You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
6.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

extern "C" {
#include"hashTable.h"
#include"rbtree.h"
#include"List.h"
}
#include<iostream>
#include <fstream>
#include<algorithm>
#include<vector>
#include<string>
#define TRUE 1
#define FALSE 0
using namespace std;
vector<string> stopword; //stopword向量有一些词出现次数太多得不偿失意义不大如 a the
void Insertword(string voc, string filename, int address, hashTable * Index);//建立索引时要插入元素
void BuildIndex(hashTable *Index);// 建立索引
void collect_stopword();//把stopword加入向量
bool isstopword(string W);//判断stopword
string Filename(int N, int M);//用这种方式获取一个文件名
bool isword(char ch);//判断这个char是否合法
void collect_stopword()
{
ifstream fp;
string stop;
char ch;
fp.open("stopword.txt", ios_base::in | ios_base::binary);
if (!fp)
{
cout << "读取屏蔽词出错!" << endl;
return;
}
fp.get(ch);
while (fp)
{
stop = "";
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
if (!fp)
break;
while (isword(ch) && !fp.eof())
{
stop += ch;
fp.get(ch);
}
transform(stop.begin(), stop.end(), stop.begin(), tolower);
stopword.push_back(stop);
}
}
//判断是不是stopword——从上一个函数中取得了所有stopword拼成的字符串判断是否能在里面找到就行
bool isstopword(string W)
{
if (find(stopword.begin(), stopword.end(), W) == stopword.end())
return 0;
else
return 1;
}
//判断是否是单词
bool isword(char ch)
{
//标准传入ch在A-Z或者a-z中间或者是\配合n吧就算
int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\''));
return ((D == 1) ? TRUE : FALSE);
}
//构造文件名的函数因为示范文件是10xx x的格式所以这么构造
string Filename(int N, int M)
{
string res;
char file[15];
char chap[10];
_itoa_s(N, file, 10);
_itoa_s(M, chap, 10);
strcat_s(file, 15, " ");
strcat_s(file, 15, chap);
strcat_s(file, 15, ".txt");
res = file;
return res;
}
int hashfile(char*p)
{
int i = strlen(p);
int hash=0;
int j;
for (j = 0; j < i; j++)
{
if (p[j] >= '0'&&p[j] <= '9')
{
hash = hash * 10 + p[j] - '0';
}
}
return hash;
}
//整理扫描到的当前单词信息(在哪个文件的哪里),再插入 address标志了单词位置
void Insertword(string voc, string filename, int address, hashTable* t)
{
char p[50];
strcpy(p, voc.c_str());
char *fname = (char*)filename.c_str();
if (findValue(t,p)==NULL)//假如索引里头没有这个词我们就把他加进去
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBRoot *root;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(t, p, root);
//cout << "haha" << endl;
//inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value);
}
else//有的话就得更新它的数据
{
RBRoot *tempRB=findValue(t, p);
//如果小向量红黑树sour中没有找到当前文件名意思是现在找到的词是本文件中第一次出现
if (rbtree_search(tempRB,hashfile(fname))==NULL)
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(tempRB, *leaf);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
//insertEntry(t, p, tempRB);
}
//如果小向量sour中找到了当前文件名意思是现在找到的词在本文件中出现过NA向量已经建立过了直接加就行
else
{
RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname));
ListInsert(&templeaf->index, 1, address);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
}
}
}
void BuildIndex(hashTable *Index)
{
int cou;
int num;
string filename = "";
ifstream fp;
string voc = "";
int address;
char ch;
//这个双层循环是配合Filename函数自己写的完成->读所有文件的任务
for (cou = 1001; cou <= 1042; cou++)
{
for (num = 1; num <= 153; num++)
{
filename = Filename(cou, num);
fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary);
if (!fp.is_open())
continue;
cout << filename << endl;
address = 0;
fp.get(ch);
while (fp)//以前学过的逐个单词读
{
voc = "";
//不是单词且没到结尾就继续
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
//到结尾就退出(去下一个文件)
if (!fp)
break;
//是单词的操作就在String vocabulary中连缀上它
while (isword(ch) && !fp.eof())
{
voc += ch;
fp.get(ch);
}
transform(voc.begin(), voc.end(), voc.begin(), tolower); //转化成小写字母
//下标移动(数在第几个位置嘛)
address++;
//假如不是屏蔽词汇就整理单词信息再插入索引用之前的Insert函数
if (!isstopword(voc))
{
Insertword(voc, filename, address, Index);
//getchar();
}
}
fp.close();
}
}
}
void test() //测试功能专用
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, 10);
ListInsert(&L, 1, 30);
hashTable t;
initHashTable(&t);
RBRoot *root = NULL;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahah";
leaf->hashfile = 20;
leaf->index = L;
insert_rbtree(root, *leaf);
leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahaha";
leaf->hashfile = 10;
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(&t, "son", root);
insertEntry(&t, "on", root);
RBRoot *b = findValue(&t, "son");
inorder_rbtree(b);
getchar();
}
int main(){
//test();
hashTable t;
initHashTable(&t);
cout << "屏蔽词载入中..." << endl;
collect_stopword();
cout << "屏蔽词库建立完成" << endl;
cout << "倒排索引建立中..." << endl;
BuildIndex(&t);
cout << "倒排索引建立完成" << endl;
while (1) {
string s;
cin >> s;
char *p = (char *)s.c_str();
inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value);
}
//inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value);
getchar();
getchar();
}