extern "C" { #include"hashTable.h" #include"rbtree.h" #include"List.h" } #include #include #include #include #include #define TRUE 1 #define FALSE 0 using namespace std; vector stopword; //stopword向量(有一些词出现次数太多,得不偿失,意义不大如 a the) void Insertword(string voc, string filename, int address, hashTable * Index);//建立索引时要插入元素 void BuildIndex(hashTable *Index);// 建立索引 void collect_stopword();//把stopword加入向量 bool isstopword(string W);//判断stopword string Filename(int N, int M);//用这种方式获取一个文件名 bool isword(char ch);//判断这个char是否合法 void collect_stopword() { ifstream fp; string stop; char ch; fp.open("stopword.txt", ios_base::in | ios_base::binary); if (!fp) { cout << "读取屏蔽词出错!" << endl; return; } fp.get(ch); while (fp) { stop = ""; while (!isword(ch) && !fp.eof()) { fp.get(ch); } if (!fp) break; while (isword(ch) && !fp.eof()) { stop += ch; fp.get(ch); } transform(stop.begin(), stop.end(), stop.begin(), tolower); stopword.push_back(stop); } } //判断是不是stopword——从上一个函数中取得了所有stopword拼成的字符串,判断是否能在里面找到就行 bool isstopword(string W) { if (find(stopword.begin(), stopword.end(), W) == stopword.end()) return 0; else return 1; } //判断是否是单词 bool isword(char ch) { //标准:传入ch在A-Z或者a-z中间或者是\(配合n吧)就算 int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\'')); return ((D == 1) ? TRUE : FALSE); } //构造文件名的函数(因为示范文件是10xx x的格式所以这么构造) string Filename(int N, int M) { string res; char file[15]; char chap[10]; _itoa_s(N, file, 10); _itoa_s(M, chap, 10); strcat_s(file, 15, " "); strcat_s(file, 15, chap); strcat_s(file, 15, ".txt"); res = file; return res; } int hashfile(char*p) { int i = strlen(p); int hash=0; int j; for (j = 0; j < i; j++) { if (p[j] >= '0'&&p[j] <= '9') { hash = hash * 10 + p[j] - '0'; } } return hash; } //整理扫描到的当前单词信息(在哪个文件的哪里),再插入 , address标志了单词位置 void Insertword(string voc, string filename, int address, hashTable* t) { char p[50]; strcpy(p, voc.c_str()); char *fname = (char*)filename.c_str(); if (findValue(t,p)==NULL)//假如索引里头没有这个词我们就把他加进去 { LinkList L; InitList(&L); ListInsert(&L, 1, address); RBRoot *root; root = create_rbtree(); RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf)); strcpy(leaf->filename, filename.c_str()); leaf->hashfile = hashfile(fname); leaf->index = L; insert_rbtree(root, *leaf); insertEntry(t, p, root); //cout << "haha" << endl; //inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value); } else//有的话就得更新它的数据 { RBRoot *tempRB=findValue(t, p); //如果小向量红黑树sour中没有找到当前文件名(意思是现在找到的词是本文件中第一次出现) if (rbtree_search(tempRB,hashfile(fname))==NULL) { LinkList L; InitList(&L); ListInsert(&L, 1, address); RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf)); strcpy(leaf->filename, filename.c_str()); leaf->hashfile = hashfile(fname); leaf->index = L; insert_rbtree(tempRB, *leaf); //inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value); //insertEntry(t, p, tempRB); } //如果小向量sour中找到了当前文件名(意思是现在找到的词在本文件中出现过,NA向量已经建立过了,直接加就行) else { RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname)); ListInsert(&templeaf->index, 1, address); //inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value); } } } void BuildIndex(hashTable *Index) { int cou; int num; string filename = ""; ifstream fp; string voc = ""; int address; char ch; //这个双层循环是配合Filename函数(自己写的)完成->读所有文件的任务 for (cou = 1001; cou <= 1042; cou++) { for (num = 1; num <= 153; num++) { filename = Filename(cou, num); fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary); if (!fp.is_open()) continue; cout << filename << endl; address = 0; fp.get(ch); while (fp)//以前学过的逐个单词读 { voc = ""; //不是单词且没到结尾就继续 while (!isword(ch) && !fp.eof()) { fp.get(ch); } //到结尾就退出(去下一个文件) if (!fp) break; //是单词的操作就在String vocabulary中连缀上它 while (isword(ch) && !fp.eof()) { voc += ch; fp.get(ch); } transform(voc.begin(), voc.end(), voc.begin(), tolower); //转化成小写字母 //下标移动(数在第几个位置嘛) address++; //假如不是屏蔽词汇,就整理单词信息再插入索引(用之前的Insert函数) if (!isstopword(voc)) { Insertword(voc, filename, address, Index); //getchar(); } } fp.close(); } } } void test() //测试功能专用 { LinkList L; InitList(&L); ListInsert(&L, 1, 10); ListInsert(&L, 1, 30); hashTable t; initHashTable(&t); RBRoot *root = NULL; root = create_rbtree(); RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf)); //leaf->filename = "hahah"; leaf->hashfile = 20; leaf->index = L; insert_rbtree(root, *leaf); leaf = (RBleaf*)malloc(sizeof(RBleaf)); //leaf->filename = "hahaha"; leaf->hashfile = 10; leaf->index = L; insert_rbtree(root, *leaf); insertEntry(&t, "son", root); insertEntry(&t, "on", root); RBRoot *b = findValue(&t, "son"); inorder_rbtree(b); getchar(); } int main(){ //test(); hashTable t; initHashTable(&t); cout << "屏蔽词载入中..." << endl; collect_stopword(); cout << "屏蔽词库建立完成" << endl; cout << "倒排索引建立中..." << endl; BuildIndex(&t); cout << "倒排索引建立完成" << endl; while (1) { string s; cin >> s; char *p = (char *)s.c_str(); inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value); } //inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value); getchar(); getchar(); }