|
|
extern "C" {
|
|
|
#include"hashTable.h"
|
|
|
#include"rbtree.h"
|
|
|
#include"List.h"
|
|
|
}
|
|
|
|
|
|
#include<iostream>
|
|
|
#include <fstream>
|
|
|
#include<algorithm>
|
|
|
#include<vector>
|
|
|
#include<string>
|
|
|
#define TRUE 1
|
|
|
#define FALSE 0
|
|
|
using namespace std;
|
|
|
vector<string> stopword; //stopword向量(有一些词出现次数太多,得不偿失,意义不大如 a the)
|
|
|
void Insertword(string voc, string filename, int address, hashTable * Index);//建立索引时要插入元素
|
|
|
void BuildIndex(hashTable *Index);// 建立索引
|
|
|
void collect_stopword();//把stopword加入向量
|
|
|
bool isstopword(string W);//判断stopword
|
|
|
string Filename(int N, int M);//用这种方式获取一个文件名
|
|
|
bool isword(char ch);//判断这个char是否合法
|
|
|
|
|
|
void collect_stopword()
|
|
|
{
|
|
|
ifstream fp;
|
|
|
string stop;
|
|
|
char ch;
|
|
|
fp.open("stopword.txt", ios_base::in | ios_base::binary);
|
|
|
if (!fp)
|
|
|
{
|
|
|
cout << "读取屏蔽词出错!" << endl;
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
fp.get(ch);
|
|
|
while (fp)
|
|
|
{
|
|
|
stop = "";
|
|
|
|
|
|
while (!isword(ch) && !fp.eof())
|
|
|
{
|
|
|
fp.get(ch);
|
|
|
}
|
|
|
|
|
|
if (!fp)
|
|
|
break;
|
|
|
|
|
|
while (isword(ch) && !fp.eof())
|
|
|
{
|
|
|
stop += ch;
|
|
|
fp.get(ch);
|
|
|
}
|
|
|
transform(stop.begin(), stop.end(), stop.begin(), tolower);
|
|
|
|
|
|
stopword.push_back(stop);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
//判断是不是stopword——从上一个函数中取得了所有stopword拼成的字符串,判断是否能在里面找到就行
|
|
|
bool isstopword(string W)
|
|
|
{
|
|
|
if (find(stopword.begin(), stopword.end(), W) == stopword.end())
|
|
|
return 0;
|
|
|
else
|
|
|
return 1;
|
|
|
}
|
|
|
//判断是否是单词
|
|
|
bool isword(char ch)
|
|
|
{
|
|
|
//标准:传入ch在A-Z或者a-z中间或者是\(配合n吧)就算
|
|
|
int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\''));
|
|
|
return ((D == 1) ? TRUE : FALSE);
|
|
|
}
|
|
|
|
|
|
//构造文件名的函数(因为示范文件是10xx x的格式所以这么构造)
|
|
|
string Filename(int N, int M)
|
|
|
{
|
|
|
string res;
|
|
|
char file[15];
|
|
|
char chap[10];
|
|
|
_itoa_s(N, file, 10);
|
|
|
_itoa_s(M, chap, 10);
|
|
|
strcat_s(file, 15, " ");
|
|
|
strcat_s(file, 15, chap);
|
|
|
strcat_s(file, 15, ".txt");
|
|
|
res = file;
|
|
|
return res;
|
|
|
}
|
|
|
int hashfile(char*p)
|
|
|
{
|
|
|
int i = strlen(p);
|
|
|
int hash=0;
|
|
|
int j;
|
|
|
for (j = 0; j < i; j++)
|
|
|
{
|
|
|
if (p[j] >= '0'&&p[j] <= '9')
|
|
|
{
|
|
|
hash = hash * 10 + p[j] - '0';
|
|
|
}
|
|
|
}
|
|
|
return hash;
|
|
|
}
|
|
|
//整理扫描到的当前单词信息(在哪个文件的哪里),再插入 , address标志了单词位置
|
|
|
void Insertword(string voc, string filename, int address, hashTable* t)
|
|
|
{
|
|
|
char p[50];
|
|
|
strcpy(p, voc.c_str());
|
|
|
char *fname = (char*)filename.c_str();
|
|
|
if (findValue(t,p)==NULL)//假如索引里头没有这个词我们就把他加进去
|
|
|
{
|
|
|
LinkList L;
|
|
|
InitList(&L);
|
|
|
ListInsert(&L, 1, address);
|
|
|
RBRoot *root;
|
|
|
root = create_rbtree();
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
strcpy(leaf->filename, filename.c_str());
|
|
|
leaf->hashfile = hashfile(fname);
|
|
|
leaf->index = L;
|
|
|
insert_rbtree(root, *leaf);
|
|
|
insertEntry(t, p, root);
|
|
|
//cout << "haha" << endl;
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value);
|
|
|
}
|
|
|
else//有的话就得更新它的数据
|
|
|
{
|
|
|
RBRoot *tempRB=findValue(t, p);
|
|
|
//如果小向量红黑树sour中没有找到当前文件名(意思是现在找到的词是本文件中第一次出现)
|
|
|
if (rbtree_search(tempRB,hashfile(fname))==NULL)
|
|
|
{
|
|
|
LinkList L;
|
|
|
InitList(&L);
|
|
|
ListInsert(&L, 1, address);
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
strcpy(leaf->filename, filename.c_str());
|
|
|
leaf->hashfile = hashfile(fname);
|
|
|
leaf->index = L;
|
|
|
insert_rbtree(tempRB, *leaf);
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
|
|
|
//insertEntry(t, p, tempRB);
|
|
|
}
|
|
|
//如果小向量sour中找到了当前文件名(意思是现在找到的词在本文件中出现过,NA向量已经建立过了,直接加就行)
|
|
|
else
|
|
|
{
|
|
|
RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname));
|
|
|
ListInsert(&templeaf->index, 1, address);
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void BuildIndex(hashTable *Index)
|
|
|
{
|
|
|
int cou;
|
|
|
int num;
|
|
|
|
|
|
string filename = "";
|
|
|
ifstream fp;
|
|
|
string voc = "";
|
|
|
int address;
|
|
|
char ch;
|
|
|
|
|
|
//这个双层循环是配合Filename函数(自己写的)完成->读所有文件的任务
|
|
|
for (cou = 1001; cou <= 1042; cou++)
|
|
|
{
|
|
|
|
|
|
for (num = 1; num <= 153; num++)
|
|
|
{
|
|
|
filename = Filename(cou, num);
|
|
|
fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary);
|
|
|
if (!fp.is_open())
|
|
|
continue;
|
|
|
cout << filename << endl;
|
|
|
address = 0;
|
|
|
fp.get(ch);
|
|
|
while (fp)//以前学过的逐个单词读
|
|
|
{
|
|
|
voc = "";
|
|
|
|
|
|
|
|
|
//不是单词且没到结尾就继续
|
|
|
while (!isword(ch) && !fp.eof())
|
|
|
{
|
|
|
fp.get(ch);
|
|
|
}
|
|
|
|
|
|
//到结尾就退出(去下一个文件)
|
|
|
if (!fp)
|
|
|
break;
|
|
|
|
|
|
//是单词的操作就在String vocabulary中连缀上它
|
|
|
while (isword(ch) && !fp.eof())
|
|
|
{
|
|
|
voc += ch;
|
|
|
fp.get(ch);
|
|
|
}
|
|
|
transform(voc.begin(), voc.end(), voc.begin(), tolower); //转化成小写字母
|
|
|
|
|
|
//下标移动(数在第几个位置嘛)
|
|
|
address++;
|
|
|
|
|
|
//假如不是屏蔽词汇,就整理单词信息再插入索引(用之前的Insert函数)
|
|
|
if (!isstopword(voc))
|
|
|
{
|
|
|
Insertword(voc, filename, address, Index);
|
|
|
//getchar();
|
|
|
}
|
|
|
}
|
|
|
fp.close();
|
|
|
}
|
|
|
}
|
|
|
|
|
|
}
|
|
|
void test() //测试功能专用
|
|
|
{
|
|
|
LinkList L;
|
|
|
InitList(&L);
|
|
|
ListInsert(&L, 1, 10);
|
|
|
ListInsert(&L, 1, 30);
|
|
|
hashTable t;
|
|
|
initHashTable(&t);
|
|
|
RBRoot *root = NULL;
|
|
|
root = create_rbtree();
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
//leaf->filename = "hahah";
|
|
|
leaf->hashfile = 20;
|
|
|
leaf->index = L;
|
|
|
insert_rbtree(root, *leaf);
|
|
|
leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
//leaf->filename = "hahaha";
|
|
|
leaf->hashfile = 10;
|
|
|
leaf->index = L;
|
|
|
insert_rbtree(root, *leaf);
|
|
|
insertEntry(&t, "son", root);
|
|
|
insertEntry(&t, "on", root);
|
|
|
RBRoot *b = findValue(&t, "son");
|
|
|
inorder_rbtree(b);
|
|
|
getchar();
|
|
|
}
|
|
|
int main(){
|
|
|
//test();
|
|
|
hashTable t;
|
|
|
initHashTable(&t);
|
|
|
cout << "屏蔽词载入中..." << endl;
|
|
|
collect_stopword();
|
|
|
cout << "屏蔽词库建立完成" << endl;
|
|
|
cout << "倒排索引建立中..." << endl;
|
|
|
BuildIndex(&t);
|
|
|
cout << "倒排索引建立完成" << endl;
|
|
|
while (1) {
|
|
|
string s;
|
|
|
cin >> s;
|
|
|
char *p = (char *)s.c_str();
|
|
|
inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value);
|
|
|
}
|
|
|
//inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value);
|
|
|
getchar();
|
|
|
getchar();
|
|
|
} |