|
|
|
|
extern "C" {
|
|
|
|
|
#include"hashTable.h"
|
|
|
|
|
#include"rbtree.h"
|
|
|
|
|
#include"List.h"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#include<iostream>
|
|
|
|
|
#include <fstream>
|
|
|
|
|
#include<algorithm>
|
|
|
|
|
#include<vector>
|
|
|
|
|
#include<string>
|
|
|
|
|
#define TRUE 1
|
|
|
|
|
#define FALSE 0
|
|
|
|
|
using namespace std;
|
|
|
|
|
vector<string> stopword; //stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һЩ<D2BB>ʳ<EFBFBD><CAB3>ִ<EFBFBD><D6B4><EFBFBD>̫<EFBFBD>࣬<EFBFBD>ò<EFBFBD><C3B2><EFBFBD>ʧ<EFBFBD><CAA7><EFBFBD><EFBFBD><EFBFBD>岻<EFBFBD><E5B2BB><EFBFBD><EFBFBD> a the<68><65>
|
|
|
|
|
void Insertword(string voc, string filename, int address, hashTable * Index);//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱҪ<CAB1><D2AA><EFBFBD><EFBFBD>Ԫ<EFBFBD><D4AA>
|
|
|
|
|
void BuildIndex(hashTable *Index);// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
void collect_stopword();//<2F><>stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
bool isstopword(string W);//<2F>ж<EFBFBD>stopword
|
|
|
|
|
string Filename(int N, int M);//<2F><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD>ʽ<EFBFBD><CABD>ȡһ<C8A1><D2BB><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
|
|
|
|
bool isword(char ch);//<2F>ж<EFBFBD><D0B6><EFBFBD><EFBFBD><EFBFBD>char<61>Ƿ<EFBFBD><C7B7>Ϸ<EFBFBD>
|
|
|
|
|
|
|
|
|
|
void collect_stopword()
|
|
|
|
|
{
|
|
|
|
|
ifstream fp;
|
|
|
|
|
string stop;
|
|
|
|
|
char ch;
|
|
|
|
|
fp.open("stopword.txt", ios_base::in | ios_base::binary);
|
|
|
|
|
if (!fp)
|
|
|
|
|
{
|
|
|
|
|
cout << "<EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD>δʳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
while (fp)
|
|
|
|
|
{
|
|
|
|
|
stop = "";
|
|
|
|
|
|
|
|
|
|
while (!isword(ch) && !fp.eof())
|
|
|
|
|
{
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!fp)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
while (isword(ch) && !fp.eof())
|
|
|
|
|
{
|
|
|
|
|
stop += ch;
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
}
|
|
|
|
|
transform(stop.begin(), stop.end(), stop.begin(), tolower);
|
|
|
|
|
|
|
|
|
|
stopword.push_back(stop);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//<2F>ж<EFBFBD><D0B6>Dz<EFBFBD><C7B2><EFBFBD>stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>stopwordƴ<64>ɵ<EFBFBD><C9B5>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ж<EFBFBD><D0B6>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
bool isstopword(string W)
|
|
|
|
|
{
|
|
|
|
|
if (find(stopword.begin(), stopword.end(), W) == stopword.end())
|
|
|
|
|
return 0;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
//<2F>ж<EFBFBD><D0B6>Ƿ<EFBFBD><C7B7>ǵ<EFBFBD><C7B5><EFBFBD>
|
|
|
|
|
bool isword(char ch)
|
|
|
|
|
{
|
|
|
|
|
//<2F><><EFBFBD><D7BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ch<63><68>A-Z<><5A><EFBFBD><EFBFBD>a-z<>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\<5C><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>n<EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\''));
|
|
|
|
|
return ((D == 1) ? TRUE : FALSE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD>ĺ<EFBFBD><C4BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊʾ<CEAA><CABE><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>10xx x<>ĸ<EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ô<EFBFBD><C3B4><EFBFBD>죩
|
|
|
|
|
string Filename(int N, int M)
|
|
|
|
|
{
|
|
|
|
|
string res;
|
|
|
|
|
char file[15];
|
|
|
|
|
char chap[10];
|
|
|
|
|
_itoa_s(N, file, 10);
|
|
|
|
|
_itoa_s(M, chap, 10);
|
|
|
|
|
strcat_s(file, 15, " ");
|
|
|
|
|
strcat_s(file, 15, chap);
|
|
|
|
|
strcat_s(file, 15, ".txt");
|
|
|
|
|
res = file;
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
int hashfile(char*p)
|
|
|
|
|
{
|
|
|
|
|
int i = strlen(p);
|
|
|
|
|
int hash=0;
|
|
|
|
|
int j;
|
|
|
|
|
for (j = 0; j < i; j++)
|
|
|
|
|
{
|
|
|
|
|
if (p[j] >= '0'&&p[j] <= '9')
|
|
|
|
|
{
|
|
|
|
|
hash = hash * 10 + p[j] - '0';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return hash;
|
|
|
|
|
}
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD>ɨ<EFBFBD>赽<EFBFBD>ĵ<EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD><C4B8>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFA3A9><EFBFBD>ٲ<EFBFBD><D9B2><EFBFBD> <20><> address<73><73>־<EFBFBD>˵<EFBFBD><CBB5><EFBFBD>λ<EFBFBD><CEBB>
|
|
|
|
|
void Insertword(string voc, string filename, int address, hashTable* t)
|
|
|
|
|
{
|
|
|
|
|
char p[50];
|
|
|
|
|
strcpy(p, voc.c_str());
|
|
|
|
|
char *fname = (char*)filename.c_str();
|
|
|
|
|
if (findValue(t,p)==NULL)//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͷû<CDB7><C3BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǾͰ<C7BE><CDB0><EFBFBD><EFBFBD>ӽ<EFBFBD>ȥ
|
|
|
|
|
{
|
|
|
|
|
LinkList L;
|
|
|
|
|
InitList(&L);
|
|
|
|
|
ListInsert(&L, 1, address);
|
|
|
|
|
RBRoot *root;
|
|
|
|
|
root = create_rbtree();
|
|
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
|
|
strcpy(leaf->filename, filename.c_str());
|
|
|
|
|
leaf->hashfile = hashfile(fname);
|
|
|
|
|
leaf->index = L;
|
|
|
|
|
insert_rbtree(root, *leaf);
|
|
|
|
|
insertEntry(t, p, root);
|
|
|
|
|
//cout << "haha" << endl;
|
|
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value);
|
|
|
|
|
}
|
|
|
|
|
else//<2F>еĻ<D0B5><C4BB>͵ø<CDB5><C3B8><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
{
|
|
|
|
|
RBRoot *tempRB=findValue(t, p);
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD>С<EFBFBD><D0A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>sour<75><72>û<EFBFBD><C3BB><EFBFBD>ҵ<EFBFBD><D2B5><EFBFBD>ǰ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˼<EFBFBD><CBBC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5>Ĵ<EFBFBD><C4B4>DZ<EFBFBD><C7B1>ļ<EFBFBD><C4BC>е<EFBFBD>һ<EFBFBD>γ<EFBFBD><CEB3>֣<EFBFBD>
|
|
|
|
|
if (rbtree_search(tempRB,hashfile(fname))==NULL)
|
|
|
|
|
{
|
|
|
|
|
LinkList L;
|
|
|
|
|
InitList(&L);
|
|
|
|
|
ListInsert(&L, 1, address);
|
|
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
|
|
strcpy(leaf->filename, filename.c_str());
|
|
|
|
|
leaf->hashfile = hashfile(fname);
|
|
|
|
|
leaf->index = L;
|
|
|
|
|
insert_rbtree(tempRB, *leaf);
|
|
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
|
|
|
|
|
//insertEntry(t, p, tempRB);
|
|
|
|
|
}
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD>С<EFBFBD><D0A1><EFBFBD><EFBFBD>sour<75><72><EFBFBD>ҵ<EFBFBD><D2B5>˵<EFBFBD>ǰ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˼<EFBFBD><CBBC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5>Ĵ<EFBFBD><C4B4>ڱ<EFBFBD><DAB1>ļ<EFBFBD><C4BC>г<EFBFBD><D0B3>ֹ<EFBFBD><D6B9><EFBFBD>NA<4E><41><EFBFBD><EFBFBD><EFBFBD>Ѿ<EFBFBD><D1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˣ<EFBFBD>ֱ<EFBFBD>ӼӾ<D3BC><D3BE>У<EFBFBD>
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname));
|
|
|
|
|
ListInsert(&templeaf->index, 1, address);
|
|
|
|
|
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void BuildIndex(hashTable *Index)
|
|
|
|
|
{
|
|
|
|
|
int cou;
|
|
|
|
|
int num;
|
|
|
|
|
|
|
|
|
|
string filename = "";
|
|
|
|
|
ifstream fp;
|
|
|
|
|
string voc = "";
|
|
|
|
|
int address;
|
|
|
|
|
char ch;
|
|
|
|
|
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD>˫<EFBFBD><CBAB>ѭ<EFBFBD><D1AD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Filename<6D><65><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Լ<EFBFBD>д<EFBFBD>ģ<EFBFBD><C4A3><EFBFBD><EFBFBD><EFBFBD>-><3E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
for (cou = 1001; cou <= 1042; cou++)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
for (num = 1; num <= 153; num++)
|
|
|
|
|
{
|
|
|
|
|
filename = Filename(cou, num);
|
|
|
|
|
fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary);
|
|
|
|
|
if (!fp.is_open())
|
|
|
|
|
continue;
|
|
|
|
|
cout << filename << endl;
|
|
|
|
|
address = 0;
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
while (fp)//<2F><>ǰѧ<C7B0><D1A7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʶ<EFBFBD>
|
|
|
|
|
{
|
|
|
|
|
voc = "";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//<2F><><EFBFBD>ǵ<EFBFBD><C7B5><EFBFBD><EFBFBD><EFBFBD>û<EFBFBD><C3BB><EFBFBD><EFBFBD>β<EFBFBD>ͼ<EFBFBD><CDBC><EFBFBD>
|
|
|
|
|
while (!isword(ch) && !fp.eof())
|
|
|
|
|
{
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//<2F><><EFBFBD><EFBFBD>β<EFBFBD><CEB2><EFBFBD>˳<EFBFBD><CBB3><EFBFBD>ȥ<EFBFBD><C8A5>һ<EFBFBD><D2BB><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
|
|
|
|
if (!fp)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
//<2F>ǵ<EFBFBD><C7B5>ʵIJ<CAB5><C4B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>String vocabulary<72><79><EFBFBD><EFBFBD><EFBFBD><D7BA><EFBFBD><EFBFBD>
|
|
|
|
|
while (isword(ch) && !fp.eof())
|
|
|
|
|
{
|
|
|
|
|
voc += ch;
|
|
|
|
|
fp.get(ch);
|
|
|
|
|
}
|
|
|
|
|
transform(voc.begin(), voc.end(), voc.begin(), tolower); //ת<><D7AA><EFBFBD><EFBFBD>Сд<D0A1><D0B4>ĸ
|
|
|
|
|
|
|
|
|
|
//<2F>±<EFBFBD><C2B1>ƶ<EFBFBD><C6B6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵڼ<DAB5><DABC><EFBFBD>λ<EFBFBD><CEBB><EFBFBD>
|
|
|
|
|
address++;
|
|
|
|
|
|
|
|
|
|
//<2F><><EFBFBD>粻<EFBFBD><E7B2BB><EFBFBD><EFBFBD><EFBFBD>δʻ㣬<CABB><E3A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD>ٲ<EFBFBD><D9B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֮ǰ<D6AE><C7B0>Insert<72><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
if (!isstopword(voc))
|
|
|
|
|
{
|
|
|
|
|
Insertword(voc, filename, address, Index);
|
|
|
|
|
//getchar();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
fp.close();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
void test() //<2F><><EFBFBD>Թ<EFBFBD><D4B9><EFBFBD>ר<EFBFBD><D7A8>
|
|
|
|
|
{
|
|
|
|
|
LinkList L;
|
|
|
|
|
InitList(&L);
|
|
|
|
|
ListInsert(&L, 1, 10);
|
|
|
|
|
ListInsert(&L, 1, 30);
|
|
|
|
|
hashTable t;
|
|
|
|
|
initHashTable(&t);
|
|
|
|
|
RBRoot *root = NULL;
|
|
|
|
|
root = create_rbtree();
|
|
|
|
|
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
|
|
//leaf->filename = "hahah";
|
|
|
|
|
leaf->hashfile = 20;
|
|
|
|
|
leaf->index = L;
|
|
|
|
|
insert_rbtree(root, *leaf);
|
|
|
|
|
leaf = (RBleaf*)malloc(sizeof(RBleaf));
|
|
|
|
|
//leaf->filename = "hahaha";
|
|
|
|
|
leaf->hashfile = 10;
|
|
|
|
|
leaf->index = L;
|
|
|
|
|
insert_rbtree(root, *leaf);
|
|
|
|
|
insertEntry(&t, "son", root);
|
|
|
|
|
insertEntry(&t, "on", root);
|
|
|
|
|
RBRoot *b = findValue(&t, "son");
|
|
|
|
|
inorder_rbtree(b);
|
|
|
|
|
getchar();
|
|
|
|
|
}
|
|
|
|
|
int main(){
|
|
|
|
|
//test();
|
|
|
|
|
hashTable t;
|
|
|
|
|
initHashTable(&t);
|
|
|
|
|
cout << "<EFBFBD><EFBFBD><EFBFBD>δ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>..." << endl;
|
|
|
|
|
collect_stopword();
|
|
|
|
|
cout << "<EFBFBD><EFBFBD><EFBFBD>δʿ⽨<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
|
|
|
|
|
cout << "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>..." << endl;
|
|
|
|
|
BuildIndex(&t);
|
|
|
|
|
cout << "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
|
|
|
|
|
while (1) {
|
|
|
|
|
string s;
|
|
|
|
|
cin >> s;
|
|
|
|
|
char *p = (char *)s.c_str();
|
|
|
|
|
inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value);
|
|
|
|
|
}
|
|
|
|
|
//inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value);
|
|
|
|
|
getchar();
|
|
|
|
|
getchar();
|
|
|
|
|
}
|