You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
6.0 KiB

extern "C" {
#include"hashTable.h"
#include"rbtree.h"
#include"List.h"
}
#include<iostream>
#include <fstream>
#include<algorithm>
#include<vector>
#include<string>
#define TRUE 1
#define FALSE 0
using namespace std;
vector<string> stopword; //stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һЩ<D2BB>ʳ<EFBFBD><CAB3>ִ<EFBFBD><D6B4><EFBFBD>̫<EFBFBD><EFBFBD>ò<EFBFBD><C3B2><EFBFBD>ʧ<EFBFBD><CAA7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><E5B2BB><EFBFBD><EFBFBD> a the<68><65>
void Insertword(string voc, string filename, int address, hashTable * Index);//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱҪ<CAB1><D2AA><EFBFBD><EFBFBD>Ԫ<EFBFBD><D4AA>
void BuildIndex(hashTable *Index);// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
void collect_stopword();//<2F><>stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
bool isstopword(string W);//<2F>ж<EFBFBD>stopword
string Filename(int N, int M);//<2F><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD>ʽ<EFBFBD><CABD>ȡһ<C8A1><D2BB><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
bool isword(char ch);//<2F>ж<EFBFBD><D0B6><EFBFBD><EFBFBD><EFBFBD>char<61>Ƿ<EFBFBD><C7B7>Ϸ<EFBFBD>
void collect_stopword()
{
ifstream fp;
string stop;
char ch;
fp.open("stopword.txt", ios_base::in | ios_base::binary);
if (!fp)
{
cout << "<EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD>δʳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
return;
}
fp.get(ch);
while (fp)
{
stop = "";
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
if (!fp)
break;
while (isword(ch) && !fp.eof())
{
stop += ch;
fp.get(ch);
}
transform(stop.begin(), stop.end(), stop.begin(), tolower);
stopword.push_back(stop);
}
}
//<2F>ж<EFBFBD><D0B6>Dz<EFBFBD><C7B2><EFBFBD>stopword<72><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>stopwordƴ<64>ɵ<EFBFBD><C9B5>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ж<EFBFBD><D0B6>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5><EFBFBD><EFBFBD><EFBFBD>
bool isstopword(string W)
{
if (find(stopword.begin(), stopword.end(), W) == stopword.end())
return 0;
else
return 1;
}
//<2F>ж<EFBFBD><D0B6>Ƿ<EFBFBD><C7B7>ǵ<EFBFBD><C7B5><EFBFBD>
bool isword(char ch)
{
//<2F><>׼<EFBFBD><D7BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ch<63><68>A-Z<><5A><EFBFBD><EFBFBD>a-z<>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\<5C><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>n<EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD><EFBFBD><EFBFBD>
int D = ((ch >= 'a' && ch <= 'z') || (ch >= 'A'&& ch <= 'Z') || (ch == '\''));
return ((D == 1) ? TRUE : FALSE);
}
//<2F><><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD>ĺ<EFBFBD><C4BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊʾ<CEAA><CABE><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>10xx x<>ĸ<EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ô<EFBFBD><C3B4><EFBFBD>
string Filename(int N, int M)
{
string res;
char file[15];
char chap[10];
_itoa_s(N, file, 10);
_itoa_s(M, chap, 10);
strcat_s(file, 15, " ");
strcat_s(file, 15, chap);
strcat_s(file, 15, ".txt");
res = file;
return res;
}
int hashfile(char*p)
{
int i = strlen(p);
int hash=0;
int j;
for (j = 0; j < i; j++)
{
if (p[j] >= '0'&&p[j] <= '9')
{
hash = hash * 10 + p[j] - '0';
}
}
return hash;
}
//<2F><><EFBFBD><EFBFBD>ɨ<EFBFBD><EFBFBD>ĵ<EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD><C4B8>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFA3A9><EFBFBD>ٲ<EFBFBD><D9B2><EFBFBD> <20><> address<73><73>־<EFBFBD>˵<EFBFBD><CBB5><EFBFBD>λ<EFBFBD><CEBB>
void Insertword(string voc, string filename, int address, hashTable* t)
{
char p[50];
strcpy(p, voc.c_str());
char *fname = (char*)filename.c_str();
if (findValue(t,p)==NULL)//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͷû<CDB7><C3BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǾͰ<C7BE><CDB0><EFBFBD><EFBFBD>ӽ<EFBFBD>ȥ
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBRoot *root;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(t, p, root);
//cout << "haha" << endl;
//inorder_rbtree(t->bucket[hashfunc(p,strlen(p))].value);
}
else//<2F>еĻ<D0B5><C4BB>͵ø<CDB5><C3B8><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
{
RBRoot *tempRB=findValue(t, p);
//<2F><><EFBFBD><EFBFBD>С<EFBFBD><D0A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>sour<75><72>û<EFBFBD><C3BB><EFBFBD>ҵ<EFBFBD><D2B5><EFBFBD>ǰ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˼<EFBFBD><CBBC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5>Ĵ<EFBFBD><C4B4>DZ<EFBFBD><C7B1>ļ<EFBFBD><C4BC>е<EFBFBD>һ<EFBFBD>γ<EFBFBD><CEB3>֣<EFBFBD>
if (rbtree_search(tempRB,hashfile(fname))==NULL)
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, address);
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
strcpy(leaf->filename, filename.c_str());
leaf->hashfile = hashfile(fname);
leaf->index = L;
insert_rbtree(tempRB, *leaf);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
//insertEntry(t, p, tempRB);
}
//<2F><><EFBFBD><EFBFBD>С<EFBFBD><D0A1><EFBFBD><EFBFBD>sour<75><72><EFBFBD>ҵ<EFBFBD><D2B5>˵<EFBFBD>ǰ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˼<EFBFBD><CBBC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҵ<EFBFBD><D2B5>Ĵ<EFBFBD><C4B4>ڱ<EFBFBD><DAB1>ļ<EFBFBD><C4BC>г<EFBFBD><D0B3>ֹ<EFBFBD><D6B9><EFBFBD>NA<4E><41><EFBFBD><EFBFBD><EFBFBD>Ѿ<EFBFBD><D1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˣ<EFBFBD>ֱ<EFBFBD>ӼӾ<D3BC><D3BE>У<EFBFBD>
else
{
RBleaf*templeaf = rbtree_search(tempRB, hashfile(fname));
ListInsert(&templeaf->index, 1, address);
//inorder_rbtree(t->bucket[hashfunc(p, strlen(p))].value);
}
}
}
void BuildIndex(hashTable *Index)
{
int cou;
int num;
string filename = "";
ifstream fp;
string voc = "";
int address;
char ch;
//<2F><><EFBFBD><EFBFBD>˫<EFBFBD><CBAB>ѭ<EFBFBD><D1AD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Filename<6D><65><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Լ<EFBFBD>д<EFBFBD>ģ<EFBFBD><C4A3><EFBFBD><EFBFBD><EFBFBD>-><3E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
for (cou = 1001; cou <= 1042; cou++)
{
for (num = 1; num <= 153; num++)
{
filename = Filename(cou, num);
fp.open("Shakespeare/" + filename, ios_base::in | ios_base::binary);
if (!fp.is_open())
continue;
cout << filename << endl;
address = 0;
fp.get(ch);
while (fp)//<2F><>ǰѧ<C7B0><D1A7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʶ<EFBFBD>
{
voc = "";
//<2F><><EFBFBD>ǵ<EFBFBD><C7B5><EFBFBD><EFBFBD><EFBFBD>û<EFBFBD><C3BB><EFBFBD><EFBFBD>β<EFBFBD>ͼ<EFBFBD><CDBC><EFBFBD>
while (!isword(ch) && !fp.eof())
{
fp.get(ch);
}
//<2F><><EFBFBD><EFBFBD>β<EFBFBD><CEB2><EFBFBD>˳<EFBFBD><CBB3><EFBFBD>ȥ<EFBFBD><C8A5>һ<EFBFBD><D2BB><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
if (!fp)
break;
//<2F>ǵ<EFBFBD><C7B5>ʵIJ<CAB5><C4B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>String vocabulary<72><79><EFBFBD><EFBFBD>׺<EFBFBD><D7BA><EFBFBD><EFBFBD>
while (isword(ch) && !fp.eof())
{
voc += ch;
fp.get(ch);
}
transform(voc.begin(), voc.end(), voc.begin(), tolower); //ת<><D7AA><EFBFBD><EFBFBD>Сд<D0A1><D0B4>ĸ
//<2F>±<EFBFBD><C2B1>ƶ<EFBFBD><C6B6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵڼ<DAB5><DABC><EFBFBD>λ<EFBFBD><CEBB><EFBFBD>
address++;
//<2F><><EFBFBD><EFBFBD><E7B2BB><EFBFBD><EFBFBD><EFBFBD>δʻ<CABB><E3A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD>ٲ<EFBFBD><D9B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֮ǰ<D6AE><C7B0>Insert<72><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
if (!isstopword(voc))
{
Insertword(voc, filename, address, Index);
//getchar();
}
}
fp.close();
}
}
}
void test() //<2F><><EFBFBD>Թ<EFBFBD><D4B9><EFBFBD>ר<EFBFBD><D7A8>
{
LinkList L;
InitList(&L);
ListInsert(&L, 1, 10);
ListInsert(&L, 1, 30);
hashTable t;
initHashTable(&t);
RBRoot *root = NULL;
root = create_rbtree();
RBleaf *leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahah";
leaf->hashfile = 20;
leaf->index = L;
insert_rbtree(root, *leaf);
leaf = (RBleaf*)malloc(sizeof(RBleaf));
//leaf->filename = "hahaha";
leaf->hashfile = 10;
leaf->index = L;
insert_rbtree(root, *leaf);
insertEntry(&t, "son", root);
insertEntry(&t, "on", root);
RBRoot *b = findValue(&t, "son");
inorder_rbtree(b);
getchar();
}
int main(){
//test();
hashTable t;
initHashTable(&t);
cout << "<EFBFBD><EFBFBD><EFBFBD>δ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>..." << endl;
collect_stopword();
cout << "<EFBFBD><EFBFBD><EFBFBD>δʿ⽨<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
cout << "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>..." << endl;
BuildIndex(&t);
cout << "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>" << endl;
while (1) {
string s;
cin >> s;
char *p = (char *)s.c_str();
inorder_rbtree(t.bucket[hashfunc(p, strlen(p))].value);
}
//inorder_rbtree((RBRoot*)t.bucket[hashfunc(p,strlen(p))].value);
getchar();
getchar();
}