2. 索引模块
- 采用二元分词技术进行数据存储,提高搜索效率和准确性。
3. 搜索模块
3.1 ASP.NET界面
- 提供了用户友好的前端界面,方便用户输入查询条件并显示搜索结果。
3.2 搜索方法
- 实现了核心的搜索逻辑,包括分词处理、停用词过滤、搜索请求处理等。
private void Search() {
string searchStr = this.Q;
string prefix = this.T;
SearchTest searcher = new SearchTest();
DateTime start = DateTime.Now;
// 创建结果DataTable
this.Results.Columns.Add("title", typeof(string));
this.Results.Columns.Add("content", typeof(string));
this.Results.Columns.Add("url", typeof(string));
if ((searchStr.IndexOf(" ") == -1) && searchStr.Length > 3) {
List resultList = Sj110.Com.Chinese.Tokenizer.Tokenize(searchStr);
StringBuilder sb = new StringBuilder();
foreach (string result in resultList) {
bool bStop = false;
foreach (string stop in m_stopWords) {
if (result == stop) {
bStop = true;
break;
}
}
if (bStop == false) {
sb.Append(result);
sb.Append(" ");
}
}
sb.Remove(sb.Length - 1, 1);
searchStr = sb.ToString();
}
try {
string[] fields = { "content", "title" };
Hits h = searcher.search(searchStr, prefix);
this.m_total = GetValidLength(h);
this.m_startAt = initStartAt();
int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
if (h.Length() == 0) {
DataRow row = this.Results.NewRow();
row["title"] = "您查询的关键字" + searchStr + "暂无结果。
提示:多个关键字之间请加空格。“公交 线路”比“公交线路”更容易搜到结果。";
row["url"] = "default.aspx";
this.Results.Rows.Add(row);
return;
}
for (int i = m_startAt; i Document doc = h.Doc(i);
string url = doc.Get("url");
if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/")) {
m_invalidCount++;
resultsCount++;
continue;
}
string cOntent= doc.Get("content");
string title = doc.Get("title");
if (title.Trim() == "") title = "无标题";
string[] searchArr = searchStr.Split(' ');
cOntent= GetBestFragments(content, searchArr);
cOntent= Hilighter(content, searchArr);
title = Hilighter(title, searchArr);
DataRow row = this.Results.NewRow();
row["title"] = title;
row["content"] = content;
row["url"] = url;
this.Results.Rows.Add(row);
}
this.m_duration = DateTime.Now - start;
this.m_fromItem = this.m_startAt + 1;
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);
} catch (Exception ex) {
Console.WriteLine(ex.Message);
return;
}
}
string searchStr = this.Q;
string prefix = this.T;
SearchTest searcher = new SearchTest();
DateTime start = DateTime.Now;
// 创建结果DataTable
this.Results.Columns.Add("title", typeof(string));
this.Results.Columns.Add("content", typeof(string));
this.Results.Columns.Add("url", typeof(string));
if ((searchStr.IndexOf(" ") == -1) && searchStr.Length > 3) {
List
StringBuilder sb = new StringBuilder();
foreach (string result in resultList) {
bool bStop = false;
foreach (string stop in m_stopWords) {
if (result == stop) {
bStop = true;
break;
}
}
if (bStop == false) {
sb.Append(result);
sb.Append(" ");
}
}
sb.Remove(sb.Length - 1, 1);
searchStr = sb.ToString();
}
try {
string[] fields = { "content", "title" };
Hits h = searcher.search(searchStr, prefix);
this.m_total = GetValidLength(h);
this.m_startAt = initStartAt();
int resultsCount = smallerOf(m_total, this.m_maxResults + this.m_startAt);
if (h.Length() == 0) {
DataRow row = this.Results.NewRow();
row["title"] = "您查询的关键字" + searchStr + "暂无结果。
提示:多个关键字之间请加空格。“公交 线路”比“公交线路”更容易搜到结果。";
row["url"] = "default.aspx";
this.Results.Rows.Add(row);
return;
}
for (int i = m_startAt; i
string url = doc.Get("url");
if (m_oldUrls.CheckRepeatUrl(url) || url.EndsWith("/")) {
m_invalidCount++;
resultsCount++;
continue;
}
string cOntent= doc.Get("content");
string title = doc.Get("title");
if (title.Trim() == "") title = "无标题";
string[] searchArr = searchStr.Split(' ');
cOntent= GetBestFragments(content, searchArr);
cOntent= Hilighter(content, searchArr);
title = Hilighter(title, searchArr);
DataRow row = this.Results.NewRow();
row["title"] = title;
row["content"] = content;
row["url"] = url;
this.Results.Rows.Add(row);
}
this.m_duration = DateTime.Now - start;
this.m_fromItem = this.m_startAt + 1;
this.m_toItem = smallerOf(this.m_startAt + m_maxResults, m_total);
} catch (Exception ex) {
Console.WriteLine(ex.Message);
return;
}
}
来源: https://www.cnblogs.com/jadepark/archive/2007/08/06/844982.html