dotlucene - 爱码网

DotLucene搜索引擎Demo之：创建索引DotLucene的官方网站这样介绍：DotLucene是一个专门为NET设计的功能强大的搜索引擎！他上面也有一个在线demo，搜索3.5GB的文本数据所花费的时间在0.1秒左右！大家可以点这里测试。我也记得在我的一个网站
99收藏夹（注1）里面有个在线帮助，他是用StreamReader来读文本数据的，其他的是读数据库，我发现不管是读数据库还是一xml的形式读xml文档，不管你的数据库如何优化，也不管你的机器培植如何之高，读的速度与读文本数据的速度是不可比的，大家也可到http://www.99scj.com测试下。点在线帮助，一闪就出来了。
dotlucene 本文是按照DotLucene官方网站的一个demo 写的，本同点在于，
1，本文的demo采用了DotLucene最新稳定版1.4.3
2，开发环境是vs2005。
3，demo被我划成了两个部分，一个部分是一个console程序，就是本文所讲的，主要就是怎么样创建索引，另
 一个部分是个web程序，关键说的是搜索这个部分所建立的索引。
dotlucene 4，源代码将在下个部分提供下载，因为这两个部分同属一个解决方案。
好了，我们现在开始进入怎么用DotLucene来创建索引了。
什么是索引呢？我也不太明白，我是这样理解的，索引就是用来加快查询数据的速度的，比如我们小时候读书的时候课本前面有那个第一课：什么什么的。。。。。。第几页，这应该就是索引吧。用DotLucene创建索引也就是说把某些文件内容编入某个目录下的索引。
首先运行vs2005，选择文件--新建项目，在弹出的对话框选：其他项目类型里的Visual Studio解决方案，选右边的空白解决方案，输入名字：SearchDemo,位置选D:\确定。
再在解决方案SearchDemo右键选添加---再选---新建解决方案文件夹，输入文件夹名字为Indexer.我再找到d:\SearchDemo,再这个文件夹里面新建个目录，叫wwwroot,大家就知道这个目录是放weB 程序的，我们在iis管理器里面新建设个虚拟目录，指向d:\SearchDemo\wwwroot目录，名字叫SearchDemo.
dotlucene 我们再在vs的解决方案SearchDemo单季右键选添加--新建解决方案文件夹，输入文件夹名字为web，其实这两个文件夹名都是vs虚拟的，其实并不存在。我们然后在第一个项目Indexer上右键选：添加==新建项目，再弹出的面板左边选v c#--windows--右边选控制台应用程序，输入名字Indexer,确定，这个时候vs会在d:\searchdemo目录下添加个Indexer目录，然后我们再在新添加的名为web项目上右建--添加---现有网站---选择我们刚刚建立的SearchDemo就是。
现在我们建设好了两个项目，一个console控制台项目和一个SearchDemo的web项目，我们这部分只讲第一个项目怎么样建立索引，要建立索引，我们首先必须明白索引应该建立在什么地方？为了方便我把索引建立在D:\SearchDemo\wwwroot目录下新建一个index目录下；还有我们必须明白哪些文件将被编入索引？也为了方便，我把要被编入索引的文件放在d:\SearchDemo\wwwroot目录下新建个documents目录下，也就是说documents目录下的所有文件都将被我编入索引。因为我们这个demo 演示的是搜索DotLucene的帮助文档，文艺我们把所有下载来的帮助文档文件全部拷入d:\SearchDemo\wwwroot\documents目录下。同时我们还必须赋予index目录写的权利。
我们现在给Indexer控制台项目添加引用Lucene.Net.dll
我们现在在Indexer控制台项目里添加个类：IntrnetIndexer.cs;
dotlucene 先说明下doc.Add(Field.UnStored("text", ParseHtml(html)));
 doc.Add(Field.Keyword("path", relativePath));
 doc.Add(Field.Text("title", GetTitle(html)));
索引是由Docuemnt对象组成，而Docuemnt对象又是由字段对象组成.
Field.UnStored方法在其官方网站上的说明是：Constructs a String-valued Field that is tokenized and indexed, but that is not stored in the index. Term vector will not be stored for this Field.搜价110的Eunge帮忙翻译下成这样：构造一个String类型的字段，它将被分词和索引，但是它不会被存储在索引中。关于这个字段的词向量不会被存储，我一直都未能够理解关于这个字段的词向量不会被存储的含义，汗。
dotlucene 代码如下：

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;

namespace Indexer
 }
using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Data.SqlClient;
using System.Drawing;
using System.Threading;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.Analysis.Cn;
using System.IO;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Search.Highlight;
using System.Text.RegularExpressions;
using XK_Vote.DAL;
using System.Net;
namespace XK_Vote
{
/// <summary>
/// WebForm1 的摘要说明。
/// </summary>
public class WebForm1 : System.Web.UI.Page
{
 protected System.Web.UI.WebControls.Button Button2;
 protected System.Web.UI.WebControls.Button Button1;
 /// <summary>
 /// 搜索返回的结果集
 /// </summary>
 protected DataTable Results = new DataTable();
 /// <summary>
 /// 搜索返回的结果总数
 /// </summary>
 private int total;
 /// <summary>
 /// 搜索的结果每页显示多少记录条数
 /// </summary>
 private readonly int maxResults = 10;
 /// <summary>
 /// 本次搜索所花费的时间
 /// </summary>
 private TimeSpan duration;
 /// <summary>
 /// 本次搜索是从哪条数据开始的,这个东西很重要，通过他基本上可以得到一切
 /// </summary>
 private int startAt;
 /// <summary>
 /// 本页显示的数据是从哪条开始的,他总是startAt+1
 /// </summary>
 private int fromItem;
 /// <summary>
 /// 本页显示的数据记录到哪条结束
 /// </summary>
 private int toItem;
 protected System.Web.UI.WebControls.TextBox TextBox1;
 protected string pg;
 protected void Page_Load(object sender, System.EventArgs e)
 {
 // 在此处放置用户代码以初始化页面

}

#region Web 窗体设计器生成的代码
 override protected void OnInit(EventArgs e)
 {
 //
 // CODEGEN: 该调用是 ASP.NET Web 窗体设计器所必需的。
 //
 InitializeComponent();
 base.OnInit(e);
 }

 /// <summary>
 /// 设计器支持所需的方法 - 不要使用代码编辑器修改
 /// 此方法的内容。
 /// </summary>
 private void InitializeComponent()
 {
 this.Button1.Click += new System.EventHandler(this.Button1_Click);
 this.Button2.Click += new System.EventHandler(this.Button2_Click);
 this.Load += new System.EventHandler(this.Page_Load);

}
 #endregion
 protected void IndexBook(Result rs,IndexWriter writer)
 {
 Document doc = new Document();
 doc.Add(Field.Keyword("Title", rs.Title));
 doc.Add(Field.Keyword("CopyFrom",rs.CopyFrom));
 doc.Add(Field.Keyword("UpdateTime",rs.UpdateTime.ToString()));
 doc.Add(Field.Text("Content", rs.Content));
 doc.Add(Field.Text("DefaultPicUrl",rs.DefaultPicUrl));
 doc.Add(Field.Text("IncludePic",rs.InclucePic.ToString()));
 writer.AddDocument(doc);

 }
 private string ParseHtml(string html)
 {
 string temp = Regex.Replace(html,"<[^>]*>|<|>","");
 return temp.Replace(" "," ");
 }
 protected void Button1_Click(object sender, System.EventArgs e)
 {
 for(int t=0;t<1;t++)
 {
 System.Threading.Thread thread=new Thread(new ThreadStart(InitIndex));
 thread.Start();
 }

  }
  private IndexWriter GetWriter(string physicalPath)
  {
   IndexWriter indexWriter = null;
   string segmentFile = System.IO.Path.Combine(physicalPath, "segments");
   if ( System.IO.File.Exists(segmentFile) )
    indexWriter = new IndexWriter(physicalPath, new Lucene.Net.Analysis.Cn.ChineseAnalyzer(), false);
   else
    indexWriter = new IndexWriter(physicalPath, new Lucene.Net.Analysis.Cn.ChineseAnalyzer(), true);

return indexWriter;

}
 private void InitIndex()
 {
 int index=0;
 int end = 0;
// IndexWriter iw=new IndexWriter(@"D:\Index",ca,true);
 IndexWriter iw=GetWriter(@"D:\study\XK_Vote\Index");
 iw.mergeFactor = 15;
 while(true)
 {
 DB db=new DB();
 System.Data.SqlClient.SqlDataReader sdr=null;
 System.Data.SqlClient.SqlParameter [] p={db.MakeInParam("@id",SqlDbType.Int,4,index)};
 db.RunProc("GZ_SelectText",p,out sdr);
 int count = 0;
 while(sdr.Read())
 {
 Result rs=new Result();
 rs.Title=sdr["Title"].ToString();
 rs.Content=this.ParseHtml(sdr["Content"].ToString());
 rs.CopyFrom=sdr["CopyFrom"].ToString();
 if(Convert.ToInt32(sdr["IncludePic"])==1)
 {
 rs.InclucePic=Convert.ToInt32(sdr["IncludePic"]);
 rs.DefaultPicUrl=sdr["DefaultPicUrl"].ToString();
 }
 else
 {
 rs.InclucePic=0;
 rs.DefaultPicUrl="";
 }
 rs.UpdateTime=Convert.ToDateTime(sdr["UpdateTime"].ToString());
 IndexBook(rs,iw);
 index=Convert.ToInt32(sdr["ArticleID"]);
 count++;

 }
 end++;
 if(count < 100-1)
 break;
 if(end > 200)
 break;

   }
   iw.Optimize();
   iw.Close();

        }
  private void Button2_Click(object sender, System.EventArgs e)
  {
   this.Query=Convert.ToString(Request.Form["Content"]);
//   Search();
   DataBind();

}
 /*protected void Search()
 {
 DateTime start = DateTime.Now;//搜索的开始时间
 Lucene.Net.Search.IndexSearcher search=new Lucene.Net.Search.IndexSearcher(@"D:\LuceneIndex");
 string [] fields={"Content","Title"};
 Query multiquery = MultiFieldQueryParser.Parse(this.Query,fields,new ChineseAnalyzer());
 this.Results.Columns.Add("Title",typeof(string));
 this.Results.Columns.Add("Content",typeof(string));
// this.Results.Columns.Add("CopyFrom",typeof(string));
// this.Results.Columns.Add("UpdateTime",typeof(string));
 QueryHighlightExtractor highlighter = new QueryHighlightExtractor(multiquery, new ChineseAnalyzer(), "", "");
 Lucene.Net.Search.Hits hit =search.Search(multiquery);

 //初始化startAt,以便得到要显示的结果集
 this.total=hit.Length();
 this.startAt = initStartAt();
 //得到当前页要显示的记录数量，包括以前所有页的记录数，这样把他与this.startAt结合就能够很好的知道当前页要显示的记录数了
 int resultsCount = smallOf(this.total,this.startAt+this.maxResults);
 //开始循环得到当前页要显示的记录ex
 for (int i = this.total - this.startAt-1; i >= this.total - resultsCount; i--)
 {
 DataRow dr = this.Results.NewRow();
 dr["Title"] = highlighter.GetBestFragments(hit.Doc(i).GetField("Title").StringValue(),1,2,"...");
 dr["Content"] = highlighter.GetBestFragments(hit.Doc(i).GetField("Content").StringValue(),80,2,"...");
// dr["CopyFrom"]= hit.Doc(i).GetField("CopyFrom").StringValue();
// dr["UpdateTime"]=hit.Doc(i).GetField("UpdateTime").StringValue();
 //把行添加进DataTable
 this.Results.Rows.Add(dr);

}
 //循环完毕，关闭搜索
 search.Close();
 //搜索花费多少时间
 this.duration = DateTime.Now - start;
 Response.Write(this.duration);
 //给fromItem赋值，他总是startAt+1
 this.fromItem = this.startAt + 1;
 //给toItem赋值
 this.toItem = smallOf(this.total,this.startAt+this.maxResults);

 }*/
 /// <summary>
 /// 开始分页
 /// </summary>
 /// <returns></returns>
 protected DataTable Paging
 {
 get
 {
 //知道了startAt,分页也很容易了,现在根据startAt得到当前是第几页,注意，现在这里的页数也是暂时从0开始的
 int pageNumber = (this.startAt + this.maxResults - 1) / this.maxResults;
 DataTable dt = new DataTable();
 dt.Columns.Add("html");
 DataRow dr = dt.NewRow();
 //暂时得到当前页的html连接,注意这里当真正显示页数的时候要+1
 dr["html"] = pagingNumberHtml(startAt,pageNumber+1,false);
 dt.Rows.Add(dr);
 //前面显示10页，如果有的话
 int previousPagesCount = 10;
 //循环把前面页的html连接插到前面去
 for (int i = pageNumber - 1; i >= 0 && i >= pageNumber - previousPagesCount; i--)
 {
 DataRow r = dt.NewRow();
 r["html"] = pagingNumberHtml(i*this.maxResults,i+1,true);
 dt.Rows.InsertAt(r,0);;
 }
 //后面也显示10页，如果有的话
 int nextPagesCount = 10;
 for (int i = pageNumber + 1; i <= this.pageCount && i <= pageNumber + nextPagesCount; i++)
 {
 DataRow r = dt.NewRow();
 r["html"] = pagingNumberHtml(i*this.maxResults,i+1,true);
 dt.Rows.Add(r);
 }
 //添加下一页的超级连接
 DataRow lastRow = dt.NewRow();
 lastRow["html"] = "<a href='WebForm1.aspx?q=" + HttpUtility.UrlEncode(this.Query) + "&start=" + (pageNumber + 1) * this.maxResults + "'>下一页</a>";
 dt.Rows.Add(lastRow);
 return dt;

}
 }
 /// <summary>
 /// 得到某一页的html连接字符串
 /// </summary>
 /// <param name="start">页是从那条记录开始搜索的</param>
 /// <param name="number">页数</param>
 /// <param name="active">是不是个超级连接</param>
 /// <returns></returns>
 private string pagingNumberHtml(int start, int number, bool active)
 {
 if (active)
 {
 return "<a href='WebForm1.aspx?q="+HttpUtility.UrlEncode(this.Query)+"&start="+start+"'>"+number+"</a>";
 }
 else
 {
 return "'+number+''";
 }
 }
 private int smallOf(int first, int second)
 {
 return first < second ? first : second;//得到两个数之间的较少数
 }
 private int initStartAt()
 {
 try
 {
 int sa = Convert.ToInt32(this.Request.Params["start"]);
 if (sa < 0)
 return 0;
 if (sa >= this.total - 1)
 return lastPageStartAt;//如果不足以显示三条，从最后页的第0条开始显示
 return sa;
 }
 catch
 {
 return 0;//如果输入非法非数字，那么从第0条开始显示
 }
 }
 private int pageCount
 {
 get
 {
 return (this.total - 1) / this.maxResults;//总页数，记住都是从0开始的
 }
 }
 private int lastPageStartAt
 {
 get
 {
 return pageCount * this.maxResults;//返回最末页的第0条的序号
 }
 }
 private string query="";
 protected string Query
 {
 get
 {
 return query;
 }
 set
 {
 this.query=value;
 }
 }
 protected string Summary
 {
 get
 {
 if (this.total > 0)
 return "共有结果' + this.total + ',当前从第' + this.fromItem + '条到第' + this.toItem + '条,本次搜索耗时' + this.duration.TotalSeconds + '秒'";
 else
 return "对不起，本次搜索没有找到任何结果";
 }
 }

}