引子
What’s Lucene
Lucene是一个信息检索的函数库(Library),利用它你可以为你的应用加上索引和搜索的功能.
Lucene的使用者不需要深入了解有关全文检索的知识,仅仅学会使用库中的一个类,你就为你的应用实现全文检索的功能.
不过千万别以为Lucene是一个象google那样的搜索引擎,Lucene甚至不是一个应用程序,它仅仅是一个工具,一个Library.你也可以把它理解为一个将索引,搜索功能封装的很好的一套简单易用的API.利用这套API你可以做很多有关搜索的事情,而且很方便.
What Can Lucene Do
Lucene可以对任何的数据做索引和搜索. Lucene不管数据源是什么格式,只要它能被转化为文字的形式,就可以被Lucene所分析利用.也就是说不管是MS word, Html ,pdf还是其他什么形式的文件只要你可以从中抽取出文字形式的内容就可以被Lucene所用.你就可以用Lucene对它们进行索引以及搜索.
以上详细的介绍在lucene中国中有详细的描述,我也在这里上传了一份,也可以点击下载
下面贴一个简单的实例代码:
using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Text;
using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using Lucene.Net.Analysis.Standard;
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
if(!IsPostBack)
TextBox3.Text = Server.MapPath("doc");
}
#region 建立索引
protected void Button2_Click(object sender, EventArgs e)
{
string INDEX_STORE_PATH = Server.MapPath("index"); //INDEX_STORE_PATH 为索引存储目录
string INDEX_PATH = TextBox3.Text; //INDEX_PATH 为搜索目录
IndexWriter writer = null;
writer = new IndexWriter(INDEX_STORE_PATH, new StandardAnalyzer(), true);
IndexDirectory(writer, new FileInfo(INDEX_PATH));
writer.Optimize();
writer.Close();
TextBox1.Text = "提示:索引完成\n";
}
public void IndexDirectory(IndexWriter writer, FileInfo file)
{
if (Directory.Exists(file.FullName))
{
String[] files = Directory.GetFileSystemEntries(file.FullName);
if (files != null)
{
for (int i = 0; i < files.Length; i++)
{
IndexDirectory(writer, new FileInfo(files[i])); //这里是一个递归
}
}
}
else if (file.Extension == ".txt")
{
IndexFile(file, writer);
}
}
private void IndexFile(FileInfo file, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(new Field("contents", new StreamReader(file.FullName, System.Text.Encoding.Default)));
writer.AddDocument(doc);
}
#endregion
#region 搜索
protected void Button1_Click(object sender, EventArgs e)
{
string INDEX_STORE_PATH = Server.MapPath("index"); //INDEX_STORE_PATH 为索引存储目录
string KEYWORD = TextBox2.Text; //搜索关键字
IndexSearcher searcher;
searcher = new IndexSearcher(INDEX_STORE_PATH);
QueryParser q = new QueryParser("contents", new StandardAnalyzer());
Query query = q.Parse(KEYWORD);
Hits hits = searcher.Search(query);
TextBox1.Text = "搜索结果为 " + hits.Length()+ " 个\n";
if (hits != null)
{
for (int i = 0; i < hits.Length(); i++)
{
Document doc = hits.Doc(i);
TextBox1.Text = TextBox1.Text + "第" + (i + 1) + "个搜索结果,文件路径为: " + doc.Get("filename") + "\n";
}
}
searcher.Close();
}
#endregion
}
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Text;
using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using Lucene.Net.Analysis.Standard;
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
if(!IsPostBack)
TextBox3.Text = Server.MapPath("doc");
}
#region 建立索引
protected void Button2_Click(object sender, EventArgs e)
{
string INDEX_STORE_PATH = Server.MapPath("index"); //INDEX_STORE_PATH 为索引存储目录
string INDEX_PATH = TextBox3.Text; //INDEX_PATH 为搜索目录
IndexWriter writer = null;
writer = new IndexWriter(INDEX_STORE_PATH, new StandardAnalyzer(), true);
IndexDirectory(writer, new FileInfo(INDEX_PATH));
writer.Optimize();
writer.Close();
TextBox1.Text = "提示:索引完成\n";
}
public void IndexDirectory(IndexWriter writer, FileInfo file)
{
if (Directory.Exists(file.FullName))
{
String[] files = Directory.GetFileSystemEntries(file.FullName);
if (files != null)
{
for (int i = 0; i < files.Length; i++)
{
IndexDirectory(writer, new FileInfo(files[i])); //这里是一个递归
}
}
}
else if (file.Extension == ".txt")
{
IndexFile(file, writer);
}
}
private void IndexFile(FileInfo file, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(new Field("contents", new StreamReader(file.FullName, System.Text.Encoding.Default)));
writer.AddDocument(doc);
}
#endregion
#region 搜索
protected void Button1_Click(object sender, EventArgs e)
{
string INDEX_STORE_PATH = Server.MapPath("index"); //INDEX_STORE_PATH 为索引存储目录
string KEYWORD = TextBox2.Text; //搜索关键字
IndexSearcher searcher;
searcher = new IndexSearcher(INDEX_STORE_PATH);
QueryParser q = new QueryParser("contents", new StandardAnalyzer());
Query query = q.Parse(KEYWORD);
Hits hits = searcher.Search(query);
TextBox1.Text = "搜索结果为 " + hits.Length()+ " 个\n";
if (hits != null)
{
for (int i = 0; i < hits.Length(); i++)
{
Document doc = hits.Doc(i);
TextBox1.Text = TextBox1.Text + "第" + (i + 1) + "个搜索结果,文件路径为: " + doc.Get("filename") + "\n";
}
}
searcher.Close();
}
#endregion
}
例子代码下载:点这里(vs.net2008)
lucene.net的版本为2.0.0.4,可以在lucene的网站上下载得到:http://incubator.apache.org/lucene.net/
截止到今天,lucene在java的版本是2.3,在.net的是2.0