Lucene.net是一个搜索引擎的框架,它自身并不能实现搜索,需要我们自己在其中实现索引的建立,索引的查找。所有这些都是根据它自身提供的API来实现。Lucene.net本身是基于java的,但是经过翻译成.ne版本的,可以在ASP.net中使用这个来实现站内搜索。
要实现基于汉语的搜索引擎,首先的要实现汉语的分词。目前网上大部分都是利用已经有的盘古分词来实现的分词系统,但是盘古分词效果不太好。在这里我把最新的ICTCLAS2014嵌入到Lucene.net中。Lucene.net中所有的分词系统都是基于Analyzer类来继承实现的。所以如果要使用ICTCLAS2014嵌入到Lucene.net中,就必要要继承Analyzer类实现自己的分词类。
1 ICTCLAS的引入
首先我们要把ICTCLAS的dll引入到C#文件中 ,因为这个dll不是在C#中建立的类库,所以无法直接将其加入到C#的引用中。我们考虑使用下面的方法来实现,为了方便,我们把引入的函数以及结构体放入一个类中。如下所示:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Runtime.InteropServices; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.DChinese { [StructLayout(LayoutKind.Explicit)] public struct result_t { [FieldOffset(0)] public int start; [FieldOffset(4)] public int length; [FieldOffset(8)] public int sPos1; [FieldOffset(12)] public int sPos2; [FieldOffset(16)] public int sPos3; [FieldOffset(20)] public int sPos4; [FieldOffset(24)] public int sPos5; [FieldOffset(28)] public int sPos6; [FieldOffset(32)] public int sPos7; [FieldOffset(36)] public int sPos8; [FieldOffset(40)] public int sPos9; [FieldOffset(44)] public int sPos10; //[FieldOffset(12)] public int sPosLow; [FieldOffset(48)] public int POS_id; [FieldOffset(52)] public int word_ID; [FieldOffset(56)] public int word_type; [FieldOffset(60)] public double weight; } public class SplitWord { const string path = @"NLPIR.dll";//设定dll的路径 //对函数进行申明 [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_Init", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_Init(String sInitDirPath, int encoding = 0, String sLicenceCode = null); //特别注意,C语言的函数NLPIR_API const char * NLPIR_ParagraphProcess(const char *sParagraph,int bPOStagged=1);必须对应下面的申明 [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_ParagraphProcess")] public static extern IntPtr NLPIR_ParagraphProcess(String sParagraph, int bPOStagged = 1); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_Exit", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_Exit(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_ImportUserDict", CallingConvention = CallingConvention.Cdecl)] public static extern int NLPIR_ImportUserDict(String sFilename); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_FileProcess", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged = 1); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_FileProcessEx", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_FileProcessEx(String sSrcFilename, String sDestFilename); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_GetParagraphProcessAWordCount", CallingConvention = CallingConvention.Cdecl)] public static extern int NLPIR_GetParagraphProcessAWordCount(String sParagraph); //NLPIR_GetParagraphProcessAWordCount [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_ParagraphProcessAW", CallingConvention = CallingConvention.Cdecl)] public static extern void NLPIR_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_AddUserWord", CallingConvention = CallingConvention.Cdecl)] public static extern int NLPIR_AddUserWord(String sWord); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_SaveTheUsrDic", CallingConvention = CallingConvention.Cdecl)] public static extern int NLPIR_SaveTheUsrDic(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_DelUsrWord", CallingConvention = CallingConvention.Cdecl)] public static extern int NLPIR_DelUsrWord(String sWord); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Start", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_NWI_Start(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Complete", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_NWI_Complete(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_AddFile", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_NWI_AddFile(String sText); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_AddMem", CallingConvention = CallingConvention.Cdecl)] public static extern bool NLPIR_NWI_AddMem(String sText); [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_NWI_GetResult")] public static extern IntPtr NLPIR_NWI_GetResult(bool bWeightOut = false); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Result2UserDict", CallingConvention = CallingConvention.Cdecl)] public static extern uint NLPIR_NWI_Result2UserDict(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_GetKeyWords", CallingConvention = CallingConvention.Cdecl)] public static extern IntPtr NLPIR_GetKeyWords(String sText, int nMaxKeyLimit = 50, bool bWeightOut = false); [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_GetFileKeyWords")] public static extern IntPtr NLPIR_GetFileKeyWords(String sFilename, int nMaxKeyLimit = 50, bool bWeightOut = false); } }
这个类里面包含了所有的ICTCLAS的API函数,包含初始化,添加词语,添加词典,词典保存,分词等各种API。并且都是STATIC函数。
2 分词类DChineseAnalyzer的建立
分词类的建立我们参考StandarAnalyzer分词的实现,再次基础上实现了DChineseAnalyzer类。在分词类中实现必要的构造函数,以及
public overrideTokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
public overrideTokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
这两个函数。他们的作用是在函数中调用用分词器Tokenizer的派生类来实现分词。在现有的版本中一般是在使用分词类的时候,直接调用ReusableTokenStream函数,而不是调用TokenStream函数,这样可以做到一个分词类对象的建立可供多个分词文本的使用。从而减少内存的浪费,提高效率。
以及一些字段,利用这些字段,我们可以加入一些停用词,用户自己的词典。
3 分词器DChineseTokenizer的建立
这个类是分词的核心关键所在。我们要在其中调用ICTCLAS中的分词。在这里面要注意的一个函数是publicoverride bool IncrementToken()
它是我们获取下一个分词结果要用到的函数,如果想要遍历分词结果,就要建立一个循环,不断的调用IncrementToken函数。
整个分词系统代码如下所示:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Runtime.InteropServices; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Util; using Lucene.Net.Documents; using Lucene.Net.Analysis.Tokenattributes; using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis.DChinese { public class DChineseAnalyzer : Analyzer { private ISet<string> stopSet; public static readonly ISet<string> STOP_WORDS_SET; private Version matchVersion; private bool replaceInvalidAcronym; private bool enableStopPositionIncrements; public DChineseAnalyzer(Version version, ISet<string> stopWords) { stopSet = stopWords; replaceInvalidAcronym = false; enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(version); replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); this.matchVersion = version; } public DChineseAnalyzer(Version version) : this(version, STOP_WORDS_SET) { } public DChineseAnalyzer(Version version, System.IO.FileInfo stopWords) : this(version, WordlistLoader.GetWordSet(stopWords)) { } static DChineseAnalyzer() { STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new DChineseTokenizer(matchVersion, reader); result = new LowerCaseFilter(result); result = new StopFilter(enableStopPositionIncrements, result, stopSet); result = new PorterStemFilter(result); return result; } private class SavedStreams { protected internal DChineseTokenizer source; protected internal TokenStream result; }; public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) { SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new DChineseTokenizer(matchVersion, reader); streams.result = new LowerCaseFilter(streams.source); streams.result = new StopFilter(enableStopPositionIncrements, streams.result, stopSet); streams.result = new PorterStemFilter(streams.result); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } streams.source.SetReplaceInvalidAcronym(replaceInvalidAcronym); return streams.result; } } public sealed class DChineseTokenizer : Tokenizer { private bool m_replaceInvalidAcronym; private int offset = 0; private int bufferIndex = 0; private int dataLen = 0; private const int MAX_WORD_LEN = 255; private const int IO_BUFFER_SIZE = 4096; private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE]; private ITermAttribute termAtt; private IOffsetAttribute offsetAtt; private IPositionIncrementAttribute posIncrAtt; private void Init(System.IO.TextReader input, Version matchVersion) { if (matchVersion.OnOrAfter(Version.LUCENE_24)) { m_replaceInvalidAcronym = true; } else { m_replaceInvalidAcronym = false; } //this.input = input; this.input = ChangeInput(input); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); } public DChineseTokenizer(Version matchVersion, System.IO.TextReader input) : base() { Init(input, matchVersion); } public DChineseTokenizer(Version matchVersion, System.IO.TextReader input, AttributeSource source) : base(source) { Init(input, matchVersion); } public DChineseTokenizer(Version matchVersion, System.IO.TextReader input, AttributeFactory factory) : base(factory) { Init(input, matchVersion); } public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; if (length > 0) break; return false; } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (!System.Char.IsWhiteSpace(c)) { if (length == 0) { start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = c; if (length == MAX_WORD_LEN) break; } else if (length > 0) break; } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); posIncrAtt.PositionIncrement = 1; return true; } public override void Reset() { base.Reset(input); bufferIndex = 0; offset = 0; dataLen = 0; } public override void Reset(TextReader input) { String inputString = input.ReadToEnd(); IntPtr intPtr = SplitWord.NLPIR_ParagraphProcess(inputString, 0); string strResult = Marshal.PtrToStringAnsi(intPtr); this.input = new StringReader(strResult); bufferIndex = 0; offset = 0; dataLen = 0; } public override void End() { int finalOffset = CorrectOffset(offset); offsetAtt.SetOffset(finalOffset, finalOffset); } public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym) { this.m_replaceInvalidAcronym = replaceInvalidAcronym; } private TextReader ChangeInput(TextReader input) { //string indexPath = System.Environment.CurrentDirectory; //string indexPath = GetType().Assembly.Location; //string indexPath = System.IO.Path.GetDirectoryName(Page.Request.PhysicalPath); //string dirParent = Directory.GetParent(indexPath).Parent.FullName; string dirParent = System.AppDomain.CurrentDomain.BaseDirectory; bool bInit = SplitWord.NLPIR_Init(dirParent, 0, null); if (!bInit) { return null; } String inputString = input.ReadToEnd(); IntPtr intPtr = SplitWord.NLPIR_ParagraphProcess(inputString, 0); string strResult = Marshal.PtrToStringAnsi(intPtr); return new StringReader(strResult); } } }