工作的需要,需要对于lucene.net索引即时的更新,毕竟lucene.net的索引重建的话比较慢,数据量大的时候建下要几天,所以就写个了缓冲类来解决即时的更新的问题,其实还是比较简单的。
大体上的原理就是创建的时候创建两组Directory 一组在内存中,一组在硬盘上,搜索的时候可以进行合并去搜索,后续的操作都直接暂放到内存中,定时的去同步到硬盘上,删除或者更新的时候内存中有一缓存Query,用来排除被更新或者删除的内容,当搜索的时候,自动忽略掉这部分的内容去集中,就解决了大多数的问题了。
OK.看代码的实现。
using System; using System.Collections.Generic; using System.Linq; using System.Text; using S = System.Timers; using IO = System.IO; using Lucene.Net.Documents; using Lucene.Net.Store; using Lucene.Net.Search; using Lucene.Net.Index; using Lucene.Net.Analysis; using System.Collections; using Lucene.Net.Analysis.MMSeg; namespace SearchSample { public class BufferDirectory { private static readonly Analyzer _analysis; private static readonly string _path; private static RAMDirectory _ramIndex; private static readonly IndexWriter _ramWrite; private static BufferDirectory _directory; private static object _lock = new object(); private static BooleanQuery _query = new BooleanQuery(); private static object _querylock = new object(); static BufferDirectory() { _path = IO.Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Index"); _ramIndex = new RAMDirectory(); _ramWrite = new IndexWriter(_ramIndex, DefaultAnalyzer.Analyzer, true, IndexWriter.MaxFieldLength.LIMITED); _analysis = DefaultAnalyzer.Analyzer; var tmpFs = FSDirectory.Open(_path); if (Lucene.Net.Index.IndexReader.IndexExists(tmpFs)) { using (IndexWriter tmp = new IndexWriter(tmpFs, _analysis, false, IndexWriter.MaxFieldLength.LIMITED)) { } } else { using (IndexWriter tmp = new IndexWriter(tmpFs, _analysis, true, IndexWriter.MaxFieldLength.LIMITED)) { } } } ~BufferDirectory() { _ramIndex.Dispose(); _ramWrite.Dispose(); _query = null; } public static BufferDirectory InitConfig() { if (_directory == null) { lock (_lock) { if (_directory == null) { _directory = new BufferDirectory(); } } } return _directory; } public static BufferDirectory Instance() { return _directory; } /// <summary> /// 重启缓存区 /// </summary> public static void Reset() { lock (_lock) { _directory = null; InitConfig(); } lock (_querylock) { _query = null; _query = new BooleanQuery(); } return; } /// <summary> /// 初始化的时候注册,注册为全局生效 /// </summary> /// <param name="before"></param> public static void BeforeConfig(BeforeSetDocument before) { if (_directory == null) { throw new NullReferenceException("BufferDirectory为空"); } _directory.BeforeSetDocumentHandler += before; return; } /// <summary> /// 初始化的时候注册,注册为全局生效 /// </summary> /// <param name="after"></param> public static void AfterConfig(AfterSetDocument after) { if (_directory == null) { throw new NullReferenceException("BufferDirectory为空"); } _directory.AfterSetDocumentHandler += after; return; } /// <summary> /// 更新删除Query /// </summary> /// <param name="query"></param> /// <returns></returns> public static void IgnoreQuery(Query query, Occur occur) { lock (_querylock) { _query.Add(query, occur); } return; } public static Query IgnoreQuery() { return _query; } public delegate void BeforeSetDocument(Document document); public event BeforeSetDocument BeforeSetDocumentHandler; public delegate void AfterSetDocument(Document document); public event AfterSetDocument AfterSetDocumentHandler; private BufferDirectory() { } //public void SetIndexWriter(Action<IndexWriter> action) //{ // action(_index2); // return; //} public void CreateDocument(Document documnet) { try { if (BeforeSetDocumentHandler != null) BeforeSetDocumentHandler(documnet); _ramWrite.AddDocument(documnet); if (AfterSetDocumentHandler != null) AfterSetDocumentHandler(documnet); } catch { } finally { _ramWrite.Commit(); } return; } public static void UpdateOrSaved() { _ramWrite.Optimize(); using (IndexWriter update = new IndexWriter(FSDirectory.Open(_path), _analysis, false, IndexWriter.MaxFieldLength.LIMITED)) { try { update.AddIndexes(IndexReader.Open(_ramIndex, true)); update.DeleteDocuments(_query); update.Commit(); update.Optimize(); } catch { } finally { _ramIndex.Dispose(); _ramIndex = null; _ramIndex = new RAMDirectory(); using (IndexWriter tmp = new IndexWriter(_ramIndex, _analysis, true, IndexWriter.MaxFieldLength.LIMITED)) { } } } return; } public static IndexReader Reader() { return new MultiReader(IndexReader.Open(FSDirectory.Open(_path), false), IndexReader.Open(_ramIndex, false)); } } }
大体上也就这样逻辑,但是还有一个就是内存中的索引持久问题,毕竟如果不进行持久,在AppDomain重启会导致内存中的索引删除,而导致的索引不同步。
这样可以通过BeforeSetDocumentHandler 和AfterSetDocumentHandler事件来写入到持久的临时缓冲器,加载的时候自动优先加载这部分的临时缓存区,于是就算比较完整的实现lucene.net动态维护了。
当然这些是比较粗暴的方法,还有一些更好的维护策略,就以后再慢慢总结了。