lucene3.0+版本中文分词测试+搜索结果+创建索引测试

lucene3.0+版本中文分词测试+搜索结果+创建索引测试


import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public class AnalzyerTest { 	/** 	 * lucene3.0开始已经抛弃了原来的分词方式,转而使用新的分词方式<br> 	 * 本方法以SmartChineseAnalyzer为例,演示如何分词以及取得分词之后的term 	 * http://blog.csdn.net/yjflinchong/article/details/7906116 	 * @throws Exception 	 */ 	public static void analysis() throws Exception { 		Analyzer analyzer = new IKAnalyzer(); 		String string = "据外媒报道,菲律宾国防部长加斯明9日称,多种新式战机、船只将于年内陆续交付军方,菲国防实力将得到大幅增强。但加斯明同时强调,此次军备采购与黄岩岛争端无关。"; 		StringReader reader = new StringReader(string); 		TokenStream ts = analyzer.tokenStream("", reader); 		TermAttribute termAttribute = ts.getAttribute(TermAttribute.class); 		while (ts.incrementToken()) { 			System.out.println(termAttribute.term() + "  "); 		} 		System.out.println(); 	} 	/** 	 * 建索引 	 * 在构造IndexWriter时必须使用Directory作为参数了 	 *  	 * @throws CorruptIndexException 	 * @throws LockObtainFailedException 	 * @throws IOException 	 */ 	private static void build() throws CorruptIndexException, LockObtainFailedException, IOException { 		String path = "index"; 		IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new IKAnalyzer(), true, MaxFieldLength.LIMITED); 		Document document = new Document(); 		document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?", Store.YES, Index.ANALYZED)); 		writer.addDocument(document); 		writer.optimize(); 		writer.close(); 	} 	/** 	 *  	 * @param keyword 	 * @throws CorruptIndexException 	 * @throws IOException 	 * @throws ParseException 	 * @throws InvalidTokenOffsetsException 	 */ 	private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException { 		Analyzer analyzer = new IKAnalyzer(); 		QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer); 		IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index"))); 		Query query = parser.parse(keyword); 		System.out.println(query); 		TopDocs topDocs = searcher.search(query, 10); 		ScoreDoc[] scoreDocs = topDocs.scoreDocs; 		System.out.println("hits:" + topDocs.totalHits); 		for (ScoreDoc scoreDoc : scoreDocs) { 			Document doc = searcher.doc(scoreDoc.doc); 			String text = doc.get("text"); 			System.out.println(highlight(text, query, analyzer)); 		} 	} 	/** 	 * 高亮关键词 	 * http://blog.csdn.net/yjflinchong/article/details/7906116 	 * @param content 	 *            需要高亮的内容 	 * @param query 	 *            搜索时使用的Query对象 	 * @param analyzer 	 *            分词器 	 * @return 高亮之后的文本 	 * @throws IOException 	 * @throws InvalidTokenOffsetsException 	 */ 	private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException { 		SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>"); 		Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); 		highlighter.setTextFragmenter(new SimpleFragmenter(100)); 		String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content); 		return resultString + "..."; 	} 	public static void main(String[] args) throws Exception { 		analysis(); 		build(); 		search("人民币 升值"); 	} } 


注明来源:http://blog.csdn.net/yjflinchong/article/details/7906116

lucene3.0以上版本中文分词测试+搜索结果+创建索引测试




      本文转自yjflinchong 51CTO博客,原文链接:http://blog.51cto.com/yjflinchong/1165005,如需转载请自行联系原作者

上一篇:python 前缀和总结


下一篇:大数据人才缺口高达150万,商业智能分析师是怎样的存在?