【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

环境
  虚拟机:VMware 10
  Linux版本:CentOS-6.5-x86_64
  客户端:Xshell4
  FTP:Xftp4
  jdk8
  elasticsearch-2.2.0

第一步:获取数据
主流搜索引擎,会使用爬虫,来获取网站的html数据,常用的工具有nutch,Python(主流),wget(c语言)
这里使用wget模拟
#安装wget
yum install wget
#使用wget从文件wget.log爬取数据
#参数:
#-o 指定爬取内容输出日志名
#-P 爬取生成文件目录父目录
#-m 拷贝
#-D 列举爬取域名清单
#-N 不重新检索文件, 除非更新文件时间晚于本地时间
#--convert-links 根据路径转换成目录
#--random-wait 随机等待 间隔性爬取 以防止被禁止访问
#-A 指定文档类型
#最后指定爬取网站地址

[cluster@PCS101 /] wget -o /tmp/wget.log -P /root/data --no-parent --no-verbose -m -D news.cctv.com -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com

第二步:ES集群安装分词器IK
注意:必选选择与ES对应的IK版本 https://github.com/medcl/elasticsearch-analysis-ik

【Elasticsearch学习之三】Elasticsearch 搜索引擎案例
#将ik目录直接ftp上传到目录下/opt/cluster/es/elasticsearch-2.2.1/plugins
#更改目录所有者
[cluster@PCS101 plugins]$ chown -R cluster:cluster ik
修改配置 plugin-descriptor.properties
elasticsearch.version=2.2.1

分发至102、103:
[cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS102:`pwd`
[cluster@PCS101 plugins]$ scp -r ik/ cluster@PCS103:`pwd`

第三步:数据抽取:从网页中抽取数据
HtmlTool.java

package com.sxt.util;

import java.io.File;

import com.sxt.es.HtmlBean;
import com.sxt.es.IndexService; import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source; public class HtmlTool { /**
* 数据过滤清洗 将网页转换成javabean
* @param path html 文件路径
*/
public static HtmlBean parserHtml(String path)throws Throwable{
HtmlBean bean =new HtmlBean();
Source source=new Source(new File(path));
source.fullSequentialParse();
Element titleElement=source.getFirstElement(HTMLElementName.TITLE);
if(titleElement==null){
return null;
}else{
String title=CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
bean.setTitle(title);
}
String content =source.getTextExtractor().setIncludeAttributes(true).toString(); String url =path.substring(IndexService.DATA_DIR.length());
bean.setContent(content);
bean.setUrl(url);
return bean;
} /**
* @param args
*/
public static void main(String[] args) {
try {
System.out.println(parserHtml("e:\\data\\news.cctv.com\\2017\\05\\01\\ARTI0k5MFLx2cvzQZffwQcUp170501.shtml").getContent());
} catch (Throwable e) {
e.printStackTrace();
}
}
}

HtmlBean.java

package com.sxt.es;

public class HtmlBean {

    private int id;
private String title;
private String content;
private String url;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
} }

第四步:把抽取出来的数据同ES建立索引
#创建索引库
IndexService.java::createIndex
#数据同ES建立索引
IndexService.java::addHtmlToES

package com.sxt.es;

import java.io.File;
import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map; import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.Requests;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryParser;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.junit.Test;
import org.springframework.stereotype.Service; import com.sxt.util.HtmlTool; @Service
public class IndexService { //存放html文件的目录
public static String DATA_DIR="e:\\data\\";
//定义客户端
public static Client client; static {
//设置连接集群名
Settings settings = Settings.settingsBuilder().put("cluster.name", "wjy-es").build();
try {
//创建连接集群客户端
client = TransportClient
.builder()
.settings(settings)
.build()
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("134.32.123.101"), 9300))
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("134.32.123.102"), 9300))
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("134.32.123.103"), 9300));
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 创建索引库
* admin():管理索引库的。client.admin().indices()
* 索引数据的管理:client.prepare
*/
@Test
public void createIndex() throws Exception {
IndicesExistsResponse resp = client.admin().indices().prepareExists("testes").execute().actionGet();
//存在删除旧的
if(resp.isExists()){
client.admin().indices().prepareDelete("testes").execute().actionGet();
}
//创建
client.admin().indices().prepareCreate("testes").execute().actionGet(); new XContentFactory(); XContentBuilder builder = XContentFactory.jsonBuilder()
.startObject()
.startObject("htmlbean")
.startObject("properties")//type
.startObject("title")
.field("type", "string")//字符串类型
.field("store", "yes")//是否保存
.field("analyzer", "ik_max_word")//指定分词器
.field("search_analyzer", "ik_max_word")//指定搜索依赖的分词器
.endObject()
.startObject("content")
.field("type", "string")
.field("store", "yes")
.field("analyzer", "ik_max_word")
.field("search_analyzer", "ik_max_word")
.endObject()
.startObject("url")
.field("type", "string")
.field("store", "yes")
.field("analyzer", "ik_max_word")
.field("search_analyzer", "ik_max_word")
.endObject()
.endObject()
.endObject()
.endObject();
//创建type
PutMappingRequest mapping = Requests.putMappingRequest("testes").type("htmlbean").source(builder);
client.admin().indices().putMapping(mapping).actionGet(); } /**
* 把源数据html文件添加到索引库中(构建索引文件)
*/
@Test
public void addHtmlToES(){
readHtml(new File(DATA_DIR));
} /**
* 遍历数据文件目录d:/data ,递归方法
* @param file
*/
public void readHtml(File file){
if(file.isDirectory()){
File[] fs =file.listFiles();
for (int i = 0; i < fs.length; i++) {
File f = fs[i];
readHtml(f);
}
}else{
HtmlBean bean;
try {
bean = HtmlTool.parserHtml(file.getPath());
if(bean!=null){
Map<String, String> dataMap =new HashMap<String, String>();
dataMap.put("title", bean.getTitle());
dataMap.put("content", bean.getContent());
dataMap.put("url", bean.getUrl());
//写索引
client.prepareIndex("testes", "htmlbean").setSource(dataMap).execute().actionGet();
}
} catch (Throwable e) {
e.printStackTrace();
} }
} /**
* 搜索
* @param kw
* @param num
* @return
*/
public PageBean<HtmlBean> search(String kw,int num,int count){
PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
wr.setIndex(num);
// //构建查询条件
// MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
// MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
//
// //构建一个多条件查询对象
// BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
// q.should(q1);
// q.should(q2); // RangeQueryBuilder q1 =new RangeQueryBuilder("age");
// q1.from(18);
// q1.to(40); MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
SearchResponse resp=null;
if(wr.getIndex()==1){//第一页
resp = client.prepareSearch("testes")
.setTypes("htmlbean")
.setQuery(q)
.addHighlightedField("title")
.addHighlightedField("content")
.setHighlighterPreTags("<font color=\"red\">")
.setHighlighterPostTags("</font>")
.setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
.setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段,每个碎片段之间用...隔开
.setFrom(0)//从第几个开始
.setSize(10)//第一页数量
.execute()
.actionGet(); }else{
wr.setTotalCount(count);
resp = client.prepareSearch("testes")
.setTypes("htmlbean")
.setQuery(q)
.addHighlightedField("title")//高亮显示
.addHighlightedField("content")
.setHighlighterPreTags("<font color=\"red\">")//红色显示
.setHighlighterPostTags("</font>")
.setHighlighterFragmentSize(40)
.setHighlighterNumOfFragments(5)
.setFrom(wr.getStartRow())
.setSize(10)
.execute()
.actionGet();
}
SearchHits hits= resp.getHits();
wr.setTotalCount((int)hits.getTotalHits()); for(SearchHit hit : hits.getHits())
{
HtmlBean bean =new HtmlBean();
if(hit.getHighlightFields().get("title")==null)
{//title中没有包含关键字
bean.setTitle(hit.getSource().get("title").toString());//获取原来的title(没有高亮的title)
}
else
{
bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
} if(hit.getHighlightFields().get("content")==null)
{//content中没有包含关键字
bean.setContent(hit.getSource().get("content").toString());//获取原来的content(没有高亮的content)
}
else
{
StringBuilder sb =new StringBuilder();
for(Text text: hit.getHighlightFields().get("content").getFragments())
{
sb.append(text.toString()+"...");
}
bean.setContent(sb.toString());
} bean.setUrl("http://"+hit.getSource().get("url").toString());
wr.setBean(bean);
} return wr;
} // @Test
// public void del(){
//// client.admin().indices().prepareDelete("testes").execute().actionGet();
// client.admin().indices().prepareDelete("testes").execute().actionGet();
// }
}

第五步:搜索数据

IndexService.java::search  见上面代码
PageBean.java

package com.sxt.es;

import java.util.ArrayList;
import java.util.List; public class PageBean<T> {
private int size = 10;//每页显示记录
private int index = 1;// 当前页号
private int totalCount = 0;// 记录总数 private int totalPageCount = 1;// 总页 private int[] numbers;//展示页数集合
protected List<T> list;//要显示到页面的数据集 /**
* 得到
* @return
*/
public int getStartRow() { return (index - 1) * size;
} /**
* 得到结束记录
* @return
*/
public int getEndRow() { return index * size;
} /**
* @return Returns the size.
*/
public int getSize() {
return size;
} /**
* @param size
* The size to set.
*/
public void setSize(int size) {
if (size > 0) {
this.size = size;
}
}
/**
* @return Returns the currentPageNo.
*/
public int getIndex() {
if (totalPageCount == 0) { return 0;
} return index;
} /**
* @param currentPageNo
* The currentPageNo to set.
*/
public void setIndex(int index) {
if (index > 0) {
this.index = index;
}
} /**
* @return Returns the totalCount.
*/
public int getTotalCount() {
return totalCount;
} /**
* @param totalCount
* The totalCount to set.
*/
public void setTotalCount(int totalCount) {
if (totalCount >= 0) {
this.totalCount = totalCount;
setTotalPageCountByRs();//根据总记录数计算总页
}
} public int getTotalPageCount() {
return this.totalPageCount;
} /**
* 根据总记录数计算总页
*/
private void setTotalPageCountByRs() {
if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size == 0) {
this.totalPageCount = this.totalCount / this.size;
} else if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size > 0) {
this.totalPageCount = (this.totalCount / this.size) + 1;
} else {
this.totalPageCount = 0;
}
setNumbers(totalPageCount);//获取展示页数集合
} public int[] getNumbers() {
return numbers;
} /**
* 设置显示页数集合
* @param totalPageCount
*/
public void setNumbers(int totalPageCount) {
if(totalPageCount>0){
//!.当前数组的长度
int[] numbers = new int[totalPageCount>10?10:totalPageCount];//页面要显示的页数集合
int k =0;
//
//1.数组长度<10 1 2 3 4 .... 7
//2.数组长度>=10
// 当前页<=6 1 2 3 4 10
// 当前页>=总页数-5 ......12 13 14 15
// 其他 5 6 7 8 当前页(10) 10 11 12 13
for(int i = 0;i < totalPageCount;i++){
//保证当前页为集合的中�?
if((i>=index- (numbers.length/2+1) || i >= totalPageCount-numbers.length) && k<numbers.length){
numbers[k] = i+1;
k++;
}else if(k>=numbers.length){
break;
}
} this.numbers = numbers;
} } public void setNumbers(int[] numbers) {
this.numbers = numbers;
} public List<T> getList() {
return list;
} public void setList(List<T> list) {
this.list = list;
} public void setBean(T bean){
if(this.list==null){
list =new ArrayList<T>();
}
list.add(bean);
} /*
public static int getTotalPageCount(int iTotalRecordCount, int iPageSize) {
if (iPageSize == 0) {
return 0;
} else {
return (iTotalRecordCount % iPageSize) == 0 ? (iTotalRecordCount / iPageSize) : (iTotalRecordCount / iPageSize) + 1;
}
}*/
}

启动ES_SEARCH web,访问 http://localhost:8080/ES_SEARCH

【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

输入关键词 搜索:

【Elasticsearch学习之三】Elasticsearch 搜索引擎案例

项目代码下载连接:https://download.csdn.net/download/cac2020/11015439

上一篇:C# 读取Json配置文件


下一篇:node.js一行一行的获取txt文件内容