爬取京东商品集成ES搜索

项目代码地址:https://gitee.com/javaming/springboot-es.git
效果展示如下:

爬取京东商品集成ES搜索

爬取京东数据

当前时间可以正确解析,后续京东修改页面后可能需要做对应代码调整

public static List<JdProduct> fetchJD(String keyword) throws IOException {
        String url =  "https://search.jd.com/Search?keyword=" + keyword;
        //解析网页 jsoup返回的就是Documnet对象
        Document parse = Jsoup.parse(new URL(url), 30000);
        Element j_goodsList = parse.getElementById("J_goodsList");
//        System.out.println(j_goodsList.html());
        //获取所有li
        Elements li = j_goodsList.getElementsByTag("li");
        List<JdProduct> list = Lists.newArrayList();
        for (Element e : li) {
            //关于这种图片特别多网站 图片都是懒加载的 source-data-lazy-img
//            String img = e.getElementsByTag("img").eq(0).attr("src");
            String img = "https:" + e.getElementsByTag("img").eq(0).attr("data-lazy-img");
            String name = e.getElementsByClass("p-name").eq(0).get(0).getElementsByTag("em").text();
            String price = e.getElementsByClass("p-price").eq(0).get(0).getElementsByTag("i").text();
//            System.out.println("img = " + img + ", name = " + name + ", price = " + price);
            list.add(new JdProduct(img, price, name));
        }
        return list;
    }

数据导入ES

 public boolean fetchJd(String keyword) {
        BulkRequest bulkRequest = new BulkRequest(ESConstants.JD_GOODS);
        try {
            bulkRequest.timeout("2m");
            HtmlParseUtil.fetchJD(keyword).stream().forEach(e -> {
                bulkRequest.add(
                        new IndexRequest().source(JSON.toJSONString(e), XContentType.JSON)
                );
            });
            BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
            return !bulk.hasFailures();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return false;
    }

实现搜索并高亮

public List<Map<String, Object>> searchPage(
            String keyword,
            int pageNo,
            int pageSize,
            boolean highlight
    ) throws IOException {
        pageNo = pageNo <=1 ? 1 : pageNo;
        pageSize = pageSize <=1 ? 1 : pageSize;
        SearchRequest searchRequest = new SearchRequest(ESConstants.JD_GOODS);
        SearchSourceBuilder builder = new SearchSourceBuilder();
        builder.from(pageNo);
        builder.size(pageSize);
        //精准匹配
        TermQueryBuilder title = QueryBuilders.termQuery("name", keyword);
        builder.query(title);
        builder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        if(highlight) {
            //设置高亮
            HighlightBuilder highlightBuilder = new HighlightBuilder();
            highlightBuilder.field("name");
            highlightBuilder.preTags("<span style='color:red'>");
            highlightBuilder.postTags("</span>");
            //只高亮一个符合条件的
            highlightBuilder.requireFieldMatch(false);
            builder.highlighter(highlightBuilder);
        }
        searchRequest.source(builder);
        SearchResponse search = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
        List<Map<String, Object>> list = Lists.newArrayList();
        Arrays.stream(search.getHits().getHits()).forEach(e -> {
            if(highlight) {
                // 解析高亮的字段
                Map<String, HighlightField> highlightFields = e.getHighlightFields();
                HighlightField name = highlightFields.get("name");
                Map<String, Object> sourceAsMap = e.getSourceAsMap();
                if (name != null) {
                    Text[] fragments = name.getFragments();
                    String newName = "";
                    for (Text text : fragments) {
                        newName += text;
                    }
                    //高亮字段替换原来的字段
                    sourceAsMap.put("name", newName);
                }
            }
            list.add(e.getSourceAsMap());
        });
        return list;
    }

使用Vue实现前后端分离

  1. 新建文件夹 安装Vue、axios
npm init
npm install vue
npm install axios
  1. 进入下载下来的源码dist目录

  2. 将源码导入到项目中
    爬取京东商品集成ES搜索

  3. 前端页面编写Vue代码

<script th:src="@{/js/axios.min.js}"></script>
<script th:src="@{/js/vue.min.js}"></script>
<script>
    new Vue({
        el: "#app",
        data: {
            keyword: "",    //搜索关键字
            results: []     //搜索的结果
        },
        methods: {
            searchKey(){
                const keyword = this.keyword;
                console.log(keyword);
                //对接后端接口
                axios.get('jd/search/' + keyword + "/1/10" ).then(e => {
                    this.results = e.data;
                })
            }
        }
    })
</script>
上一篇:搜索结果关键词高亮显示


下一篇:基于Annoy的语义泛化-模块化进阶标准