倒排索引
这里就涉及到了分词
分词语法
默认的分词器
GET _analyze?pretty { "text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱" }
{ "tokens" : [ { "token" : "haier", "start_offset" : 0, "end_offset" : 5, "type" : "<ALPHANUM>", "position" : 0 }, { "token" : "海", "start_offset" : 6, "end_offset" : 7, "type" : "<IDEOGRAPHIC>", "position" : 1 }, { "token" : "尔", "start_offset" : 7, "end_offset" : 8, "type" : "<IDEOGRAPHIC>", "position" : 2 }, { "token" : "bcd", "start_offset" : 9, "end_offset" : 12, "type" : "<ALPHANUM>", "position" : 3 }, { "token" : "470wdpg", "start_offset" : 13, "end_offset" : 20, "type" : "<ALPHANUM>", "position" : 4 }, { "token" : "十", "start_offset" : 20, "end_offset" : 21, "type" : "<IDEOGRAPHIC>", "position" : 5 }, { "token" : "字", "start_offset" : 21, "end_offset" : 22, "type" : "<IDEOGRAPHIC>", "position" : 6 }, { "token" : "对", "start_offset" : 22, "end_offset" : 23, "type" : "<IDEOGRAPHIC>", "position" : 7 }, { "token" : "开", "start_offset" : 23, "end_offset" : 24, "type" : "<IDEOGRAPHIC>", "position" : 8 }, { "token" : "门", "start_offset" : 24, "end_offset" : 25, "type" : "<IDEOGRAPHIC>", "position" : 9 }, { "token" : "风", "start_offset" : 25, "end_offset" : 26, "type" : "<IDEOGRAPHIC>", "position" : 10 }, { "token" : "冷", "start_offset" : 26, "end_offset" : 27, "type" : "<IDEOGRAPHIC>", "position" : 11 }, { "token" : "变", "start_offset" : 27, "end_offset" : 28, "type" : "<IDEOGRAPHIC>", "position" : 12 }, { "token" : "频", "start_offset" : 28, "end_offset" : 29, "type" : "<IDEOGRAPHIC>", "position" : 13 }, { "token" : "一", "start_offset" : 29, "end_offset" : 30, "type" : "<IDEOGRAPHIC>", "position" : 14 }, { "token" : "级", "start_offset" : 30, "end_offset" : 31, "type" : "<IDEOGRAPHIC>", "position" : 15 }, { "token" : "节", "start_offset" : 31, "end_offset" : 32, "type" : "<IDEOGRAPHIC>", "position" : 16 }, { "token" : "能", "start_offset" : 32, "end_offset" : 33, "type" : "<IDEOGRAPHIC>", "position" : 17 }, { "token" : "家", "start_offset" : 33, "end_offset" : 34, "type" : "<IDEOGRAPHIC>", "position" : 18 }, { "token" : "用", "start_offset" : 34, "end_offset" : 35, "type" : "<IDEOGRAPHIC>", "position" : 19 }, { "token" : "官", "start_offset" : 35, "end_offset" : 36, "type" : "<IDEOGRAPHIC>", "position" : 20 }, { "token" : "方", "start_offset" : 36, "end_offset" : 37, "type" : "<IDEOGRAPHIC>", "position" : 21 }, { "token" : "冰", "start_offset" : 37, "end_offset" : 38, "type" : "<IDEOGRAPHIC>", "position" : 22 }, { "token" : "箱", "start_offset" : 38, "end_offset" : 39, "type" : "<IDEOGRAPHIC>", "position" : 23 } ] }
ik_max_word
GET _analyze?pretty { "analyzer": "ik_max_word", "text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱" }
{ "tokens" : [ { "token" : "haier", "start_offset" : 0, "end_offset" : 5, "type" : "ENGLISH", "position" : 0 }, { "token" : "海尔", "start_offset" : 6, "end_offset" : 8, "type" : "CN_WORD", "position" : 1 }, { "token" : "bcd-470wdpg", "start_offset" : 9, "end_offset" : 20, "type" : "LETTER", "position" : 2 }, { "token" : "bcd", "start_offset" : 9, "end_offset" : 12, "type" : "ENGLISH", "position" : 3 }, { "token" : "470", "start_offset" : 13, "end_offset" : 16, "type" : "ARABIC", "position" : 4 }, { "token" : "wdpg", "start_offset" : 16, "end_offset" : 20, "type" : "ENGLISH", "position" : 5 }, { "token" : "十字", "start_offset" : 20, "end_offset" : 22, "type" : "CN_WORD", "position" : 6 }, { "token" : "十", "start_offset" : 20, "end_offset" : 21, "type" : "TYPE_CNUM", "position" : 7 }, { "token" : "字", "start_offset" : 21, "end_offset" : 22, "type" : "COUNT", "position" : 8 }, { "token" : "对开", "start_offset" : 22, "end_offset" : 24, "type" : "CN_WORD", "position" : 9 }, { "token" : "开门", "start_offset" : 23, "end_offset" : 25, "type" : "CN_WORD", "position" : 10 }, { "token" : "门风", "start_offset" : 24, "end_offset" : 26, "type" : "CN_WORD", "position" : 11 }, { "token" : "风冷", "start_offset" : 25, "end_offset" : 27, "type" : "CN_WORD", "position" : 12 }, { "token" : "变频", "start_offset" : 27, "end_offset" : 29, "type" : "CN_WORD", "position" : 13 }, { "token" : "一级", "start_offset" : 29, "end_offset" : 31, "type" : "CN_WORD", "position" : 14 }, { "token" : "一", "start_offset" : 29, "end_offset" : 30, "type" : "TYPE_CNUM", "position" : 15 }, { "token" : "级", "start_offset" : 30, "end_offset" : 31, "type" : "COUNT", "position" : 16 }, { "token" : "节能", "start_offset" : 31, "end_offset" : 33, "type" : "CN_WORD", "position" : 17 }, { "token" : "家用", "start_offset" : 33, "end_offset" : 35, "type" : "CN_WORD", "position" : 18 }, { "token" : "官方", "start_offset" : 35, "end_offset" : 37, "type" : "CN_WORD", "position" : 19 }, { "token" : "冰箱", "start_offset" : 37, "end_offset" : 39, "type" : "CN_WORD", "position" : 20 } ] }
ik_max_word
GET _analyze?pretty { "analyzer": "ik_smart", "text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱" }
{ "tokens" : [ { "token" : "haier", "start_offset" : 0, "end_offset" : 5, "type" : "ENGLISH", "position" : 0 }, { "token" : "海尔", "start_offset" : 6, "end_offset" : 8, "type" : "CN_WORD", "position" : 1 }, { "token" : "bcd-470wdpg", "start_offset" : 9, "end_offset" : 20, "type" : "LETTER", "position" : 2 }, { "token" : "十字", "start_offset" : 20, "end_offset" : 22, "type" : "CN_WORD", "position" : 3 }, { "token" : "对开", "start_offset" : 22, "end_offset" : 24, "type" : "CN_WORD", "position" : 4 }, { "token" : "门", "start_offset" : 24, "end_offset" : 25, "type" : "CN_CHAR", "position" : 5 }, { "token" : "风冷", "start_offset" : 25, "end_offset" : 27, "type" : "CN_WORD", "position" : 6 }, { "token" : "变频", "start_offset" : 27, "end_offset" : 29, "type" : "CN_WORD", "position" : 7 }, { "token" : "一级", "start_offset" : 29, "end_offset" : 31, "type" : "CN_WORD", "position" : 8 }, { "token" : "节能", "start_offset" : 31, "end_offset" : 33, "type" : "CN_WORD", "position" : 9 }, { "token" : "家用", "start_offset" : 33, "end_offset" : 35, "type" : "CN_WORD", "position" : 10 }, { "token" : "官方", "start_offset" : 35, "end_offset" : 37, "type" : "CN_WORD", "position" : 11 }, { "token" : "冰箱", "start_offset" : 37, "end_offset" : 39, "type" : "CN_WORD", "position" : 12 } ] }
ik_max_word:会将文本做最细粒度的拆分,比如会将“*国歌”拆分为“*,中华人民,中华,华人,人民*,人民,人,民,*,共和,和,国国,国歌”,会穷尽各种可能的组合。
ik_smart:会做最粗粒度的拆分,比如会将“*国歌”拆分为“*,国歌”。
分词使用
使用分词后 会将数据以倒排索引的方法存储 实现模糊查询
新建索引并使用ik分词保存
PUT my_index { "mappings": { "properties": { "title": { "type": "text", "analyzer": "ik_max_word" //使用ik分词保存 }, "name": { "type": "text" }, "age": { "type": "integer" }, "created": { "type": "date", "format": "strict_date_optional_time||epoch_millis" } } } }
索引插入文档
POST /my_index3/_bulk { "index": { "_id": 1 }} { "title" : "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱", "name" : "王二" , "age": 10, "created": 20190101 } { "index": { "_id": 2 }} { "title" : "【爆款秒杀】海尔冰箱三门家用小型节能省电双门电冰箱官方旗舰店", "name" : "王二" , "age": 10, "created": 20190101 } { "index": { "_id": 3}} { "title" : "Panasonic/松下 NR-TC28WS1-N 风冷无霜家用抑菌三门小体积冰箱", "name" : "王二" , "age": 10, "created": 20190101 } { "index": { "_id": 4}} { "title" : "小米电视4A50英寸4K高清智能网络平板液晶屏家电视机家电官方旗舰", "name" : "王二" , "age": 10, "created": 20190101 } { "index": { "_id": 5}} { "title" : "创维40X6 40英寸高清电视机智能网络wifi平板液晶屏家用彩电32 43", "name" : "王二" , "age": 10, "created": 20190101 } { "index": { "_id": 6}} { "title" : "Changhong/长虹 50D4P 50英寸超薄无边全面屏4K超高清智能电视机", "name" : "王二" , "age": 10, "created": 20190101 }
查看分词
GET _analyze?pretty { "analyzer": "ik_max_word", "text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱" }
通过条件搜索
GET /my_index/_search?pretty { "query": { "match": {"title": "对"} } }
会发现只有分词的条件才能被查询
自定义分词器
参考:https://blog.csdn.net/Barbarousgrowth_yp/article/details/80242811
参考:https://blog.csdn.net/zhou870498/article/details/80501972