官网指南:https://www.elastic.co/guide/en/elasticsearch/reference/current/normalizer.html
在 Elasticsearch 中处理字符串类型的数据时,如果我们想把整个字符串作为一个完整的 term 存储,我们通常会将其类型 type
设定为 keyword
。但有时这种设定又会给我们带来麻烦,比如同一个数据再写入时由于没有做好清洗,导致大小写不一致,比如 apple
、Apple
两个实际都是 apple
,但当我们去搜索 apple
时却无法返回 Apple
的文档。要解决这个问题,就需要 Normalizer
出场了。废话不多说,直接上手看!
静态映射创建索引
PUT test
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
准备数据
PUT test/_doc/1
{
"foo": "BÀR"
}
PUT test/_doc/2
{
"foo": "bar"
}
PUT test/_doc/3
{
"foo": "baz"
}
测试效果
GET test/_search
{
"query": {
"term": {
"foo": "BAR"
}
}
}
GET test/_search
{
"query": {
"match": {
"foo": "BAR"
}
}
}
实战创建索引demo:
{
"settings": {
"number_of_replicas": 1,
"number_of_shards": 3,
"refresh_interval": "1s",
"translog": {
"flush_threshold_size": "1.6gb"
},
"merge": {
"scheduler": {
"max_thread_count": "1"
}
},
"index": {
"routing": {
"allocation": {
"total_shards_per_node": "2"
}
}
},
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"properties": {
"huid": {
"index": true,
"store": true,
"type": "keyword"
},
"standard_name": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"hcode": {
"index": true,
"store": true,
"type": "keyword"
},
"name": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"name_segments": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"name_segments_loc": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"pcode": {
"index": true,
"store": true,
"type": "keyword"
},
"label": {
"index": true,
"store": true,
"type": "keyword"
},
"hcreatetime": {
"index": true,
"store": true,
"format": "yyyy-MM-dd HH:mm:ss",
"type": "date"
},
"hupdatetime": {
"index": true,
"store": true,
"format": "yyyy-MM-dd HH:mm:ss",
"type": "date"
},
"create_by": {
"index": true,
"store": true,
"type": "keyword"
},
"update_by": {
"index": true,
"store": true,
"type": "keyword"
},
"hisvalid": {
"index": true,
"store": true,
"type": "integer"
},
"src": {
"index": true,
"store": true,
"type": "keyword"
},
"SEC_HCODE": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"SEC_TYPE": {
"index": true,
"store": true,
"type": "keyword"
},
"EXCH_HCODE": {
"index": true,
"store": true,
"type": "keyword"
},
"COMB_SYMBOL": {
"index": true,
"store": true,
"type": "keyword"
},
"CNAME": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"CSNAME_PINYIN_FSIM": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"CSNAME": {
"index": true,
"store": true,
"type": "keyword"
},
"ENAME": {
"index": true,
"store": true,
"type": "keyword"
},
"ESNAME": {
"index": true,
"store": true,
"type": "keyword"
},
"is_mstr_name": {
"index": true,
"store": true,
"type": "integer"
},
"tag": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
},
"name_rinse": {
"index": true,
"store": true,
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}