倒排索引
这里就涉及到了分词
分词语法
默认的分词器
GET _analyze?pretty
{
"text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱"
}
{
"tokens" : [
{
"token" : "haier",
"start_offset" : 0,
"end_offset" : 5,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "海",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "尔",
"start_offset" : 7,
"end_offset" : 8,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "bcd",
"start_offset" : 9,
"end_offset" : 12,
"type" : "<ALPHANUM>",
"position" : 3
},
{
"token" : "470wdpg",
"start_offset" : 13,
"end_offset" : 20,
"type" : "<ALPHANUM>",
"position" : 4
},
{
"token" : "十",
"start_offset" : 20,
"end_offset" : 21,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "字",
"start_offset" : 21,
"end_offset" : 22,
"type" : "<IDEOGRAPHIC>",
"position" : 6
},
{
"token" : "对",
"start_offset" : 22,
"end_offset" : 23,
"type" : "<IDEOGRAPHIC>",
"position" : 7
},
{
"token" : "开",
"start_offset" : 23,
"end_offset" : 24,
"type" : "<IDEOGRAPHIC>",
"position" : 8
},
{
"token" : "门",
"start_offset" : 24,
"end_offset" : 25,
"type" : "<IDEOGRAPHIC>",
"position" : 9
},
{
"token" : "风",
"start_offset" : 25,
"end_offset" : 26,
"type" : "<IDEOGRAPHIC>",
"position" : 10
},
{
"token" : "冷",
"start_offset" : 26,
"end_offset" : 27,
"type" : "<IDEOGRAPHIC>",
"position" : 11
},
{
"token" : "变",
"start_offset" : 27,
"end_offset" : 28,
"type" : "<IDEOGRAPHIC>",
"position" : 12
},
{
"token" : "频",
"start_offset" : 28,
"end_offset" : 29,
"type" : "<IDEOGRAPHIC>",
"position" : 13
},
{
"token" : "一",
"start_offset" : 29,
"end_offset" : 30,
"type" : "<IDEOGRAPHIC>",
"position" : 14
},
{
"token" : "级",
"start_offset" : 30,
"end_offset" : 31,
"type" : "<IDEOGRAPHIC>",
"position" : 15
},
{
"token" : "节",
"start_offset" : 31,
"end_offset" : 32,
"type" : "<IDEOGRAPHIC>",
"position" : 16
},
{
"token" : "能",
"start_offset" : 32,
"end_offset" : 33,
"type" : "<IDEOGRAPHIC>",
"position" : 17
},
{
"token" : "家",
"start_offset" : 33,
"end_offset" : 34,
"type" : "<IDEOGRAPHIC>",
"position" : 18
},
{
"token" : "用",
"start_offset" : 34,
"end_offset" : 35,
"type" : "<IDEOGRAPHIC>",
"position" : 19
},
{
"token" : "官",
"start_offset" : 35,
"end_offset" : 36,
"type" : "<IDEOGRAPHIC>",
"position" : 20
},
{
"token" : "方",
"start_offset" : 36,
"end_offset" : 37,
"type" : "<IDEOGRAPHIC>",
"position" : 21
},
{
"token" : "冰",
"start_offset" : 37,
"end_offset" : 38,
"type" : "<IDEOGRAPHIC>",
"position" : 22
},
{
"token" : "箱",
"start_offset" : 38,
"end_offset" : 39,
"type" : "<IDEOGRAPHIC>",
"position" : 23
}
]
}
ik_max_word
GET _analyze?pretty
{
"analyzer": "ik_max_word",
"text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱"
}
{
"tokens" : [
{
"token" : "haier",
"start_offset" : 0,
"end_offset" : 5,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "海尔",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "bcd-470wdpg",
"start_offset" : 9,
"end_offset" : 20,
"type" : "LETTER",
"position" : 2
},
{
"token" : "bcd",
"start_offset" : 9,
"end_offset" : 12,
"type" : "ENGLISH",
"position" : 3
},
{
"token" : "470",
"start_offset" : 13,
"end_offset" : 16,
"type" : "ARABIC",
"position" : 4
},
{
"token" : "wdpg",
"start_offset" : 16,
"end_offset" : 20,
"type" : "ENGLISH",
"position" : 5
},
{
"token" : "十字",
"start_offset" : 20,
"end_offset" : 22,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "十",
"start_offset" : 20,
"end_offset" : 21,
"type" : "TYPE_CNUM",
"position" : 7
},
{
"token" : "字",
"start_offset" : 21,
"end_offset" : 22,
"type" : "COUNT",
"position" : 8
},
{
"token" : "对开",
"start_offset" : 22,
"end_offset" : 24,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "开门",
"start_offset" : 23,
"end_offset" : 25,
"type" : "CN_WORD",
"position" : 10
},
{
"token" : "门风",
"start_offset" : 24,
"end_offset" : 26,
"type" : "CN_WORD",
"position" : 11
},
{
"token" : "风冷",
"start_offset" : 25,
"end_offset" : 27,
"type" : "CN_WORD",
"position" : 12
},
{
"token" : "变频",
"start_offset" : 27,
"end_offset" : 29,
"type" : "CN_WORD",
"position" : 13
},
{
"token" : "一级",
"start_offset" : 29,
"end_offset" : 31,
"type" : "CN_WORD",
"position" : 14
},
{
"token" : "一",
"start_offset" : 29,
"end_offset" : 30,
"type" : "TYPE_CNUM",
"position" : 15
},
{
"token" : "级",
"start_offset" : 30,
"end_offset" : 31,
"type" : "COUNT",
"position" : 16
},
{
"token" : "节能",
"start_offset" : 31,
"end_offset" : 33,
"type" : "CN_WORD",
"position" : 17
},
{
"token" : "家用",
"start_offset" : 33,
"end_offset" : 35,
"type" : "CN_WORD",
"position" : 18
},
{
"token" : "官方",
"start_offset" : 35,
"end_offset" : 37,
"type" : "CN_WORD",
"position" : 19
},
{
"token" : "冰箱",
"start_offset" : 37,
"end_offset" : 39,
"type" : "CN_WORD",
"position" : 20
}
]
}
ik_max_word
GET _analyze?pretty
{
"analyzer": "ik_smart",
"text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱"
}
{
"tokens" : [
{
"token" : "haier",
"start_offset" : 0,
"end_offset" : 5,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "海尔",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "bcd-470wdpg",
"start_offset" : 9,
"end_offset" : 20,
"type" : "LETTER",
"position" : 2
},
{
"token" : "十字",
"start_offset" : 20,
"end_offset" : 22,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "对开",
"start_offset" : 22,
"end_offset" : 24,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "门",
"start_offset" : 24,
"end_offset" : 25,
"type" : "CN_CHAR",
"position" : 5
},
{
"token" : "风冷",
"start_offset" : 25,
"end_offset" : 27,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "变频",
"start_offset" : 27,
"end_offset" : 29,
"type" : "CN_WORD",
"position" : 7
},
{
"token" : "一级",
"start_offset" : 29,
"end_offset" : 31,
"type" : "CN_WORD",
"position" : 8
},
{
"token" : "节能",
"start_offset" : 31,
"end_offset" : 33,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "家用",
"start_offset" : 33,
"end_offset" : 35,
"type" : "CN_WORD",
"position" : 10
},
{
"token" : "官方",
"start_offset" : 35,
"end_offset" : 37,
"type" : "CN_WORD",
"position" : 11
},
{
"token" : "冰箱",
"start_offset" : 37,
"end_offset" : 39,
"type" : "CN_WORD",
"position" : 12
}
]
}
ik_max_word:会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合。
ik_smart:会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
分词使用
使用分词后 会将数据以倒排索引的方法存储 实现模糊查询
新建索引并使用ik分词保存
PUT my_index
{
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "ik_max_word" //使用ik分词保存
},
"name": {
"type": "text"
},
"age": {
"type": "integer"
},
"created": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
}
索引插入文档
POST /my_index3/_bulk
{ "index": { "_id": 1 }}
{ "title" : "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 2 }}
{ "title" : "【爆款秒杀】海尔冰箱三门家用小型节能省电双门电冰箱官方旗舰店", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 3}}
{ "title" : "Panasonic/松下 NR-TC28WS1-N 风冷无霜家用抑菌三门小体积冰箱", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 4}}
{ "title" : "小米电视4A50英寸4K高清智能网络平板液晶屏家电视机家电官方旗舰", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 5}}
{ "title" : "创维40X6 40英寸高清电视机智能网络wifi平板液晶屏家用彩电32 43", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 6}}
{ "title" : "Changhong/长虹 50D4P 50英寸超薄无边全面屏4K超高清智能电视机", "name" : "王二" , "age": 10, "created": 20190101 }
查看分词
GET _analyze?pretty
{
"analyzer": "ik_max_word",
"text": "Haier/海尔 BCD-470WDPG十字对开门风冷变频一级节能家用官方冰箱"
}
通过条件搜索
GET /my_index/_search?pretty
{
"query": {
"match": {"title": "对"}
}
}
会发现只有分词的条件才能被查询
自定义分词器
参考:https://blog.csdn.net/Barbarousgrowth_yp/article/details/80242811
参考:https://blog.csdn.net/zhou870498/article/details/80501972