Skip to content

Commit 01c35ae

Browse files
committed
elasticsearch
1 parent 6783cf9 commit 01c35ae

File tree

2 files changed

+156
-0
lines changed

2 files changed

+156
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# -*- coding: utf-8 -*-
2+
# @Time    : 2020/7/30 15:04
3+
# @Author  : xiaolu
4+
# @FileName: 001-创建库并插入数据.py
5+
# @Software: PyCharm
6+
from elasticsearch import Elasticsearch
7+
8+
9+
es = Elasticsearch()
10+
11+
# result = es.indices.delete(index='point_type', ignore=[400, 404]) # 删除索引(库)
12+
# exit()
13+
14+
15+
mapping = {
16+
"settings": {
17+
"analysis": {
18+
"filter": {
19+
"jieba_stop": {
20+
"type": "stop",
21+
"stopwords_path": "stopwords/stopwords.txt"
22+
},
23+
"jieba_synonym": {
24+
"type": "synonym",
25+
"synonyms_path": "synonyms/synonyms.txt"
26+
},
27+
"my_shingle_filter": {
28+
"type": "shingle",
29+
"min_shingle_size": 2,
30+
"max_shingle_size": 2,
31+
"output_unigrams": False
32+
}
33+
},
34+
"analyzer": {
35+
"word_ans": {
36+
"tokenizer": "jieba_search", # 采用结巴分词
37+
"filter": "jieba_stop" # 采用结巴停用词过滤
38+
},
39+
"char_ana": {
40+
"tokenizer": "standard", # 对于字符 采用标准的分词方式 就是按字分割
41+
"filter": "jieba_stop" # 也采用jieba停用词过滤
42+
},
43+
"char_bigram_ana": {
44+
"type": "custom",
45+
"tokenizer": "standard",
46+
"filter": [
47+
"jieba_stop",
48+
"my_shingle_filter"
49+
]
50+
},
51+
"word_bigram_ana": {
52+
"type": "custom",
53+
"tokenizer": "jieba_search",
54+
"filter": [
55+
"jieba_stop",
56+
"my_shingle_filter"
57+
]
58+
}
59+
}
60+
}
61+
},
62+
"mappings": {
63+
"properties": {
64+
"title": {
65+
"type": "keyword"
66+
},
67+
"author": {
68+
"type": "keyword"
69+
},
70+
"dynasty": {
71+
"type": "keyword"
72+
},
73+
"words": {
74+
"type": "integer"
75+
},
76+
"content": {
77+
"analyzer": "word_ana",
78+
"search_analyzer": "word_ana",
79+
"type": "text"
80+
}
81+
}
82+
}
83+
}
84+
# 相当于将content入库时,会进行分词,然后采用jieba的停用词过滤方式。 当通过内容去查找时,也是先将问题分词,然后停用词过滤,在进行匹配。
85+
86+
# es.indices.create(index='point_type', body=mapping)
87+
88+
# 然后插入数据
89+
data = [
90+
{
91+
"title": "静夜思",
92+
"author": "李白",
93+
"dynasty": "唐",
94+
"words": "20",
95+
"content": "床前明月光,疑是地上霜。举头望明月,低头思故乡。"
96+
},
97+
98+
{
99+
"title": "观沧海",
100+
"author": "曹操",
101+
"dynasty": "东汉末年",
102+
"words": "56",
103+
"content": "东临碣石,以观沧海。水何澹澹,山岛竦峙。树木丛生,百草丰茂。秋风萧瑟,洪波涌起。日月之行,若出其中。星汉灿烂,若出其里。幸甚至哉,歌以咏志。"
104+
},
105+
106+
{
107+
"title": "咏鹅",
108+
"author": "骆宾王",
109+
"dynasty": "唐",
110+
"words": "18",
111+
"content": "鹅鹅鹅,曲项向天歌。白毛浮绿水,红掌拨清波。"
112+
},
113+
114+
{
115+
"title": "将进酒",
116+
"author": "陈陶",
117+
"dynasty": "唐",
118+
"words": "14",
119+
"content": "银鸭金鹅言待谁,隋家岳渎皇家有"
120+
},
121+
122+
{
123+
"title": "春雪",
124+
"author": "白居易",
125+
"dynasty": "唐",
126+
"words": "10",
127+
"content": "大似落鹅毛,密如飘玉屑"
128+
}
129+
]
130+
for d in data:
131+
es.index(index='point_type', body=d)

elasticsearch/002-es中的搜索.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
# @Time    : 2020/7/30 15:52
3+
# @Author  : xiaolu
4+
# @FileName: 002-es中的搜索.py
5+
# @Software: PyCharm
6+
from elasticsearch import Elasticsearch
7+
8+
9+
if __name__ == '__main__':
10+
es = Elasticsearch()
11+
querys = '东临碣石'
12+
dsl = {
13+
'query': {
14+
'match': {
15+
'title': '咏鹅'
16+
}
17+
}
18+
}
19+
results = es.search(index='point_type', body=dsl)['hits']['hits'] # 搜索多条结果的话 这里可能是一个列表
20+
21+
res = []
22+
for result in results:
23+
res.append(result['_source'])
24+
print(res)
25+

0 commit comments

Comments
 (0)