1.模式
import jieba
string='我喜欢广州奥体中心'
#全模式
w1=jieba.cut(string,cut_all=True)
for i in w1:
print(i)
print("------------------------------------")
#精准模式,默认是这个方式
w2=jieba.cut(string,cut_all=False)
for i in w2:
print(i)
print("------------------------------------")
#搜索引擎
w3=jieba.cut_for_search(string)
for i in w3:
print(i)
print("------------------------------------")
'''
我
喜欢
广州
奥体
奥体中心
中心
------------------------------------
我
喜欢
广州
奥体中心
------------------------------------
我
喜欢
广州
奥体
中心
奥体中心
------------------------------------
'''
2.词典加载与设定
'''
指定自己的词典
tips:1.编码utf-8
2.格式为:词语+空格+词频+[词性]
'''
#设置自己的字典,如果在自定义的词典里找不到,还是会在默认的词典里找
jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
'''
加载自己的词典
tips:1.编码utf-8
2.格式为:词语+[空格]+[词频]+[词性]
'''
jieba.load_userdict('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
3.词性标注
import jieba.posseg
w4=jieba.posseg.cut(string)
# .word词语
#.flag词性
for i in w4:
print(i.word+"----"+i.flag)
4.提取高词性词语
data=open('D:/aaa.txt').read()
#num:前多少个?
w3=jba.extract_tags(data,num)
for i in w3:
print(i)
5.词语与词语位置
#精准模式
w1=jieba.tokenize(string)
for i in w1:
print(i)
print("------------------------------------------------")
#搜索引擎方式
w2=jieba.tokenize(string,mode='search')
for i in w2:
print(i)
'''
('我', 0, 1)
('喜欢', 1, 3)
(',', 3, 4)
('广州', 4, 6)
('奥体中心', 6, 10)
------------------------------------------------
('我', 0, 1)
('喜欢', 1, 3)
(',', 3, 4)
('广州', 4, 6)
('奥体', 6, 8)
('中心', 8, 10)
('奥体中心', 6, 10)
'''
6.高频词提取
import jieba
#数据读取
def get_context(path):
with open(path,'r',encoding='utf-8') as f:
return f.read().strip()
#高频词统计
def get_TF(split_words,top=10):
tf_dic={}
for word in split_words:
tf_dic[word]=tf_dic.get(word,0)+1
return sorted(tf_dic.items(),key=lambda x:x[1],reverse=True)[:top]
def main():
jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
path='D:/自然语言处理/test1.txt'
corpus=get_context(path)
split_words=list(jieba.cut(corpus))
tops=get_TF(split_words)
print(tops)
main()
'''
[('、', 181853), ('.', 85128), (':', 84800), ('txt', 84800), ('\n', 84799),
('。', 76733), ('-', 26655), (',', 18151), ('coco奶茶', 17284), ('材料', 9961)]
'''
7.设置停用词
import jieba
#数据读取
def get_context(path):
with open(path,'r',encoding='utf-8') as f:
return f.read().strip()
#高频词统计
def get_TF(split_words,top=10):
tf_dic={}
for word in split_words:
tf_dic[word]=tf_dic.get(word,0)+1
return sorted(tf_dic.items(),key=lambda x:x[1],reverse=True)[:top]
#设置停用词
def stop_word(path):
with open(path,'r',encoding='utf-8-sig') as f:
stop_words=['\n']
for i in f.readlines():
stop_words.append(i.strip())
return stop_words
def main():
jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
path='D:/自然语言处理/test1.txt'
path2 = 'D:/自然语言处理/stop.txt'
corpus=get_context(path)
stop_words=stop_word(path2)
print(stop_words)
split_words=[word for word in jieba.cut(corpus) if word not in stop_words]
tops=get_TF(split_words)
print(tops)
main()
'''
[('coco奶茶', 17284), ('材料', 9961), ('(', 6603), (')', 6349), ('N', 5682),
('金属', 5506), ('盐', 4703), ('溶剂', 4669), ('纳米', 4583), (',', 4358)]
'''