jieba_1

1.模式

import jieba
string='我喜欢广州奥体中心'
#全模式
w1=jieba.cut(string,cut_all=True)
for i in w1:
    print(i)
print("------------------------------------")

#精准模式,默认是这个方式
w2=jieba.cut(string,cut_all=False)
for i in w2:
    print(i)
print("------------------------------------")

#搜索引擎
w3=jieba.cut_for_search(string)
for i in w3:
    print(i)
print("------------------------------------")
'''
我
喜欢
广州
奥体
奥体中心
中心
------------------------------------
我
喜欢
广州
奥体中心
------------------------------------
我
喜欢
广州
奥体
中心
奥体中心
------------------------------------
'''

2.词典加载与设定

'''
指定自己的词典
tips:1.编码utf-8
     2.格式为：词语+空格+词频+[词性]
'''
#设置自己的字典,如果在自定义的词典里找不到，还是会在默认的词典里找
jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')

'''
加载自己的词典
tips:1.编码utf-8
     2.格式为：词语+[空格]+[词频]+[词性]
'''
jieba.load_userdict('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')

3.词性标注

import jieba.posseg
w4=jieba.posseg.cut(string)
# .word词语
#.flag词性
for i in w4:
    print(i.word+"----"+i.flag)

4.提取高词性词语

data=open('D:/aaa.txt').read()
#num：前多少个？
w3=jba.extract_tags(data,num)
for i in w3:
    print(i)

5.词语与词语位置

#精准模式
w1=jieba.tokenize(string)
for i in w1:
    print(i)
print("------------------------------------------------")
#搜索引擎方式
w2=jieba.tokenize(string,mode='search')
for i in w2:
    print(i)

'''
('我', 0, 1)
('喜欢', 1, 3)
(',', 3, 4)
('广州', 4, 6)
('奥体中心', 6, 10)
------------------------------------------------
('我', 0, 1)
('喜欢', 1, 3)
(',', 3, 4)
('广州', 4, 6)
('奥体', 6, 8)
('中心', 8, 10)
('奥体中心', 6, 10)
'''

6.高频词提取

import jieba
#数据读取
def get_context(path):
    with open(path,'r',encoding='utf-8') as f:
        return f.read().strip()
#高频词统计
def get_TF(split_words,top=10):
    tf_dic={}
    for word in split_words:
        tf_dic[word]=tf_dic.get(word,0)+1
    return sorted(tf_dic.items(),key=lambda x:x[1],reverse=True)[:top]


def main():
    jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
    path='D:/自然语言处理/test1.txt'
    corpus=get_context(path)
    split_words=list(jieba.cut(corpus))
    tops=get_TF(split_words)
    print(tops)
main()

'''
[('、', 181853), ('.', 85128), (':', 84800), ('txt', 84800), ('\n', 84799), 
('。', 76733), ('-', 26655), ('，', 18151), ('coco奶茶', 17284), ('材料', 9961)]
'''

7.设置停用词

import jieba
#数据读取
def get_context(path):
    with open(path,'r',encoding='utf-8') as f:
        return f.read().strip()
#高频词统计
def get_TF(split_words,top=10):
    tf_dic={}
    for word in split_words:
        tf_dic[word]=tf_dic.get(word,0)+1
    return sorted(tf_dic.items(),key=lambda x:x[1],reverse=True)[:top]

#设置停用词
def stop_word(path):
    with open(path,'r',encoding='utf-8-sig') as f:
        stop_words=['\n']
        for i in f.readlines():
            stop_words.append(i.strip())
        return stop_words


def main():
    jieba.set_dictionary('D:/Anaconda/Lib/site-packages/jieba/mydict.txt')
    path='D:/自然语言处理/test1.txt'
    path2 = 'D:/自然语言处理/stop.txt'
    corpus=get_context(path)
    stop_words=stop_word(path2)
    print(stop_words)
    split_words=[word for word in jieba.cut(corpus) if word not in stop_words]
    tops=get_TF(split_words)
    print(tops)
main()


'''
[('coco奶茶', 17284), ('材料', 9961), ('(', 6603), (')', 6349), ('N', 5682), 
('金属', 5506), ('盐', 4703), ('溶剂', 4669), ('纳米', 4583), (',', 4358)]
'''

1.模式

2.词典加载与设定

3.词性标注

4.提取高词性词语

5.词语与词语位置

6.高频词提取

7.设置停用词

猜你喜欢