版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_27492735/article/details/82082233
代码如下,停用词获取点击这里。
密码:cef8
# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
import jieba.posseg as pseg
import re
from pyltp import NamedEntityRecognizer
# jieba.load_userdict('userdict.txt')
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行分词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
inputs = open('./origin data/yu.txt', 'r', encoding='utf-8')
outputs = open('yuliao3.txt', 'w', encoding='utf-8')
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
谢谢观看!!!