Python 文本分析 笔记
中文停用词处理
自行下载 shotwords.txt,代码如下:
def
stopwordslist(filepath):
stopwords
=
[line.strip()
for
line
in
open
(filepath,
'r'
, encoding
=
'utf-8'
).readlines()]
return
stopwords
# 对句子进行分词
def
seg_sentence(sentence):
sentence_seged
=
jieba.cut(sentence.strip())
stopwords
=
stopwordslist(
'/root/stopwords.txt'
)
# 这里加载停用词的路径
outstr
=
''
for
word
in
sentence_seged:
if
word
not
in
stopwords:
if
word !
=
'\t'
:
outstr
+
=
word
outstr
+
=
" "
return
outstr
|