批量删除指定文本
之前的keyword就不要了,准备删除掉,写了个脚本,批量删除keyword.txt结尾的文件
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
def read_from_file(directions):
decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
#编码集循环
for k in decode_set:
try:
file = open(directions,"r",encoding=k)
readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
#print("open file %s with encoding %s" %(directions,k))#打印读取成功
#readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
file.close()
break#打开路径成功跳出编码匹配
except:
if k=="Error":#如果碰到这个程序终止运行
raise Exception("%s had no way to decode"%directions)
continue
return readfile
filenames = []
filenames=glob.glob(r"D:/TXTandTXTkeyword/TXT/*.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
countprint=countprint+1
if countprint==10:
print("\r%d : %d" %(count,filenameslen),end='')
countprint=0
os.remove(filename)
count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")
去除emoji
在之前的文本预处理工作之中,我只是简单的进行了jieba分词,然后效果很不好,里面有很多的乱码,经过调查,乱码来源如下:
一个是emoji,一个是非gbk字符
针对我们目前英文文本的处理操作,我们必须去除掉emoji和非gbk字符,这边提供一种思路,就是使用nltk.corpus中的wordnet去判断是否为标准英文单词,代码如下:
#!usr/bin/env python
#-*- coding:utf-8 -*-
from nltk.corpus import wordnet
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
def read_from_file(directions):
decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
#编码集循环
for k in decode_set:
try:
file = open(directions,"r",encoding=k)
readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
#print("open file %s with encoding %s" %(directions,k))#打印读取成功
#readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
file.close()
break#打开路径成功跳出编码匹配
except:
if k=="Error":#如果碰到这个程序终止运行
raise Exception("%s had no way to decode"%directions)
continue
return readfile
filenames = []
filenames=glob.glob(r"D:/allkeyword/TXT/*.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
countprint=countprint+1
if countprint==10:
print("\r%d : %d" %(count,filenameslen),end='')
countprint=0
names=filename.find('TXT')+4
namee=filename.find('.txt')
f=open(filename,"rb")
content=f.readlines()
content=" ".join('%s' %id for id in content)
start=content.find('description')+15
overflow=content.find('comments')
end=content[start:].find('#')+start
contentfinal=""
j=start
qq=start
# print(content[start:overflow])
for i in range(start,overflow):
if content[i]=="#":
contentfinal=contentfinal+content[j:i]
x=content[i:].find(" ")+i
i=x
j=x
contentfinal=contentfinal+content[j:i]
if end>=overflow:
end=overflow
# print("filename:\n")
# print(filename[names:namee])
file = open(r"D:/allkeyword/TXT/" +filename[names:namee]+'keyword' + '.txt','w')
# print("content is:\n")
# print(content[start:end])
# print("\n keyword: \n")
file_data = contentfinal
#基于TF-IDF算法进行关键词抽取
tfidf=jieba.analyse.extract_tags
keywords=tfidf(file_data)
for i in range(len(keywords)):
if len(keywords)<=0:
print("error,please check your input")
break
if wordnet.synsets(keywords[i]):
file.write(keywords[i]+'\n')
# print(keywords[i]+"\n")
file.close()
count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")
指定social media语料库进行词的筛选
使用了nltk的语料库,nltk有着webtext这样的语料库,据此我对文本进行了筛选
#!usr/bin/env python
#-*- coding:utf-8 -*-
from nltk.corpus import webtext
from nltk.corpus import wordnet
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
str0=webtext.raw('firefox.txt')
str1=webtext.raw('grail.txt')
str2=webtext.raw('overheard.txt')
str3=webtext.raw('pirates.txt')
str4=webtext.raw('singles.txt')
str5=webtext.raw('wine.txt')
def read_from_file(directions):
decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
#编码集循环
for k in decode_set:
try:
file = open(directions,"r",encoding=k)
readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
#print("open file %s with encoding %s" %(directions,k))#打印读取成功
#readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
file.close()
break#打开路径成功跳出编码匹配
except:
if k=="Error":#如果碰到这个程序终止运行
raise Exception("%s had no way to decode"%directions)
continue
return readfile
filenames = []
filenames=glob.glob(r"D:/allkeyword/TXT/*keyword.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
countprint=countprint+1
if countprint==10:
print("\r%d : %d" %(count,filenameslen),end='')
countprint=0
names=filename.find('TXT')+4
namee=filename.find('.txt')
f=open(filename,'r')
f1=open(filename,'r')
countoffile0=len(f1.readlines())
f1.close()
countoffile1=0
line=f.readline()
file = open(r"D:/allkeyword/dealwithSocialMedia/"+filename[names:namee]+'new'+ '.txt','w')
while line:
text=line[:len(line)-1]
# print(text)
if str0.find(text)!=-1 or str1.find(text)!=-1 or str2.find(text)!=-1 or str3.find(text)!=-1 or str4.find(text)!=-1 or str5.find(text)!=-1:
# print(text+" in")
file.write(text+'\n')
countoffile1=countoffile1+1
line = f.readline()
file.close()
file1 = open(r"D:/allkeyword/stat/" +filename[names:namee]+'stat' + '.txt','w')
file1.write("%d to %d" %(countoffile0,countoffile1))
file1.close()
# print("filename:\n")
# print(filename[names:namee])
count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")