#!python3
#这里使用朴素贝叶斯算法
#即贝叶斯算法的简化版
from numpy import *
def loadDataSet():
postingList=[['my','dog','had','flea',\
'problems','help','please'],
['mybe','not','take','him',\
'to','dog','park','stupid'],
['my','dalmation','is','so','cute',\
'I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how',\
'to','stop','him'],
['quit','buying', 'worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1] #1 代表侮辱性文字,0代表正常言论
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: %s is not in my Vocabulary!" %word)
return returnVec
#计算每个类别中的文档数目
#对每篇训练文档:
# 对每个类别:
# 如果词条出现在文档中————》增加该词条的计数值
# 增加所有词条的计数值
# 对每个类别:
# 对每个词条:
# 将该词条的数目初一总词条数得到条件概率
# 返回每个类别的条件概率
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pabusive = sum(trainCategory)/float(numTrainDocs)
p0Num = zeros(numWords);p1Num = zeros(numWords)
p0Denom = 0.0;p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect,p1Vect,pabusive
def classifyNB(vec2Classify, p0Vec, p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postingDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V\
,pAb))
#文档词袋模型
def bagOfWord2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
#文件解析及完整的垃圾邮件测试函数
def textParse(bigString):
import re
listOfTokens = re.split(r'\w*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) >2 2]
def spamTest():
docList=[];classList = [];fullText = []
for i in range(1,26):
wordList = textParse(open('email/span/%d.txt'%i).read())
docList.append(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = range(50);testSet = []
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = [];trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) !=\
classList[docIndex]:
errorCount += 1
print ("the error rate is:",float(errorCount)/len(testSet))
#Rss源分类器及高频词去除函数
def clacMostFreq(vocabList,fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(),key = operator.itemgetter(1),reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
import feedparser
docList = [];classList = [];fullText = []
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)
朴素贝叶斯分类numpy版本——深度学习
猜你喜欢
转载自blog.csdn.net/mengjiexu_cn/article/details/83019170
今日推荐
周排行