朴素贝叶斯——从个人广告中获取区域倾向
示例:使用朴素贝叶斯分类器从个人广告中获取区域倾向
Craigslist个人广告链接已经找不到了,改用该网站的社会事件event与政治politics的RSS源
https://newyork.craigslist.org/search/eve?format=rss&sale_date=2018-06-11
https://losangeles.craigslist.org/search/eve?format=rss&sale_date=2018-06-11
https://newyork.craigslist.org/search/pol?format=rss
https://sfbay.craigslist.org/search/pol?format=rss
代码如下:
def calcMostFreq(vocabList, fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
import feedparser
docList=[]
classList = []
fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1) # NY is class 1
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList) # create vocabulary
top30Words = calcMostFreq(vocabList, fullText) # remove top 30 words
for pairW in top30Words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen))
testSet=[] # create test set
for i in range(20):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet: # train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet: # classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is: ', float(errorCount)/len(testSet))
return vocabList, p0V, p1V
def getTopWords(ny, sf):
import operator
vocabList, p0V, p1V=localWords(ny, sf)
topNY=[]
topSF=[]
for i in range(len(p0V)):
if p0V[i] > -6.0:
topSF.append((vocabList[i],p0V[i]))
if p1V[i] > -6.0:
topNY.append((vocabList[i],p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print(item[0])
ny = feedparser.parse('https://newyork.craigslist.org/search/pol?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/pol?format=rss')
vocabList, psf, pny = localWords(ny, sf)
print(vocabList)
print(psf)
print(pny)
vocabList, psf,pny = localWords(ny, sf)