朴素贝叶斯模型是贝叶斯理论中最简单的一类,基本假设有两个,一是特征相互独立,二是特征同等重要,这样可以大大简化计算,虽然在实际中属性之间未必是相互独立的,甚至存在严重的依赖性(此时会用到半朴素贝叶斯模型,比如贝叶斯网),但是这并不影响朴素贝叶斯的使用价值。在很多情况下可以取得很好的效果。
以下用一个简单的模拟数据集来展示了朴素贝叶斯模型的基本实现过程,使用python3.6版本
from numpy import * def loaddata(): datalist = [['my', 'dog', 'has', 'flea', 'problems', 'please', 'help', 'a'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him', 'have'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classvect = [0, 1, 0, 1, 0, 1]# 类别标签,0表示不含有侮辱性语言,1表示含有侮辱性语言 return datalist, classvect #创建dataset def createvocabset(dataset): vocabset = set([]) for item in dataset: vocabset = vocabset | set(item) return list(vocabset) #将测试字符串转化为向量 def word2vect(wordset, testword): resultvect = [0]*len(wordset) for item in testword: if item in wordset: indx = wordset.index(item) resultvect[indx] = 1 else: print(item + 'is not in wordlist!') return resultvect def calcondiproba(wordlist, wordset, classvect): wordmatrix = [] for item in wordlist: wordmatrix.append(word2vect(wordset, item)) numwords = len(wordmatrix[0]) numvects = len(wordmatrix) p1 = ones(numwords) p0 = ones(numwords) for i in range(numvects): if classvect[i] == 1: p1 += wordmatrix[i] else: p0 += wordmatrix[i] p1 = log(p1/(sum(p1)+2)) p0 = log(p0/(sum(p0)+2)) pA = sum(classvect)/len(classvect) return p1, p0, pA def classify(vect2classify, p1, p0, pA): P1 = sum(vect2classify * p1) + log(pA) P0 = sum(vect2classify * p0) + log(1 - pA) if P1 > P0: return 1 else: return 0 #测试主函数 def test_classify(testwords): testwords = testwords.split() wordlist, classvect = loaddata() wordset = createvocabset(wordlist) p1, p0, pA = calcondiproba(wordlist, wordset, classvect) testvect = word2vect(wordset, testwords) result = classify(testvect, p1, p0, pA) print(result) #测试 mysetence1 = "I have a stupid dog" test_classify(mysetence1) mysetence2 = "my dog is worthless" test_classify(mysetence2)