python数据分析与挖掘之贝叶斯算法算法实现

代码中有详细的注释
训练文件：
Txt文件中为0，1矩阵，将图片转换为0，1矩阵见上一篇博客方法
import numpy  
import operator
from os import listdir
class Bayes:
    def __init__(self):
        self.length=-1
        self.labelcount=dict()#各类别的概率{'类别1'：p1,'类别2'：p2}
        self.vectorcount=dict()#以字典存储各类别的特征向量
    #训练函数
    def fit(self,dataSet:list,labels:list):
        if(len(dataSet)!=len(labels)):
            raise ValueError('您输入的测试数组和类别数组长度不一致')
        self.length=len(dataSet[0])#测试数据特征值长度
        labelsnum=len(labels)#类别所有的数量
        norlabels=set(labels)#不重复类别的数量
        for item in norlabels:
            thislabel=item
            self.labelcount[thislabel]=labels.count(thislabel)/labelsnum#求的当前类别在总类别中所占的比例
        for vector,label in zip(dataSet,labels):
            #通过zip将两个数组交叉放置
            if(label not in self.vectorcount):
                self.vectorcount[label]=[]
            self.vectorcount[label].append(vector)
        print('训练结束')
        return self

    def btest(self,testData,labelsSet):
        if(self.length==-1):
            raise ValueError('您还没有进行训练，请先训练')
        #计算testdata分别为各个类别的概率
        lbDict=dict() #{'类别1'：p1,'类别2'：p2}
        for thislb in labelsSet:
            p=1
            alllabel=self.labelcount[thislb] #当前类别的概率
            allvector=self.vectorcount[thislb]#当前类别的所有特征向量
            vnum=len(allvector)#当前类别特征向量个数
            allvector=numpy.array(allvector).T
            for index in range(0,len(testData)): #依次计算各特征的概率
                vector=list(allvector[index])
                p*=vector.count(testData[index])/vnum #p(当前特征|C)
            lbDict[thislb]=p*alllabel  #alllabel相当于p(c)
        thislabel=sorted(lbDict,key=lambda x:lbDict[x],reverse=True)[0]
        return thislabel


#加载数据
def datatoarray(fname):
    arr=[]
    fh=open(fname)
    #因为图片文本是32乘32的，将每一个像素点的值都放入一个长度为1024的列表中
    for i in range(0,32):
        thisline=fh.readline()
        for j in range(0,32):
            arr.append(int(thisline[j]))
    return arr

#取文件的前缀
def seplabel(fname):
    filestr=fname.split('.')[0]
    label=int(filestr.split('-')[0])
    #print(label)
    return label

#建立训练数据
def traindata():
    labels=[]
    tranfile=listdir('E:/programCode/手写数字识别实验')#listdir()得到所有的文件名
    num=len(tranfile)
    #行的长度1024，每一行存储一个文件
    #用一个数组存储所有训练数据，行：文件总数，列：1024
    trainarr=numpy.zeros((num,1024))
    for i in range(0,num):
        thisfname=tranfile[i]
        thislabel=seplabel(thisfname)
        labels.append(thislabel)
        trainarr[i,:]=datatoarray('E:/programCode/手写数字识别实验/'+thisfname)
    return trainarr,labels


bys=Bayes()
#训练数据
train_data,labels=traindata()
bys.fit(train_data,labels)

#测试
thisdata=datatoarray("E:/programCode/test/test.txt")
labelsall=[1,2,3]

#识别单个手写体数字
rst=bys.btest(thisdata,labelsall)
print(rst)
训练文件列表：
python数据分析与挖掘之贝叶斯算法算法实现

猜你喜欢