版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_33765907/article/details/83064325
使用的数据集和上一个算法一样,都是完整的MNIST,这里为了简化循环的计算量,将整个图片二值化,同时为了避免浮点数连续相乘和避免出现除以0,用了一些小trick。重点要理解三维数组的条件概率。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import random
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
# 将图片二值化
def binaryzation(image):
cv_img = []
for i in image:
if i > 0:
cv_img.append(1)
else:
cv_img.append(0)
return np.array(cv_img)
# 先验概率的分母都为N,所以可以舍去只计算分子
# 条件概率由于是浮点数,容易产生错误(784个浮点数连乘变成inf)
# 于是得到概率后均乘以1000000,然后为了避免出现除以0,再加一,将值映射到[1,10001]
def Train(trainset, train_labels):
# 先验概率
prior_probability = np.zeros(class_num)
# 条件概率,第一维是y,第二维是x所在的维度,第三维是x的值
conditional_probability = np.zeros((class_num, features_num, 2))
# 计算先验概率和条件概率
for i in range(len(trainset)):
# 先将图片二值化,像素为0的不变,大于0的变成1
image = binaryzation(trainset[i])
label = train_labels[i]
prior_probability[label] += 1
for j in range(features_num): # 遍历每个特征
# P(xj|y),label=y,j=j,xj=image[j]
conditional_probability[label][j][image[j]] += 1 # 这里仅计算了分子
# 将条件概率映射到[1,10001]
for i in range(class_num):
for j in range(features_num):
# 只有x(1)和x(2),用第三个维度的原因是要针对维度的每个取值都要计算
pix_0 = conditional_probability[i][j][0]
pix_1 = conditional_probability[i][j][1]
# 计算0和1像素点对应的条件概率
probability_0 = float(pix_0) / float(pix_0 + pix_1) * 1000000 + 1
probability_1 = float(pix_1) / float(pix_0 + pix_1) * 1000000 + 1
conditional_probability[i][j][0] = probability_0
conditional_probability[i][j][1] = probability_1
return prior_probability, conditional_probability
# 计算概率,其实不完全是概率,缺少两个常数
def Calculate_probability(image, label):
# float64 -> int
probability = int(prior_probability[label])
for i in range(len(image)): # 遍历所有像素(特征),每个维度叠乘起来
probability *= int(conditional_probability[label][i][image[i]])
return probability
def Predict(testset, prior_probability, conditional_probability):
predict = []
for image in testset:
# 首先进行二值化
image = binaryzation(image)
max_label = 0
max_probability = -1
for i in range(class_num): # 遍历十个类别
probability = Calculate_probability(image, i)
if max_probability < probability:
max_label = i
max_probability = probability
predict.append(max_label)
return np.array(predict)
class_num = 10 # 类别数
features_num = 784 # 特征数,像素数
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('./data/train.csv')
data = raw_data.values
img = data[:, 1:]
labels = data[:, 0]
print(img.shape)
print(labels.shape)
train_features, test_features, train_labels, test_labels = train_test_split(img, labels, test_size=0.33, random_state=11111)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Starting training:')
prior_probability, conditional_probability = Train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Starting predicting:')
test_predict = Predict(test_features, prior_probability, conditional_probability)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('The accuracy is %f!' % accuracy)
'''
output:
Start reading data:
(42000, 784)
(42000,)
read data cost 6.082135 seconds
Starting training:
training cost 28.842341 seconds
Starting predicting:
predicting cost 147.300343 seconds
The accuracy is 0.841919!
'''