版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_33765907/article/details/83277912
数据集使用了类别仅两种的MNIST。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import math
import random
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
class LogisticRegression(object):
def __init__(self):
self.learning_rate = 0.00001
self.max_iteration = 5000
def train(self, features, labels):
self.w = [0.0] * (len(features[0]) + 1)
correct_count = 0
iteration = 0
while iteration < self.max_iteration:
index = random.randint(0, len(labels) - 1)
x = list(features[index])
x.append(1.0)
y = labels[index]
if y == self.my_predict(x):
correct_count += 1
if correct_count > self.max_iteration:
break
continue
iteration += 1
correct_count = 0
wx = sum([self.w[j] * x[j] for j in range(len(self.w))])
exp_wx = math.exp(wx)
for j in range(len(self.w)):
# 公式中为sigma求和,但这里仅对一个误分类点i(这里应该还需要遍历i,有点问题)
self.w[j] -= self.learning_rate * (-y * x[j] + float(x[j] * exp_wx) / float(1 + exp_wx))
def my_predict(self, x):
wx = sum([self.w[j] * x[j] for j in range(len(self.w))])
exp_wx = math.exp(wx)
predict1 = exp_wx / (1 + exp_wx)
predict0 = 1 / (1 + exp_wx)
if predict1 > predict0:
return 1
else:
return 0
def predict(self, features):
labels = []
for feature in features:
x = list(feature)
x.append(1.0) # 考虑到bias
labels.append(self.my_predict(x))
return np.array(labels)
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('data/train_binary.csv')
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Start training:')
lr = LogisticRegression()
lr.train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Start predicting:')
test_predict = lr.predict(test_features)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('The accuracy is %f!' % accuracy)
'''
output:
Start reading data:
read data cost 5.048630 seconds
Start training:
training cost 124.349997 seconds
Start predicting:
predicting cost 3.422893 seconds
The accuracy is 0.985137!
'''