统计学习方法_逻辑回归实现

数据集使用了类别仅两种的MNIST。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import math
import random

import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

class LogisticRegression(object):
	def __init__(self):
		self.learning_rate = 0.00001
		self.max_iteration = 5000

	def train(self, features, labels):
		self.w = [0.0] * (len(features[0]) + 1)

		correct_count = 0
		iteration = 0

		while iteration < self.max_iteration:
			index = random.randint(0, len(labels) - 1)
			x = list(features[index])
			x.append(1.0)
			y = labels[index]

			if y == self.my_predict(x):
				correct_count += 1
				if correct_count > self.max_iteration:
					break
				continue

			iteration += 1
			correct_count = 0

			wx = sum([self.w[j] * x[j] for j in range(len(self.w))])
			exp_wx = math.exp(wx)

			for j in range(len(self.w)):
				# 公式中为sigma求和，但这里仅对一个误分类点i（这里应该还需要遍历i，有点问题）
				self.w[j] -= self.learning_rate * (-y * x[j] + float(x[j] * exp_wx) / float(1 + exp_wx))

	def my_predict(self, x):
		wx = sum([self.w[j] * x[j] for j in range(len(self.w))])
		exp_wx = math.exp(wx)

		predict1 = exp_wx / (1 + exp_wx)
		predict0 = 1 / (1 + exp_wx)

		if predict1 > predict0:
			return 1
		else:
			return 0

	def predict(self, features):
		labels = []

		for feature in features:
			x = list(feature)
			x.append(1.0)  # 考虑到bias
			labels.append(self.my_predict(x))
		return np.array(labels)

if __name__ == '__main__':
	print('Start reading data:')
	time1 = time.time()

	raw_data = pd.read_csv('data/train_binary.csv')
	data = raw_data.values

	imgs = data[:, 1:]
	labels = data[:, 0]

	train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)

	time2 = time.time()
	print('read data cost %f seconds' % (time2 - time1))

	print('Start training:')
	lr = LogisticRegression()
	lr.train(train_features, train_labels)
	time3 = time.time()
	print('training cost %f seconds' % (time3 - time2))

	print('Start predicting:')
	test_predict = lr.predict(test_features)
	time4 = time.time()
	print('predicting cost %f seconds' % (time4 - time3))

	accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
	print('The accuracy is %f!' % accuracy)

'''
output:
Start reading data:
read data cost 5.048630 seconds
Start training:
training cost 124.349997 seconds
Start predicting:
predicting cost 3.422893 seconds
The accuracy is 0.985137!
'''
统计学习方法_逻辑回归实现

数据集使用了类别仅两种的MNIST。

猜你喜欢