数据来源:https://www.kaggle.com/c/titanic
Training
import pandas
import numpy
import time
import matplotlib.pyplot as plt
%matplotlib inline
def prepareData(filename):
data = pandas.read_csv(filename)
data['Sex'] = data['Sex'].map({'female':0, 'male':1})
data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if numpy.isnan(item) else item for item in data['Embarked']]
data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]
age_avg = numpy.mean([0 if numpy.isnan(item) else item for item in data['Age']])
data['Age'] = [age_avg if numpy.isnan(item) else item for item in data['Age']]
data['Age'] = [(item-min(data['Age']))/(max(data['Age'])-min(data['Age'])) for item in data['Age']]
data['Fare'] = [(item-min(data['Fare']))/(max(data['Fare'])-min(data['Fare'])) for item in data['Fare']]
data.insert(0, 'ones', 1)
return data
def run(X, Y, theta, alpha, steps):
init_time = time.time()
costs = [getCost(X, Y, theta)]
count = 0
with open('titanic/model.txt','w') as f:
for i in range(len(theta)):
f.write('theta_' + str(i) + ',')
f.write('cost\n')
while count < steps:
theta -= alpha*getGradient(X, Y, theta)
cost = getCost(X, Y, theta)
costs.append(cost)
for item in theta:
f.write(str(item)+',')
f.write(str(cost)+'\n')
count += 1
time_spent = time.time()-init_time
return costs, theta, time_spent
def getGradient(X, Y, theta):
gradient = numpy.zeros(len(theta))
for j in range(len(theta)):
tmp = 0
for x,y in zip(X,Y):
tmp += x[j]*(y - 1/(1+numpy.exp(-numpy.dot(theta,x))))
gradient[j] = -1/len(Y)*tmp
return gradient
def getCost(X, Y, theta):
cost = 0
for x,y in zip(X,Y):
cost += -numpy.log(numpy.exp(numpy.dot(theta,x)) + 1) + y*numpy.dot(theta,x)
return -cost/len(Y)
def getAccuracy(train_X, train_Y, theta):
Y_hat = []
for x in train_X:
y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
if y_hat >= 0.5:
Y_hat.append(1)
else:
Y_hat.append(0)
correct = 0.0
for i,j in zip(Y_hat, train_Y):
if i == j:
correct += 1
accuracy = correct/len(Y_hat)
return accuracy
train_data = prepareData('titanic/train.csv')
train_data.head(5)
|
ones |
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
1 |
0.271174 |
1 |
0 |
A/5 21171 |
0.014151 |
NaN |
1 |
1 |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th… |
0 |
0.472229 |
1 |
0 |
PC 17599 |
0.139136 |
C85 |
2 |
2 |
1 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
0 |
0.321438 |
0 |
0 |
STON/O2. 3101282 |
0.015469 |
NaN |
1 |
3 |
1 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
0 |
0.434531 |
1 |
0 |
113803 |
0.103644 |
C123 |
1 |
4 |
1 |
5 |
0 |
3 |
Allen, Mr. William Henry |
1 |
0.434531 |
0 |
0 |
373450 |
0.015713 |
NaN |
1 |
train_Y = (train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))['Survived'].values
train_X = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).drop(['Survived'], axis=1).values
theta = numpy.random.random(len(train_X[1]))
alpha = 0.001
steps = 10000
costs, theta, time_spent = run(train_X, train_Y, theta, alpha, steps)
accuracy = getAccuracy(train_X, train_Y, theta)
fig = plt.figure(figsize=(18,5))
ax1 = fig.add_subplot(121)
ax1.plot(range(steps+1), costs)
ax1.set_title('Logistic Regression for Titanic Problem -- Time spent: %f\nAccuracy: %f' % (time_spent, accuracy))
ax1.set_xlabel('steps')
ax1.set_ylabel('cost')
ax2 = fig.add_subplot(122)
ax2.plot(range(steps+1)[-1000:-1], costs[-1000:-1])
ax2.set_xlabel('steps')
ax2.set_ylabel('cost')
![costs vs. steps](https://img-blog.csdn.net/20171208110555237?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvdTAxMjg0MTkyMg==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
Testing
test_data = prepareData('titanic/test.csv')
test_data.head(5)
|
ones |
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
892 |
3 |
Kelly, Mr. James |
1 |
0.452723 |
0 |
0 |
330911 |
0.015282 |
NaN |
3 |
1 |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
0 |
0.617566 |
1 |
0 |
363272 |
0.013663 |
NaN |
1 |
2 |
1 |
894 |
2 |
Myles, Mr. Thomas Francis |
1 |
0.815377 |
0 |
0 |
240276 |
0.018909 |
NaN |
3 |
3 |
1 |
895 |
3 |
Wirz, Mr. Albert |
1 |
0.353818 |
0 |
0 |
315154 |
0.016908 |
NaN |
1 |
4 |
1 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
0 |
0.287881 |
1 |
1 |
3101298 |
0.023984 |
NaN |
1 |
test_X = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).values
Y_hat = []
for x in test_X:
y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
if y_hat >= 0.5:
Y_hat.append(1)
else:
Y_hat.append(0)
results = pandas.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('titanic/results.csv')