数据来源：https://www.kaggle.com/c/titanic

Training

import pandas
import numpy
import time
import matplotlib.pyplot as plt
%matplotlib inline

def prepareData(filename):
    data = pandas.read_csv(filename)

    data['Sex'] = data['Sex'].map({'female':0, 'male':1})

    data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})
    pier = [0 if numpy.isnan(item) else item for item in data['Embarked']]
    data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]

    age_avg = numpy.mean([0 if numpy.isnan(item) else item for item in data['Age']])
    data['Age'] = [age_avg if numpy.isnan(item) else item for item in data['Age']]
    #data['Age'] = [1/(1+numpy.exp(-item)) for item in data['Age']]
    data['Age'] = [(item-min(data['Age']))/(max(data['Age'])-min(data['Age'])) for item in data['Age']]

    #data['Fare'] = [1/(1+numpy.exp(-item)) for item in data['Fare']]
    data['Fare'] = [(item-min(data['Fare']))/(max(data['Fare'])-min(data['Fare'])) for item in data['Fare']]

    #data = data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
    data.insert(0, 'ones', 1)

    return data

def run(X, Y, theta, alpha, steps):
    init_time = time.time()
    costs = [getCost(X, Y, theta)]

    count = 0
    with open('titanic/model.txt','w') as f:
        for i in range(len(theta)):
            f.write('theta_' + str(i) + ',')
        f.write('cost\n')

        while count < steps:
            theta -= alpha*getGradient(X, Y, theta)
            cost = getCost(X, Y, theta)
            costs.append(cost)
            for item in theta:
                f.write(str(item)+',')
            f.write(str(cost)+'\n')
            count += 1

    time_spent = time.time()-init_time
    return costs, theta, time_spent

def getGradient(X, Y, theta):
    gradient = numpy.zeros(len(theta))
    for j in range(len(theta)):
        tmp = 0
        for x,y in zip(X,Y):
            tmp += x[j]*(y - 1/(1+numpy.exp(-numpy.dot(theta,x))))
        gradient[j] = -1/len(Y)*tmp
    return gradient

def getCost(X, Y, theta):
    cost = 0
    for x,y in zip(X,Y):
        cost += -numpy.log(numpy.exp(numpy.dot(theta,x)) + 1) + y*numpy.dot(theta,x)
    return -cost/len(Y)

def getAccuracy(train_X, train_Y, theta):
    Y_hat = []
    for x in train_X:
        y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
        if y_hat >= 0.5:
            Y_hat.append(1)
        else:
            Y_hat.append(0)

    correct = 0.0
    for i,j in zip(Y_hat, train_Y):
        if i == j:
            correct += 1

    accuracy = correct/len(Y_hat)
    return accuracy

train_data = prepareData('titanic/train.csv')
train_data.head(5)

	ones	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	1	0	3	Braund, Mr. Owen Harris	1	0.271174	1	A/5 21171	0.014151	NaN	1
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	0	0.472229	1	PC 17599	0.139136	C85	2
2	1	3	1	3	Heikkinen, Miss. Laina	0	0.321438	0	STON/O2. 3101282	0.015469	NaN	1
3	1	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	0.434531	1	113803	0.103644	C123	1
4	1	5	0	3	Allen, Mr. William Henry	1	0.434531	0	373450	0.015713	NaN	1

train_Y = (train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))['Survived'].values
train_X = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).drop(['Survived'], axis=1).values
theta = numpy.random.random(len(train_X[1]))
alpha = 0.001
steps = 10000

costs, theta, time_spent = run(train_X, train_Y, theta, alpha, steps)
accuracy = getAccuracy(train_X, train_Y, theta)

fig = plt.figure(figsize=(18,5))
ax1 = fig.add_subplot(121)
ax1.plot(range(steps+1), costs)
ax1.set_title('Logistic Regression for Titanic Problem -- Time spent: %f\nAccuracy: %f' % (time_spent, accuracy))
ax1.set_xlabel('steps')
ax1.set_ylabel('cost')

ax2 = fig.add_subplot(122)
ax2.plot(range(steps+1)[-1000:-1], costs[-1000:-1])
ax2.set_xlabel('steps')
ax2.set_ylabel('cost')

costs vs. steps

Testing

test_data = prepareData('titanic/test.csv')
test_data.head(5)

	ones	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	892	3	Kelly, Mr. James	1	0.452723	0	0	330911	0.015282	NaN	3
1	1	893	3	Wilkes, Mrs. James (Ellen Needs)	0	0.617566	1	0	363272	0.013663	NaN	1
2	1	894	2	Myles, Mr. Thomas Francis	1	0.815377	0	0	240276	0.018909	NaN	3
3	1	895	3	Wirz, Mr. Albert	1	0.353818	0	0	315154	0.016908	NaN	1
4	1	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	0	0.287881	1	1	3101298	0.023984	NaN	1

test_X = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).values
Y_hat = []
for x in test_X:
    y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
    if y_hat >= 0.5:
        Y_hat.append(1)
    else:
        Y_hat.append(0)

results = pandas.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('titanic/results.csv')

逻辑回归实战 — Kaggle_Titanic

Training

Testing

猜你喜欢