GITHUB地址https://github.com/fz861062923/Keras
数据来源:http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
查看数据
import numpy as np
import pandas as pd
df= pd. read_excel( 'titanic3.xls' )
df[ : 3 ]
pclass
survived
name
sex
age
sibsp
parch
ticket
fare
cabin
embarked
boat
body
home.dest
0
1
1
Allen, Miss. Elisabeth Walton
female
29.0000
0
0
24160
211.3375
B5
S
2
NaN
St Louis, MO
1
1
1
Allison, Master. Hudson Trevor
male
0.9167
1
2
113781
151.5500
C22 C26
S
11
NaN
Montreal, PQ / Chesterville, ON
2
1
0
Allison, Miss. Helen Loraine
female
2.0000
1
2
113781
151.5500
C22 C26
S
NaN
NaN
Montreal, PQ / Chesterville, ON
去除与survived无关的变量(eg.ticket,cabin)
cols= [ 'survived' , 'name' , 'pclass' , 'sex' , 'age' , 'sibsp' , 'parch' , 'fare' , 'embarked' ]
last_df= df[ cols]
last_df[ : 2 ]
survived
name
pclass
sex
age
sibsp
parch
fare
embarked
0
1
Allen, Miss. Elisabeth Walton
1
female
29.0000
0
0
211.3375
S
1
1
Allison, Master. Hudson Trevor
1
male
0.9167
1
2
151.5500
S
数据预处理
last_df= last_df. drop( [ 'name' ] , axis= 1 )
last_df[ : 2 ]
survived
pclass
sex
age
sibsp
parch
fare
embarked
0
1
1
female
29.0000
0
0
211.3375
S
1
1
1
male
0.9167
1
2
151.5500
S
last_df. describe( )
survived
pclass
age
sibsp
parch
fare
count
1309.000000
1309.000000
1046.000000
1309.000000
1309.000000
1308.000000
mean
0.381971
2.294882
29.881135
0.498854
0.385027
33.295479
std
0.486055
0.837836
14.413500
1.041658
0.865560
51.758668
min
0.000000
1.000000
0.166700
0.000000
0.000000
0.000000
25%
0.000000
2.000000
21.000000
0.000000
0.000000
7.895800
50%
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
1.000000
3.000000
39.000000
1.000000
0.000000
31.275000
max
1.000000
3.000000
80.000000
8.000000
9.000000
512.329200
last_df. dtypes
survived int64
pclass int64
sex object
age float64
sibsp int64
parch int64
fare float64
embarked object
dtype: object
age_mean= last_df[ 'age' ] . mean( )
last_df[ 'age' ] = last_df[ 'age' ] . fillna( age_mean)
fare_mean= last_df[ 'fare' ] . mean( )
last_df[ 'fare' ] = last_df[ 'fare' ] . fillna( fare_mean)
last_df[ 'sex' ] = last_df[ 'sex' ] . map ( { 'female' : 0 , 'male' : 1 } )
last_df
survived
pclass
sex
age
sibsp
parch
fare
embarked
0
1
1
0
29.000000
0
0
211.3375
S
1
1
1
1
0.916700
1
2
151.5500
S
2
0
1
0
2.000000
1
2
151.5500
S
3
0
1
1
30.000000
1
2
151.5500
S
4
0
1
0
25.000000
1
2
151.5500
S
5
1
1
1
48.000000
0
0
26.5500
S
6
1
1
0
63.000000
1
0
77.9583
S
7
0
1
1
39.000000
0
0
0.0000
S
8
1
1
0
53.000000
2
0
51.4792
S
9
0
1
1
71.000000
0
0
49.5042
C
10
0
1
1
47.000000
1
0
227.5250
C
11
1
1
0
18.000000
1
0
227.5250
C
12
1
1
0
24.000000
0
0
69.3000
C
13
1
1
0
26.000000
0
0
78.8500
S
14
1
1
1
80.000000
0
0
30.0000
S
15
0
1
1
29.881135
0
0
25.9250
S
16
0
1
1
24.000000
0
1
247.5208
C
17
1
1
0
50.000000
0
1
247.5208
C
18
1
1
0
32.000000
0
0
76.2917
C
19
0
1
1
36.000000
0
0
75.2417
C
20
1
1
1
37.000000
1
1
52.5542
S
21
1
1
0
47.000000
1
1
52.5542
S
22
1
1
1
26.000000
0
0
30.0000
C
23
1
1
0
42.000000
0
0
227.5250
C
24
1
1
0
29.000000
0
0
221.7792
S
25
0
1
1
25.000000
0
0
26.0000
C
26
1
1
1
25.000000
1
0
91.0792
C
27
1
1
0
19.000000
1
0
91.0792
C
28
1
1
0
35.000000
0
0
135.6333
S
29
1
1
1
28.000000
0
0
26.5500
S
...
...
...
...
...
...
...
...
...
1279
0
3
0
14.000000
0
0
7.8542
S
1280
0
3
1
22.000000
0
0
7.8958
S
1281
0
3
1
22.000000
0
0
9.0000
S
1282
0
3
1
29.881135
0
0
8.0500
S
1283
0
3
1
29.881135
0
0
7.5500
S
1284
0
3
1
29.881135
0
0
8.0500
S
1285
0
3
1
32.500000
0
0
9.5000
S
1286
1
3
0
38.000000
0
0
7.2292
C
1287
0
3
1
51.000000
0
0
7.7500
S
1288
0
3
1
18.000000
1
0
6.4958
S
1289
0
3
1
21.000000
1
0
6.4958
S
1290
1
3
0
47.000000
1
0
7.0000
S
1291
0
3
1
29.881135
0
0
8.7125
S
1292
0
3
1
29.881135
0
0
7.5500
S
1293
0
3
1
29.881135
0
0
8.0500
S
1294
0
3
1
28.500000
0
0
16.1000
S
1295
0
3
1
21.000000
0
0
7.2500
S
1296
0
3
1
27.000000
0
0
8.6625
S
1297
0
3
1
29.881135
0
0
7.2500
S
1298
0
3
1
36.000000
0
0
9.5000
S
1299
0
3
1
27.000000
1
0
14.4542
C
1300
1
3
0
15.000000
1
0
14.4542
C
1301
0
3
1
45.500000
0
0
7.2250
C
1302
0
3
1
29.881135
0
0
7.2250
C
1303
0
3
1
29.881135
0
0
14.4583
C
1304
0
3
0
14.500000
1
0
14.4542
C
1305
0
3
0
29.881135
1
0
14.4542
C
1306
0
3
1
26.500000
0
0
7.2250
C
1307
0
3
1
27.000000
0
0
7.2250
C
1308
0
3
1
29.000000
0
0
7.8750
S
1309 rows × 8 columns
last2_df= pd. get_dummies( data= last_df, columns= [ 'embarked' ] )
last2_df[ : 2 ]
survived
pclass
sex
age
sibsp
parch
fare
embarked_C
embarked_Q
embarked_S
0
1
1
0
29.0000
0
0
211.3375
0
0
1
1
1
1
1
0.9167
1
2
151.5500
0
0
1
DF转换为array
array= last2_df. values
array. shape
(1309, 10)
array[ : 1 ]
array([[ 1. , 1. , 0. , 29. , 0. , 0. ,
211.3375, 0. , 0. , 1. ]])
第一个字段为label,后面的特征全部可以定义为feature
label= array[ : , 0 ]
feature= array[ : , 1 : ]
label
array([1., 1., 0., ..., 0., 0., 0.])
feature
array([[ 1. , 0. , 29. , ..., 0. , 0. , 1. ],
[ 1. , 1. , 0.9167, ..., 0. , 0. , 1. ],
[ 1. , 0. , 2. , ..., 0. , 0. , 1. ],
...,
[ 3. , 1. , 26.5 , ..., 1. , 0. , 0. ],
[ 3. , 1. , 27. , ..., 1. , 0. , 0. ],
[ 3. , 1. , 29. , ..., 0. , 0. , 1. ]])
这里有个问题年龄,运费fare,sex之间的数值差异十分大
特征字段进行标准化
from sklearn import preprocessing
scale= preprocessing. MinMaxScaler( feature_range= ( 0 , 1 ) )
last_feature= scale. fit_transform( feature)
last_feature
array([[0. , 0. , 0.36116884, ..., 0. , 0. ,
1. ],
[0. , 1. , 0.00939458, ..., 0. , 0. ,
1. ],
[0. , 0. , 0.0229641 , ..., 0. , 0. ,
1. ],
...,
[1. , 1. , 0.32985358, ..., 1. , 0. ,
0. ],
[1. , 1. , 0.33611663, ..., 1. , 0. ,
0. ],
[1. , 1. , 0.36116884, ..., 0. , 0. ,
1. ]])
金星:完美·
将数据分为train和test两部分
cols= [ 'survived' , 'name' , 'pclass' , 'sex' , 'age' , 'sibsp' , 'parch' , 'fare' , 'embarked' ]
_df= df[ cols]
msk= np. random. rand( len ( _df) ) < 0.8
train= _df[ msk]
test= _df[ ~ msk]
len ( train)
1071
len ( test)
238
len ( df)
1309
len ( df) == len ( train) + len ( test)
True
将上面预处理数据的过程写成一个完整的函数
from sklearn import preprocessing
def preprocess ( df) :
df= df. drop( [ 'name' ] , axis= 1 )
age_mean= df[ 'age' ] . mean( )
df[ 'age' ] = df[ 'age' ] . fillna( age_mean)
fare_mean= df[ 'fare' ] . mean( )
df[ 'fare' ] = df[ 'fare' ] . fillna( fare_mean)
df[ 'sex' ] = df[ 'sex' ] . map ( { 'female' : 0 , 'male' : 1 } )
df= pd. get_dummies( data= df, columns= [ 'embarked' ] )
array= df. values
Label= array[ : , 0 ]
feature= array[ : , 1 : ]
scale= preprocessing. MinMaxScaler( feature_range= ( 0 , 1 ) )
Feature= scale. fit_transform( feature)
return Feature, Label
train_feature, train_label= preprocess( train)
test_feature, test_label= preprocess( test)
train_feature
array([[0. , 1. , 0.01015802, ..., 0. , 0. ,
1. ],
[0. , 0. , 0.02483026, ..., 0. , 0. ,
1. ],
[0. , 1. , 0.40406294, ..., 0. , 0. ,
1. ],
...,
[1. , 0. , 0.39950884, ..., 1. , 0. ,
0. ],
[1. , 1. , 0.35665885, ..., 1. , 0. ,
0. ],
[1. , 1. , 0.39051891, ..., 0. , 0. ,
1. ]])
train_label
array([1., 0., 0., ..., 0., 0., 0.])
建立多层感知机模型
from keras. models import Sequential
from keras. layers import Dense, Dropout
C:\Users\admin\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Using TensorFlow backend.
model= Sequential( )
建立隐藏层1
model. add( Dense( units= 40 , input_dim= 9 ,
kernel_initializer= 'uniform' ,
activation= 'relu' ) )
建立隐藏层2
model. add( Dense( units= 30 , kernel_initializer= 'uniform' ,
activation= 'relu' ) )
建立输出层
model. add( Dense( units= 1 , kernel_initializer= 'uniform' ,
activation= 'sigmoid' ) )
开始训练
model. compile ( loss= 'binary_crossentropy' ,
optimizer= 'adam' , metrics= [ 'accuracy' ] )
train_history= model. fit( train_feature, train_label, validation_split= 0.1 ,
epochs= 30 ,
batch_size= 30 ,
verbose= 2 )
Train on 963 samples, validate on 108 samples
Epoch 1/30
- 1s - loss: 0.6886 - acc: 0.5971 - val_loss: 0.6607 - val_acc: 0.7963
Epoch 2/30
- 0s - loss: 0.6632 - acc: 0.6199 - val_loss: 0.5776 - val_acc: 0.7963
Epoch 3/30
- 0s - loss: 0.6057 - acc: 0.6615 - val_loss: 0.4881 - val_acc: 0.8148
Epoch 4/30
- 0s - loss: 0.5522 - acc: 0.7549 - val_loss: 0.4740 - val_acc: 0.7685
Epoch 5/30
- 0s - loss: 0.5153 - acc: 0.7695 - val_loss: 0.4613 - val_acc: 0.7778
Epoch 6/30
- 0s - loss: 0.5006 - acc: 0.7726 - val_loss: 0.4621 - val_acc: 0.7778
Epoch 7/30
- 0s - loss: 0.4930 - acc: 0.7726 - val_loss: 0.4668 - val_acc: 0.7778
Epoch 8/30
- 0s - loss: 0.4873 - acc: 0.7705 - val_loss: 0.4570 - val_acc: 0.7778
Epoch 9/30
- 0s - loss: 0.4816 - acc: 0.7767 - val_loss: 0.4726 - val_acc: 0.7778
Epoch 10/30
- 0s - loss: 0.4805 - acc: 0.7757 - val_loss: 0.4551 - val_acc: 0.7778
Epoch 11/30
- 0s - loss: 0.4751 - acc: 0.7767 - val_loss: 0.4482 - val_acc: 0.7778
Epoch 12/30
- 0s - loss: 0.4728 - acc: 0.7819 - val_loss: 0.4719 - val_acc: 0.7778
Epoch 13/30
- 0s - loss: 0.4733 - acc: 0.7767 - val_loss: 0.4423 - val_acc: 0.7963
Epoch 14/30
- 0s - loss: 0.4688 - acc: 0.7747 - val_loss: 0.4606 - val_acc: 0.7778
Epoch 15/30
- 0s - loss: 0.4743 - acc: 0.7653 - val_loss: 0.4357 - val_acc: 0.8241
Epoch 16/30
- 0s - loss: 0.4660 - acc: 0.7882 - val_loss: 0.4416 - val_acc: 0.7963
Epoch 17/30
- 0s - loss: 0.4648 - acc: 0.7799 - val_loss: 0.4356 - val_acc: 0.8241
Epoch 18/30
- 0s - loss: 0.4641 - acc: 0.7850 - val_loss: 0.4331 - val_acc: 0.8148
Epoch 19/30
- 0s - loss: 0.4648 - acc: 0.7934 - val_loss: 0.4429 - val_acc: 0.8056
Epoch 20/30
- 0s - loss: 0.4608 - acc: 0.7830 - val_loss: 0.4356 - val_acc: 0.8241
Epoch 21/30
- 0s - loss: 0.4640 - acc: 0.7902 - val_loss: 0.4362 - val_acc: 0.8241
Epoch 22/30
- 0s - loss: 0.4579 - acc: 0.7923 - val_loss: 0.4422 - val_acc: 0.8148
Epoch 23/30
- 0s - loss: 0.4587 - acc: 0.7882 - val_loss: 0.4333 - val_acc: 0.8241
Epoch 24/30
- 0s - loss: 0.4573 - acc: 0.7861 - val_loss: 0.4327 - val_acc: 0.8241
Epoch 25/30
- 0s - loss: 0.4559 - acc: 0.7871 - val_loss: 0.4326 - val_acc: 0.8333
Epoch 26/30
- 0s - loss: 0.4549 - acc: 0.7913 - val_loss: 0.4368 - val_acc: 0.8148
Epoch 27/30
- 0s - loss: 0.4544 - acc: 0.7913 - val_loss: 0.4310 - val_acc: 0.8241
Epoch 28/30
- 0s - loss: 0.4559 - acc: 0.7850 - val_loss: 0.4299 - val_acc: 0.8241
Epoch 29/30
- 0s - loss: 0.4533 - acc: 0.7923 - val_loss: 0.4324 - val_acc: 0.8241
Epoch 30/30
- 1s - loss: 0.4544 - acc: 0.7944 - val_loss: 0.4299 - val_acc: 0.8333
可视化
import matplotlib. pyplot as plt
def show_train_history ( train_history, train, validation) :
plt. plot( train_history. history[ train] )
plt. plot( train_history. history[ validation] )
plt. title( 'Train History' )
plt. ylabel( train)
plt. xlabel( 'Epoch' )
plt. legend( [ 'train' , 'validation' ] , loc= 'upper left' )
plt. show( )
show_train_history( train_history, 'acc' , 'val_acc' )
show_train_history( train_history, 'loss' , 'val_loss' )
评估模型准确率
scores = model. evaluate( test_feature, test_label)
238/238 [==============================] - 0s 67us/step
scores[ 1 ]
0.8319327736101231
做一个有趣的实验
模拟出Jack和Rose两个电影人物,比较他们的生存率
创建数据
Jack= pd. Series( [ 0 , 'Jack' , 3 , 'male' , 23 , 1 , 0 , 5.000 , 'S' ] )
Rose= pd. Series( [ 1 , 'Rose' , 1 , 'female' , 20 , 1 , 0 , 100.0000 , 'S' ] )
new_df= pd. DataFrame( [ list ( Jack) , list ( Rose) ] ,
columns= [ 'survived' , 'name' , 'pclass' , 'sex' , 'age' , 'sibsp' , 'parch' , 'fare' , 'embarked' ] )
加入数据到DF中
df= pd. concat( [ _df, new_df] , sort= False )
df[ - 4 : ]
survived
name
pclass
sex
age
sibsp
parch
fare
embarked
1307
0
Zakarian, Mr. Ortin
3
male
27.0
0
0
7.225
C
1308
0
Zimmerman, Mr. Leo
3
male
29.0
0
0
7.875
S
0
0
Jack
3
male
23.0
1
0
5.000
S
1
1
Rose
1
female
20.0
1
0
100.000
S
进行预测
Feature, Label= preprocess( df)
probability= model. predict( Feature)
probability[ : 10 ]
array([[0.9776822 ],
[0.54482657],
[0.9709371 ],
[0.36197898],
[0.969016 ],
[0.2566388 ],
[0.9287646 ],
[0.2852625 ],
[0.9162269 ],
[0.27310023]], dtype=float32)
pd= df
pd. insert( len ( df. columns) , 'probability' , probability)
pd[ - 2 : ]
survived
name
pclass
sex
age
sibsp
parch
fare
embarked
probability
0
0
Jack
3
male
23.0
1
0
5.0
S
0.132897
1
1
Rose
1
female
20.0
1
0
100.0
S
0.964962
可以看出Rose的存活率的确远远大于Jack,滑稽xiao
pd[ ( pd[ 'survived' ] == 0 ) & ( pd[ 'probability' ] > 0.9 ) ]
survived
name
pclass
sex
age
sibsp
parch
fare
embarked
probability
2
0
Allison, Miss. Helen Loraine
1
female
2.0
1
2
151.5500
S
0.970937
4
0
Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
1
female
25.0
1
2
151.5500
S
0.969016
105
0
Evans, Miss. Edith Corse
1
female
36.0
0
0
31.6792
C
0.970464
169
0
Isham, Miss. Ann Elizabeth
1
female
50.0
0
0
28.7125
C
0.967463
286
0
Straus, Mrs. Isidor (Rosalie Ida Blun)
1
female
63.0
1
0
221.7792
S
0.959756