本文是博主基于之前练手Kaggle上泰坦尼克的入门分析而做的个人总结。
此案例是读者经过研究多个Kaggle上大神的kernel经验,加上个人的理解,得到 的结果
此案例的亮点在于数据探索部分及后续的模型优化上,博主第一次见识到了数据探索时利用seaborn和go包画出的各种数据分析图像的美腻。
话不多说~开始~
0 简介
关于这个案例,具体的介绍及简介,见Kaggle官网上的数据,内容很全,唯一一个要稍稍提到的是,官网上的关于变量的解释均是英文的,作为英文有些坎坷的我来说,通常做Kaggle上的案例时,我都会谨慎的仔细的理解各个变量的意思。
这是官网上这个案例的链接地址点击打开链接
1 数据探索
往往数据探索的时候需要进行二次探索,即进行预处理过后,再次进行探索。
第一次探索
1.1 总体情况分析
首先读取数据
train = pd.read_csv('train.csv') test = pd.read_csv('test.csv')
将数据的基本描述存入Excel表中,由于jupyter notebook上显示不全
explore = train.describe(include = 'all').T explore['null'] = len(train) - explore['count'] explore.insert(0,'dtype',train.dtypes) explore.T.to_csv('explore1.csv')
探索数据时,由于总体数据量较少,此处将训练集和测试集进行连接,探索
alldata = pd.concat([train.ix[:,'Pclass':'Embarked'],test.ix[:,'Pclass':'Embarked']]).reset_index(drop=True)
explore = alldata.describe(include = 'all').T explore['null'] = len(alldata) - explore['count'] explore.insert(0,'dtype',alldata.dtypes) explore.T.to_csv('explore2.csv')
1.2 数据质量分析
1.2.1 缺失值
自定义缺失函数
def missing_values(alldata): alldata_na = pd.DataFrame(alldata.isnull().sum(), columns={'missingNum'}) alldata_na['missingRatio'] = alldata_na['missingNum']/len(alldata)*100 alldata_na['existNum'] = len(alldata) - alldata_na['missingNum'] alldata_na['train_notna'] = len(train) - train.isnull().sum() alldata_na['test_notna'] = alldata_na['existNum'] - alldata_na['train_notna'] alldata_na['dtype'] = alldata.dtypes alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True]) alldata_na.set_index('index',inplace=True) return alldata_na alldata_na = missing_values(alldata)
alldata_na
1.2.2 重复值
train[train.duplicated()==True] # 无重复值
运行结果为空,无重复值
1.2.3 异常值
1、简单统计量分析;
2、3sigma原则:一组测定值中与平均值的偏差超过3倍标准差的值;
3、箱型图分析Ql+/-1.5QL
train.boxplot() plt.ylim(0,1000)
train[train.Fare>400]
#这部分数据的最后都是存活,暂时可以推测,fare值越高可能存活率越高,暂时未见异常值
1.3 数据特征分析
1.3.1 分布分析
fig, ax = plt.subplots(2, 2, figsize = (8, 6)) sns.countplot(x='Pclass',data=train, ax = ax[0,1]) sns.countplot(x='Survived',data=train, ax = ax[1,1]) sns.violinplot('Survived', 'Age', data = train, ax = ax[1,0]).set(ylim = (-10, 80)) sns.countplot('Embarked', data = train, ax = ax[0,0]) plt.tight_layout()
plt.hist(x = [train[train['Survived']==1]['Pclass'], train[train['Survived']==0]['Pclass']], \ stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Pclass Histogram by Survival') plt.xlabel(u'乘客等级') plt.xticks(train['Pclass'].value_counts().index) plt.ylabel('# of Passengers') plt.legend()
# 查看不同等级不同性别的存活情况 h = sns.FacetGrid(train, row = 'Sex', col = 'Pclass', hue = 'Survived') h.map(plt.hist, 'Age', alpha = .75) h.add_legend()
# 查看不同年龄的存活情况 a = sns.FacetGrid( train, hue = 'Survived', aspect=4 ) a.map(sns.kdeplot, 'Age', shade= True ) a.set(xlim=(0, train['Age'].max())) a.add_legend()
# 不同等级中不同年龄分布情况 plt.figure(figsize=(8,6)) sns.kdeplot(train[(train['Pclass']==1) & (train['Age'].notnull()==True)]['Age'], shade=True).set(ylim = (0, 0.045)) sns.kdeplot(train[(train['Pclass']==2) & (train['Age'].notnull()==True)]['Age'], shade=True) sns.kdeplot(train[(train['Pclass']==3) & (train['Age'].notnull()==True)]['Age'], shade=True) plt.legend([u'头等舱', u'2等舱',u'3等舱'],loc='best')
# 查看‘cabin'字段是否是空对存活的影响 Survived_cabin = train.Survived[pd.notnull(train.Cabin)].value_counts() Survived_nocabin = train.Survived[pd.isnull(train.Cabin)].value_counts() df=pd.DataFrame({u'有cabin':Survived_cabin, u'无cabin':Survived_nocabin}).transpose() df.plot(kind='bar') plt.xticks(rotation = 0)
# 查看不同Embark对存活的影响 sns.countplot(x = 'Embarked',hue = 'Survived', orient='h', data = train)
fig = plt.figure(figsize=(10,8)) fig.set(alpha=0.2) # 设定图表颜色alpha参数 plt.subplot2grid((2,3),(0,0)) sns.countplot(x='Survived',data=train) # df.plot(kind='bar') plt.subplot2grid((2,3),(0,1)) # sns.countplot(x='Pclass',data=train) sns.countplot(x = 'Pclass',hue = 'Survived', orient='h', data = train) plt.subplot2grid((2,3),(0,2)) sns.violinplot('Survived', 'Age', data = train)#, ax = ax[1,0]).set(ylim = (-10, 80)) plt.subplot2grid((2,3),(1,0), colspan=2) sns.kdeplot(train[(train['Pclass']==1) & (train['Age'].notnull()==True)]['Age'], shade=True)#.set(ylim = (0, 0.045)) sns.kdeplot(train[(train['Pclass']==2) & (train['Age'].notnull()==True)]['Age'], shade=True) sns.kdeplot(train[(train['Pclass']==3) & (train['Age'].notnull()==True)]['Age'], shade=True) plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') plt.ylabel(u"密度",fontsize= 22) plt.title(u"各等级的乘客年龄分布") plt.ylim(0,0.05) plt.subplot2grid((2,3),(1,2)) # sns.countplot('Embarked', data = train) sns.countplot(x = 'Embarked',hue = 'Survived', orient='h', data = train) plt.subplot2grid((2,3),(1,2)) sns.countplot(x = 'Sex',hue = 'Survived', orient='h', data = train) plt.tight_layout() plt.show()
、
# 查看不同性别的存活情况 sns.factorplot('Sex', 'Age', hue = 'Survived', estimator = np.mean, data = train, size = 3, aspect = 1.4)
# 查看不同性别的存活情况 h = sns.FacetGrid(train, row = 'Sex', hue = 'Survived') h.map(plt.hist, 'Age', alpha = .75) h.add_legend()
plt.figure(figsize=[8,4]) plt.subplot(121) plt.boxplot(x=train['Fare'], showmeans = True, meanline = True) plt.title('Fare Boxplot') plt.ylabel('Fare ($)') plt.subplot(122) plt.hist(x = [train[train['Survived']==1]['Fare'], train[train['Survived']==0]['Fare']], stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Fare Histogram by Survival') plt.xlabel('Fare ($)') plt.ylabel('# of Passengers') plt.legend() # 可见Fare越低,存活率越低
1.3.2 相关性分析
# corrmat = train.corr() plt.subplots(figsize=(10,8)) corrmat = train[train.columns[1:]].corr() sns.set(font_scale=1.25) hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}) plt.title('Pearson Correlation of Features', y=1.05, size=15) plt.show()
# 通过上图,可以初步看出
# (1)Pclass与Survived和Fare成反比,一定程度上反映了Pclass越大(等级越低),Fare相应的越低,Survived的几率越小;
# (2)Pclass越大(等级越低)的人群中Age偏小
# (3)Fare对Survived有正向作用
2 数据预处理
2.1 数据清洗
由1_dataExploration中的缺失值分析结果进行下面处理
# 对于Fare alldata['Fare'] = alldata['Fare'].fillna(alldata['Fare'].mean()) # median()
# 对于Embarked alldata['Embarked'] = alldata['Embarked'].fillna(alldata['Embarked'].mode()[0])
# 对于Age: 采取随机森林补充缺失值 from sklearn.ensemble import RandomForestRegressor ### 使用 RandomForestClassifier 填补缺失的年龄属性 def set_missing_ages(df): # 把已有的数值型特征取出来丢进Random Forest Regressor中 age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] # 乘客分成已知年龄和未知年龄两部分 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # y即目标年龄 y = known_age[:, 0] # X即特征属性值 X = known_age[:, 1:] # fit到RandomForestRegressor之中 rfr = RandomForestRegressor(random_state=10, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # 用得到的模型进行未知年龄结果预测 predictedAges = rfr.predict(unknown_age[:, 1::]) # 用得到的预测结果填补原缺失数据 df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges return df, rfr alldata, rfr = set_missing_ages(alldata)
# 对于Cabin alldata['Cabin'].value_counts() # 由上述结果可见,Cabin本身并没有什么规律,倒是前面的首字母可以提取出来,构造新属性
2.2 数据变换
2.2.1 属性构造
# 1、对于Cabin
alldata['CabinHead'] = alldata['Cabin'].str[0] # 等价于alldata['Cabin'].str.extract('(\S{1})')
alldata['CabinHead'] = alldata['CabinHead'].fillna('None')
alldata['CabinHead'].value_counts().sort_index()
train['CabinHead'] = train['Cabin'].str.extract('(\S{1})') train['CabinHead'] = train['CabinHead'].fillna('None') a = train['CabinHead'].value_counts().sort_index() a.name = 'Cabinnum' b= train[:891].groupby(['CabinHead'])['Survived'].sum().sort_index() b.name = 'Survivednum' c = pd.concat([a,b],axis=1) c['survivedRate'] = c['Survivednum']/c['Cabinnum']*100 c # 由上面结果可知,CabinHead为B D E 的存活率较高,创建新属性
alldata['CabinAlpha'] = (alldata['CabinHead'].isin(['B','D','E']))*1
根据有无Cabin创造新属性
alldata['NullCabin'] = (alldata['Cabin'].notnull()==True)*1 alldata['NullCabin'] = alldata['NullCabin'].fillna(0)
# 2、家族成员相关
alldata['NoSibSp'] = (alldata['SibSp']<=0)*1 alldata['NoParch'] = (alldata['Parch']<=0)*1 alldata['Family'] = alldata['SibSp'] + alldata['Parch'] + 1 alldata['isAlone'] = (alldata['Family']==1)*1
# 3、对于Ticket属性
Ticket = pd.DataFrame(alldata['Ticket'].value_counts())
Ticket.columns = ['PN']
Ticket
plt.subplot(111) plt.hist(x = [train[train['Survived']==1]['Fare'], train[train['Survived']==0]['Fare']], stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Fare Histogram by Survival') plt.xlabel('Fare ($)') plt.ylabel('# of Passengers') plt.legend() # 由图可知,对于Ticket属性,存在相同的值,可以推测这是由于团购票的存在,因此,每个个体取对应均值
# 构建每个人的真实的票价 alldata1 = pd.merge(alldata, Ticket, left_on = 'Ticket',right_index = True) alldata1['realFare'] = alldata1['Fare']/alldata1['PN']
# 4、对于Name属性,根据其中的称谓进行构建新属性
alldata1['Title'] = alldata1['Name'].str.split(", |\.", expand=True)[1]
# 由于'Ms','Mlle'都是Miss的意思,'Mme'同Mrs,因此,统一
alldata1.ix[alldata1['Title'].isin(['Ms','Mlle']),'Title' ] ='Miss'
alldata1.ix[alldata1['Title'].isin(['Mme']),'Title' ] ='Mrs'
alldata1['mother'] = ((alldata1['Sex']=='female') & (alldata1['Parch'] > 0)\
& (alldata1['Age']>=16)& (alldata1['Title']=='Mrs')) *1
# 转换称呼头衔,若数量大于stat_min的进行单独分类,其余的分为Misc类
stat_min = 10
title_names = (alldata1['Title'].value_counts() < stat_min)
alldata1['Title'] = alldata1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
2.2.2 连续属性离散化
alldata1['FareBin'] = pd.qcut(alldata1['realFare'], 4, labels=['low','norml','middle','high']) alldata1['AgeBin'] = pd.cut(alldata1['Age'].astype(int), 5,labels=['yonth','youngAdult','middle','Senior','old'])
alldata2 = alldata1.drop(['Name','SibSp', 'Parch','Ticket', 'Fare',\ 'Cabin','PN'],axis=1)
保存处理的数据
alldata2 = alldata2.sort_index() alldata2.to_excel('1_afterdataprocessing.xlsx')
2.2.3 数据二值变换
# 方法一:dummy typeList = ['Pclass','Sex','Embarked','CabinHead','Title','FareBin','AgeBin'] A = pd.concat([pd.get_dummies(alldata2[i],prefix = i) for i in typeList], axis=1) B = alldata2[['NoSibSp', 'NoParch', 'NullCabin', 'CabinAlpha', 'Family', 'isAlone','mother','Age','realFare']] alldata3 = pd.concat([A,B],axis = 1) alldata3.head() # 方法二:encode # from sklearn.preprocessing import OneHotEncoder, LabelEncoder # label = LabelEncoder() # alldata5['Sex'] = label.fit_transform(alldata2['Sex']) # alldata5['Embarked'] = label.fit_transform(alldata2['Embarked']) # alldata5['Title'] = label.fit_transform(alldata2['Title']) # alldata5['AgeBin'] = label.fit_transform(alldata2['AgeBin']) # alldata5['FareBin'] = label.fit_transform(alldata2['FareBin']) # alldata5['CabinHead'] = label.fit_transform(alldata2['CabinHead'])
alldata3.columns
输出结果:
Index(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'CabinHead_A', 'CabinHead_B', 'CabinHead_C', 'CabinHead_D', 'CabinHead_E', 'CabinHead_F', 'CabinHead_G', 'CabinHead_None', 'CabinHead_T', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'FareBin_low', 'FareBin_norml', 'FareBin_middle', 'FareBin_high', 'AgeBin_yonth', 'AgeBin_youngAdult', 'AgeBin_middle', 'AgeBin_Senior', 'AgeBin_old', 'NoSibSp', 'NoParch', 'NullCabin', 'CabinAlpha', 'Family', 'isAlone', 'mother', 'Age', 'realFare'], dtype='object')
X_train = alldata3[:len(train)] X_test = alldata3[len(train):] y_train = train1['Survived']
保存数据,用于建模
alldata3.to_excel('2_afterDataPreprocessing.xlsx')#
3 数据二次探索
train1 = pd.concat([alldata2.iloc[:len(train),:],train[['Survived']]],axis=1)
for x in train1.columns[:-1]: if train1[x].dtype != 'float64' : print('Survival Correlation by:', x) print(train1[[x, 'Survived']].groupby(x, as_index=False).mean()) print('-'*10, '\n') print(pd.crosstab(train1['Title'],train1['Survived']))
plt.figure(figsize=[16,12]) plt.subplot(231) plt.boxplot(x=alldata2['realFare'], showmeans = True, meanline = True) plt.title('Fare Boxplot') plt.ylabel('Fare ($)') plt.subplot(232) plt.boxplot(alldata2['Age'], showmeans = True, meanline = True) plt.title('Age Boxplot') plt.ylabel('Age (Years)') plt.subplot(233) plt.boxplot(alldata2['Family'], showmeans = True, meanline = True) plt.title('Family Size Boxplot') plt.ylabel('Family Size (#)') plt.subplot(234) plt.hist(x = [train1[train1['Survived']==1]['realFare'], train1[train1['Survived']==0]['realFare']], stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Fare Histogram by Survival') plt.xlabel('Fare ($)') plt.ylabel('# of Passengers') plt.legend() plt.subplot(235) plt.hist(x = [train1[train1['Survived']==1]['Age'], train1[train1['Survived']==0]['Age']], stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Age Histogram by Survival') plt.xlabel('Age (Years)') plt.ylabel('# of Passengers') plt.legend() plt.subplot(236) plt.hist(x = [train1[train1['Survived']==1]['Family'], train1[train1['Survived']==0]['Family']], stacked=True, color = ['g','r'],label = ['Survived','Dead']) plt.title('Family Size Histogram by Survival') plt.xlabel('Family Size (#)') plt.ylabel('# of Passengers') plt.legend()
fig, saxis = plt.subplots(2, 3,figsize=(16,12)) sns.barplot(x = 'Embarked', y = 'Survived', data=train1, ax = saxis[0,0]) sns.barplot(x = 'Pclass', y = 'Survived', order=[1,2,3], data=train1, ax = saxis[0,1]) sns.barplot(x = 'isAlone', y = 'Survived', order=[1,0], data=train1, ax = saxis[0,2]) sns.pointplot(x = 'FareBin', y = 'Survived', data=train1, ax = saxis[1,0]) sns.pointplot(x = 'AgeBin', y = 'Survived', data=train1, ax = saxis[1,1]) sns.pointplot(x = 'Family', y = 'Survived', data=train1, ax = saxis[1,2])
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(16,6)) sns.boxplot(x = 'Pclass', y = 'realFare', hue = 'Survived', data =train1, ax = axis1) axis1.set_title('Pclass vs Fare Survival Comparison') sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = train1, split = True, ax = axis2) axis2.set_title('Pclass vs Age Survival Comparison') sns.boxplot(x = 'Pclass', y ='Family', hue = 'Survived', data = train1, ax = axis3) axis3.set_title('Pclass vs Family Size Survival Comparison')
# fig = plt.figure() # ax = fig.add_subplot(1,1,1) fig, qaxis = plt.subplots(1,3,figsize=(16,6)) sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=train1, ax = qaxis[0]) axis1.set_title('Sex vs Embarked Survival Comparison') sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=train1, ax = qaxis[1]) axis1.set_title('Sex vs Pclass Survival Comparison') sns.barplot(x = 'Sex', y = 'Survived', hue = 'isAlone', data=train1, ax = qaxis[2]) axis1.set_title('Sex vs IsAlone Survival Comparison')
fig, (maxis1, maxis2) = plt.subplots(1, 2,figsize=(12,6)) #how does family size factor with sex & survival sns.pointplot(x="Family", y="Survived", hue="Sex", data=train1, palette={"male": "blue", "female": "pink"}, markers=["*", "o"], linestyles=["-", "--"], ax = maxis1) #how does class factor with sex & survival sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train1, palette={"male": "blue", "female": "pink"}, markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)
e = sns.FacetGrid(train1, col = 'Embarked') e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', ci=95.0, palette = 'deep') e.add_legend() #plot distributions of Age of passengers who survived or did not survive a = sns.FacetGrid(train1, hue = 'Survived', aspect=4 ) a.map(sns.kdeplot, 'Age', shade= True ) a.set(xlim=(0, train1['Age'].max())) a.add_legend() #histogram h = sns.FacetGrid(train1, row = 'Sex', col = 'Pclass', hue = 'Survived') h.map(plt.hist, 'Age', alpha = .75) h.add_legend()
def correlation_heatmap(df): _ , ax = plt.subplots(figsize =(14, 12)) colormap = sns.diverging_palette(220, 10, as_cmap = True) _ = sns.heatmap( df.corr(), cmap = colormap, square=True, cbar_kws={'shrink':.9 }, ax=ax, annot=True, linewidths=0.1,vmax=1.0, linecolor='white', annot_kws={'fontsize':12 } ) plt.title('Pearson Correlation of Features', y=1.05, size=15) correlation_heatmap(train1)
pd.crosstab(train1['Family'],train1['Survived'])
g = sns.pairplot(train1, hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) ) g.set(xticklabels=[])
4 建立模型
alldata = pd.read_excel('2_afterDataPreprocessing.xlsx',index=False)
确定自变量和应变量的值:
X_train = alldata[:len(train)] X_test = alldata[len(train):] y_train = train['Survived']
# 定义评价函数 from sklearn import cross_validation def rmsl(clf): s = cross_validation.cross_val_score(clf, X_train, y_train, cv=5) return (s.mean(),s.std())
本文建模时,思路是先建立基础模型,然后进行模型融合
4.1 基础模型
# 对比各个模型的最好的 from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(n_estimators = 60), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(n_neighbors = 3), #SVM svm.SVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ] #create table to compare MLA MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy Min' ,'MLA Time'] MLA_compare = pd.DataFrame(columns = MLA_columns) #index through MLA and save performance to table row_index = 0 for alg in MLA: #set name and parameters MLA_compare.loc[row_index, 'MLA Name'] = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) #score model with cross validation: cv_results = model_selection.cross_validate(alg, X_train, y_train, cv =5 ) MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean() MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() MLA_compare.loc[row_index, 'MLA Test Accuracy Min'] = cv_results['test_score'].min() #let's know the worst that can happen! row_index+=1 MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True) MLA_compare
#cv_results = model_selection.cross_validate(alg, X_train, y_train, cv = 5) # 优于cross_validation.cross_val_score #cv_results
4.1.1参数调节
param_grid = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [None, 2,4,6,8,10], 'min_samples_split': [5,10,15,20,25], 'max_features': [None, 'auto', 'sqrt', 'log2'] } tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = 5) cv_results = model_selection.cross_validate(tune_model, X_train, y_train, cv = 5)
print(tune_model.get_params()) print(cv_results['train_score'].mean()) print(cv_results['test_score'].mean()) print(cv_results['test_score'].min())
tune_model.get_params()
# 画图比较各个算法的效率 sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm') plt.title('Machine Learning Algorithm Accuracy Score \n') plt.xlabel('Accuracy Score (%)') plt.ylabel('Algorithm')
4.1.2 结果预测
# 由上面结果可知,预测效果最优的前几位是
# BaggingClassifier、GradientBoostingClassifier、RidgeClassifierCV、LogisticRegression、
# RandomForestClassifier、AdaBoostClassifier、LogisticRegressionCV
MLA_best = [ #Ensemble Methods ensemble.AdaBoostClassifier(), # 0.76076 ensemble.BaggingClassifier(), # 0.72248 ensemble.GradientBoostingClassifier(), # 0.73684 ensemble.RandomForestClassifier(n_estimators = 60), # 0.72727 #GLM linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6), # 0.77990 linear_model.RidgeClassifierCV(), # 0.77033 linear_model.LogisticRegressionCV() #0.77033 ] row_index = 0 for alg in MLA_best: algname = alg.__class__.__name__ alg.fit(X_train, y_train) predictions = alg.predict(X_test) result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv(algname+".csv", index=False) row_index+=1
4.2 模型融合
4.2.1 模型融合1(普通融合-投票法)
voting_est = [ #Ensemble Methods ('ada', ensemble.AdaBoostClassifier()), ('bc', ensemble.BaggingClassifier()), ('etc',ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('rfc', ensemble.RandomForestClassifier(n_estimators = 100)), #Gaussian Processes ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM - remove linear models, since this is a classifier algorithm ('lrcv', linear_model.LogisticRegressionCV()), ('lr', linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)), #('pac', linear_model.PassiveAggressiveClassifier()), # ('rc', linear_model.RidgeClassifierCV()), # voting-soft的时候出错,提示has no attribute 'predict_proba' #('sgd', linear_model.SGDClassifier()), #('pct', linear_model.Perceptron()), #Navies Bayes # ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor ('knn', neighbors.KNeighborsClassifier(n_neighbors = 3)), #SVM ('svc', svm.SVC(probability=True)), #('lsvc', svm.LinearSVC()), #Trees ('dtc', tree.DecisionTreeClassifier()), ('etc2', tree.ExtraTreeClassifier()), ] #Hard Vote or majority rules voting_hard = ensemble.VotingClassifier(estimators = voting_est , voting = 'hard') voting_hard_cv = model_selection.cross_validate(voting_hard, X_train, y_train, cv = 5) voting_hard.fit(X_train, y_train) print("Hard Voting Training w/bin score mean: {:.2f}". format(voting_hard_cv['train_score'].mean()*100)) print("Hard Voting Test w/bin score mean: {:.2f}". format(voting_hard_cv['train_score'].mean()*100)) print("Hard Voting Test w/bin score min: {:.2f}". format(voting_hard_cv['train_score'].min()*100)) # 如果“硬”,则使用预测的类标签进行多数规则投票。 # 否则,如果“软”,则基于预测概率总和的argmax预测类别标签,这对于经过良好校准的分类器的集合是推荐的。 #Soft Vote or weighted probabilites voting_soft = ensemble.VotingClassifier(estimators = voting_est , voting = 'soft') voting_soft_cv = model_selection.cross_validate(voting_soft, X_train, y_train, cv =5) voting_soft.fit(X_train, y_train) print("Soft Voting Training w/bin score mean: {:.2f}". format(voting_soft_cv['train_score'].mean()*100)) print("Soft Voting Test w/bin score mean: {:.2f}". format(voting_soft_cv['train_score'].mean()*100)) print("Soft Voting Test w/bin score min: {:.2f}". format(voting_soft_cv['train_score'].min()*100))
predictions = voting_soft.predict(X_test) result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("voting_soft.csv", index=False) # 0.73684
predictions = voting_hard.predict(X_test) result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("voting_hard.csv", index=False) # 0.75119
4.2.2 模型融合2:stacking
第一层:Stacking models
ntrain = train.shape[0] #891 ntest = test.shape[0] #418 SEED = 0 # for reproducibility NFOLDS = 5 # set folds for out-of-fold prediction kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
# 封装算法基本操作 class SklearnHelper(object): def __init__(self, clf, seed=0, params=None): params['random_state'] = seed self.clf = clf(**params) def train(self, x_train, y_train): self.clf.fit(x_train, y_train) def predict(self, x): return self.clf.predict(x) def fit(self,x,y): return self.clf.fit(x,y) def feature_importances(self,x,y): print(self.clf.fit(x,y).feature_importances_) return self.clf.fit(x,y).feature_importances_
# 定义求五折交叉验证的方法 def get_oof(clf, x_train, y_train, x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS, ntest)) for i, (train_index, test_index) in enumerate(kf): x_tr = x_train[train_index] y_tr = y_train[train_index] x_te = x_train[test_index] clf.train(x_tr, y_tr) oof_train[test_index] = clf.predict(x_te) oof_test_skf[i, :] = clf.predict(x_test) oof_test[:] = oof_test_skf.mean(axis=0) return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) # 想让z变成只有一列,行数不知道多少
# 定义四个不同的弱分类器的参数值 # Random Forest parameters rf_params = { 'n_jobs': -1,'n_estimators': 500,'warm_start': True, 'max_depth': 6,'min_samples_leaf': 2, 'max_features' : 'sqrt','verbose': 0#'max_features': 0.2, } # Extra Trees Parameters et_params = { 'n_jobs': -1,'n_estimators':500,'max_depth': 8,'min_samples_leaf': 2,'verbose': 0 #'max_features': 0.5, } # AdaBoost parameters ada_params = { 'n_estimators': 500,'learning_rate' : 0.75 } # Gradient Boosting parameters gb_params = { 'n_estimators': 500,'max_depth': 5,'min_samples_leaf': 2, 'verbose': 0 #'max_features': 0.2, } # Support Vector Classifier parameters # svc_params = { # 'kernel' : 'linear','C' : 0.025 # } # 创建四个若分类器模型 rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params) et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params) ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params) gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
X_train = X_train.values # 此处需要转成数组型 X_test = X_test.values # 使用五折交叉方法分别计算出使用不同算法的预测结果,这些结果将用于Stacking的第二层预测 et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test) # Random Forest ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost
# 计算出不同算法得出的不同字段的重要程度 rf_feature = rf.feature_importances(X_train,y_train) et_feature = et.feature_importances(X_train, y_train) ada_feature = ada.feature_importances(X_train, y_train) gb_feature = gb.feature_importances(X_train,y_train)
cols = alldata.columns.values # Create a dataframe with features feature_dataframe = pd.DataFrame( {'features': cols, 'Random Forest feature importances': rf_feature, 'Extra Trees feature importances': et_feature, 'AdaBoost feature importances': ada_feature, 'Gradient Boost feature importances': gb_feature }) feature_dataframe
画图,查看各个特征的重要性(用go包里的画图函数画的图,互动图)
# Scatter plot trace = go.Scatter( y = feature_dataframe['Random Forest feature importances'].values, x = feature_dataframe['features'].values, mode='markers', marker=dict( sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True, color = feature_dataframe['Random Forest feature importances'].values, # size = feature_dataframe['AdaBoost feature importances'].values, # color = np.random.randn(500)#set color equal to a variable ), text = feature_dataframe['features'].values ) data = [trace] layout= go.Layout( autosize= True, title= 'Random Forest Feature Importance', hovermode= 'closest', # xaxis= dict( # 不显示X轴标题 # title= 'Pop', # ticklen= 5, # zeroline= False, # gridwidth= 2, # ), yaxis=dict( title= 'Feature Importance', ticklen= 5, gridwidth= 2 ), showlegend= False ) fig = go.Figure(data=data, layout=layout) py.iplot(fig,filename='scatter20170207')
# Scatter plot trace = go.Scatter( y = feature_dataframe['Extra Trees feature importances'].values, x = feature_dataframe['features'].values, mode='markers', marker=dict( sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True, color = feature_dataframe['Extra Trees feature importances'].values, ), text = feature_dataframe['features'].values ) data = [trace] layout= go.Layout( autosize= True, title= 'Extra Trees Feature Importance', hovermode= 'closest', yaxis=dict( title= 'Feature Importance',ticklen= 5,gridwidth= 2 ), showlegend= False ) fig = go.Figure(data=data, layout=layout) py.iplot(fig,filename='scatter20170207_2')
# Scatter plot trace = go.Scatter( y = feature_dataframe['AdaBoost feature importances'].values, x = feature_dataframe['features'].values, mode='markers', marker=dict( sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True, color = feature_dataframe['AdaBoost feature importances'].values, ), text = feature_dataframe['features'].values ) data = [trace] layout= go.Layout( autosize= True, title= 'AdaBoost feature importances', hovermode= 'closest', yaxis=dict( title= 'Feature Importance',ticklen= 5,gridwidth= 2 ), showlegend= False ) fig = go.Figure(data=data, layout=layout) py.iplot(fig,filename='scatter20170207_3')
# Scatter plot trace = go.Scatter( y = feature_dataframe['Gradient Boost feature importances'].values, x = feature_dataframe['features'].values, mode='markers', marker=dict( sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True, color = feature_dataframe['Gradient Boost feature importances'].values, ), text = feature_dataframe['features'].values ) data = [trace] layout= go.Layout( autosize= True, title= 'Gradient Boost feature importances', hovermode= 'closest', yaxis=dict( title= 'Feature Importance',ticklen= 5,gridwidth= 2 ), showlegend= False ) fig = go.Figure(data=data, layout=layout) py.iplot(fig,filename='scatter20170207_4')
eature_dataframe['mean'] = feature_dataframe.mean(axis= 1) feature_dataframe.head(3)
画出各个特征的平均重要性
y = feature_dataframe['mean'].values x = feature_dataframe['features'].values data = [go.Bar( x= x, y= y, width = 0.5, marker=dict( color = feature_dataframe['mean'].values, colorscale='Portland', showscale=True, reversescale = False ), opacity=0.6 )] layout= go.Layout( autosize= True, title= 'Barplots of Mean Feature Importance', hovermode= 'closest', yaxis=dict( title= 'Feature Importance', ticklen= 5, gridwidth= 2 ), showlegend= False ) fig = go.Figure(data=data, layout=layout) py.iplot(fig, filename='bar-direct-labels')
第二层: 以第一层的结果预测
base_predictions_train = pd.DataFrame( { 'RandomForest': rf_oof_train.ravel(),# # ravel函数在降维时默认是行序优先 'ExtraTrees': et_oof_train.ravel(), 'AdaBoost': ada_oof_train.ravel(), 'GradientBoost': gb_oof_train.ravel() }) base_predictions_train.head()
训练集中各个特征的相关性分析
data = [ go.Heatmap( z= base_predictions_train.astype(float).corr().values , x=base_predictions_train.columns.values, y= base_predictions_train.columns.values, colorscale='Viridis', showscale=True, reversescale = True ) ] py.iplot(data, filename='labelled-heatmap')
X_train2 = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1) X_test2 = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
使用XGboost作为学习器
gbm = xgb.XGBClassifier( #learning_rate = 0.02, n_estimators= 2000, max_depth= 4, min_child_weight= 2, #gamma=1, gamma=0.9, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(X_train2, y_train) predictions = gbm.predict(X_test2)
# Generate Submission File StackingSubmission = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': predictions }) StackingSubmission.to_csv("StackingSubmission.csv", index=False) # 0.77990