任务
记录五个模型(逻辑回归、svm、决策树、xgboost、lightgbm)关于precision、recall score、f1 score、roc、aoc的评分表格。
实现
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 15 13:02:11 2018
@author: keepi
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
#导入数据
data = pd.read_csv('data.csv',encoding='gb18030')
print("data.shape:",data.shape)
#数据处理
miss_rate = data.isnull().sum() / len(data)
#print("缺失率:",miss_rate.sort_values(ascending=False))
X_num = data.select_dtypes('number').copy()
X_num.fillna(X_num.mean(),inplace=True)
print("数值型特征的shape:",X_num.shape)
print(X_num.columns)
X_num.drop(['Unnamed: 0','status'],axis=1,inplace=True)
X_str = data.select_dtypes(exclude='number').copy()
X_str.fillna(0,inplace=True)
print("非数值型特征:",X_str.columns)
print(X_str.head())
X_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X = pd.concat([X_num,X_dummy],axis=1,sort=False)
y = data['status']
#划分训练集、测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1117)
#归一化
ss = StandardScaler()
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.transform(X_test)
#逻辑回归模型
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_std,y_train)
#svm
from sklearn.svm import SVC
svm_linear = SVC(kernel = 'linear',probability=True).fit(X_train_std,y_train)
#决策树模型
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=8)
dtc.fit(X_train_std,y_train)
#xgboost sklearn版
from xgboost.sklearn import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train_std,y_train)
#lightgbm sklearn版
from lightgbm.sklearn import LGBMClassifier
lgb = LGBMClassifier()
lgb.fit(X_train_std,y_train)
print('all done!')
#模型评估
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
def draw_metrics(clf, X_train, X_test, y_train, y_test):
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
y_train_prob = clf.predict_proba(X_train)[:,1]
y_test_prob = clf.predict_proba(X_test)[:,1]
#准确率
print('准确率:\n')
print('训练集: ','%.4f'%accuracy_score(y_train,y_train_pred), end=' ')
print('测试集: ','%4f'%accuracy_score(y_test,y_test_pred),end=' ')
#精准率
print('精准率:\n')
print('训练集: ','%.4f'%precision_score(y_train,y_train_pred),end=' ')
print('测试集: ','%.4f'%precision_score(y_test,y_test_pred),end=' ')
#召回率
print('召回率:\n')
print('训练集: ','%.4f'%recall_score(y_train,y_train_pred),end=' ')
print('测试集: ','%.4f'%recall_score(y_test,y_test_pred),end=' ')
#f1_score
print('f1-score:\n')
print('训练集: ','%.4f'%f1_score(y_train,y_train_pred),end=' ')
print('测试集: ','%.4f'%f1_score(y_test,y_test_pred),end=' ')
#auc
print('auc:\n')
print('训练集: ','%.4f'%roc_auc_score(y_train,y_train_prob),end=' ')
print('测试集: ','%.4f'%roc_auc_score(y_test,y_test_prob),end=' ')
#roc曲线
fpr_train, tpr_train, thred_train = roc_curve(y_train,y_train_prob,pos_label=1)
fpr_test, tpr_test, thred_test = roc_curve(y_test,y_test_prob,pos_label=1)
label = ['Train - AUC:{:.4f}'.format(auc(fpr_train,tpr_train)),
'Test - AUC:{:.4f}'.format(auc(fpr_test,tpr_test))]
plt.plot(fpr_train,tpr_train)
plt.plot(fpr_test,tpr_test)
plt.plot([0,1],[0,1],'d--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(label, loc = 4)
plt.title('ROC Curve')
draw_metrics(lr, X_train_std,X_test_std,y_train,y_test)
draw_metrics(svm_linear, X_train_std,X_test_std,y_train,y_test)
draw_metrics(dtc, X_train_std,X_test_std,y_train,y_test)
draw_metrics(xgbc, X_train_std,X_test_std,y_train,y_test)
draw_metrics(lgb, X_train_std,X_test_std,y_train,y_test)
评估结果
accuracy | precision | recall | f1 score |
roc-auc score | ROC曲线 | |
逻辑回归 |
训练集:0.8019 测试集:0.7772 |
训练集:0.7181 测试集:0.6384 |
训练集:0.3604 测试集:0.3763 |
训练集:0.4799 测试集:0.4753 |
训练集:0.8157 测试集:0.7775 |
|
svm线性核 | 训练集: 0.8019 测试集: 0.7680 |
训练集: 0.7790 测试集: 0.6690 |
训练集: 0.2645 测试集: 0.2553 |
训练集: 0.3949 测试集: 0.3695 |
训练集: 0.8131 测试集: 0.7825 |
|
决策树 | 训练集: 0.8954 测试集: 0.7337 |
训练集: 0.8756 测试集: 0.5000 |
训练集: 0.6667 测试集: 0.3816 |
训练集: 0.7570 测试集: 0.4328 | 训练集: 0.9060 测试集: 0.6481 | |
xgboost | 训练集: 0.8584 测试集: 0.7842 | 训练集: 0.8800 测试集: 0.6651 |
训练集: 0.4871 测试集: 0.3816 |
训练集: 0.6271 测试集: 0.4849 | 训练集: 0.9156 测试集: 0.7855 | |
lightgbm | 训练集: 0.9976 测试集: 0.7730 | 训练集: 1.0000 测试集: 0.6120 | 训练集: 0.9902 测试集: 0.4026 | 训练集: 0.9951 测试集: 0.4857 | 训练集: 1.0000 测试集: 0.7764 |