import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,roc_curve,auc
import matplotlib.pyplot as plt
# 显示中文及负号
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 文件路径
data_path = '/Users/gaofei/Desktop/ensemble/data.csv' # 请自行更改
# 读取数据
data = pd.read_csv(data_path, encoding='gbk')
# 归一化
scaler = MinMaxScaler()
values = scaler.fit_transform(data.values[:,:-1])
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(values[:,:-1],data.values[:,-1],test_size=0.3)
数据集地址:提取码:lg9r
https://pan.baidu.com/s/1e8txYy-PZrwKKP3JD4sJAg
# 不调参的模型效果
def default_param():
gbdtclf = GradientBoostingClassifier(random_state=10)
gbdtclf.fit(X_train, y_train)
y_pre = gbdtclf.predict(X_test)
y_prb_1 = gbdtclf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prb_1)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='AUC = {0:.4f}'.format(roc_auc))
plt.title('ROC曲线')
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('误判率')
plt.ylabel('命中率')
plt.show()
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pre)))
不调参输出准确率为:0.82, AUC为0.917
# 首先从步长(learning rate) 和 迭代次数(n_estimator)入手,将步长初始值设为0.1,对迭代次数进行网格搜索
def adjust_n_estimators():
param_dic = {'n_estimators': range(10, 101, 10)}
gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
min_samples_leaf=20, max_depth=8, max_features='sqrt',
subsample=0.8, random_state=10),
param_grid=param_dic, scoring='roc_auc', iid=False, cv=5)
gscv.fit(X_train, y_train)
print('best_params:{0}'.format(gscv.best_params_))
print('best_score:{0}'.format(gscv.best_score_))
best_params:{'n_estimators': 60}
best_score:0.9194666836254305
# 迭代次数有了,接下来对决策树进行调参:
# 首先对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
def adjust_depth_samples():
param_dic = {'max_depth': range(3, 14, 2), 'min_samples_split': range(100, 801, 200)}
gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, min_samples_leaf=20,
max_features='sqrt', subsample=0.8, random_state=10),
param_grid=param_dic, scoring='roc_auc', iid=False, cv=5)
gscv.fit(X_train, y_train)
print('best_params:{0}'.format(gscv.best_params_))
print('best_score:{0}'.format(gscv.best_score_))
best_params:{'max_depth': 7, 'min_samples_split': 500}
best_score:0.9196201483525961
# 先定下深度为7,但min_samples_split和其他参数还有关联,接下来要和min_samples_leaf一起调参
def adjust_samples_leaf():
param_dic = {'min_samples_split':range(500,900,100), 'min_samples_leaf':range(50,151,10)}
gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=7,
max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_dic, scoring='roc_auc',iid=False, cv=5)
gscv.fit(X_train,y_train)
print('best_params:{0}'.format(gscv.best_params_))
print('best_score:{0}'.format(gscv.best_score_))
best_params:{'min_samples_leaf': 110, 'min_samples_split': 500}
best_score:0.9202495480403471
def best_param():
gbdtclf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_depth=7,min_samples_leaf=110,
min_samples_split=500, max_features='sqrt', subsample=0.8, random_state=10)
gbdtclf.fit(X_train, y_train)
y_pre = gbdtclf.predict(X_test)
y_prb_1 = gbdtclf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prb_1)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='AUC = {0:.4f}'.format(roc_auc))
plt.title('ROC曲线')
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('误判率')
plt.ylabel('命中率')
plt.show()
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pre)))
调参之后,准确率为0.86,AUC为0.92