1.Xgboost建模,sklearn评估
2.网格搜索交叉验证找最优参数
3.early-stop早停止
4.特征重要度
5.并行训练加速
#预估器建模方式:sklearn与XGboost配合使用
#xgboost建模,sklearn评估
import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
rng = np.random.RandomState(31337)
#二分类:混淆矩阵
print("数字0和1的二分类问题")
digits = load_digits(2) #2表示类别的意思
y = digits['target']
X = digits['data']
#数据切分对象
kf = KFold(n_splits=2,shuffle=True,random_state=rng)#对数据进行切分
print("在2着数据上的交叉验证")
#2折交叉验证
for train_index,test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])#初始化XGB分类器,拟合数据
predictions = xgb_model.predict(X[test_index])#预估
actuals = y[test_index]
print("混淆矩阵")
print(confusion_matrix(actuals,predictions))
#多分类,混淆矩阵
print('\nIris:多分类')
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("混淆矩阵:")
print(confusion_matrix(actuals, predictions))
#回归问题:MSE
print("\n波士顿房价回归预测问题")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("MSE:",mean_squared_error(actuals, predictions))
#网格搜索交叉验证查找最优超参数
print("参数最优化:")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()#以回归类问题为了,初始化一个回归器
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
print(clf.best_score_) #最好的得分
print(clf.best_params_)#最好的参数
#early-stopping 早停(另外一种调参方式)
# 在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长
X = digits['data']
y = digits['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
#early_stopping_rounds=10:表示再验证集上最多有耐心等10次,如果加了10棵树还未提升,则停止,追加到10次之前未最好的效果
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_val, y_val)])
#
#特征重要度
iris = load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)
print('特征排序:')
feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
#模型中的属性feature_importances_
feature_importances = xgb_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
for index in indices:
print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
import os
if __name__ == "__main__":
try:
from multiprocessing import set_start_method
except ImportError:
raise ImportError("Unable to import multiprocessing.set_start_method."
" This example only runs on Python 3.4")
#set_start_method("forkserver")
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_boston
import xgboost as xgb
rng = np.random.RandomState(31337)
print("Parallel Parameter optimization")
boston = load_boston()
os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)