基于xgboost 的贷款风险预测

 现在我们用传说中的xgboost 对这个数据集进行计算


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 19 13:19:26 2017

@author: luogan
"""

import pandas as pd
df = pd.read_csv('loans.csv')

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
dff =df.apply(lambda df: d[df.name].fit_transform(df))
dff.to_excel('dff.xls')



import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional     scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
#%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_excel('dff.xls')
target = 'safe_loans'
IDcol = 'id'


def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['safe_loans'],eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    from pandas import DataFrame
    '''
    gg=DataFrame(dtrain_predictions)
    gg.to_excel('dtrain_predictions.xls')   

    tt=DataFrame(dtrain_predprob)
    tt.to_excel('dtrain_predprob.xls')
    '''

    print(alg)
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['safe_loans'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['safe_loans'], dtrain_predprob))

    ww=(alg.feature_importances_)
    print(ww)            
    feat_imp = pd.Series(ww).sort_values(ascending=False)

    #print(feat_imp)

    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

    """
    model=alg
    featureImportance = model.get_score() 
    features = pd.DataFrame() 
    features['features'] = featureImportance.keys() 
    features['importance'] = featureImportance.values() 
    features.sort_values(by=['importance'],ascending=False,inplace=True) 
    fig,ax= plt.subplots() 
    fig.set_size_inches(20,10) 
    plt.xticks(rotation=60) 
    #sn.barplot(data=features.head(30),x="features",y="importance",ax=ax,orient="v") 
    """


#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
                learning_rate =0.1,
                n_estimators=1000,
                max_depth=18,
                min_child_weight=1,
                gamma=0,
                subsample=0.8,
                colsample_bytree=0.8,
                objective= 'binary:logistic',
                nthread=4,
                scale_pos_weight=1,
                seed=27)
modelfit(xgb1, train, predictors)     





  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
Model Report
Accuracy : 0.9533
AUC Score (Train): 0.990971
  • 1
  • 2
  • 3

    正确率95%,甩决策树和BP网几条街啊! 
可见传说中的xgboost果然厉害,难怪工业实践中xgboost 应用如痴的广泛 
下图显示了每个 feature的重要性,里面有两个文件,请运行xgboost.py

这里写图片描述

猜你喜欢

转载自blog.csdn.net/kwame211/article/details/80224462