
train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'], weight=w )

LightGBM 可以直接使用 categorical features(分类特征)作为 input(输入). 它不需要被转换成 one-hot coding(独热编码), 并且它比 one-hot coding(独热编码)更快(约快上 8 倍)

注意: 在你构造 Dataset 之前, 你应该将分类特征转换为 int 类型的值.



num_round = 10

lgb.cv(param, train_data, num_round, nfold=5)



lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快

lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # 创建验证数据


params = {

'task': 'train',

'boosting_type': 'gbdt',  # 设置提升类型

'objective': 'regression', # 目标函数

'metric': {'l2', 'auc'},  # 评估函数

'num_leaves': 31,   # 叶子节点数

'learning_rate': 0.05,  # 学习速率

'feature_fraction': 0.9, # 建树的特征选择比例

'bagging_fraction': 0.8, # 建树的样本采样比例

'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging

'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息


print(‘Start training…’)

#训练 cv and train

gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5) # 训练数据需要参数列表和数据集

print(‘Save model…’)

gbm.save_model(‘model.txt’) # 训练后保存模型到文件

print(‘Start predicting…’)


y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止,可以通过best_iteration方式从最佳迭代中获得预测


print(‘The rmse of prediction is:’, mean_squared_error(y_test, y_pred) ** 0.5) # 计算真实值和预测值之间的均方根误差

import pandas as pd
df = pd.read('data.csv')


from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, Y, train_size = 0.8, random_state = 123)
import lightgbm as lgb
lgb_train = lgb.Dataset(train_X, train_y, free_raw_data=False)
lgb_eval = lgb.Dataset(test_X, test_y, reference=lgb_train,free_raw_data=False)
param = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'l2', 'auc'},
'num_leaves': 40,
'learning_rate': 0.01,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
param['metric'] = 'auc'

bst=lgb.cv(param,lgb_train, num_boost_round=1000, nfold=6, early_stopping_rounds=100)
estimators = lgb.train(param,lgb_train,num_boost_round=len(bst['auc-mean']))
print('Start training...')
y_pred = estimators.predict(test_X, num_iteration=estimators.best_iteration)
from sklearn import metrics
print('The roc of prediction is:', metrics.roc_auc_score(test_y, y_pred) )
print('Feature names:', estimators.feature_name())
print('Feature importances:', list(estimators.feature_importance()))


def XGB_predict(train_x,train_y,val_X,val_Y,test_x,res):
    print("XGB test")
    # create dataset for lightgbm

    xgb_val = xgb.DMatrix(val_X, label=val_Y)
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_test = xgb.DMatrix(test_x)
    # specify your configurations as a dict
    params = {
              'booster': 'gbtree',
              # 'objective': 'multi:softmax', # 多分类的问题、
              # 'objective': 'multi:softprob', # 多分类概率
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              # 'num_class': 9, # 类别数,与 multisoftmax 并用
              'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
              'max_depth': 8, # 构建树的深度,越大越容易过拟合
              'alpha': 0, # L1正则化系数
              'lambda': 10, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
              'subsample': 0.7, # 随机采样训练样本
              'colsample_bytree': 0.5, # 生成树时进行的列采样
              'min_child_weight': 3,
              # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
              # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
              # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
              'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0.
              'eta': 0.03, # 如同学习率
              'seed': 1000,
              'nthread': -1, # cpu 线程数
              'missing': 1,
              'scale_pos_weight': (np.sum(y==0)/np.sum(y==1)) # 用来处理正负样本不均衡的问题,通常取:sum(negative cases) / sum(positive cases)
              # 'eval_metric': 'auc'

    plst = list(params.items())
    num_rounds = 5000 # 迭代次数
    watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
    # 交叉验证
    # result = xgb.cv(plst, xgb_train, num_boost_round=200, nfold=4, early_stopping_rounds=200, verbose_eval=True, folds=StratifiedKFold(n_splits=4).split(X, y))
    # 训练模型并保存
    # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
    model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=200)
    res['score'] = model.predict(xgb_test)
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    return res


