版权声明:我是南七小僧,微信: to_my_love ,2020年硕士毕业,寻找 自然语言处理,图像处理,软件开发等相关工作,欢迎交流思想碰撞。 https://blog.csdn.net/qq_25439417/article/details/83590107
# -*- coding: utf-8 -*-
"""
@Author: zzn
@Date: 2018-10-22 15:01:44
@Last Modified by: zzn
@Last Modified time: 2018-10-22 15:01:44
"""
import pandas as pd
import numpy as np
import datetime
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
def concat_data():
tr_data = pd.read_csv('../data/train_xy.csv')
te_data = pd.read_csv('../data/test_all.csv')
tr_u_data = pd.read_csv('../data/train_x.csv')
tr_u_data['y'] = -2
te_data['y'] = -1
data = pd.concat([tr_data, tr_u_data, te_data], axis=0)
return data
def drop_useless_cols(data):
data = data.replace(-99, np.nan)
col_null_rate = data.isnull().sum()/len(data)
high_null_cols = list(col_null_rate[col_null_rate >= 0.8].index)
cols_nunique = data.nunique()
one_value_cols = list(col_null_rate[cols_nunique == 1].index)
useless_cols = list(set(high_null_cols) | set(one_value_cols))
data = data.drop(useless_cols, axis=1)
return data
def submit(te_id, pred_proba, prefix=''):
result = pd.DataFrame()
result['cust_id'] = te_id
result['pred_prob'] = pred_proba
now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
result.to_csv('../result/{}_res_{}.csv'.format(prefix, now), index=False)
def cross_valid(tr_x, tr_y, te_x, te_id, n_fold, col_names=None, cate_cols=None):
kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=2018)
val_score = []
pred_proba = []
low_threshold = np.linspace(0.18, 0.22, 20)
high_threshold = np.linspace(0.04, 0.07, 20)
extra_tr_x_indexs = []
extra_u_y = []
for k, (tr_idx, val_idx) in enumerate(kfold.split(tr_x, tr_y)):
if SEMI_FLAG == True:
k = k
else:
k = 0
train_x, train_y, valid_x, valid_y = tr_x.iloc[tr_idx], tr_y[
tr_idx], tr_x.iloc[val_idx], tr_y[val_idx]
train_dataset = lgb.Dataset(
train_x, train_y, feature_name=col_names, categorical_feature=cate_cols, free_raw_data=False)
val_dataset = lgb.Dataset(
valid_x, valid_y, feature_name=col_names, categorical_feature=cate_cols, free_raw_data=False)
lgb_paras = {
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.1,
'num_leaves': 31,
'lambda_l1': 0.1,
'lambda_l2': 10,
'seed': 2018,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_freq': 4,
'max_depth': -1
}
if k == 0:
lgb_model = lgb.train(lgb_paras, train_dataset, num_boost_round=3000,
early_stopping_rounds=50, valid_sets=[val_dataset], verbose_eval=10, categorical_feature=cate_cols, feature_name=col_names)
val_auc = roc_auc_score(valid_y, lgb_model.predict(
valid_x, num_iteration=lgb_model.best_iteration))
val_score.append(val_auc)
tr_u_pred = lgb_model.predict(
tr_u_x, num_iteration=lgb_model.best_iteration)
for low in low_threshold:
for high in high_threshold:
neg_index = list(np.where(tr_u_pred <= low)[0])
pos_index = list(np.where(tr_u_pred >= high)[0])
extra_tr_x_indexs.append(pos_index+neg_index)
extra_u_y.append([1]*len(pos_index)+[0]*len(neg_index))
pred_proba.append(lgb_model.predict(
te_x, num_iteration=lgb_model.best_iteration))
else:
lgb_model = lgb.train(lgb_paras, train_dataset, num_boost_round=3000,
early_stopping_rounds=50, valid_sets=[val_dataset], verbose_eval=10, categorical_feature=cate_cols, feature_name=col_names)
val_auc = roc_auc_score(valid_y, lgb_model.predict(
valid_x, num_iteration=lgb_model.best_iteration))
tr_u_pred = lgb_model.predict(
tr_u_x, num_iteration=lgb_model.best_iteration)
best_score = 0
best_model = None
for i, idx in enumerate(extra_tr_x_indexs):
extra_x = tr_u_x.iloc[idx]
extra_y = extra_u_y[i]
train_tmp_x = pd.concat([train_x, extra_x], axis=0)
train_tmp_y = np.concatenate((train_y, extra_y), axis=0)
train_tmp_dataset = lgb.Dataset(
train_tmp_x, train_tmp_y, feature_name=col_names, categorical_feature=cate_cols)
lgb_tmp_model = lgb.train(lgb_paras, train_tmp_dataset,
num_boost_round=3000, early_stopping_rounds=50,
valid_sets=[
val_dataset], verbose_eval=0,
categorical_feature=cate_cols, feature_name=col_names)
val_tmp_auc = roc_auc_score(valid_y, lgb_tmp_model.predict(
valid_x, num_iteration=lgb_tmp_model.best_iteration))
if val_tmp_auc > best_score:
print('Current Val AUC: {} ,After Semi-Supervised Val AUC: {}'.format(
val_auc, val_tmp_auc))
best_score = val_tmp_auc
best_model = lgb_tmp_model
extra_tr_x_indexs = []
extra_u_y = []
tr_u_pred = lgb_model.predict(
tr_u_x, num_iteration=lgb_model.best_iteration)
for low in low_threshold:
for high in high_threshold:
neg_index = list(np.where(tr_u_pred <= low)[0])
pos_index = list(np.where(tr_u_pred >= high)[0])
extra_tr_x_indexs.append(pos_index+neg_index)
extra_u_y.append([1]*len(pos_index)+[0]*len(neg_index))
if best_score > val_auc:
print(
'Current Val AUC: {} ,Best Semi-Supervised Val AUC: {}'.format(val_auc, best_score))
val_score.append(best_score)
pred_proba.append(best_model.predict(
te_x, num_iteration=best_model.best_iteration))
else:
val_score.append(val_auc)
pred_proba.append(lgb_model.predict(
te_x, num_iteration=lgb_model.best_iteration))
val_score = val_score[1:]
pred_proba = pred_proba[1:]
print('val auc mean:', np.mean(val_score))
print('val auc std:', np.std(val_score))
pred_proba = (np.array(
pred_proba)*(np.array(val_score).reshape(-1, 1))).sum(axis=0)/np.sum(val_score)
submit(te_id, pred_proba, np.mean(val_score))
if __name__ == '__main__':
data = concat_data()
data = drop_useless_cols(data)
cate_cols = ['x_{}'.format(i) for i in range(96, 158)]
cates = [col for col in data.columns if col in cate_cols]
tr_data = data[data['y'] != -1].copy()
tr_data = tr_data[tr_data['y'] != -2].copy()
tr_u_data = data[data['y'] == -2].copy()
te_data = data[data['y'] == -1].copy()
tr_x = tr_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
tr_u_x = tr_u_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
te_x = te_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
te_id = te_data['cust_id'].values
tr_y = tr_data['y'].values
SEMI_FLAG = True
cross_valid(tr_x, tr_y, te_x, te_id, 10, cate_cols=cates)
没做特征的情况下,semi-lgb拿到新网银行大数据的全国第15名