semi-lgb

# -*- coding: utf-8 -*-
"""
  @Author: zzn
  @Date: 2018-10-22 15:01:44
  @Last Modified by:   zzn
  @Last Modified time: 2018-10-22 15:01:44
"""
import pandas as pd
import numpy as np
import datetime
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


def concat_data():
    tr_data = pd.read_csv('../data/train_xy.csv')
    te_data = pd.read_csv('../data/test_all.csv')
    tr_u_data = pd.read_csv('../data/train_x.csv')
    tr_u_data['y'] = -2
    te_data['y'] = -1
    data = pd.concat([tr_data, tr_u_data, te_data], axis=0)
    return data


def drop_useless_cols(data):
    data = data.replace(-99, np.nan)
    col_null_rate = data.isnull().sum()/len(data)
    high_null_cols = list(col_null_rate[col_null_rate >= 0.8].index)
    cols_nunique = data.nunique()
    one_value_cols = list(col_null_rate[cols_nunique == 1].index)
    useless_cols = list(set(high_null_cols) | set(one_value_cols))
    data = data.drop(useless_cols, axis=1)
    return data


def submit(te_id, pred_proba, prefix=''):
    result = pd.DataFrame()
    result['cust_id'] = te_id
    result['pred_prob'] = pred_proba
    now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    result.to_csv('../result/{}_res_{}.csv'.format(prefix, now), index=False)


def cross_valid(tr_x, tr_y, te_x, te_id, n_fold, col_names=None, cate_cols=None):
    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=2018)
    val_score = []
    pred_proba = []
    low_threshold = np.linspace(0.18, 0.22, 20)
    high_threshold = np.linspace(0.04, 0.07, 20)
    extra_tr_x_indexs = []
    extra_u_y = []
    for k, (tr_idx, val_idx) in enumerate(kfold.split(tr_x, tr_y)):
        if SEMI_FLAG == True:
            k = k
        else:
            k = 0
        train_x, train_y, valid_x, valid_y = tr_x.iloc[tr_idx], tr_y[
            tr_idx], tr_x.iloc[val_idx], tr_y[val_idx]
        train_dataset = lgb.Dataset(
            train_x, train_y, feature_name=col_names, categorical_feature=cate_cols, free_raw_data=False)
        val_dataset = lgb.Dataset(
            valid_x, valid_y, feature_name=col_names, categorical_feature=cate_cols, free_raw_data=False)
        lgb_paras = {
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.1,
            'num_leaves': 31,
            'lambda_l1': 0.1,
            'lambda_l2': 10,
            'seed': 2018,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            'max_depth': -1
        }
        if k == 0:
            lgb_model = lgb.train(lgb_paras, train_dataset, num_boost_round=3000,
                                  early_stopping_rounds=50, valid_sets=[val_dataset], verbose_eval=10, categorical_feature=cate_cols, feature_name=col_names)
            val_auc = roc_auc_score(valid_y, lgb_model.predict(
                valid_x, num_iteration=lgb_model.best_iteration))
            val_score.append(val_auc)
            tr_u_pred = lgb_model.predict(
                tr_u_x, num_iteration=lgb_model.best_iteration)
            for low in low_threshold:
                for high in high_threshold:
                    neg_index = list(np.where(tr_u_pred <= low)[0])
                    pos_index = list(np.where(tr_u_pred >= high)[0])
                    extra_tr_x_indexs.append(pos_index+neg_index)
                    extra_u_y.append([1]*len(pos_index)+[0]*len(neg_index))
            pred_proba.append(lgb_model.predict(
                te_x, num_iteration=lgb_model.best_iteration))
        else:
            lgb_model = lgb.train(lgb_paras, train_dataset, num_boost_round=3000,
                                  early_stopping_rounds=50, valid_sets=[val_dataset], verbose_eval=10, categorical_feature=cate_cols, feature_name=col_names)
            val_auc = roc_auc_score(valid_y, lgb_model.predict(
                valid_x, num_iteration=lgb_model.best_iteration))
            tr_u_pred = lgb_model.predict(
                tr_u_x, num_iteration=lgb_model.best_iteration)
            best_score = 0
            best_model = None
            for i, idx in enumerate(extra_tr_x_indexs):
                extra_x = tr_u_x.iloc[idx]
                extra_y = extra_u_y[i]
                train_tmp_x = pd.concat([train_x, extra_x], axis=0)
                train_tmp_y = np.concatenate((train_y, extra_y), axis=0)
                train_tmp_dataset = lgb.Dataset(
                    train_tmp_x, train_tmp_y, feature_name=col_names, categorical_feature=cate_cols)
                lgb_tmp_model = lgb.train(lgb_paras, train_tmp_dataset,
                                          num_boost_round=3000, early_stopping_rounds=50,
                                          valid_sets=[
                                              val_dataset], verbose_eval=0,
                                          categorical_feature=cate_cols, feature_name=col_names)
                val_tmp_auc = roc_auc_score(valid_y, lgb_tmp_model.predict(
                    valid_x, num_iteration=lgb_tmp_model.best_iteration))
                if val_tmp_auc > best_score:
                    print('Current Val AUC: {} ,After Semi-Supervised Val AUC: {}'.format(
                        val_auc, val_tmp_auc))
                    best_score = val_tmp_auc
                    best_model = lgb_tmp_model
            extra_tr_x_indexs = []
            extra_u_y = []
            tr_u_pred = lgb_model.predict(
                tr_u_x, num_iteration=lgb_model.best_iteration)
            for low in low_threshold:
                for high in high_threshold:
                    neg_index = list(np.where(tr_u_pred <= low)[0])
                    pos_index = list(np.where(tr_u_pred >= high)[0])
                    extra_tr_x_indexs.append(pos_index+neg_index)
                    extra_u_y.append([1]*len(pos_index)+[0]*len(neg_index))
            if best_score > val_auc:
                print(
                    'Current Val AUC: {} ,Best Semi-Supervised Val AUC: {}'.format(val_auc, best_score))
                val_score.append(best_score)
                pred_proba.append(best_model.predict(
                    te_x, num_iteration=best_model.best_iteration))
            else:
                val_score.append(val_auc)
                pred_proba.append(lgb_model.predict(
                    te_x, num_iteration=lgb_model.best_iteration))
    val_score = val_score[1:]
    pred_proba = pred_proba[1:]
    print('val auc mean:', np.mean(val_score))
    print('val auc std:', np.std(val_score))
    pred_proba = (np.array(
        pred_proba)*(np.array(val_score).reshape(-1, 1))).sum(axis=0)/np.sum(val_score)
    submit(te_id, pred_proba, np.mean(val_score))


if __name__ == '__main__':
    data = concat_data()
    data = drop_useless_cols(data)
    cate_cols = ['x_{}'.format(i) for i in range(96, 158)]
    cates = [col for col in data.columns if col in cate_cols]
    tr_data = data[data['y'] != -1].copy()
    tr_data = tr_data[tr_data['y'] != -2].copy()
    tr_u_data = data[data['y'] == -2].copy()
    te_data = data[data['y'] == -1].copy()
    tr_x = tr_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
    tr_u_x = tr_u_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
    te_x = te_data.drop(['cust_id', 'cust_group', 'y'], axis=1)
    te_id = te_data['cust_id'].values
    tr_y = tr_data['y'].values
    SEMI_FLAG = True
    cross_valid(tr_x, tr_y, te_x, te_id, 10, cate_cols=cates)
没做特征的情况下，semi-lgb拿到新网银行大数据的全国第15名
猜你喜欢