文章目录
MLR
https://zhuanlan.zhihu.com/p/100532677
https://blog.csdn.net/fyneru_xiaohui/article/details/106390266
deepctr实现MLR
import os, warnings, time, sys
import pickle
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from deepctr.models import DeepFM, xDeepFM, MLR, DeepFEFM, DIN, AFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.layers import custom_objects
from tensorflow.python.keras.models import save_model, load_model
from tensorflow.keras.models import model_from_yaml
import tensorflow as tf
from tensorflow.python.ops import array_ops
import tensorflow.keras.backend as K
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.models import model_from_json
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers.embeddings import Embedding
from toolsnn import *
def train_MLR():
print('MLR 模型训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
start_time_start = time.time()
# pdtrain:正样本数:565485,负样本数:1133910,正负样本比: 1 : 2.0052
# pdtest:正样本数:565485,负样本数:1134505,正负样本比: 1 : 2.0063
# pdeval_full:正样本数:46,负样本数:8253,正负样本比: 1 : 179.413
pdtrain = pd.read_csv(train_path_ascii)
pdtest = pd.read_csv(test_path_ascii)
data = pd.concat([pdtrain, pdtest[pdtest['y'] == 0]], axis=0, ignore_index=True)
data = data.drop(['WilsonClickRate_all', 'WilsonClickRate_yesterday', 'WilsonAd_clickRate_all',
'WilsonAd_clickRate_yesterday'], axis=1)
# 将 `用户id`、`广告id`、`用户设备`、`多牛广告位id` 用ASCII数值化,转为embedding: 利用卷积原理,将每个字符的ascii码相加作为字符串的数值
data['suuid'] = data['suuid'].apply(lambda x: sum([ord(i) for i in x]))
data['advertisement'] = data['advertisement'].apply(lambda x: sum([ord(i) for i in x]))
# data['position'] = data['position'].apply(lambda x: sum([ord(i) for i in x])) # 多牛广告位id本身就是float类型,直接embedding
data['user_modelMake'] = data['user_modelMake'].apply(lambda x: sum([ord(i) for i in x]))
# double -> float
data = transformDF(data, ['reserve_price', 'reserve_price_cpc', 'clickRate_all', 'clickRate_yesterday',
'ad_clickRate_yesterday'], float)
''' 特征处理 '''
global sparsecols, densecols
# 稀疏-onehot
sparsecols = ['hour', 'advert_place', 'province_id', 'port_type', 'user_osID', 'is_holidays', 'is_being',
'is_outflow', 'advertiser', 'ad_from', 'payment']
# ascii embedding
sparse_ascii = ['suuid', 'advertisement', 'position', 'user_modelMake']
# 稠密-归一化
densecols = ['W', 'H', 'reserve_price', 'reserve_price_cpc', 'is_rest_click', 'clickPerHour_yesterday',
'display_nums_all', 'click_nums_all', 'display_nums_yesterday', 'click_nums_yesterday',
'ad_display_all', 'ad_click_all', 'ad_display_yesterday', 'ad_click_yesterday']
# 稠密-点击率
ratecols = ['WHrate', 'clickRate_all', 'clickRate_yesterday', 'ad_clickRate_yesterday']
global namesoh
namesoh = {
}
for sparse in sparsecols:
onehot = OneHotEncoder()
arrays = onehot.fit_transform(np.array(data[sparse]).reshape(-1, 1))
# 将onehot后的稀疏矩阵拼回原来的df
arrays = arrays.toarray()
names = [sparse + '_' + str(n) for n in range(len(arrays[0]))]
namesoh[sparse] = names
data = pd.concat([data, pd.DataFrame(arrays, columns=names)], axis=1)
data = data.drop([sparse], axis=1)
# 保存编码规则
with open(feature_encode_path.format(sparse) + '.pkl', 'wb') as f:
pickle.dump(onehot, f)
# print(' {} onehot完成'.format(sparse))
print(' onehot完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
for dense in densecols:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense] = mms.fit_transform(np.array(data[dense]).reshape(-1, 1))
with open(feature_encode_path.format(dense) + '.pkl', 'wb') as f:
pickle.dump(mms, f)
# print(' {} 归一化完成'.format(dense))
print(' 归一化完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
print(' columns: ', len(list(data.columns)))
''' 训练集、测试集、验证集划分 '''
train_data, test_data = getRata2(data, num=1)
_, val_data = train_test_split(test_data, test_size=0.2, random_state=1, shuffle=True)
train_data = shuffle(train_data)
test_data = shuffle(test_data)
val_data = shuffle(val_data)
negBpow(train_data, '训练集')
negBpow(val_data, '验证集')
negBpow(test_data, '测试集')
print(' train_data shape: ', train_data.shape)
print(' val_data shape: ', val_data.shape)
print(' test_data shape: ', test_data.shape)
sparse_features = []
for value in namesoh.values():
for v in value:
sparse_features.append(v)
dense_features = densecols + ratecols
sparse_feature_columns1 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
for i, feat in enumerate(sparse_features)]
sparse_feature_columns2 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
for i, feat in enumerate(sparse_ascii)]
sparse_feature_columns = sparse_feature_columns1 + sparse_feature_columns2
dense_feature_columns = [DenseFeat(feat, 1)
for feat in dense_features]
print(' sparse_features count: ', len(sparse_features))
print(' dense_features count: ', len(dense_features))
linear_feature_columns = sparse_feature_columns + dense_feature_columns
tmp_user = ['hour', 'province_id', 'user_osID', 'is_holidays', 'is_being', 'is_outflow']
region_feature_columns = []
for key, value in namesoh.items():
if key in tmp_user:
for v in value:
region_feature_columns.append(v)
base_feature_columns = linear_feature_columns
global feature_names
feature_names = get_feature_names(linear_feature_columns)
print(' feature_names: ', feature_names)
''' feed input '''
train_x = {
name: train_data[name].values for name in feature_names}
test_x = {
name: test_data[name].values for name in feature_names}
val_x = {
name: val_data[name].values for name in feature_names}
train_y = train_data[['y']].values
test_y = test_data[['y']].values
val_y = val_data[['y']].values
print(' 数据处理完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
'''
region_feature_columns: 用于聚类的用户特征
base_feature_columns:基模型特征,其实可以是全部特征,也可以是用于训练的广告特征
l2_reg_linear:LR的正则强度(L2正则)
bias_feature_columns: 偏好特征,不同的人群具有聚类特性,同一类人群具有类似的广告点击偏好。
'''
deep = MLR(region_feature_columns=base_feature_columns, region_num=4,
l2_reg_linear=1e-5, task='binary',)
mNadam = Adam(lr=1e-4, beta_1=0.95, beta_2=0.96)
deep.compile(optimizer=mNadam, loss='binary_crossentropy',
metrics=['AUC', 'Precision', 'Recall'])
print(' 组网完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
print(' 训练开始 ', time.strftime("%H:%M:%S", time.localtime(time.time())))
start_time = time.time()
''' 训练 '''
# 早停止:验证集精确率上升幅度小于min_delta,训练停止
earlystop_callback = EarlyStopping(
monitor='val_precision', min_delta=0.001, mode='max',
verbose=2, patience=3)
generator_flag = False # fit
# generator_flag = True # fit_generator
if not generator_flag:
history = deep.fit(
train_x, train_y, validation_data=(val_x, val_y),
batch_size=2000,
epochs=3,
verbose=2,
shuffle=True,
# callbacks=[earlystop_callback]
)
else:
batch_size = 2000
train_nums = len(train_data)
history = deep.fit_generator(
GeneratorRandomPatchs(train_x, train_y, batch_size, train_nums, feature_names),
validation_data=(val_x, val_y),
steps_per_epoch=train_nums // batch_size,
epochs=3000,
verbose=2,
shuffle=True,
# callbacks=[earlystop_callback]
)
end_time = time.time()
print(' 训练完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
print((' 训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time - start_time) // 60, (end_time - start_time) % 60)))
# 模型保存成yaml文件
save_model(deep, save_path)
print(' 模型保存完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
# # 训练可视化
# visualization(history, saveflag=True, showflag=False, path1=loss_plt_path.format('loss_auc.jpg'),
# path2=loss_plt_path.format('precision_recall.jpg'))
# 测试集评估
scores = deep.evaluate(test_x, test_y, verbose=0)
print(' %s: %.4f' % (deep.metrics_names[0], scores[0]))
print(' %s: %.4f' % (deep.metrics_names[1], scores[1]))
print(' %s: %.4f' % (deep.metrics_names[2], scores[2]))
print(' %s: %.4f' % (deep.metrics_names[3], scores[3]))
print(' %s: %.4f' % ('F1', (2 * scores[2] * scores[3]) / (scores[2] + scores[3])))
print(' 验证集再评估完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
# 全量评估
full_evaluate2()
end_time_end = time.time()
print(('MLR 模型训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60,
(end_time_end - start_time_start) % 60)))
print(('{:.0f}小时'.format((end_time_end - start_time_start) // 60 / 60)))