dvars = {} scores = {} train = pd.read_csv('train.csv',index_col='CUST_ID') del train['Unnamed: 0'] df = label_encoder(train) df['target'] = df['bad_good'] del df['bad_good'] df = rough_del(df, p=0.9) # 判断变量是类别型还是数值型 numerical_var = [] features = df.columns[:-1] # 获取特征名 for col in features: uniq_valid_vals = list(set(df[col])) # 转为集合类型去重 if len(uniq_valid_vals) >= 6: # 如果特征去重后的个数大于6则可以判断为数值特征 (6为经验值) numerical_var.append(col) # 将判断为数值特征的特征名称存入列表 categorical_var = [i for i in features if i not in numerical_var] # 类别特征名称列表 print('数值特征的个数是:{},分别是{}'.format(len(numerical_var),numerical_var)) print('类别特征的个数是:{},分别是{}'.format(len(categorical_var),categorical_var)) # 首先判断哪些列有缺失值 (有一个也存入,待处理) notnullCol = [] # 无缺失值特征的列表 isnullCol = [] # 有缺失值特征的列表 for col in features: if df[col].count() < df.shape[0]: isnullCol.append(col) else: notnullCol.append(col)
一、以上为对缺失值的判定
二、填补缺失值
for col in isnullCol: missingRate = MissingRate(df,col) # 计算当前col缺失值的占比 print('特征 {} 的缺失率是 {}'.format(col,missingRate)) # 输出当前col列的缺失值 if missingRate > 0.8: # 缺失率大于0.8,则用MissingNewFeature方法处理 df[col] = MissingNewFeature(df,col) elif 0.5 < missingRate < 0.8: # 缺失率大于0.5小于0.8,则用CategoricalMissingFeature方法处理 df = CategoricalMissingFeature(df,col,n=4) elif 0.3 < missingRate < 0.5: feature = random.sample(notnullCol, 3) # 没有缺失值的特征中随机选择3个特征 if col in categorical_var: # 判断是否为类别特征 df = Model_Fillna(df,col,feature,method='clf') # 使用 Model_Fillna 中的 随机森林分类模型 填补缺失样本 else: df = Model_Fillna(df, col, feature, method='reg') # 使用 Model_Fillna 中的 随机森林回归模型 填补缺失样本 else: if col in categorical_var: # 缺失率为其他时,判断是否为类别特征 df[col].fillna(df[col].dropna().mode().values[0],inplace=True) # 将所有nan替换为众数 else: # 不在类别特征列表中的为数值特征 df[col].fillna(df[col].dropna().mean(),inplace=True) # 将所有nan替换为均值
三、类别型变量处理方式
''' 对于类别型变量,按照以下方式处理 1,如果变量的取值个数超过5,计算bad rate进行编码 2,除此之外,其他任何类别型变量如果有某个取值中, 对应的样本全部是坏样本或者是好样本,进行合并。 ''' deleted_features = [] #将处理过的变量删除,防止对后面的建模干扰 encoded_features = {} #将bad rate编码方式保存下来,在以后的测试和生产环境中需要使用 merged_features = {} #将类别型变量合并方案保留下来 var_IV = {} #save the IV values for binned features # 将IV值保留和WOE值 var_WOE = {} for col in categorical_var: # 循环类别特征名列表 print('we are processing {}'.format(col)) if len(set(df[col]))>5: # 查看去重后的类别个数是否大于5 print('{} is encoded with bad rate'.format(col)) # col0 = str(col)+'_encoding' #(1), 计算坏样本率并进行编码 encoding_result = BadRateEncoding(df, col, 'target') df[col0], br_encoding = encoding_result['encoding'],encoding_result['bad_rate'] # 坏样本率,{样本:坏样本率} #(2), 将(1)中的编码后的变量也加入数值型变量列表中,为后面的卡方分箱做准备 numerical_var.append(col0) #(3), 保存编码结果 encoded_features[col] = [col0, br_encoding] #(4), 删除原始值 deleted_features.append(col) # 将处理完毕的连续变量存入此列表 else: bad_bin = df.groupby([col])['target'].sum() #对于类别数少于5个,但是出现0坏样本的特征需要做处理 if min(bad_bin) == 0: # 判断当前特征中是否有坏样本 print('{} has 0 bad sample!'.format(col)) col1 = str(col) + '_mergeByBadRate' #(1), 找出最优合并方式,使得每一箱同时包含好坏样本 mergeBin = MergeBad0(df, col, 'target') #(2), 依照(1)的结果对值进行合并 df[col1] = df[col].map(mergeBin) maxPcnt = MaximumBinPcnt(df, col1) # 占比最大的类别 # 如果合并后导致有箱占比超过90%,就删除。 if maxPcnt > 0.9: print('{} is deleted because of large percentage of single bin'.format(col)) deleted_features.append(col) categorical_var.remove(col) del df[col] continue #(3) 如果合并后的新的变量满足要求,就保留下来 merged_features[col] = [col1, mergeBin] WOE_IV = CalcWOE(df, col1, 'target') # 返回WOE值,及IV值 var_WOE[col1] = WOE_IV['WOE'] var_IV[col1] = WOE_IV['IV'] #del trainData[col] deleted_features.append(col) # 处理完毕的变量存入deleted_features else: WOE_IV = CalcWOE(df, col, 'target') var_WOE[col] = WOE_IV['WOE'] var_IV[col] = WOE_IV['IV']
四、数值型变量处理方式
''' 对于连续型变量,处理方式如下: 1,利用卡方分箱法将变量分成5个箱 2,检查坏样本率的单带性,如果发现单调性不满足,就进行合并,直到满足单调性 ''' var_cutoff = {} for col in numerical_var: print("{} is in processing".format(col)) col1 = str(col) + '_Bin' #(1),用卡方分箱法进行分箱,并且保存每一个分割的端点。例如端点=[10,20,30]表示将变量分为x<10,10<x<20,20<x<30和x>30. #特别地,缺失值-1不参与分箱 if -1 in set(df[col]): special_attribute = [-1] else: special_attribute = [] cutOffPoints = ChiMerge(df, col, 'target',special_attribute=special_attribute) var_cutoff[col] = cutOffPoints df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) #(2), check whether the bad rate is monotone BRM = BadRateMonotone(df, col1, 'target',special_attribute=special_attribute) if not BRM: if special_attribute == []: bin_merged = Monotone_Merge(df, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin)>1: indices = [int(b.replace('Bin ','')) for b in bin] removed_index = removed_index+indices[0:-1] removed_point = [cutOffPoints[k] for k in removed_index] for p in removed_point: cutOffPoints.remove(p) var_cutoff[col] = cutOffPoints df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute)) else: cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute] temp = df.loc[~df[col].isin(special_attribute)] bin_merged = Monotone_Merge(temp, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin) > 1: indices = [int(b.replace('Bin ', '')) for b in bin] removed_index = removed_index + indices[0:-1] removed_point = [cutOffPoints2[k] for k in removed_index] for p in removed_point: cutOffPoints2.remove(p) cutOffPoints2 = cutOffPoints2 + special_attribute var_cutoff[col] = cutOffPoints2 df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints2, special_attribute=special_attribute)) #(3), 分箱后再次检查是否有单一的值占比超过90%。如果有,删除该变量 maxPcnt = MaximumBinPcnt(df, col1) if maxPcnt > 0.9: deleted_features.append(col) numerical_var.remove(col) print('we delete {} because the maximum bin occupies more than 90%'.format(col)) continue WOE_IV = CalcWOE(df, col1, 'target') var_IV[col] = WOE_IV['IV'] var_WOE[col] = WOE_IV['WOE']
五、关于多重共线性分析
######################################## # Step 4: WOE编码后的单变量分析与多变量分析# ######################################## for col in var_WOE.keys(): print(col) col2 = str(col)+"_WOE" if col in var_cutoff.keys(): cutOffPoints = var_cutoff[col] special_attribute = [] if - 1 in cutOffPoints: special_attribute = [-1] binValue = df[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) df[col2] = binValue.map(lambda x: var_WOE[col][x]) else: df[col2] = df[col].map(lambda x: var_WOE[col][x]) ### (i) 选择IV高于阈值的变量 all_IV = list(var_IV.values()) all_IV = sorted(all_IV, reverse=True) plt.bar(x=range(len(all_IV)), height = all_IV) plt.show() iv_threshould = 0.02 varByIV = [k for k, v in var_IV.items() if v > iv_threshould] ### (ii) 检查WOE编码后的变量的两两线性相关性 var_IV_selected = {k:var_IV[k] for k in varByIV} var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True) var_IV_sorted = [i[0] for i in var_IV_sorted] removed_var = [] roh_thresould = 0.6 for i in range(len(var_IV_sorted)-1): if var_IV_sorted[i] not in removed_var: x1 = var_IV_sorted[i]+"_WOE" for j in range(i+1,len(var_IV_sorted)): if var_IV_sorted[j] not in removed_var: x2 = var_IV_sorted[j] + "_WOE" roh = np.corrcoef([df[x1], df[x2]])[0, 1] if abs(roh) >= roh_thresould: print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh))) if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]: removed_var.append(var_IV_sorted[j]) else: removed_var.append(var_IV_sorted[i]) var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var] ### (iii)检查是否有变量与其他所有变量的VIF > 10 for i in range(len(var_IV_sortet_2)): x0 = df[var_IV_sortet_2[i]+'_WOE'] x0 = np.array(x0) X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]] X = df[X_Col] X = np.matrix(X) regr = LinearRegression() clr= regr.fit(X, x0) x_pred = clr.predict(X) R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum() vif = 1/(1-R2) if vif > 10: print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))
六、建模
######################### # Step 5: 应用逻辑回归模型# ######################### multi_analysis = [i+'_WOE' for i in var_IV_sortet_2] y = df['target'] X = df[multi_analysis].copy() X['intercept'] = [1]*X.shape[0] print('------------入模特征数--------------') print(len(multi_analysis)) LR = sm.Logit(y, X).fit() summary = LR.summary2() pvals = LR.pvalues.to_dict() params = LR.params.to_dict() #发现有变量不显著,因此需要单独检验显著性 varLargeP = {k: v for k,v in pvals.items() if v >= 0.1} varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True) varLargeP = [i[0] for i in varLargeP] p_value_list = {} for var in varLargeP: X_temp = df[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() p_value_list[var] = LR.pvalues[var] for k,v in p_value_list.items(): print("{0} has p-value of {1} in univariate regression".format(k,v)) #发现有变量的系数为正,因此需要单独检验正确性 varPositive = [k for k,v in params.items() if v >= 0] coef_list = {} for var in varPositive: X_temp = df[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() coef_list[var] = LR.params[var] for k,v in coef_list.items(): print("{0} has coefficient of {1} in univariate regression".format(k,v)) selected_var = [multi_analysis[0]] for var in multi_analysis[1:]: try_vars = selected_var+[var] X_temp = df[try_vars].copy() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() #summary = LR.summary2() pvals, params = LR.pvalues, LR.params del params['intercept'] if max(pvals)<0.1 and max(params)<0: selected_var.append(var) print(LR.summary2()) # y_pred = LR.predict(X_temp) # pred_proba = sm.Logit(y,X_temp).pdf(X_temp) # 等于sklearn中的predict_proba() # y_result = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target'])}) from sklearn.linear_model import LogisticRegression logic = LogisticRegression() logic.fit(X_temp,y) y_pred = logic.predict(X_temp) pred_proba = logic.predict_proba(X_temp) from scikitplot import metrics # 混淆矩阵 metrics.plot_confusion_matrix(y, y_pred) plt.show() # 输出ROC曲线 metrics.plot_roc_curve(y, pred_proba,curves=('each_class')) plt.show() # 输出pr曲线 metrics.plot_precision_recall_curve(y, pred_proba, curves=('each_class')) plt.show() # 输出ks曲线 metrics.plot_ks_statistic(y, pred_proba) plt.show()
私人分享,仅为记录!!!