Python项目实战调用方法

dvars = {}
scores = {}
train = pd.read_csv('train.csv',index_col='CUST_ID')
del train['Unnamed: 0']
df = label_encoder(train)
df['target'] = df['bad_good']
del df['bad_good']
df = rough_del(df, p=0.9)

# 判断变量是类别型还是数值型
numerical_var = []
features = df.columns[:-1] # 获取特征名
for col in features:
    uniq_valid_vals = list(set(df[col])) # 转为集合类型去重
    if len(uniq_valid_vals) >= 6:  #  如果特征去重后的个数大于6则可以判断为数值特征 （6为经验值）
        numerical_var.append(col)  # 将判断为数值特征的特征名称存入列表
categorical_var = [i for i in features if i not in numerical_var]  # 类别特征名称列表

print('数值特征的个数是：{}，分别是{}'.format(len(numerical_var),numerical_var))
print('类别特征的个数是：{}，分别是{}'.format(len(categorical_var),categorical_var))

# 首先判断哪些列有缺失值  （有一个也存入，待处理）
notnullCol = []  # 无缺失值特征的列表
isnullCol = [] # 有缺失值特征的列表
for col in features:
    if df[col].count() < df.shape[0]:
        isnullCol.append(col)
    else:
        notnullCol.append(col)

一、以上为对缺失值的判定

二、填补缺失值

for col in isnullCol:
    missingRate = MissingRate(df,col) # 计算当前col缺失值的占比
    print('特征 {} 的缺失率是 {}'.format(col,missingRate)) # 输出当前col列的缺失值
    if missingRate > 0.8:         # 缺失率大于0.8，则用MissingNewFeature方法处理
        df[col] = MissingNewFeature(df,col)
    elif 0.5 < missingRate < 0.8:  # 缺失率大于0.5小于0.8，则用CategoricalMissingFeature方法处理
        df = CategoricalMissingFeature(df,col,n=4)
    elif 0.3 < missingRate < 0.5:
        feature = random.sample(notnullCol, 3) # 没有缺失值的特征中随机选择3个特征
        if col in categorical_var:  # 判断是否为类别特征
            df = Model_Fillna(df,col,feature,method='clf') # 使用 Model_Fillna 中的 随机森林分类模型 填补缺失样本
        else:
            df = Model_Fillna(df, col, feature, method='reg') # 使用 Model_Fillna 中的 随机森林回归模型 填补缺失样本
    else:
        if col in categorical_var:  # 缺失率为其他时，判断是否为类别特征
            df[col].fillna(df[col].dropna().mode().values[0],inplace=True)  # 将所有nan替换为众数
        else:                       # 不在类别特征列表中的为数值特征
            df[col].fillna(df[col].dropna().mean(),inplace=True)  # 将所有nan替换为均值

三、类别型变量处理方式

'''
对于类别型变量，按照以下方式处理
1，如果变量的取值个数超过5，计算bad rate进行编码
2，除此之外，其他任何类别型变量如果有某个取值中，
对应的样本全部是坏样本或者是好样本，进行合并。
'''
deleted_features = []   #将处理过的变量删除，防止对后面的建模干扰
encoded_features = {}   #将bad rate编码方式保存下来，在以后的测试和生产环境中需要使用
merged_features = {}    #将类别型变量合并方案保留下来
var_IV = {}  #save the IV values for binned features       # 将IV值保留和WOE值
var_WOE = {}
for col in categorical_var: # 循环类别特征名列表
    print('we are processing {}'.format(col))
    if len(set(df[col]))>5:  # 查看去重后的类别个数是否大于5
        print('{} is encoded with bad rate'.format(col)) #
        col0 = str(col)+'_encoding'

        #(1), 计算坏样本率并进行编码
        encoding_result = BadRateEncoding(df, col, 'target')
        df[col0], br_encoding = encoding_result['encoding'],encoding_result['bad_rate'] # 坏样本率，{样本：坏样本率}

        #(2), 将（1）中的编码后的变量也加入数值型变量列表中，为后面的卡方分箱做准备
        numerical_var.append(col0)

        #(3), 保存编码结果
        encoded_features[col] = [col0, br_encoding]

        #(4), 删除原始值

        deleted_features.append(col) # 将处理完毕的连续变量存入此列表
    else:
        bad_bin = df.groupby([col])['target'].sum()
        #对于类别数少于5个，但是出现0坏样本的特征需要做处理
        if min(bad_bin) == 0:  # 判断当前特征中是否有坏样本
            print('{} has 0 bad sample!'.format(col))
            col1 = str(col) + '_mergeByBadRate'
            #(1), 找出最优合并方式，使得每一箱同时包含好坏样本
            mergeBin = MergeBad0(df, col, 'target')
            #(2), 依照（1）的结果对值进行合并
            df[col1] = df[col].map(mergeBin)
            maxPcnt = MaximumBinPcnt(df, col1) # 占比最大的类别
            # 如果合并后导致有箱占比超过90%，就删除。
            if maxPcnt > 0.9:
                print('{} is deleted because of large percentage of single bin'.format(col))
                deleted_features.append(col)
                categorical_var.remove(col)
                del df[col]
                continue
            #(3) 如果合并后的新的变量满足要求，就保留下来
            merged_features[col] = [col1, mergeBin]
            WOE_IV = CalcWOE(df, col1, 'target')  # 返回WOE值，及IV值
            var_WOE[col1] = WOE_IV['WOE']
            var_IV[col1] = WOE_IV['IV']
            #del trainData[col]
            deleted_features.append(col)  # 处理完毕的变量存入deleted_features
        else:
            WOE_IV = CalcWOE(df, col, 'target')
            var_WOE[col] = WOE_IV['WOE']
            var_IV[col] = WOE_IV['IV']

四、数值型变量处理方式

'''
对于连续型变量，处理方式如下：
1，利用卡方分箱法将变量分成5个箱
2，检查坏样本率的单带性，如果发现单调性不满足，就进行合并，直到满足单调性
'''
var_cutoff = {}
for col in numerical_var:
    print("{} is in processing".format(col))
    col1 = str(col) + '_Bin'

    #(1),用卡方分箱法进行分箱，并且保存每一个分割的端点。例如端点=[10,20,30]表示将变量分为x<10,10<x<20,20<x<30和x>30.
    #特别地，缺失值-1不参与分箱
    if -1 in set(df[col]):
        special_attribute = [-1]
    else:
        special_attribute = []
    cutOffPoints = ChiMerge(df, col, 'target',special_attribute=special_attribute)
    var_cutoff[col] = cutOffPoints
    df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))

    #(2), check whether the bad rate is monotone
    BRM = BadRateMonotone(df, col1, 'target',special_attribute=special_attribute)
    if not BRM:
        if special_attribute == []:
            bin_merged = Monotone_Merge(df, 'target', col1)
            removed_index = []
            for bin in bin_merged:
                if len(bin)>1:
                    indices = [int(b.replace('Bin ','')) for b in bin]
                    removed_index = removed_index+indices[0:-1]
            removed_point = [cutOffPoints[k] for k in removed_index]
            for p in removed_point:
                cutOffPoints.remove(p)
            var_cutoff[col] = cutOffPoints
            df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute))
        else:
            cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
            temp = df.loc[~df[col].isin(special_attribute)]
            bin_merged = Monotone_Merge(temp, 'target', col1)
            removed_index = []
            for bin in bin_merged:
                if len(bin) > 1:
                    indices = [int(b.replace('Bin ', '')) for b in bin]
                    removed_index = removed_index + indices[0:-1]
            removed_point = [cutOffPoints2[k] for k in removed_index]
            for p in removed_point:
                cutOffPoints2.remove(p)
            cutOffPoints2 = cutOffPoints2 + special_attribute
            var_cutoff[col] = cutOffPoints2
            df[col1] = df[col].map(lambda x: AssignBin(x, cutOffPoints2, special_attribute=special_attribute))

    #(3), 分箱后再次检查是否有单一的值占比超过90%。如果有，删除该变量
    maxPcnt = MaximumBinPcnt(df, col1)
    if maxPcnt > 0.9:
        deleted_features.append(col)
        numerical_var.remove(col)
        print('we delete {} because the maximum bin occupies more than 90%'.format(col))
        continue

    WOE_IV = CalcWOE(df, col1, 'target')
    var_IV[col] = WOE_IV['IV']
    var_WOE[col] = WOE_IV['WOE']

五、关于多重共线性分析

########################################
# Step 4: WOE编码后的单变量分析与多变量分析#
########################################

for col in var_WOE.keys():
    print(col)
    col2 = str(col)+"_WOE"
    if col in var_cutoff.keys():
        cutOffPoints = var_cutoff[col]
        special_attribute = []
        if - 1 in cutOffPoints:
            special_attribute = [-1]
        binValue = df[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
        df[col2] = binValue.map(lambda x: var_WOE[col][x])
    else:
        df[col2] = df[col].map(lambda x: var_WOE[col][x])

### (i) 选择IV高于阈值的变量
all_IV = list(var_IV.values())
all_IV = sorted(all_IV, reverse=True)
plt.bar(x=range(len(all_IV)), height = all_IV)
plt.show()
iv_threshould = 0.02
varByIV = [k for k, v in var_IV.items() if v > iv_threshould]


### (ii) 检查WOE编码后的变量的两两线性相关性

var_IV_selected = {k:var_IV[k] for k in varByIV}
var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True)
var_IV_sorted = [i[0] for i in var_IV_sorted]

removed_var  = []
roh_thresould = 0.6
for i in range(len(var_IV_sorted)-1):
    if var_IV_sorted[i] not in removed_var:
        x1 = var_IV_sorted[i]+"_WOE"
        for j in range(i+1,len(var_IV_sorted)):
            if var_IV_sorted[j] not in removed_var:
                x2 = var_IV_sorted[j] + "_WOE"
                roh = np.corrcoef([df[x1], df[x2]])[0, 1]
                if abs(roh) >= roh_thresould:
                    print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh)))
                    if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]:
                        removed_var.append(var_IV_sorted[j])
                    else:
                        removed_var.append(var_IV_sorted[i])

var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]

### (iii）检查是否有变量与其他所有变量的VIF > 10
for i in range(len(var_IV_sortet_2)):
    x0 = df[var_IV_sortet_2[i]+'_WOE']
    x0 = np.array(x0)
    X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]]
    X = df[X_Col]
    X = np.matrix(X)
    regr = LinearRegression()
    clr= regr.fit(X, x0)
    x_pred = clr.predict(X)
    R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum()
    vif = 1/(1-R2)
    if vif > 10:
        print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))

六、建模

#########################
# Step 5: 应用逻辑回归模型#
#########################
multi_analysis = [i+'_WOE' for i in var_IV_sortet_2]
y = df['target']
X = df[multi_analysis].copy()
X['intercept'] = [1]*X.shape[0]
print('------------入模特征数--------------')
print(len(multi_analysis))
LR = sm.Logit(y, X).fit()
summary = LR.summary2()
pvals = LR.pvalues.to_dict()
params = LR.params.to_dict()

#发现有变量不显著，因此需要单独检验显著性
varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True)
varLargeP = [i[0] for i in varLargeP]
p_value_list = {}
for var in varLargeP:
    X_temp = df[var].copy().to_frame()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    p_value_list[var] = LR.pvalues[var]
for k,v in p_value_list.items():
    print("{0} has p-value of {1} in univariate regression".format(k,v))


#发现有变量的系数为正，因此需要单独检验正确性
varPositive = [k for k,v in params.items() if v >= 0]
coef_list = {}
for var in varPositive:
    X_temp = df[var].copy().to_frame()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    coef_list[var] = LR.params[var]
for k,v in coef_list.items():
    print("{0} has coefficient of {1} in univariate regression".format(k,v))


selected_var = [multi_analysis[0]]
for var in multi_analysis[1:]:
    try_vars = selected_var+[var]
    X_temp = df[try_vars].copy()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    #summary = LR.summary2()
    pvals, params = LR.pvalues, LR.params
    del params['intercept']
    if max(pvals)<0.1 and max(params)<0:
        selected_var.append(var)

print(LR.summary2())
# y_pred = LR.predict(X_temp)
# pred_proba = sm.Logit(y,X_temp).pdf(X_temp) # 等于sklearn中的predict_proba()
# y_result = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target'])})

from sklearn.linear_model import LogisticRegression
logic = LogisticRegression()
logic.fit(X_temp,y)
y_pred = logic.predict(X_temp)
pred_proba = logic.predict_proba(X_temp)

from scikitplot import metrics

# 混淆矩阵
metrics.plot_confusion_matrix(y, y_pred)
plt.show()
# 输出ROC曲线
metrics.plot_roc_curve(y, pred_proba,curves=('each_class'))
plt.show()
# 输出pr曲线
metrics.plot_precision_recall_curve(y, pred_proba, curves=('each_class'))
plt.show()
# 输出ks曲线
metrics.plot_ks_statistic(y, pred_proba)
plt.show()

私人分享，仅为记录！！！

Python项目实战 调用方法

猜你喜欢

Python项目实战调用方法