下面就是相关数据,这是一个脱敏数据,经过了一定的处理,我们不需要分析情况处理相关特征。
如果需要做实验,评论留邮箱,发数据。下面是代码部分,具体每一步的作用,代码中已经进行了详细的阐述。
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
credit = pd.read_csv('./data/creditcard.csv')
# print(credit.shape) # 284807* 31
print(credit.head())
# 判断其是否有空值
# print(pd.isnull(credit).values.any()) # 表示没有空值
# 看一下类别
count_classes = pd.value_counts(credit['Class'], sort=True)
# print(count_classes) # 0表示为欺诈 数据284315条 1 表示欺诈 数据有492条
# 我们此时应该看数据需不需要进行处理
# 显然Time是个无用的属性,Amount账户需要标准化
credit['normAmount'] = StandardScaler().fit_transform(np.array(credit['Amount']).reshape(-1, 1))
# 接着删除刚才原始的Amount以及Time特征
credit = credit.drop(['Time', 'Amount'], axis=1)
# print(credit)
# 将数据打乱, 这里就随意了,打不打乱都不是很重要
columns_index = credit.columns # 把特征名先保存起来,等会还原数据需要用
credit = np.array(credit)
np.random.shuffle(credit)
credit = pd.DataFrame(credit, columns=columns_index)
# print(credit)
# 切分特征和标签
X = credit.loc[:, credit.columns != 'Class']
y = credit.loc[:, credit.columns == 'Class']
# print(X)
# print(y)
# 因为标签为0的数据太多 而标签为1的数据却很少 悬殊太大,我们进行重采样, 将未欺诈和被欺诈的数量搞成一样的
bad_num = len(credit[credit['Class'] == 1]) # 被欺骗的数量
bad_indices = credit[credit['Class'] == 1].index
good_indices = credit[credit['Class'] == 0].index # 把为欺骗的横索引找出来
# 然后进行选择
random_good_indices = np.random.choice(good_indices, bad_num)
random_good_indices = np.array(random_good_indices)
resample_indices = np.concatenate([bad_indices, random_good_indices])
# print(resample_indices)
resample_data = credit.iloc[resample_indices, :]
# print(len(resample_data))
# print(resample_data)
resample_x = resample_data.loc[:, resample_data.columns != 'Class']
resample_y = resample_data.loc[:, resample_data.columns == 'Class']
# 数据集的分割
# 未重新采样的数据集分割
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# 采样后的数据集分割
re_x_train, re_x_test, re_y_train, re_y_test = train_test_split(resample_x, resample_y, test_size=0.25, random_state=42)
# print(len(x_train)) # 213605
# print(len(re_x_train)) # 738
# 走到这一步数据基本处理好了
# 我们有两组数据,一组是原始数据,一组是重采样后的数据
# 接下来我们进行模型训练与评估
def print_kfold(x_train_data, y_train_data):
fold = KFold(n_splits=5, shuffle=False) # 进行5折交叉验证
# 给一些不同的参数
c_param_range = [0.01, 0.1, 1, 10, 100]
result = pd.DataFrame(index=[0, 1, 2, 3, 4], columns=['C_parameter', 'Mean recall score'])
result['C_parameter'] = c_param_range
j = 0
for c_param in c_param_range:
print("*"*100)
print('C parameter:', c_param)
recall_accs = []
for train_index, test_index in fold.split(x_train_data, y_train_data):
lr = LogisticRegression(C=c_param, penalty='l1')
lr.fit(x_train_data.iloc[train_index, :], y_train_data.iloc[train_index, :])
y_pred = lr.predict(x_train_data.iloc[test_index, :])
recall = recall_score(y_train_data.iloc[test_index, :], y_pred)
recall_accs.append(recall)
print("此次召回率:", recall)
print('平均召回率:', np.mean(recall_accs))
result.loc[j, 'Mean recall score'] = np.mean(recall_accs)
j += 1
print('平均召回率为:', np.mean(recall_accs))
print(result)
result['Mean recall score'] = result['Mean recall score'].astype('float64')
best_c = result.loc[result['Mean recall score'].idxmax()]['C_parameter']
print("最好的参数C:", best_c)
return best_c
best_c = print_kfold(re_x_train, re_y_train)
# 此处我们看一下混淆矩阵
lr = LogisticRegression(C=best_c, penalty='l1')
lr.fit(re_x_train, re_y_train)
re_y_pred = lr.predict(re_x_train)
matrix = confusion_matrix(re_y_train, re_y_pred)
print("混淆矩阵:\n", matrix)
print("精度:", precision_score(re_y_train, re_y_pred))
print("召回率:", recall_score(re_y_train, re_y_pred))
print("f1分数:", f1_score(re_y_train, re_y_pred))
最后的实验效果: