import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
def load_and_analyse_data():
data = pd.read_csv('./data/creditcard.csv')
# ----------------------预处理---------------------------------------------
# ----------------------标准化Amount列---------
data['normAmout'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
# ----------------------------------------------
X = data.ix[:, data.columns != 'Class']
y = data.ix[:, data.columns == 'Class']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
# ----------------------采样-------------------
sample_solver = SMOTE(random_state=0)
X_sample ,y_sample = sample_solver.fit_sample(X_train,y_train)#从原始的训练集采出样本,用来训练模型
return np.array(X_test),np.array(y_test).reshape(len(y_test)),np.array(X_sample),np.array(y_sample).reshape(len(y_sample))
if __name__ == '__main__':
X_test, y_test, X_sample, y_sample = load_and_analyse_data()
X_train,X_dev,y_train,y_dev = train_test_split(X_sample,y_sample,test_size=0.3,random_state=1)
print("X_train:{} X_dev:{} X_test:{}".format(len(y_train), len(y_dev), len(y_test)))
model = LogisticRegression()
parameters = {'C':[0.001,0.003,0.01,0.03,0.1,0.3,1,3,10]}
gs = GridSearchCV(model,parameters,verbose=5,cv=5)
gs.fit(X_train,y_train)#训练模型,训练集为采样后的数据
print('最佳模型:',gs.best_params_,gs.best_score_)
print('在采样数据上的性能表现:')
print(gs.score(X_dev,y_dev))
y_dev_pre = gs.predict(X_dev)
print(classification_report(y_dev,y_dev_pre))
print('在原始数据上的性能表现:')
print(gs.score(X_test,y_test))
y_pre = gs.predict(X_test)
print(classification_report(y_test,y_pre))
数据:
链接: https://pan.baidu.com/s/1OlZ-nkS4sbjSgoaetqqOGg 提取码: ggr8
什么是过采样:
目的:处理数据不平衡问题。
方法:当数据不平衡的时,比如样本标签1有10000个数据,样本标签0有100个数据,这时如果采用下采样会浪费很多样本,
所以引入过采样,过采样是根据样本标签少的样本的规律去生成更多该标签样本,这样使得数据趋向于平衡。
典型的过采样方式是SMOTE等
关于SMOTE具体算法:
https://blog.csdn.net/jiede1/article/details/70215477
1、对于少数类中每一个样本x,以欧氏距离为标准计算它到少数类样本集Smin中所有样本的距离,得到其k近邻。
2、根据样本不平衡比例设置一个采样比例以确定采样倍率N,对于每一个少数类样本x,从其k近邻中随机选择若干个样本,假设选择的近邻为xn。
3、对于每一个随机选出的近邻xn,分别与原样本按照如下的公式构建新的样本 。
效果对比: