import pandas as pd
import numpy as np

'''# 1、读取数据'''
path = ""
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path, names=column_name)

'''2 数据预处理'''
# 2、缺失值处理
# 1)替换-》np.nan
data = data.replace(to_replace="?", value=np.nan)
# 2)删除有缺失值的样本

'''# 3、划分数据集'''
from sklearn.model_selection import train_test_split

# 筛选特征值和目标值
x = data.iloc[:, 1:-1]
y = data["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y)

'''# 4、特征工程---标准化'''
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

'''# 5、预估器流程'''
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(), y_train)

from sklearn.externals import joblib
joblib.dump(estimator,'LogisticRegression.pkl')  #将模型保存在当前路劲下的LogisticRegression.pk文件里

# 返回逻辑回归的模型参数:回归系数和偏置

'''# 6、模型评估'''
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)  #计算预测值
print("y_predict(预测值):\n", y_predict)
# print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:", score)

# 查看精确率、召回率、F1-score
from sklearn.metrics import classification_report
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])

# ROC曲线与AUC指标
from sklearn.metrics import roc_auc_score
# y_true:每个样本的真实类别,必须为0(反例),1(正例)标记
# 将y_test 转换成 0 ,1
y_true = np.where(y_test > 3, 1, 0)
# print(y_true)
AUC = roc_auc_score(y_true, y_predict)  #计算AUC指标:越接近1越好

回归系数: [[1.47945227 0.07579265 0.59505721 0.69195463 0.33274168 1.16446335
  1.16645995 0.92205206 0.72380317]]
误差(偏置): [-0.93015988]
 [4 4 2 4 2 2 4 4 4 2 2 4 2 2 4 2 4 4 2 2 2 2 4 2 2 4 2 2 4 2 4 4 2 4 2 4 2
 4 2 2 2 4 4 2 4 2 2 4 2 2 2 4 4 2 2 2 4 2 2 2 4 2 2 2 4 2 4 2 4 4 4 2 4 2
 2 4 2 2 2 4 2 4 2 2 2 4 4 4 2 4 2 2 4 2 4 4 2 2 2 2 2 2 2 4 2 2 2 4 2 2 2
 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 4
 2 2 2 2 2 2 2 4 2 4 2 2 2 4 4 4 2 2 2 2 4 2 2]
准确率为: 0.9766081871345029
              precision    recall  f1-score   support

         良性       0.98      0.98      0.98       114
         恶性       0.96      0.96      0.96        57

avg / total       0.98      0.98      0.98       171

AUC: 0.9736842105263157

x = (data.iloc[:, 1:-1]).sample(50) #传入需要预测的特征变量,这里就暂时使用原来数据随机抽50个

from sklearn.preprocessing import StandardScaler  #对特征变量进行标准化处理
transfer = StandardScaler()
x_s = transfer.fit_transform(x)

from sklearn.externals import joblib  #调用加载模型的API
from sklearn.linear_model import LogisticRegression  #调用模型的API
estimator=joblib.load('LogisticRegression.pkl')  #加载模型
y_predict = estimator.predict(x_s)  #计算预测值
print("y_predict(预测值):\n", y_predict)

# 结果展示:构造二维表
import pandas as pd
data = pd.DataFrame(x)
data["y_predict"]= y_predict
 [2 2 4 2 2 4 4 4 2 2 2 2 2 4 4 4 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2
 2 2 4 2 4 4 4 2 4 2 2 2 2]
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses y_predict
429 2 1 1 1 2 1 2 1 1 2
219 6 1 3 1 2 1 3 1 1 2
320 7 6 3 2 5 10 7 4 6 4
423 5 1 3 1 2 1 2 1 1 2
678 1 1 1 1 2 1 1 1 1 2
