文章目录
一、算法介绍
回归算法
二、算法原理
逻辑回归假设数据服从伯努利分布通过极大似然函数的方法,运用梯度下降来求解参数来达到将数据二分类的目的
三、算法特点
1、实现简单
2、分类时计算量小,速度快
3、容易欠拟合,一般准确度不太高
4、只能处理两分类问题,且必须线性可分
四、癌症分类
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# out = load_breast_cancer()
# print(out.keys())
# print(out["DESCR"])
"""
数据集:569样本,30属性(特征)
212 - Malignant(恶性的) 0
357 - Benign(良性的) 1
"""
X, y = load_breast_cancer(return_X_y=True) # 只返回特征和标签
# print(y.sum())
# -----------------------拆分数据集-------------------------------
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=1,
stratify=y, # 等比例拆分, 回归数据集没有该参数
)
# print("拆分后 测试集 各个类别的样本数目", y_test.sum(), len(y_test)-y_test.sum())
std = StandardScaler()
std.fit(X_train)
X_train = std.transform(X_train)
X_test = std.transform(X_test)
# ----------------------算法-----------------------------------------------
alg = LogisticRegression() # 准确率: 0.9912280701754386
# alg = KNeighborsClassifier() # 准确率: 0.9736842105263158
alg.fit(X_train, y_train)
# print("系数", alg.coef_)
# print("截距", alg.intercept_)
# 预测结果
y_pred = alg.predict(X_test)
# print("预测结果", y_pred)
# 评估指标
acc = alg.score(X_test, y_test)
print("准确率", acc)
五、癌症分类算法比较
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import time
"""
数据集:569样本,30属性(特征)
212 - Malignant(恶性的) 0
357 - Benign(良性的) 1
"""
X, y = load_breast_cancer(return_X_y=True) # 只返回特征和标签
# print(y.sum())
# print(X)
# -----------------------拆分数据集-------------------------------
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=1,
stratify=y, # 等比例拆分, 回归数据集没有该参数
)
std = StandardScaler()
std.fit(X_train)
X_train = std.transform(X_train)
X_test = std.transform(X_test)
# ----------------------算法-----------------------------------------------
start = time.time()
# alg = LogisticRegression() # 准确率: 0.9912280701754386
alg = KNeighborsClassifier() # 准确率: 0.9736842105263158
# alg = GaussianNB() # 准确率 0.9210526315789473
alg.fit(X_train, y_train)
# 预测结果
y_pred = alg.predict(X_test)
# 评估指标
acc = alg.score(X_test, y_test)
print("准确率", acc)
print("总时长", time.time() - start)
"""
高斯朴素贝叶斯
准确率 0.9210526315789473
总时长 0.0019948482513427734
逻辑回归
准确率 0.9912280701754386
总时长 0.02098822593688965
KNN
准确率 0.9736842105263158
总时长 0.015989065170288086
"""