1.线性回归

求解方法
- 正规方程
- 梯度下降
线性回归分类
- 普通线性回归
- L1正则化的线性回归 (部分参数为零)
- L2正则化的线性回归 (部分参数很小)
- 机器学习中正则化项L1和L2的直观理解

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge
from sklearn.preprocessing import StandardScaler


def linear1():
    """
    正规方程求解波士顿房价问题
    适合小数据计算比较好 100K为比较小  因为时间复杂度比较高
    """
    # 1.获取数据
    boston = load_boston()
    # 2.切分数据
    x_train,x_test,y_train,y_test = train_test_split(boston.data,boston.target,random_state=22) # 初始一个随机数,然后好进行对比
    # 3.特征工程-标准化处理
    stand = StandardScaler() 
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # 4.建立模型
    linear1 = LinearRegression()
    linear1.fit(x_train,y_train)
    print('偏置:',linear1.coef_)
    print('权重:',linear1.intercept_)
    # 5.模型评估
    y_predict = linear1.predict(x_test)
    error1 = mean_squared_error(y_predict,y_test)
    print('正规方程的均方差误差:',error1)

def linear2():
    """
    梯度下降求解波士顿房价问题
    适合大数据计算比较好 100K为比较大
    """
    # 1.获取数据
    boston = load_boston()
    # 2.切分数据
    x_train,x_test,y_train,y_test = train_test_split(boston.data,boston.target,random_state=22) # 初始一个随机数,然后好进行对比
    # 3.特征工程-标准化处理
    stand = StandardScaler() 
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # 4.建立模型
    linear2 = SGDRegressor(penalty='l1')  # 参数 eta0是学习率，learning_rate学习的方式，默认值先大后小的方式  max_iter=10000 最大的迭代次数  #默认使用了L2正则化系数 penalty='l1'
    linear2.fit(x_train,y_train)
    print('偏置:',linear2.coef_)
    print('权重:',linear2.intercept_)
    # 5.模型评估
    y_predict = linear2.predict(x_test)
    error2 = mean_squared_error(y_predict,y_test)
    print('梯度下降的均方差误差:',error2)

def linear3():
    """
    岭回归求解波士顿房价问题
    """
    # 1.获取数据
    boston = load_boston()
    # 2.切分数据
    x_train,x_test,y_train,y_test = train_test_split(boston.data,boston.target,random_state=22) # 初始一个随机数,然后好进行对比
    # 3.特征工程-标准化处理
    stand = StandardScaler() 
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # 4.建立模型
    linear1 = Ridge() # alpha=4  正则化力度
    linear1.fit(x_train,y_train)
    print('偏置:',linear1.coef_)
    print('权重:',linear1.intercept_)
    # 5.模型评估
    y_predict = linear1.predict(x_test)
    error1 = mean_squared_error(y_predict,y_test)
    print('岭回归的均方差误差:',error1)

if __name__ == '__main__':
    linear1()   # 正规方程
    linear2()   # 梯度下降
    linear3()   # 岭回归下降

2.逻辑回归

逻辑回归说是回归实际上一个二分类问题
原理：逻辑回归的输入就是线性回归的输出，然后经过Sigmoid函数，映射成两个种类的概率。
误差函数：对数似然误差。
数据处理小技巧
- y_true = np.where(y_test>2.5,1,0) # 三元运算符
- data.replace(to_replace=’?’,value=np.nan) # 数据替换
- data.isnull().any(axis=0) # 检查缺失行
精准率和召回率
- 精准率: 你认为的正样本，你猜对了多少。
- 召回率: 正样本中有多少被你找了出来。
样本不均衡
- AUC 指标

import pandas as pd
import numpy as np

path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path,names=column_name)

data.head()

	Sample code number	Clump Thickness	Uniformity of Cell Size	Uniformity of Cell Shape	Marginal Adhesion	Single Epithelial Cell Size	Bare Nuclei	Bland Chromatin	Normal Nucleoli	Mitoses	Class
0	1000025	5	1	1	1	2	1	3	1	1	2
1	1002945	5	4	4	5	7	10	3	2	1	2
2	1015425	3	1	1	1	2	2	3	1	1	2
3	1016277	6	8	8	1	3	4	3	7	1	2
4	1017023	4	1	1	3	2	1	3	1	1	2

# 2. 替换缺失值
# 1). ？替换成Nan
data = data.replace(to_replace='?',value=np.nan)
# 2). 删除有缺失值的行
data.dropna(inplace=True)

# 3). 检查是否有缺失值
# isnull 配合  any  # 默认按列
data.isnull().any(axis=0)

Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

x = data.iloc[:,1:-1]
y = data["Class"]

# 导入包
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y)

# 特征标准化
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 预估器的评估
from sklearn.linear_model import LogisticRegression

# 训练模型
estimator = LogisticRegression()
estimator.fit(x_train,y_train)
estimator.coef_   # 权重
estimator.intercept_ # 参数
estimator.score(x_test,y_test)

0.9532163742690059

# 精准率和召回率
from sklearn.metrics import classification_report
y_predict = estimator.predict(x_test)
report = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性'])

print(report)

          良性       0.96      0.96      0.96       112

          恶性       0.93      0.93      0.93        59
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171

# ruc_auc 指标
from sklearn.metrics import roc_auc_score

# 使用np三元运算符
y_true = np.where(y_test>2.5,1,0)

roc_auc_score(y_true,y_predict)

0.9482445520581114

3.样本的保存与提取

模型保存 oblib.dump(linear1,‘linear1.pkl’)
模型加载 linear1 = joblib.load(‘linear1.pkl’)

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib    # 模型的保存的库

def linear1():
    """
    正规方程求解波士顿房价问题
    适合小数据计算比较好 100K为比较小  因为时间复杂度比较高
    """
    # 1.获取数据
    boston = load_boston()
    # 2.切分数据
    x_train,x_test,y_train,y_test = train_test_split(boston.data,boston.target,random_state=22) # 初始一个随机数,然后好进行对比
    # 3.特征工程-标准化处理
    stand = StandardScaler() 
    x_train = stand.fit_transform(x_train)
    x_test = stand.transform(x_test)
    # ---------------------------------------------
    # 4.建立模型   模型的保存与提取
    """
    linear1 = LinearRegression()
    linear1.fit(x_train,y_train)
    # 模型的保存
    joblib.dump(linear1,'linear1.pkl')
    """
    # 模型的加载
    linear1 = joblib.load('linear1.pkl')
    # ---------------------------------------------
    print('偏置:',linear1.coef_)
    print('权重:',linear1.intercept_)
    # 5.模型评估
    y_predict = linear1.predict(x_test)
    error1 = mean_squared_error(y_predict,y_test)
    print('正规方程的均方差误差:',error1)

if __name__ == '__main__':
    linear1()

4.K-means聚类

随机取点，然后求中心，比较随机取点，和中心的差距
模型评估-轮廓系数

from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

iris = load_iris()

estimator = KMeans(n_clusters=3) 
estimator.fit(iris.data,iris.target)

y = estimator.predict(iris.data)
# print(y)
# print(iris.target)

# 模型评估-轮廓系数
from sklearn.metrics import silhouette_score

print(silhouette_score(iris.data,y))

Hello King

发布了31 篇原创文章 · 获赞 13 · 访问量 9892

私信关注

机器学习第三天

1.线性回归

2.逻辑回归

3.样本的保存与提取

4.K-means聚类

猜你喜欢