1.线性回归
from sklearn. model_selection import train_test_split
from sklearn. datasets import load_boston
from sklearn. metrics import mean_squared_error
from sklearn. linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn. preprocessing import StandardScaler
def linear1 ( ) :
"""
正规方程求解波士顿房价问题
适合小数据计算比较好 100K为比较小 因为时间复杂度比较高
"""
boston = load_boston( )
x_train, x_test, y_train, y_test = train_test_split( boston. data, boston. target, random_state= 22 )
stand = StandardScaler( )
x_train = stand. fit_transform( x_train)
x_test = stand. transform( x_test)
linear1 = LinearRegression( )
linear1. fit( x_train, y_train)
print ( '偏置:' , linear1. coef_)
print ( '权重:' , linear1. intercept_)
y_predict = linear1. predict( x_test)
error1 = mean_squared_error( y_predict, y_test)
print ( '正规方程的均方差误差:' , error1)
def linear2 ( ) :
"""
梯度下降求解波士顿房价问题
适合大数据计算比较好 100K为比较大
"""
boston = load_boston( )
x_train, x_test, y_train, y_test = train_test_split( boston. data, boston. target, random_state= 22 )
stand = StandardScaler( )
x_train = stand. fit_transform( x_train)
x_test = stand. transform( x_test)
linear2 = SGDRegressor( penalty= 'l1' )
linear2. fit( x_train, y_train)
print ( '偏置:' , linear2. coef_)
print ( '权重:' , linear2. intercept_)
y_predict = linear2. predict( x_test)
error2 = mean_squared_error( y_predict, y_test)
print ( '梯度下降的均方差误差:' , error2)
def linear3 ( ) :
"""
岭回归求解波士顿房价问题
"""
boston = load_boston( )
x_train, x_test, y_train, y_test = train_test_split( boston. data, boston. target, random_state= 22 )
stand = StandardScaler( )
x_train = stand. fit_transform( x_train)
x_test = stand. transform( x_test)
linear1 = Ridge( )
linear1. fit( x_train, y_train)
print ( '偏置:' , linear1. coef_)
print ( '权重:' , linear1. intercept_)
y_predict = linear1. predict( x_test)
error1 = mean_squared_error( y_predict, y_test)
print ( '岭回归的均方差误差:' , error1)
if __name__ == '__main__' :
linear1( )
linear2( )
linear3( )
2.逻辑回归
逻辑回归说是回归实际上一个二分类问题
原理:逻辑回归的输入就是线性回归的输出,然后经过Sigmoid函数,映射成两个种类的概率。
误差函数:对数似然误差。
数据处理小技巧
y_true = np.where(y_test>2.5,1,0) # 三元运算符
data.replace(to_replace=’?’,value=np.nan) # 数据替换
data.isnull().any(axis=0) # 检查缺失行
精准率和召回率
精准率: 你认为的正样本,你猜对了多少。
召回率: 正样本中有多少被你找了出来。
样本不均衡
import pandas as pd
import numpy as np
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_name = [ 'Sample code number' , 'Clump Thickness' , 'Uniformity of Cell Size' , 'Uniformity of Cell Shape' ,
'Marginal Adhesion' , 'Single Epithelial Cell Size' , 'Bare Nuclei' , 'Bland Chromatin' ,
'Normal Nucleoli' , 'Mitoses' , 'Class' ]
data = pd. read_csv( path, names= column_name)
data. head( )
Sample code number
Clump Thickness
Uniformity of Cell Size
Uniformity of Cell Shape
Marginal Adhesion
Single Epithelial Cell Size
Bare Nuclei
Bland Chromatin
Normal Nucleoli
Mitoses
Class
0
1000025
5
1
1
1
2
1
3
1
1
2
1
1002945
5
4
4
5
7
10
3
2
1
2
2
1015425
3
1
1
1
2
2
3
1
1
2
3
1016277
6
8
8
1
3
4
3
7
1
2
4
1017023
4
1
1
3
2
1
3
1
1
2
data = data. replace( to_replace= '?' , value= np. nan)
data. dropna( inplace= True )
data. isnull( ) . any ( axis= 0 )
Uniformity of Cell Size False
Uniformity of Cell Shape False
Marginal Adhesion False
Single Epithelial Cell Size False
Bare Nuclei False
Bland Chromatin False
Normal Nucleoli False
Mitoses False
Class False
dtype: bool
x = data. iloc[ : , 1 : - 1 ]
y = data[ "Class" ]
from sklearn. model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y)
from sklearn. preprocessing import StandardScaler
transfer = StandardScaler( )
x_train = transfer. fit_transform( x_train)
x_test = transfer. transform( x_test)
from sklearn. linear_model import LogisticRegression
estimator = LogisticRegression( )
estimator. fit( x_train, y_train)
estimator. coef_
estimator. intercept_
estimator. score( x_test, y_test)
0.9532163742690059
from sklearn. metrics import classification_report
y_predict = estimator. predict( x_test)
report = classification_report( y_test, y_predict, labels= [ 2 , 4 ] , target_names= [ '良性' , '恶性' ] )
print ( report)
良性 0.96 0.96 0.96 112
恶性 0.93 0.93 0.93 59
macro avg 0.95 0.95 0.95 171
weighted avg 0.95 0.95 0.95 171
from sklearn. metrics import roc_auc_score
y_true = np. where( y_test> 2.5 , 1 , 0 )
roc_auc_score( y_true, y_predict)
0.9482445520581114
3.样本的保存与提取
模型保存 oblib.dump(linear1,‘linear1.pkl’)
模型加载 linear1 = joblib.load(‘linear1.pkl’)
from sklearn. datasets import load_boston
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LinearRegression
from sklearn. preprocessing import StandardScaler
from sklearn. metrics import mean_squared_error
import joblib
def linear1 ( ) :
"""
正规方程求解波士顿房价问题
适合小数据计算比较好 100K为比较小 因为时间复杂度比较高
"""
boston = load_boston( )
x_train, x_test, y_train, y_test = train_test_split( boston. data, boston. target, random_state= 22 )
stand = StandardScaler( )
x_train = stand. fit_transform( x_train)
x_test = stand. transform( x_test)
"""
linear1 = LinearRegression()
linear1.fit(x_train,y_train)
# 模型的保存
joblib.dump(linear1,'linear1.pkl')
"""
linear1 = joblib. load( 'linear1.pkl' )
print ( '偏置:' , linear1. coef_)
print ( '权重:' , linear1. intercept_)
y_predict = linear1. predict( x_test)
error1 = mean_squared_error( y_predict, y_test)
print ( '正规方程的均方差误差:' , error1)
if __name__ == '__main__' :
linear1( )
4.K-means聚类
随机取点,然后求中心,比较随机取点,和中心的差距
模型评估-轮廓系数
from sklearn. cluster import KMeans
from sklearn. datasets import load_iris
iris = load_iris( )
estimator = KMeans( n_clusters= 3 )
estimator. fit( iris. data, iris. target)
y = estimator. predict( iris. data)
from sklearn. metrics import silhouette_score
print ( silhouette_score( iris. data, y) )