阿里云大学笔记——线性回归:最小二乘法

1.导入波士顿房价数据

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IjpXtJVg-1596197641549)(attachment:image.png)]

import numpy as np
import pandas as pd

data=pd.read_csv("data/boston.csv")
data
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88 11.9

506 rows × 14 columns

#查看数据的基本信息。同时也可以查看是否有缺失值
data.info()
#查看是否有重复值
data.duplicated().any()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB





False

2. 实现线性回归(最小二乘法)

class LinearRegression:
    """最小二乘法实现"""
    def fit(self,X,y):
        #注意:X必须是完整的矩阵,通过拷贝X,避免X只是数组对象的一部分(进行了切片)
        X=np.asmatrix(X.copy())
        #y是一维,可以不用进行拷贝,因为进行矩阵运算,必须转化为二维矩阵
        y=np.asmatrix(y).reshape(-1,1)
        #通过最小二乘公式,求解出最佳的权重值
        self.w_=(X.T*X).I*X.T*y
        
    def predict(self,X):
        
        X=np.asmatrix(X.copy())
        result=X*self.w_
        #将矩阵转化为数组,使用ravel()进行扁平化处理
        return np.array(result).ravel()
        

3. 数据切分,进行预测

3.1 不考虑截距

t=data.sample(len(data),random_state=666)
train_X=t.iloc[:400,:-1]
train_y=t.iloc[:400,-1]
test_X=t.iloc[400:,:-1]
test_y=t.iloc[400:,-1]

my_reg=LinearRegression()
my_reg.fit(train_X,train_y)
result=my_reg.predict(test_X)
#result
display(np.mean((result-test_y)**2))
#查看权重值
display(my_reg.w_)
18.595095881220296



matrix([[-2.13527048e-01],
        [ 4.64325786e-02],
        [-4.40798167e-02],
        [ 3.94148352e+00],
        [-2.43031921e+00],
        [ 5.60499592e+00],
        [-3.53223856e-03],
        [-9.53062020e-01],
        [ 2.00556858e-01],
        [-9.05450000e-03],
        [-2.68508504e-01],
        [ 1.53991325e-02],
        [-4.94776734e-01]])

3.2 考虑截距:

增加一列,该列的所有值为1

t=data.sample(len(data),random_state=666)
#t["Intercept"]=1
#t
#按照习惯截距作为w0,为之配上一个x0,放在最前面
new_columns=t.columns.insert(0,"Intercept")
#t=t.reindex(columns=new_columns)
#t["Intercept"]=1
#重新安排列的序列,如果值为空,使用fill_value参数填充
t=t.reindex(columns=new_columns,fill_value=1)


train_X=t.iloc[:400,:-1]
train_y=t.iloc[:400,-1]
test_X=t.iloc[400:,:-1]
test_y=t.iloc[400:,-1]

my_reg2=LinearRegression()
my_reg2.fit(train_X,train_y)
result2=my_reg2.predict(test_X)
#result
display(np.mean((result2-test_y)**2))
#查看权重值
display(my_reg2.w_)
18.706069253903184



matrix([[ 3.83679438e+01],
        [-1.70796105e-02],
        [ 4.08402871e-02],
        [-7.84161928e-03],
        [ 3.94149503e+00],
        [-1.85251673e+01],
        [ 3.46054943e+00],
        [ 1.54653190e-03],
        [-1.46077566e+00],
        [ 2.87513235e-01],
        [-1.24453964e-02],
        [-8.57033425e-01],
        [ 9.42230048e-03],
        [-6.08173883e-01]])

4. 可视化展示

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False

plt.figure(figsize=(10,10))
#绘制预测值
plt.plot(result2,"ro-",label="预测值")
#绘制真实值
plt.plot(test_y.values,"go--",label="真实值")
plt.title("线性回归预测-最小二乘法")
plt.xlabel("样本序号")
plt.ylabel("房价")
plt.legend()
<matplotlib.legend.Legend at 0x20f9d9defc8>

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/chairon/article/details/107720648