1. 导入波士顿房价数据集
import numpy as np
import pandas as pd
data=pd.read_csv("data/boston.csv")
data.head()
|
CRIM |
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PTRATIO |
B |
LSTAT |
MEDV |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
3 |
0.03237 |
0.0 |
2.18 |
0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3 |
222 |
18.7 |
394.63 |
2.94 |
33.4 |
4 |
0.06905 |
0.0 |
2.18 |
0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3 |
222 |
18.7 |
396.90 |
5.33 |
36.2 |
2. 实现线性回归:梯度下降
class LinearRegression:
"""线性回归算法:梯度下降法实现"""
def __init__(self,alpha,times):
"""alpha: float 学习率。用来控制步长(权重调整的幅度)
times:int 循环迭代的次数"""
self.alpha=alpha
self.times=times
def fit(self,X,y):
X=np.asarray(X)
y=np.asarray(y)
self.w_=np.zeros(1+X.shape[1])
self.loss_=[]
for i in range(self.times):
y_hat=np.dot(X,self.w_[1:])+self.w_[0]
error=y-y_hat
self.loss_.append(np.sum(error**2)/2)
self.w_[0]+=self.alpha*np.sum(error)
self.w_[1:]+=self.alpha*np.dot(X.T,error)
def predict(self,X):
X=np.asarray(X)
result=np.dot(X,self.w_[1:])+self.w_[0]
return result
3.数据切分
t=data.sample(len(data),random_state=666)
X_train=t.iloc[:400,:-1]
y_train=t.iloc[:400,-1]
X_test=t.iloc[400:,:-1]
y_test=t.iloc[400:,-1]
4. 线性回归预测(没标准化)
my_reg=LinearRegression(alpha=0.0005,times=20)
my_reg.fit(X_train,y_train)
result=my_reg.predict(X_test)
display(my_reg.w_)
display(my_reg.loss_)
display(np.mean((result-y_test)**2))
array([-6.59441004e+91, -1.40968186e+92, -6.83088551e+92, -7.91819869e+92,
-4.03805906e+90, -3.74461053e+91, -4.12255808e+92, -4.69487511e+93,
-2.37774656e+92, -7.33988713e+92, -2.93151961e+94, -1.23169166e+93,
-2.34499035e+94, -8.68295194e+92])
[118525.505,
363708382156678.0,
1.4463556286428806e+24,
5.755546457407455e+33,
2.290338437939112e+43,
9.11407858329823e+52,
3.6268189476416507e+62,
1.443241415877832e+72,
5.743175533643604e+81,
2.2854156516968003e+91,
9.094489051263666e+100,
3.619023569832781e+110,
1.4401393552929022e+120,
5.730831321331474e+129,
2.2805034466177901e+139,
9.074941624398248e+148,
3.6112449471793515e+158,
1.4370439621856128e+168,
5.718513641305657e+177,
2.2756017996876753e+187]
4.361920130943844e+194
5. 进行标准化处理
class StandardScaler:
"""该类对数据进行标准化处理"""
def fit (self,X):
"""根据传递的样本,计算每个特征列的均值和方差"""
X=np.asarray(X)
self.std_=np.std(X,axis=0)
self.mean_=np.mean(X,axis=0)
def transform(self,X):
"""对给定的数据X进行标准化处理。(将X的每一列都转化为标准正态分布:均值为1,方差为0"""
return (X-self.mean_)/self.std_
def fit_transform(self,X):
"""对数据进行训练,并进行标准化处理"""
self.fit(X)
return self.transform(X)
6.线性回归预测:标准化处理
t=data.sample(len(data),random_state=666)
X_train=t.iloc[:400,:-1]
y_train=t.iloc[:400,-1]
X_test=t.iloc[400:,:-1]
y_test=t.iloc[400:,-1]
s=StandardScaler()
X_train=s.fit_transform(X_train)
X_test=s.transform(X_test)
s2=StandardScaler()
y_train=s2.fit_transform(y_train)
y_test=s2.transform(y_test)
reg=LinearRegression(alpha=0.0005,times=20)
reg.fit(X_train,y_train)
result1=reg.predict(X_test)
display(np.mean((result1-y_test)**2))
display(reg.w_)
display(reg.loss_)
0.21492617454147625
array([ 5.64659430e-16, 1.95047851e-02, 4.17596271e-02, -6.72431209e-02,
1.08055415e-01, -1.00875900e-01, 3.19801377e-01, -2.18037208e-02,
-2.17426807e-01, 8.04056926e-02, -6.75273910e-02, -1.82418768e-01,
9.39966020e-02, -4.13050262e-01])
[200.0,
109.21432152379548,
87.0452598536069,
76.53800264630138,
70.55588355152493,
66.96828493244492,
64.73083847262916,
63.272165227255414,
62.27233491795431,
61.549981193179754,
61.00090033178433,
60.56421933693936,
60.203637354956086,
59.89695732596468,
59.63019822805497,
59.39425170905927,
59.18296126355835,
58.99200309779758,
58.81822318519731,
58.65923694602755]
7. 进行可视化展示
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False
plt.figure(figsize=(10,10))
plt.plot(result1,"ro-",label="预测值")
plt.plot(y_test.values,"go--",label="真实值")
plt.title("线性回归—梯度下降")
plt.xlabel("样本序号")
plt.ylabel("房价")
plt.legend()
<matplotlib.legend.Legend at 0x219b25f4648>
plt.plot(range(1,reg.times+1),reg.loss_,"ro-")
[<matplotlib.lines.Line2D at 0x219b2792188>]
lr=LinearRegression(alpha=0.0005,times=50)
t=data.sample(len(data),random_state=666)
X_train=t.iloc[:400,5:6]
y_train=t.iloc[:400,-1]
X_test=t.iloc[400:,5:6]
y_test=t.iloc[400:,-1]
s1=StandardScaler()
X_train=s1.fit_transform(X_train)
X_test=s1.transform(X_test)
s2=StandardScaler()
y_train=s2.fit_transform(y_train)
y_test=s2.transform(y_test)
lr.fit(X_train,y_train)
result2=lr.predict(X_test)
display(np.mean((result2-y_test)**2))
0.3576747881088171
plt.scatter(X_train["RM"],y_train)
display(lr.w_)
x=np.arange(-5,5,0.1)
y=3.31734640e-16+6.62422223e-01*x
plt.plot(x,lr.predict(x.reshape(-1,1)),"r")
array([3.31734640e-16, 6.62422223e-01])
[<matplotlib.lines.Line2D at 0x219af61f0c8>]