线性回归(linear regression)
理论思路
最小二乘法是一个结论,由最大似然和高斯分布推导而来
如果误差不服从高斯分布,就得改造成高斯分布
前提:样本之间独立,工程上一般不满足,效果也还行,can work
回归与拟合 与插值
插值必须要经过样本点
拟合不一定
报错
path = r'G:\百度云网盘下载\升级版7\第九课(1.2日更新)\9.Regression\Advertising.csv'
data = pd.read_csv(path) # TV、Radio、Newspaper、Sales
OSError: Initializing from file failed
path = open(r'G:\百度云网盘下载\升级版7\第九课(1.2日更新)\9.Regression\Advertising.csv')
data = pd.read_csv(path) # TV、Radio、Newspaper、Sales
加上open() 就可以了
完整代码
linear regression
#!/usr/bin/python
# -*- coding:utf-8 -*-
import csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pprint import pprint
if __name__ == "__main__":
path = open(r'G:\百度云网盘下载\升级版7\第九课(1.2日更新)\9.Regression\Advertising.csv')
# # 手写读取数据
# f = file(path)
# x = []
# y = []
# for i, d in enumerate(f):
# if i == 0:
# continue
# d = d.strip()
# if not d:
# continue
# d = map(float, d.split(','))
# x.append(d[1:-1])
# y.append(d[-1])
# pprint(x)
# pprint(y)
# x = np.array(x)
# y = np.array(y)
# Python自带库
# f = file(path, 'r')
# print f
# d = csv.reader(f)
# for line in d:
# print line
# f.close()
# # numpy读入
# p = np.loadtxt(path, delimiter=',', skiprows=1)
# print p
# print '\n\n===============\n\n'
'''
这个读入是针对有表头的数据格式,后面的logistic回归有无表头的
'''
# pandas读入
data = pd.read_csv(path) # TV、Radio、Newspaper、Sales
x = data[['TV', 'Radio', 'Newspaper']]
# x = data[['TV', 'Radio']]
y = data['Sales']
print(x)
print(y)
'''
让表格中出现中文和负号
'''
mpl.rcParams['font.sans-serif'] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 绘制1
plt.figure(facecolor='w') #对系统表明,我要开始画图了(边框线的颜色#white)
plt.plot(data['TV'], y, 'ro', label='TV') #(数据x,数据y, 图上表示点的符号#red圆圈,表示不同的线标签)
plt.plot(data['Radio'], y, 'g^', label='Radio') #(,,三角,)
plt.plot(data['Newspaper'], y, 'mv', label='Newspaer') #(,,桃红色五星,)
plt.legend(loc='lower right') #用来显示出带有标签的线(标签的位置#右下)
plt.xlabel('广告花费', fontsize=16)
plt.ylabel('销售额', fontsize=16)
plt.title('广告花费与销售额对比数据', fontsize=18)
plt.grid(b=True, ls=':') #显示出网格(b表示布尔值,ls表示linestyl 线的风格)
plt.show()
# 绘制2
plt.figure(facecolor='w', figsize=(9, 10))
plt.subplot(311)
plt.plot(data['TV'], y, 'ro')
plt.title('TV')
plt.grid(b=True, ls=':')
plt.subplot(312)
plt.plot(data['Radio'], y, 'g^')
plt.title('Radio')
plt.grid(b=True, ls=':')
plt.subplot(313)
plt.plot(data['Newspaper'], y, 'b*')
plt.title('Newspaper')
plt.grid(b=True, ls=':')
plt.tight_layout()
plt.show()
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) #(x,y,?,?)
print(type(x_test)) #喂给模型的数据类型是 DataFrame格式
print(x_train.shape, y_train.shape)
linreg = LinearRegression()
model = linreg.fit(x_train, y_train) #喂模型
print(model)
print(linreg.coef_, linreg.intercept_) #模型的系数,截距b
order = y_test.argsort(axis=0) #排序(按列)
'''
x = np.array([3, 1, 2])
np.argsort(x) #按升序排列
np.argsort(-x) #按降序排列
x[np.argsort(x)]
x[np.argsort(-x)]
'''
y_test = y_test.values[order]
x_test = x_test.values[order, :] #[行,列#省略表示所有列]
y_hat = linreg.predict(x_test)
mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error 均方误差 *注意不是方差,方差是期望的差方
rmse = np.sqrt(mse) # Root Mean Squared Error 均方根误差
print('MSE = ', mse, end=' ')
print('RMSE = ', rmse)
print('R2 = ', linreg.score(x_train, y_train)) #R2=1-RSS/TSS,越接近1 越好
print('R2 = ', linreg.score(x_test, y_test))
plt.figure(facecolor='w') #画实际值与预测值对比图
t = np.arange(len(x_test)) #确定横轴长度
plt.plot(t, y_test, 'r-', linewidth=2, label='真实数据')
plt.plot(t, y_hat, 'g-', linewidth=2, label='预测数据')
plt.legend(loc='upper left')
plt.title('线性回归预测销量', fontsize=18)
plt.grid(b=True, ls=':')
plt.show()
linear regression
linear regression