线性回归（linear regression）

理论思路

最小二乘法是一个结论，由最大似然和高斯分布推导而来
如果误差不服从高斯分布，就得改造成高斯分布
前提：样本之间独立，工程上一般不满足，效果也还行，can work

回归与拟合与插值
插值必须要经过样本点
拟合不一定

报错

    path = r'G:\百度云网盘下载\升级版7\第九课（1.2日更新）\9.Regression\Advertising.csv'
    data = pd.read_csv(path)    # TV、Radio、Newspaper、Sales

OSError: Initializing from file failed

    path = open(r'G:\百度云网盘下载\升级版7\第九课（1.2日更新）\9.Regression\Advertising.csv')
    data = pd.read_csv(path)    # TV、Radio、Newspaper、Sales

加上open() 就可以了

完整代码

linear regression

#!/usr/bin/python
# -*- coding:utf-8 -*-

import csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pprint import pprint


if __name__ == "__main__":
    path = open(r'G:\百度云网盘下载\升级版7\第九课（1.2日更新）\9.Regression\Advertising.csv')
    # # 手写读取数据
    # f = file(path)
    # x = []
    # y = []
    # for i, d in enumerate(f):
    #     if i == 0:
    #         continue
    #     d = d.strip()
    #     if not d:
    #         continue
    #     d = map(float, d.split(','))
    #     x.append(d[1:-1])
    #     y.append(d[-1])
    # pprint(x)
    # pprint(y)
    # x = np.array(x)
    # y = np.array(y)

    # Python自带库
    # f = file(path, 'r')
    # print f
    # d = csv.reader(f)
    # for line in d:
    #     print line
    # f.close()

    # # numpy读入
    # p = np.loadtxt(path, delimiter=',', skiprows=1)
    # print p
    # print '\n\n===============\n\n'
    
    '''
    这个读入是针对有表头的数据格式，后面的logistic回归有无表头的
    '''
    # pandas读入
    data = pd.read_csv(path)    # TV、Radio、Newspaper、Sales
    x = data[['TV', 'Radio', 'Newspaper']]
    # x = data[['TV', 'Radio']]
    y = data['Sales']
    print(x)
    print(y)

    '''
    让表格中出现中文和负号
    '''
    mpl.rcParams['font.sans-serif'] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False

    # 绘制1
    plt.figure(facecolor='w')  #对系统表明，我要开始画图了（边框线的颜色#white）
    plt.plot(data['TV'], y, 'ro', label='TV')  #（数据x，数据y, 图上表示点的符号#red圆圈，表示不同的线标签）
    plt.plot(data['Radio'], y, 'g^', label='Radio')  #（,,三角,）
    plt.plot(data['Newspaper'], y, 'mv', label='Newspaer')  #(,,桃红色五星,)
    plt.legend(loc='lower right') #用来显示出带有标签的线（标签的位置#右下）
    plt.xlabel('广告花费', fontsize=16) 
    plt.ylabel('销售额', fontsize=16)
    plt.title('广告花费与销售额对比数据', fontsize=18)
    plt.grid(b=True, ls=':') #显示出网格（b表示布尔值，ls表示linestyl 线的风格）
    plt.show()

    # 绘制2
    plt.figure(facecolor='w', figsize=(9, 10))
    plt.subplot(311)
    plt.plot(data['TV'], y, 'ro')
    plt.title('TV')
    plt.grid(b=True, ls=':')
    plt.subplot(312)
    plt.plot(data['Radio'], y, 'g^')
    plt.title('Radio')
    plt.grid(b=True, ls=':')
    plt.subplot(313)
    plt.plot(data['Newspaper'], y, 'b*')
    plt.title('Newspaper')
    plt.grid(b=True, ls=':')
    plt.tight_layout()
    plt.show()

    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) #（x,y,？，？）
    print(type(x_test))  #喂给模型的数据类型是 DataFrame格式
    print(x_train.shape, y_train.shape)
    linreg = LinearRegression()
    model = linreg.fit(x_train, y_train)  #喂模型
    print(model)
    print(linreg.coef_, linreg.intercept_)  #模型的系数，截距b

    order = y_test.argsort(axis=0) #排序（按列）
    '''
    x = np.array([3, 1, 2])
    np.argsort(x) #按升序排列
    np.argsort(-x) #按降序排列
    x[np.argsort(x)]
    x[np.argsort(-x)]
    '''
    y_test = y_test.values[order]
    x_test = x_test.values[order, :] #[行，列#省略表示所有列]
    y_hat = linreg.predict(x_test)
    mse = np.average((y_hat - np.array(y_test)) ** 2)  # Mean Squared Error 均方误差 *注意不是方差，方差是期望的差方
    rmse = np.sqrt(mse)  # Root Mean Squared Error  均方根误差
    print('MSE = ', mse, end=' ')
    print('RMSE = ', rmse)
    print('R2 = ', linreg.score(x_train, y_train))  #R2=1-RSS/TSS,越接近1 越好
    print('R2 = ', linreg.score(x_test, y_test))

    plt.figure(facecolor='w')  #画实际值与预测值对比图
    t = np.arange(len(x_test))  #确定横轴长度
    plt.plot(t, y_test, 'r-', linewidth=2, label='真实数据')
    plt.plot(t, y_hat, 'g-', linewidth=2, label='预测数据')
    plt.legend(loc='upper left')
    plt.title('线性回归预测销量', fontsize=18)
    plt.grid(b=True, ls=':')
    plt.show()

回归实践

目录

线性回归（linear regression）

理论思路

报错

完整代码

linear regression

linear regression

linear regression

猜你喜欢