线性回归实战

不多说上干货！

数据暂时还不知道怎么上传：你们自己造一点数据吧，文件名字：

Advertising.csv

数据格式如下图：

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pprint import pprint
from sklearn import metrics

if __name__ == "__main__":
    path = 'Advertising.csv'
    # # 手写读取数据
    # f = file(path)
    # x = []
    # y = []
    # for i, d in enumerate(f):
    #     if i == 0:
    #         continue
    #     d = d.strip()
    #     if not d:
    #         continue
    #     d = map(float, d.split(','))
    #     x.append(d[1:-1])
    #     y.append(d[-1])
    # pprint(x)
    # pprint(y)
    # x = np.array(x)
    # y = np.array(y)

    # Python自带库
    # f = file(path, 'r')
    # print f
    # d = csv.reader(f)
    # for line in d:
    #     print line
    # f.close()

    # # numpy读入
    # p = np.loadtxt(path, delimiter=',', skiprows=1)
    # print p
    # print '\n\n===============\n\n'

    # pandas读入
    data = pd.read_csv(path)    # TV、Radio、Newspaper、Sales
    # x = data[['TV', 'Radio', 'Newspaper']]
    x = data[['TV', 'Radio']]
    y = data['Sales']
    #我们看看数据的维度(结果有200个样本，每个样本有5列)
    print(data.shape)
    print("************"*5)
    print ("x=:\n",x)
    print("========================")
    print ("y=:\n",y)

    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False

    # 绘制1
    plt.figure(facecolor='w')
    plt.plot(data['TV'], y, 'ro', label='TV')
    plt.plot(data['Radio'], y, 'g^', label='Radio')
    plt.plot(data['Newspaper'], y, 'mv', label='Newspaer')
    plt.legend(loc='lower right')
    plt.xlabel(u'广告花费', fontsize=16)
    plt.ylabel(u'销售额', fontsize=16)
    plt.title(u'广告花费与销售额对比数据', fontsize=20)
    plt.grid()
    plt.show()

    # 绘制2
    plt.figure(facecolor='w', figsize=(9, 10))
    plt.subplot(311)
    plt.plot(data['TV'], y, 'ro')
    plt.title('TV')
    plt.grid()
    plt.subplot(312)
    plt.plot(data['Radio'], y, 'g^')
    plt.title('Radio')
    plt.grid()
    plt.subplot(313)
    plt.plot(data['Newspaper'], y, 'b*')
    plt.title('Newspaper')
    plt.grid()
    plt.tight_layout()
    plt.show()
    #划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8,random_state=1)#注意此处与上面是一行
    print (type(x_test))
    #查看下训练集和测试集的维度：
    print (x_train.shape, y_train.shape)
    print("========================"*3)
    linreg = LinearRegression()#线性回归
    model = linreg.fit(x_train, y_train)#用训练集来拟合出线性回归模型
    print (model)
    #我们看看我们的需要的模型系数结果
    print (linreg.coef_, linreg.intercept_)
    #argsort函数返回的是数组值从小到大的索引值，沿第一轴排序（向下）
    order = y_test.argsort(axis=0)
    y_test = y_test.values[order]
    x_test = x_test.values[order, :]
    #模型拟合测试集
    y_hat = linreg.predict(x_test)
    mse = np.average((y_hat - np.array(y_test)) ** 2)  # Mean Squared Error（均方误差）
    rmse = np.sqrt(mse)  # Root Mean Squared Error（均方根误差）
    #用scikit-learn计算 MSE,RMSE
    smse=metrics.mean_squared_error(y_test,y_hat)
    srmse=np.sqrt(smse)
    print("+++++++++++++++++++++均方误差，均方根误差+++++++++++++++++++++++++++++++")
    print('MSE = ', mse,"sMSE=",smse)
    print('RMSE = ', rmse,"sRMSE=",srmse)
    print ('R2 = ', linreg.score(x_train, y_train))
    print ('R2 = ', linreg.score(x_test, y_test))
    #画图
    plt.figure(facecolor='w')
    t = np.arange(len(x_test))
    plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据')
    plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据')
    plt.legend(loc='upper right')
    plt.title(u'线性回归预测销量', fontsize=18)
    plt.grid(b=True)
    plt.show()

结果：

控制台输出：

************************************************************
x=:
TV Radio
0 230.1 37.8
1 44.5 39.3
2 17.2 45.9
3 151.5 41.3
4 180.8 10.8
... ...
197 12.8
198 25.5
199 13.4
[200 rows x 2 columns]
========================
y=:
0 22.1
1 10.4
2 9.3
... ...
198 25.5
199 13.4
Name: Sales, Length: 200, dtype: float64

<class 'pandas.core.frame.DataFrame'>
(160, 2) (160,)
========================================================================
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
[ 0.04686997 0.1800065 ] 2.94751503603
+++++++++++++++++++++均方误差，均方根误差+++++++++++++++++++++++++++++++
MSE = 1.95522188501 sMSE= 1.95522188501
RMSE = 1.39829248908 sRMSE= 1.39829248908
R2 = 0.895852846878
R2 = 0.894734495003

************************************************************************************************

上面的是最原始的回归，然后我们对比一下Lasso回归和Ridge回归

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV#GridSearchCV模块，能够在指定的范围内自动搜索具有不同超参数的不同


if __name__ == "__main__":
    # pandas读入
    data = pd.read_csv('Advertising.csv')    # TV、Radio、Newspaper、Sales
    x = data[['TV', 'Radio', 'Newspaper']]
    # x = data[['TV', 'Radio']]
    y = data['Sales']
    print (x)
    print (y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
    #Lasso回归
    #model = Lasso()
    #岭回归
    model = Ridge()
    alpha_can = np.logspace(-3, 2, 10)#alpha参数集 10^-3~10^2的等比数列的10个数
    np.set_printoptions(suppress=True)
    print ('alpha_can = ', alpha_can)
    lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=8)#其中cv8折交叉验证。

    lasso_model.fit(x_train, y_train)
    print ('超参数：\n', lasso_model.best_params_)

    order = y_test.argsort(axis=0)
    y_test = y_test.values[order]
    x_test = x_test.values[order, :]
    y_hat = lasso_model.predict(x_test)
    print (lasso_model.score(x_test, y_test))
    mse = np.average((y_hat - np.array(y_test)) ** 2)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print(mse, rmse)

    t = np.arange(len(x_test))
    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据')
    plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据')
    plt.title(u'线性回归预测销量', fontsize=18)
    plt.legend(loc='upper right')
    plt.grid()
    plt.show()

其实这个code只是多了一个对于超参数的处理，多个超参数，找出最好的一个 然后我们看一下结果对比一下

	Ridge
0.915232753209
MSE：1.98213253677 RMSE：1.40788228797


	Lasso
0.912025386213
MSE：2.0571311562  RMSE：1.43427025215


	原始回归
0.895937263233
MSE：1.99188555183  RMSE：1.41134175586

从上面的数据来看:

MSE:Lasso回归>原始回归和>Ridge回归

得到目前Ridge回归模型效果最好！

猜你喜欢