不多说上干货!
数据暂时还不知道怎么上传:你们自己造一点数据吧,文件名字:
Advertising.csv
数据格式如下图:
import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from pprint import pprint from sklearn import metrics if __name__ == "__main__": path = 'Advertising.csv' # # 手写读取数据 # f = file(path) # x = [] # y = [] # for i, d in enumerate(f): # if i == 0: # continue # d = d.strip() # if not d: # continue # d = map(float, d.split(',')) # x.append(d[1:-1]) # y.append(d[-1]) # pprint(x) # pprint(y) # x = np.array(x) # y = np.array(y) # Python自带库 # f = file(path, 'r') # print f # d = csv.reader(f) # for line in d: # print line # f.close() # # numpy读入 # p = np.loadtxt(path, delimiter=',', skiprows=1) # print p # print '\n\n===============\n\n' # pandas读入 data = pd.read_csv(path) # TV、Radio、Newspaper、Sales # x = data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] #我们看看数据的维度(结果有200个样本,每个样本有5列) print(data.shape) print("************"*5) print ("x=:\n",x) print("========================") print ("y=:\n",y) mpl.rcParams['font.sans-serif'] = [u'simHei'] mpl.rcParams['axes.unicode_minus'] = False # 绘制1 plt.figure(facecolor='w') plt.plot(data['TV'], y, 'ro', label='TV') plt.plot(data['Radio'], y, 'g^', label='Radio') plt.plot(data['Newspaper'], y, 'mv', label='Newspaer') plt.legend(loc='lower right') plt.xlabel(u'广告花费', fontsize=16) plt.ylabel(u'销售额', fontsize=16) plt.title(u'广告花费与销售额对比数据', fontsize=20) plt.grid() plt.show() # 绘制2 plt.figure(facecolor='w', figsize=(9, 10)) plt.subplot(311) plt.plot(data['TV'], y, 'ro') plt.title('TV') plt.grid() plt.subplot(312) plt.plot(data['Radio'], y, 'g^') plt.title('Radio') plt.grid() plt.subplot(313) plt.plot(data['Newspaper'], y, 'b*') plt.title('Newspaper') plt.grid() plt.tight_layout() plt.show() #划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8,random_state=1)#注意此处与上面是一行 print (type(x_test)) #查看下训练集和测试集的维度: print (x_train.shape, y_train.shape) print("========================"*3) linreg = LinearRegression()#线性回归 model = linreg.fit(x_train, y_train)#用训练集来拟合出线性回归模型 print (model) #我们看看我们的需要的模型系数结果 print (linreg.coef_, linreg.intercept_) #argsort函数返回的是数组值从小到大的索引值,沿第一轴排序(向下) order = y_test.argsort(axis=0) y_test = y_test.values[order] x_test = x_test.values[order, :] #模型拟合测试集 y_hat = linreg.predict(x_test) mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error(均方误差) rmse = np.sqrt(mse) # Root Mean Squared Error(均方根误差) #用scikit-learn计算 MSE,RMSE smse=metrics.mean_squared_error(y_test,y_hat) srmse=np.sqrt(smse) print("+++++++++++++++++++++均方误差,均方根误差+++++++++++++++++++++++++++++++") print('MSE = ', mse,"sMSE=",smse) print('RMSE = ', rmse,"sRMSE=",srmse) print ('R2 = ', linreg.score(x_train, y_train)) print ('R2 = ', linreg.score(x_test, y_test)) #画图 plt.figure(facecolor='w') t = np.arange(len(x_test)) plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据') plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据') plt.legend(loc='upper right') plt.title(u'线性回归预测销量', fontsize=18) plt.grid(b=True) plt.show()结果:
控制台输出:
************************************************************
x=:
TV Radio
0 230.1 37.8
1 44.5 39.3
2 17.2 45.9
3 151.5 41.3
4 180.8 10.8
... ...
197 12.8
198 25.5
199 13.4
[200 rows x 2 columns]
========================
y=:
0 22.1
1 10.4
2 9.3
... ...
198 25.5
199 13.4
Name: Sales, Length: 200, dtype: float64
<class 'pandas.core.frame.DataFrame'>
(160, 2) (160,)
========================================================================
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
[ 0.04686997 0.1800065 ] 2.94751503603
+++++++++++++++++++++均方误差,均方根误差+++++++++++++++++++++++++++++++
MSE = 1.95522188501 sMSE= 1.95522188501
RMSE = 1.39829248908 sRMSE= 1.39829248908
R2 = 0.895852846878
R2 = 0.894734495003
************************************************************************************************
上面的是最原始的回归,然后我们对比一下Lasso回归和Ridge回归
import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import Lasso, Ridge from sklearn.model_selection import GridSearchCV#GridSearchCV模块,能够在指定的范围内自动搜索具有不同超参数的不同 if __name__ == "__main__": # pandas读入 data = pd.read_csv('Advertising.csv') # TV、Radio、Newspaper、Sales x = data[['TV', 'Radio', 'Newspaper']] # x = data[['TV', 'Radio']] y = data['Sales'] print (x) print (y) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) #Lasso回归 #model = Lasso() #岭回归 model = Ridge() alpha_can = np.logspace(-3, 2, 10)#alpha参数集 10^-3~10^2的等比数列的10个数 np.set_printoptions(suppress=True) print ('alpha_can = ', alpha_can) lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=8)#其中cv8折交叉验证。
lasso_model.fit(x_train, y_train) print ('超参数:\n', lasso_model.best_params_) order = y_test.argsort(axis=0) y_test = y_test.values[order] x_test = x_test.values[order, :] y_hat = lasso_model.predict(x_test) print (lasso_model.score(x_test, y_test)) mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error rmse = np.sqrt(mse) # Root Mean Squared Error print(mse, rmse) t = np.arange(len(x_test)) mpl.rcParams['font.sans-serif'] = [u'simHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据') plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据') plt.title(u'线性回归预测销量', fontsize=18) plt.legend(loc='upper right') plt.grid() plt.show()
其实这个code只是多了一个对于超参数的处理,多个超参数,找出最好的一个 然后我们看一下结果对比一下
Ridge
0.915232753209
MSE:1.98213253677 RMSE:1.40788228797
Lasso
0.912025386213
MSE:2.0571311562 RMSE:1.43427025215
原始回归
0.895937263233
MSE:1.99188555183 RMSE:1.41134175586
从上面的数据来看:
MSE:Lasso回归>原始回归和>Ridge回归
得到目前Ridge回归模型效果最好!