机器学习-回归算法中使用多项式——管道解决欠拟合问题

管道操作
代码如下:
#引入所需要的包
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import time

#创建一个时间字符串格式化字符串
def date_format(dt):
import time
t = time.strptime(’ ‘.join(dt),’%d/%m/%Y %H:%M:%S’)
return (t.tm_year,t.tm_mon,t.tm_mday,t.tm_hour,t.tm_min,t.tm_sec)

#设置字符集,防止中文乱码
mpl.rcParams[‘font.sans-serif’] = [u’simHei’]
mpl.rcParams[‘axes.unicode_minus’] = False

#1、加载数据
path = ‘./datas/household_power_consumption_1000.txt’
#path = ‘./datas/household_power_consumption_200.txt’
df = pd.read_csv(path,sep=’;’,low_memory=False)

#日期、时间、有功功率、无功功率、电压、电流、厨房用电功率、洗衣服用电功率、热水器用电功率
names1 = df.columns
print(names1)
names = [“Date”,“Time”,“Global_active_power”,“Global_reactive_power”,“Voltage”,“Global_intensity”,“Sub_metering_1”,“Sub_metering_2”,“Sub_metering_3”]

#数据清洗
df.replace(’?’,np.nan,inplace=True)
datas = df.dropna(axis=0,how=‘any’)

#构建数据
X = datas[names[0:2]]
X = X.apply(lambda x:pd.Series(date_format(x)),axis=1)
Y = datas[names[4]]
X = X.astype(np.float)
Y = Y.astype(np.float)

#将数据划分Wie训练集和测试集
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

#1.构建一个管道流对象,定义数据处理的顺序
model = Pipeline(steps = [
(‘Poly’,PolynomialFeatures()),
(‘Linear’,LinearRegression(fit_intercept=False))
])

#1.2 Pineline设置管道的参数
model.set_params(Poly__degree = 4)
model.set_params(Linear__normalize = True)
#2.模型训练(先调用第一步进行数据处理,然后调用第二步做模型训练。)
#假设是n步操作,前n-1步操作是:fit + transform,最后一步操作是fit
model.fit(X_train,Y_train)
print(“多项式模型:{}”.format(model.get_params()[‘Poly’]))
print(“线性回归模型:{}”.format(model.get_params()[‘Linear’]))
#3.预测值产生(先调用第一步的transform对数据进行转换,再调用predict对数据进行预测。)
#假设是n步操作,前n-1步操作是:transform,最后一步操作是predict
y_predict = model.predict(X_test)

#模型效果
linear_model = model.get_params()[‘Linear’]
print(“线性回归的各个特征属性对应的权重参数theta:{}”.format(linear_model.coef_))
print(“线性回归的截距项的值:{}”.format(linear_model.intercept_))
print(“在训练集上的模型效果:{}”.format(model.score(X_train,Y_train)))
print(“在测试集集上的模型效果:{}”.format(model.score(X_test,Y_test)))
print(“在测试集上的MSE的值:{}”.format(mean_squared_error(y_true = Y_test,y_pred=y_predict)))
#画图查看一下效果
t = np.arange(len(X_test))
plt.figure(facecolor=‘w’)
plt.plot(t,y_predict,‘g-’,linewidth = 2, label = u’预测值’)
plt.plot(t,Y_test,‘r-’,linewidth = 2,label = u’真实值’)
plt.legend(loc=‘lower right’)
plt.title(‘线性回归预测时间和功率之间的关系’,fontsize = 20)
plt.grid(b = True)
plt.savefig(‘Pipeline.png’)
plt.show()
测试结果:
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/Yangwenyi115615/article/details/89765170