简单线性回归
y=ax+b
其中a、b的算法依次是这个
如何评价回归方程的拟合度呢?
实战案例:
import pymysql # 导入模块
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split #这里是引用了交叉验证
from sklearn.linear_model import LinearRegression #线性回归
def get_df_from_db(sql,columnNames):
conn = pymysql.connect(
host='', # 主机模块
port=3306, # 端口号
user='',# 用户名
password='', # 密码
database='', # 需要连接的库
charset='utf8' # 指定编码utf8
)
cursor = conn.cursor() # 获取游标
cursor.execute(sql)
data = cursor.fetchall()
columnDes = cursor.description #获取连接对象的描述信息
# = [columnDes[i][0] for i in range(len(columnDes))]
df = pd.DataFrame([list(i) for i in data],columns=columnNames)
return df
def pridect_cost(x,y,x1_ture,x2_ture):
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)#选择20%为测试集
#print(X_train)y_perdict,a,b
linreg = LinearRegression()
#训练
model = linreg.fit(X_train, y_train)
print('模型参数:')
print(model)
# 训练后模型截距
print('模型截距:')
print(linreg.intercept_)
# 训练后模型权重(特征个数无变化)
print('参数权重:')
print (linreg.coef_)
y_pred = linreg.predict(X_test)
sum_mean = 0
for i in range(len(y_pred)):
sum_mean += (y_pred[i] - y_test.values[i]) ** 2
sum_erro = np.sqrt(sum_mean /len(y_pred)) # 测试级的数量
# calculate RMSE
print ("RMSE by hand:", sum_erro)
# 做ROC曲线
plt.figure()
plt.plot(range(len(y_pred)), y_pred, 'b', label="predict")
plt.plot(range(len(y_pred)), y_test, 'r', label="test")
plt.legend(loc="upper right") # 显示图中的标签
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.show()
#假设多元线性模型为:y=ax+bx+c,x为第一个参数,y为第二个参数
c=linreg.intercept_
a=float(linreg.coef_[0])
b=float(linreg.coef_[1])
y_predict=x1_ture*a+x2_ture*b+c
return y_predict
def main():
sql="SELECT DATE_FORMAT(create_time,'%d'),DATE_FORMAT(create_time,'%k'),SUM(CASE WHEN str_tag='total_cost' THEN num_tag ELSE NULL END ) FROM common.current_consumption_list WHERE apartment='102087' AND DATE_FORMAT(create_time,'%Y-%m')='2020-03' AND DATE_FORMAT(create_time,'%Y-%m-%d')>=DATE_SUB(CURDATE(),INTERVAL 20 day) GROUP BY 1,2;"
columnNames=['天','小时','消耗']
df=get_df_from_db(sql,columnNames)
x=df[['天','小时']]#.values.reshape(-1,1)做一元的时候需要用到这个
y=df['消耗']
y_predict=pridect_cost(x,y,22,24)
print(y_predict)
if __name__ == '__main__':
main()
多元线性回归