日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
In [1]:
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
In [2]:
#没有用bagging和boosting
#stacking 先用几个不同的模型做预测 输出预测值 然后将这几个模型输出的预测值作为特征来训练一个新的模型
获取数据
In [3]:
data=pd.read_csv("data/onehot_feature.csv")
data_test = pd.read_csv("./data/onehot_feature_test.csv")
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150518 entries, 0 to 150517 Data columns (total 34 columns): Unnamed: 0 150518 non-null int64 时间 150518 non-null int64 小区名 150518 non-null int64 小区房屋出租数量 150518 non-null float64 楼层 150518 non-null int64 总楼层 150518 non-null float64 房屋面积 150518 non-null float64 房屋朝向 150518 non-null object 居住状态 150518 non-null float64 卧室数量 150518 non-null int64 厅的数量 150518 non-null int64 卫的数量 150518 non-null int64 出租方式 150518 non-null float64 区 150518 non-null float64 位置 150518 non-null float64 地铁线路 150518 non-null float64 地铁站点 150518 non-null float64 距离 150518 non-null float64 装修情况 150518 non-null float64 月租金 150518 non-null float64 log_rent 150518 non-null float64 新朝向 150518 non-null object 房+卫+厅 150518 non-null int64 房/总 150518 non-null float64 卫/总 150518 non-null float64 厅/总 150518 non-null float64 卧室面积 150518 non-null float64 楼层比 150518 non-null float64 户型 150518 non-null int64 有地铁 150518 non-null int64 小区线路数 150518 non-null int64 位置线路数 150518 non-null int64 新小区名 150518 non-null int64 小区条数大于100 150518 non-null int64 dtypes: float64(18), int64(14), object(2) memory usage: 39.0+ MB
In [4]:
# 将离散特征转换成字符串类型
colunms = ['时间', '新小区名', '居住状态', '出租方式', '区',
'位置', '地铁线路', '地铁站点', '装修情况', '户型']
for col in colunms:
data[col] = data[col].astype(str)
In [5]:
x_columns=['小区房屋出租数量','新小区名', '楼层', '总楼层', '房屋面积','居住状态', '卧室数量',
'卫的数量', '位置', '地铁站点', '距离', '装修情况',
'新朝向', '房+卫+厅', '房/总', '卫/总', '厅/总', '卧室面积', '楼层比', '户型','有地铁','小区线路数','位置线路数','小区条数大于100',]
y_label='log_rent'
x=data[x_columns]
y=data[y_label]
X_TEST = data_test[x_columns]
In [6]:
# 2.分割数据集
train_x, test_x, train_y, test_y = train_test_split(
x, y, test_size=0.25, random_state=12)
In [7]:
# 1.特征转换
vector = DictVectorizer(sparse=True)
x_train = vector.fit_transform(train_x.to_dict(orient='records'))
x_test = vector.transform(test_x.to_dict(orient='records'))
X_TEST = vector.transform(X_TEST.to_dict(orient="records"))
In [8]:
print(x_train.shape, x_test.shape, X_TEST.shape)
(112888, 826) (37630, 826) (46000, 826)
In [9]:
# 2.降维
pca=PCA(0.98)
pca_x_train=pca.fit_transform(x_train.toarray())
pca_x_test=pca.transform(x_test.toarray())
PCA_X_TEST = pca.transform(X_TEST.toarray())
In [10]:
print(pca_x_train.shape, pca_x_test.shape, PCA_X_TEST.shape)
(112888, 361) (37630, 361) (46000, 361)
In [68]:
def rmse(y_true,y_pred):
y_pred=np.exp(y_pred)-1 # 转换成真实的租金
y_true=np.exp(y_true)-1
return np.sqrt(mean_squared_error(y_true,y_pred))
构建子模型
构建岭回归模型
In [69]:
%%time
# 1.通过参数搜索,确定最优参数alpha的值
ridge = Ridge(normalize=True)
params = {
"alpha": [0.005, 0.01, 1, 5, 10, 20, 50]
}
model1 = GridSearchCV(ridge, param_grid=params, cv=5, n_jobs=-1)
model1.fit(pca_x_train, train_y)
model1.best_params_
#{'alpha': 50, 'fit_intercept': True}
CPU times: user 1.78 s, sys: 705 ms, total: 2.48 s Wall time: 21.5 s
In [70]:
# 利用搜索出的最优参数构建模型
ridge = Ridge(alpha=50, normalize=True)
ridge.fit(pca_x_train, train_y)
Out[70]:
Ridge(alpha=50, copy_X=True, fit_intercept=True, max_iter=None, normalize=True, random_state=None, solver='auto', tol=0.001)
In [71]:
y_pred_test=ridge.predict(pca_x_test)
y_pred_train=ridge.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 6.342657781238426 测试集rmse: 6.493947602276618
构建lasso回归
In [72]:
%%time
# 1.参数搜索
lasso = Lasso(normalize=True)
params = {
"alpha": [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
"fit_intercept": [True, False]
}
model2 = GridSearchCV(lasso, param_grid=params, cv=5, n_jobs=-1)
model2.fit(pca_x_train, train_y)
print(model2.best_params_)
#{'alpha': 0.001, 'fit_intercept': True}
{'alpha': 0.001, 'fit_intercept': True} CPU times: user 1.68 s, sys: 551 ms, total: 2.23 s Wall time: 49.6 s
In [73]:
# 利用搜索出的最优参数构建模型
lasso=Lasso(alpha=0.001, normalize=True)
lasso.fit(pca_x_train,train_y)
Out[73]:
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
In [74]:
%%time
y_pred_test=lasso.predict(pca_x_test)
y_pred_train=lasso.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 6.385065714494761 测试集rmse: 6.53676743372339 CPU times: user 393 ms, sys: 47.4 ms, total: 440 ms Wall time: 87.1 ms
构建随机森林
In [75]:
%%time
# 1.参数搜索
rf = RandomForestRegressor(max_features='sqrt') # 设置max_features='sqrt',不然太耗时间
params = {
"n_estimators": [200], # [200,500,700],
"max_depth": [50], # [40, 50, 60]
"min_samples_split": [20, 50, 100],
"min_samples_leaf": [10, 20, 30]
}
model3 = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1, verbose=2)
model3.fit(pca_x_train, train_y)
print(model3.best_params_)
# {'max_depth': 50,
# 'min_samples_leaf': 10,
# 'min_samples_split': 20,
# 'n_estimators': 200}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 55.7min [Parallel(n_jobs=-1)]: Done 45 out of 45 | elapsed: 81.1min finished
{'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 200} CPU times: user 10min 4s, sys: 8.96 s, total: 10min 13s Wall time: 1h 31min 30s
In [76]:
%%time
# 利用搜索出的最优参数构建模型
rf=RandomForestRegressor(n_estimators=200,
max_features=0.8,
max_depth=50,
min_samples_split=20,
min_samples_leaf=10,
n_jobs=-1)
rf.fit(pca_x_train,train_y)
CPU times: user 3h 34min 3s, sys: 1min 29s, total: 3h 35min 32s Wall time: 33min 4s
In [77]:
%%time
y_pred_test=rf.predict(pca_x_test)
y_pred_train=rf.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 2.133144119124377 测试集rmse: 2.7950254213867094 CPU times: user 24.4 s, sys: 465 ms, total: 24.9 s Wall time: 4.53 s
构建决策树
In [78]:
%%time
tree=DecisionTreeRegressor()
params={
"max_depth":[60], # [40,50,60,70],
"min_samples_split":[5], # [5,10,20,30,40,50]
"min_samples_leaf":[5], # [2,3,5,7,9,11]
}
model4=GridSearchCV(tree,param_grid=params,cv=5,n_jobs=-1)
model4.fit(pca_x_train,train_y)
print(model4.best_params_)
# {'max_depth': 60, 'min_samples_leaf': 2, 'min_samples_split': 5}
{'max_depth': 60, 'min_samples_leaf': 5, 'min_samples_split': 5} CPU times: user 1min 34s, sys: 2.06 s, total: 1min 36s Wall time: 3min 26s
In [79]:
%%time
from sklearn.tree import DecisionTreeRegressor
#利用搜索出的最优参数构建模型
tree=DecisionTreeRegressor(max_depth=60,min_samples_leaf=2,min_samples_split=5)
tree.fit(pca_x_train,train_y)
CPU times: user 1min 36s, sys: 1.48 s, total: 1min 38s Wall time: 1min 40s
In [80]:
%%time
y_pred_test=tree.predict(pca_x_test)
y_pred_train=tree.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 0.805142479875888 测试集rmse: 2.6702036461919856 CPU times: user 254 ms, sys: 123 ms, total: 377 ms Wall time: 380 ms
In [81]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.show()
构建支持向量机
In [ ]:
# %%time
# # 1.参数搜索----数据量大 svm太耗时,调参几乎不可能
# svr=SVR()
# params={
# "gamma":[0.001,0.01,0.1,0.5,1,5],
# "C":[0.001,0.1,0.5,1,5]
# }
# model5=GridSearchCV(svr,param_grid=params,cv=5,n_jobs=-1,verbose=10)
# # verbose:日志冗长度,int:冗长度,0:不输出训练过程,1:偶尔输出,>1:对每个子模型都输出。
# model5.fit(pca_x_train,train_y)
# model5.best_params_
In [ ]:
# %%time
# # 随意选一组参数 --- 耗时太长 放弃该模型
# svr=SVR(gamma=0.1,C=0.5)
# svr.fit(pca_x_train,train_y)
# y_pred=svr.predict(pca_x_test)
# print(rmse(test_y,y_pred))
构建xgboost模型
In [82]:
%%time
import xgboost as xgb
xgbr = xgb.XGBRegressor(objective='reg:linear', learning_rate=0.1, gamma=0.05, max_depth=45,
min_child_weight=0.5, subsample=0.6, reg_alpha=0.5, reg_lambda=0.8, colsample_bytree=0.5, n_jobs=-1)
xgbr.fit(pca_x_train, train_y)
y_pred = xgbr.predict(pca_x_test)
print(rmse(test_y,y_pred))
/Users/sherwin/anaconda3/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version if getattr(data, 'base', None) is not None and \
[12:23:28] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. 2.1601162492127104 CPU times: user 28min 30s, sys: 24.2 s, total: 28min 54s Wall time: 29min 29s
In [83]:
%%time
y_pred_test=xgbr.predict(pca_x_test)
y_pred_train=xgbr.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 0.9609658477710833 测试集rmse: 2.1601162492127104 CPU times: user 10 s, sys: 427 ms, total: 10.4 s Wall time: 10.6 s
In [84]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.show()
Stacking融合
构建Stacking模型需要的数据
In [86]:
%%time
# 获取每个子模型的预测结果作为特征
# 训练特征
train_features=[]
train_features.append(ridge.predict(pca_x_train)) # 将每个模型预测值保存起来
train_features.append(lasso.predict(pca_x_train))
# train_features.append(svr.predict(pca_x_train)) # 这个太慢了 不要了
train_features.append(rf.predict(pca_x_train))
train_features.append(tree.predict(pca_x_train))
train_features.append(xgbr.predict(pca_x_train))
# 测试特征
test_features=[]
test_features.append(ridge.predict(pca_x_test))
test_features.append(lasso.predict(pca_x_test))
# test_features.append(svr.predict(pca_x_test))
test_features.append(rf.predict(pca_x_test))
test_features.append(tree.predict(pca_x_test))
test_features.append(xgbr.predict(pca_x_test))
# 提交结果特征
TEST_FEATURES=[]
TEST_FEATURES.append(ridge.predict(PCA_X_TEST))
TEST_FEATURES.append(lasso.predict(PCA_X_TEST))
# TEST_FEATURES.append(svr.predict(PCA_X_TEST))
TEST_FEATURES.append(rf.predict(PCA_X_TEST))
TEST_FEATURES.append(tree.predict(PCA_X_TEST))
TEST_FEATURES.append(xgbr.predict(PCA_X_TEST))
CPU times: user 42.1 s, sys: 1.49 s, total: 43.6 s Wall time: 20.3 s
In [87]:
train_features
Out[87]:
[array([2.04715431, 2.05232901, 2.04572967, ..., 2.04659472, 2.04508413, 2.05562638]), array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758, 2.05200758]), array([1.67325566, 1.94499122, 1.85460452, ..., 1.92275812, 1.76267895, 2.22438597]), array([1.59023952, 1.84714777, 1.85130219, ..., 1.96150612, 1.77317884, 2.23207518]), array([1.6343094, 1.9145248, 1.8356705, ..., 1.9381661, 1.7626299, 2.2465973], dtype=float32)]
In [88]:
test_features
Out[88]:
[array([2.04925512, 2.04865288, 2.04878586, ..., 2.07295592, 2.05666692, 2.0560697 ]), array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758, 2.05200758]), array([1.93842148, 1.71689679, 1.71233925, ..., 3.7684956 , 2.1988801 , 2.15518207]), array([1.93762954, 1.71991266, 1.59023952, ..., 3.92681962, 2.1296814 , 2.08786427]), array([1.9394264, 1.6995616, 1.8815998, ..., 3.7348156, 2.2026072, 2.1582646], dtype=float32)]
In [89]:
# np.vstack:按垂直方向(行顺序)堆叠数组构成一个新的数组
mx_train=np.vstack(train_features).T
mx_test=np.vstack(test_features).T
MX_TEST=np.vstack(TEST_FEATURES).T
MX_TEST.shape
Out[89]:
(46000, 5)
Stacking模型训练
In [90]:
%%time
stack_model=Ridge(fit_intercept=False)
params={
"alpha":np.logspace(-2,3,20)
}
model=GridSearchCV(stack_model,param_grid=params,cv=5,n_jobs=-1)
model.fit(mx_train,train_y)
print(model.best_params_)
{'alpha': 0.06158482110660264} CPU times: user 580 ms, sys: 439 ms, total: 1.02 s Wall time: 3.47 s
In [91]:
%%time
stack_model=Ridge(alpha=0.379269,fit_intercept=False)
stack_model.fit(mx_train,train_y)
y_pred=stack_model.predict(mx_test)
y_pred_train=stack_model.predict(mx_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred))
训练集rmse: 0.7337935133190991 测试集rmse: 2.3272631885188044 CPU times: user 30.8 ms, sys: 9.28 ms, total: 40.1 ms Wall time: 13.2 ms
In [92]:
stack_model.coef_
Out[92]:
array([-0.1330147 , 0.13235901, -0.15773228, 0.6991465 , 0.45928745])
提交结果输出
In [96]:
Y_PRED_TEST = stack_model.predict(MX_TEST)
Y_PRED_TEST = np.exp(Y_PRED_TEST)-1
print(Y_PRED_TEST)
data = range(1, len(Y_PRED_TEST)+1)
Y_PRED = pd.DataFrame(data=Y_PRED_TEST, columns=["月租金"])
Y_PRED["id"] = range(1, Y_PRED.shape[0]+1)
Y_PRED.head()
[6.2493489 5.12626054 8.64297508 ... 3.59608672 1.05481017 4.8740706 ]
Out[96]:
月租金 | id | |
---|---|---|
0 | 6.249349 | 1 |
1 | 5.126261 | 2 |
2 | 8.642975 | 3 |
3 | 8.885262 | 4 |
4 | 4.482541 | 5 |
In [97]:
Y_PRED.to_csv("./data/Y_PRED_STACK.csv")
模型保存
In [98]:
from sklearn.externals import joblib
joblib.dump(stack_model, "./data/stack_model.kpl")
Out[98]:
['./data/stack_model.kpl']