一文了解深度学习实战——预测篇

本文将从四个案例 房价预测泰坦尼克号生还预测股票预测影评情感预测 入手,让童鞋们从实战角度快速入门深度学习的预测部分!

房价预测

基于决策树回归器(DecisionTreeRegressor)

数据文件在这:
链接:https://pan.baidu.com/s/1mPr60cFUSc5m7pmF8Ju4vw 提取码:j2b0

#基于DecisionTreeRegressor预测北京房价

import numpy 
import pandas as pd
import matplotlib
import seaborn
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
import tensorflow 
import numpy as np


#定义一堆函数
# 定义网格搜索最佳模型函数
def gridSearchVC_fit_model(X, y):
    
    # 清洗和分割数据对象定义,
    # 参数一:n_splits表示重新清洗和分割数据的迭代次数,默认值就是10
    # 参数二:test_size=0.2表示有0.2的数据用于测试,也就是20%的测试数据,80%的训练数据
    # 参数三:random_state表示随机数生成器的种子,如果希望第二次调用ShuffleSplit()方法时
    #        和第一次调用的结果一致,那么就可以设置一个值,多少都可以,生产环境不要设值
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

    # 创建决策树回归器对象
    regressor = DecisionTreeRegressor(random_state=0)

    # 创建一个字典,表示max_depth的值是从1到10
    # 注意:如果是Python2的话,这个list()函数调用去掉
    params = {
    
     "max_depth" : list(range(1, 10)) }

    # 通过make_scorer()函数将上面定义的performance_metric()函数转换成计算分值函数
    scoring_fnc = make_scorer(score_func=performance_metric)

    # 创建网格搜索对象
    # 参数一:评估器,就是回归器,这里表示的是决策树回归器
    # 参数二:网格搜索参数
    # 参数三:计算分值函数
    # 参数四:cv(Cross-Validation)交叉验证,传入交叉验证生成器,或者可迭代对象
    grid = GridSearchCV(estimator=regressor, param_grid=params, 
                        scoring=scoring_fnc, cv=cv)

    # 根据数据计算/训练适合网格搜索对象的最佳模型
    grid = grid.fit(X, y)

    # 返回计算得到的最佳模型
    return grid.best_estimator_


# 预测房屋价格
def PredictHousingPrice(X, y, fitter):
    
    # 迭代10次
    epochs = 10
    # 存储预测的价格
    y_predict_test_price = None
    # 分割训练集和测试集数据
    X_train, X_test, y_train, y_test = train_test_split(X, y,
            test_size=0.2, random_state=0)
    # 迭代训练
    for epoch_i in range(epochs):
        # 根据数据训练模型,并返回最佳模型
        reg = fitter(X_train, y_train)
        # 预测测试数据
        predicted_price = reg.predict(X_test)
        y_predict_test_price = predicted_price
        print("迭代第{}次。".format(epoch_i+1))
    return y_test, y_predict_test_price
    

# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
    # 创建一个10x7英寸的窗口大小
    plt.figure(figsize=(10, 7))
    # 绘制的图1是真实的房价
    X_show = np.rint(np.linspace(1, 
                                 np.max(y_true_price), 
                                 len(y_true_price))
                    ).astype(int)
    # 绘制图1线,plot()方法:
    #  参数1:X轴方向的值,真实房价最低价和最高价
    #  参数2:y轴方向的值,真实房价的值
    #  参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
    #  参数4:绘制的线的颜色,这里是青色
    plt.plot(X_show, y_true_price, 'o-', color='c')
    # 绘制的图2是预测的房价,叠加在图1上
    X_show_predicted = np.rint(np.linspace(1, 
                                           np.max(y_predict_price), 
                                           len(y_predict_price))
                              ).astype(int)
    # 绘制图2线,plot()方法:
    #  参数1:X轴方向的值,预测房价最低价和最高价
    #  参数2:y轴方向的值,预测房价的值
    #  参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
    #  参数4:绘制的线的颜色,这里是洋红色
    plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
    # 添加标题
    plt.title('Housing Prices Prediction')
    # 添加图例
    plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
    # 添加X轴的标题
    plt.xlabel("House's Price Tendency By Array")
    # 添加y轴的标题
    plt.ylabel("House's Price")
    # 显示绘制
    plt.show()

#开搞!
# 根据北京的房价数据来预测
  
# 加载数据集
df = pd.read_csv('bj_housing.csv')
df.describe()

bj_prices = df['Value']
bj_prices.head()
bj_features = df.drop('Value', axis=1)
bj_features.head()

y_true_bj_price, y_predict_bj_price = \
PredictHousingPrice(bj_features, bj_prices, gridSearchVC_fit_model)

y_true_bj_price.reset_index().drop('index', axis=1).head()
pd.Series(y_predict_bj_price).head()

# 北京房屋价格对比图
plotVersusFigure(y_true_bj_price, y_predict_bj_price)

基于Keras

# 使用Keras来预测波士顿的房价预测

import tensorflow as tf
from tensorflow import keras
import numpy as np

# 加载波士顿的房价数据
(train_data, train_labels), (test_data, test_labels) = \
keras.datasets.boston_housing.load_data()

# 清洗训练集数据
# np.random.random()表示在0.0到1.0之间返回指定个数的随机浮点数
# np.argsort()表示返回对数组进行排序的索引
order = np.argsort(np.random.random(train_labels.shape))
train_data = train_data[order]
train_labels = train_labels[order]

# 归一化处理数据
# 对不同的范围和比例进行归一化处理,并且每个元素都要减去均值除以标准差
# 模型虽然在没有特征归一化时也可以得到收敛,但是这会让训练更加困难,
# 而且会是结果模型很依赖于训练数据
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

print("train_data.shape: {}, train_labels.shape: {}."
      .format(train_data.shape, train_labels.shape)) 
print("test_data.shape: {}, test_labels.shape: {}."
      .format(test_data.shape, test_labels.shape)) 

# 创建模型函数
def build_model():
    model = keras.Sequential([
      keras.layers.Dense(64, activation=tf.nn.relu,
                         input_shape=(train_data.shape[1],)),
      keras.layers.Dense(64, activation=tf.nn.relu),
      keras.layers.Dense(1)
    ])

    optimizer = tf.train.RMSPropOptimizer(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae'])
    return model

model = build_model()
# 查看模型的架构
model.summary()

# 自定义一个回调类,在每次epoch(代)结束时都会调用该函数
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

EPOCHS = 500

# 训练模型
history = model.fit(train_data, train_labels, epochs=EPOCHS,
                    validation_split=0.2, verbose=0,
                    callbacks=[PrintDot()])

import matplotlib.pyplot as plt

# 绘制图来显示训练的误差历史
def plot_history(history):
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [1000$]')
    plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
             label='Train Loss')
    plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
             label='Val loss')
    plt.legend()
    plt.ylim([0, 5])
    plt.show()

plot_history(history)



# 评估模型
[loss, mae] = model.evaluate(test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: ${:7.2f}".format(mae * 1000))

# 预测模型
test_predictions = model.predict(test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [1000$]')
plt.ylabel('Predictions [1000$]')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
plt.plot([-100, 100], [-100, 100])
plt.show()

# 查看预测值与真实的值得误差
error = test_predictions - test_labels
plt.hist(error, bins=50)
plt.xlabel("Prediction Error [1000$]")
plt.ylabel("Count")
plt.show()


# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
    # 创建一个10x7英寸的窗口大小
    plt.figure(figsize=(10, 7))
    # 绘制的图1是真实的房价
    X_show = np.rint(np.linspace(1, 
                                 np.max(y_true_price), 
                                 len(y_true_price))
                    ).astype(int)
    # 绘制图1线,plot()方法:
    #  参数1:X轴方向的值,真实房价最低价和最高价
    #  参数2:y轴方向的值,真实房价的值
    #  参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
    #  参数4:绘制的线的颜色,这里是青色
    plt.plot(X_show, y_true_price, 'o-', color='c')
    # 绘制的图2是预测的房价,叠加在图1上
    X_show_predicted = np.rint(np.linspace(1, 
                                           np.max(y_predict_price), 
                                           len(y_predict_price))
                              ).astype(int)
    # 绘制图2线,plot()方法:
    #  参数1:X轴方向的值,预测房价最低价和最高价
    #  参数2:y轴方向的值,预测房价的值
    #  参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
    #  参数4:绘制的线的颜色,这里是洋红色
    plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
    # 添加标题
    plt.title('Housing Prices Prediction')
    # 添加图例
    plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
    # 添加X轴的标题
    plt.xlabel("House's Price Tendency By Array")
    # 添加y轴的标题
    plt.ylabel("House's Price")
    # 显示绘制
    plt.show()

# 对比真实的值和预测的值的图
plotVersusFigure(test_labels, test_predictions)

泰坦尼克号生还预测

提供1309行泰坦尼克号乘客数据,其中891行是训练数据,418行是测试数据,一共有12列,其中有一列表示乘客是否生还。
下面用sklearn(决策树、逻辑回归、梯度提升、多层感知机)和keras(DNN)实现乘客生还预测。

数据文件在这:
链接:https://pan.baidu.com/s/1o_FUa_4VxmqXVBMBGh4rog 提取码:apzg

基于Sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)

# 预览前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train.info()

# 先看下数据集的 Age 分布状态
sns.distplot(X_train['Age'].dropna(), hist=True, kde=True)
# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)

# Cabin 的缺失值太多,从 Dataframe 中移除后,也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)

# 我们来看下乘客都在哪些站登船的
# S 表示:Southampton,英国南安普敦
# C 表示:Cherbourg-Octeville,法国瑟堡-奥克特维尔
# Q 表示:Queenstown,爱尔兰昆士敦
X_train.Embarked.value_counts()

# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据,我们把它找出来,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传,级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare


X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))


# 使用决策树预测模型
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
# 创建决策树模型
def createDecisionTreeClassifier():
    model = DecisionTreeClassifier()
    # 训练模型
    model.fit(train_X, train_y)
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    # 计算精确度
    train_accuracy = accuracy_score(train_y, train_pred)
    test_accuracy = accuracy_score(test_y, test_pred)
    print('The training accuracy is {}.'.format(train_accuracy))
    print('The test accuracy is {}'.format(test_accuracy))
    # ROC curve and AUC
    y_score_dt = model.predict_proba(test_X)
    fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(test_y, y_score_dt[:,1])
    print('Decision Tree Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_dt[:,1])))
    return fpr_dt, tpr_dt
fpr_dt, tpr_dt = createDecisionTreeClassifier()



# 创建逻辑回归预测模型
from sklearn.linear_model import LogisticRegression
def createLogisticRegressionModel():
    model = LogisticRegression()
    model.fit(train_X, train_y)

    print('Logistic Regression Accuracy for training data is: {:.3f}'.format(model.score(train_X, train_y)))
    print('Logistic Regression Accuracy for testing data is: {:.3f}'.format(model.score(test_X, test_y)))
    
    y_score_lr = model.decision_function(test_X)
    print('Logistic Regression AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_lr)))

    fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(test_y, y_score_lr)
    return fpr_lr, tpr_lr

fpr_lr, tpr_lr = createLogisticRegressionModel()



# 创建梯度提升模型
from sklearn.ensemble import GradientBoostingClassifier
def createGradientBoostingClassifierModel():
    model = GradientBoostingClassifier(n_estimators = 500)
    model.fit(train_X, train_y)
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    print('Gradient Boosting Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
    print('Gradient Boosting Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
    # ROC 曲线 和 AUC
    y_score_gb = model.predict_proba(test_X)
    fpr_gb, tpr_gb, thresholds_gb = metrics.roc_curve(test_y, y_score_gb[:,1])
    print('Gradient Boosting Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_gb[:,1])))
    return fpr_gb, tpr_gb
fpr_gb, tpr_gb = createGradientBoostingClassifierModel()



# 创建多层感知器的预测模型
from sklearn.neural_network import MLPClassifier
def createMLPClassifierModel():
    model = MLPClassifier(hidden_layer_sizes=128, batch_size=64, max_iter=1000, solver="adam")
    model.fit(train_X, train_y)
     
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    
    print('Neural Network classifier  Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
    print('Neural Network classifier  Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))

    # ROC curve and AUC
    y_score_nn = model.predict_proba(test_X)
    fpr_nn, tpr_nn, thresholds_nn = metrics.roc_curve(test_y, y_score_nn[:,1])
    print('Neural Network Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_nn[:,1])))
    return fpr_nn, tpr_nn
  
fpr_nn, tpr_nn = createMLPClassifierModel()

# 全部模型的训练曲线画图!
fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(111)
ax1 = ax.plot(fpr_dt, tpr_dt, c='c', lw=2, label="Decision Tree")
ax2 = ax.plot(fpr_lr, tpr_lr, c='y', lw=2, label="Logistic Regression")
ax3 = ax.plot(fpr_gb, tpr_gb, c='r', lw=2, label="Gradient Boosting")
ax4 = ax.plot(fpr_nn, tpr_nn, c='b', lw=2, label="Neural Network")

ax.grid()
lns = ax1 + ax2 + ax3 + ax4
ax.legend(lns, loc=0)
plt.show()

train_X.shape

基于Keras

# Keras的神经网络模型来预测
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import utils as np_utils

# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)

# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多,从 Dataframe 中移除后,也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)

X_train.Embarked.value_counts()
# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据,我们把它找出来,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传,级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare


X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))

def createKerasModel(X, y):
    # 创建模型
    model = Sequential()
    # 内核初始化器就使用截断正态分布
    initializers = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
    # 输入层维度是 X.shape[1]
    model.add(Dense(input_dim=X.shape[1], units=128, kernel_initializer=initializers, bias_initializer='zeros'))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Dense(32))
    model.add(Activation("relu"))
    model.add(Dense(2))
    # 输出的结果是要么1,要么0,所以使用 sigmoid激活函数
    model.add(Activation("sigmoid"))
    # 编译使用二进制交叉熵,adam优化器自行调整
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # 将训练数据的y进行独热编码(one-hot encoding)
    y_train_categorical = np_utils.to_categorical(y)
    # 训练模型,epochs表示要训练150次,verbose表示训练每批次时输出日志信息
    model.fit(X.values, y_train_categorical, epochs=150, verbose=1)
    return model
   
keras_model = createKerasModel(train_X, train_y)


y_test_categorical = np_utils.to_categorical(test_y)
loss_and_accuracy = keras_model.evaluate(test_X.values, y_test_categorical)
print("Loss={}, Accuracy={}.".format(loss_and_accuracy[0], loss_and_accuracy[1]))

predictions_classes = keras_model.predict_classes(test_X.values)

submission = pd.DataFrame({
    
    
    "PassengerId": test_X["PassengerId"],
    "Survived": predictions_classes})
print(submission[0:15])

股票预测

根据3000多条的百度股票数据,预测出股票曲线。
数据通过quandl开源库获取,使用Facebook开源的fbprophet库来进行股票价格预测。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install quandl
import quandl 

!pip install fbprophet
import fbprophet



def init_api_key():
    quandl.save_key("Your API Key")
    print(quandl.ApiConfig.api_key)
init_api_key()

quandl.read_key()
print(quandl.ApiConfig.api_key)

def init_stock(stock_name):
	#获取股票数据
    stock = quandl.get("WIKI/{}".format(stock_name))
    #设置列Date为第一列
    stock = stock.reset_index(level=0)
    return stock
  
#获取百度所有数据
stock_name = "BIDU"
baiduStock = init_stock(stock_name)
baiduStock.head()
print("baiduStock共计{}条。".format(len(baiduStock)))

min_date = min(baiduStock['Date'])
max_date = max(baiduStock['Date'])
print("百度的股票数据从{}到{}。".format(min_date, max_date))

print(type(baiduStock))
baiduStock.to_csv("baiduStock.csv", index=False)
baidu_df = pd.read_csv("baiduStock.csv")
baidu_df.head()

#数据可视化
def plot_basic_stock_history(df, start_date, end_date, stock_name):
    stats_Ajd_Close = 'Adj. Close'
    stat_min = min(df[stats_Ajd_Close])
    stat_max = max(df[stats_Ajd_Close])
    stat_mean = np.mean(df[stats_Ajd_Close])
    date_stat_min = df[df[stats_Ajd_Close] == stat_min]['Date']
    date_stat_min = date_stat_min[date_stat_min.index[0]].date()
    date_stat_max = df[df[stats_Ajd_Close] == stat_max]['Date']
    date_stat_max = date_stat_max[date_stat_max.index[0]].date()
    print("{}在{}最小,价格是:{}美元。".format(stats_Ajd_Close, date_stat_min, stat_min))
    print("{}在{}最高,价格是:{}美元。".format(stats_Ajd_Close, date_stat_max, stat_max))
    print("{}在{}当前价格是:{}美元。".format(stats_Ajd_Close, end_date.date(), df.loc[df.index[-1], 'Adj. Close']))
    plt.style.use("default")
    plt.plot(df["Date"], 
             df[stats_Ajd_Close], 
             color='r', 
             linewidth=3, 
             label=stats_Ajd_Close)
    plt.xlabel("Date")
    plt.ylabel("US $")
    plt.title("{} Stock History".format(stock_name))
    plt.grid()
    plt.show()

start_date = min_date
end_date = max_date
plot_basic_stock_history(baiduStock, start_date, end_date, stock_name)


#计算购买的股票收益
def plot_potential_profit(df, 
                          start_date, 
                          end_date, 
                          stock_name, 
                          line_color, 
                          text_color, 
                          myshares=1):
    start_price = float(df[df["Date"] == start_date]["Adj. Open"])
    end_price = float(df[df["Date"] == end_date]["Adj. Close"])
    df["profits"] = (df["Adj. Close"] - start_price) * myshares
    total_hold_profit = (end_price - start_price) * myshares
    print("从{}到{},购买{}股,总收益是:{}美元。".format(start_date.date(), 
                                                  end_date.date(), 
                                                  myshares, 
                                                  total_hold_profit))
    plt.style.use("default")
    plt.plot(df["Date"], df["profits"], color=line_color, linewidth=3)
    plt.xlabel("Date")
    plt.ylabel("Profit $")
    plt.title("My Shares From {} to {} on {}.".format(start_date.date(), end_date.date(), stock_name))
    text_location_x = (end_date - pd.DateOffset(months=1)).date()
    text_location_y = total_hold_profit + (total_hold_profit / 40)
    plt.text(text_location_x, 
             text_location_y, 
             "${}".format(int(total_hold_profit)), 
             color=text_color,
             size=15)
    plt.grid()
    plt.show()

start_date = min_date
end_date = max_date
plot_potential_profit(baiduStock, start_date, end_date, stock_name, 'm', 'g', 100)



# 倘若在2012年到2013年之间持股的话,差不多就会亏损一半哦,可是谁又知道了?他们最后涨了那么多
start_date = pd.to_datetime("2012-08-07")
end_date = pd.to_datetime("2013-03-05")
baiduStockLowerPricePhase = baiduStock[
                            (baiduStock['Date'] >= start_date.date()) & 
                            (baiduStock['Date'] <= end_date.date())
                            ]
plot_potential_profit(baiduStockLowerPricePhase, start_date, end_date, stock_name, 'c', 'r', 100)


#训练和评估模型
def train_model(stock_history, days=0, weekly_seasonality=False, monthly_seasonality=False):
    model = fbprophet.Prophet(daily_seasonality=False,  
                              weekly_seasonality=False, 
                              yearly_seasonality=True,
                              changepoint_prior_scale=0.05)
    if monthly_seasonality:
        model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.fit(stock_history)
    future = model.make_future_dataframe(periods=days)
    future = model.predict(future)
    return model, future
  
  
def create_prophet_model(df, 
                         stock_name, 
                         days=0,
                         weekly_seasonality=False, 
                         monthly_seasonality=False):
    stock_history = df[df["Date"] > (max_date - pd.DateOffset(years=3)).date()]
    model, future = train_model(stock_history, days, weekly_seasonality, monthly_seasonality)

    plt.style.use("default") 
    fig, ax = plt.subplots(1, 1) 
    fig.set_size_inches(10, 5)
    # 绘制真实的值
    ax.plot(stock_history['ds'], 
            stock_history['y'], 
            'v-', 
            linewidth=1.0, 
            alpha=0.8, 
            ms=1.8, 
            label='Observations')
    # 绘制预测的值
    ax.plot(future['ds'], 
            future['yhat'], 
            'o-',
            linewidth=1., 
            label='Modeled')
    # 使用带状绘制一个不确定的区间值
    ax.fill_between(future['ds'].dt.to_pydatetime(), 
                    future['yhat_upper'], 
                    future['yhat_lower'], 
                    alpha=0.3, 
                    facecolor='g', 
                    edgecolor='k', 
                    linewidth=1.0, 
                    label='Confidence Interval') 
    plt.legend(loc=2, prop={
    
    'size': 10})  
    plt.title("{} Historical and Modeled Stock Price".format(stock_name)) 
    plt.xlabel('Date') 
    plt.ylabel('Price $') 
    plt.grid(linewidth=0.6, alpha=0.6) 
    plt.show() 
    return model, future
  

baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
model, future_data = create_prophet_model(baiduStock, stock_name, monthly_seasonality=True)

model.plot_components(future_data)
plt.show()

model, future_data = create_prophet_model(baiduStock, stock_name, weekly_seasonality=True, monthly_seasonality=True)

model.plot_components(future_data)
plt.show()


#股票预测,基于时间序列预测未来180天的百度股票价格
model, future = create_prophet_model(baiduStock, stock_name, days=180)

#股票买入策略
import prophet_evaluator
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
prophet_evaluator.evaluator(baiduStock, min_date, max_date, train_model, stock_name, 1000)

影评的情感分析

情感分析在自然语言处理(NLP)领域是很复杂 的,有主观的,也有客观的。基于当前环境,针对不同的人或物,我们应该做出什么样的情感反应。下面讲解如何通过分析情感文本数据,预测出说话者在当时的情况下的情绪状态是积极的,还是消极的。
生活中就有很多例子,比如在京东、淘宝等电商 平台购物后,用户都会被请求对收到的货物进行拍 照、点赞、评论和评价星级等。平台收集这些数据后 去做情感分析,从而通过了解买家对于产品的喜好和满意度来改善产品和服务。这为平台提供了一些潜在 的用户会购买哪些产品的数据。
下面使用循环神经网络(RNN)来编写该神经网络模型的代码,创建此网络模型会使用到长短期记忆 网络(LSTM)和嵌入层(Embedding Layers),最后的输出层会使用sigmoid激活函数,因为我们预测的结果要么是积极的,要么是消极的。

数据文件在这:
链接:https://pan.baidu.com/s/1DQdAROwzOT6nXdWBYeT2bw 密码:1rn7

基于TensorFlow

import numpy as np
import tensorflow as tf

# 定义加载数据的函数
def loadData():
    # 加载评论(字符串)
    with open('reviews.txt', 'r') as f:
        reviews = f.read()

    # 加载评论(字符串)的对应标签,是积极的还是消极的
    with open('labels.txt', 'r') as f:
        labels = f.read()
        
    # 返回评论和标签
    return reviews, labels

# 调用函数
reviews, labels = loadData()

# 查看评论的前150个字符是什么
reviews[:150]
# 查看评论的对应标签的前150个字符是什么
labels[:150]

from string import punctuation 

# 定义数据预处理函数
def dataPreprocess(reviews_str):
    # 通过列表推导式将reviews_str字符串里的包含各种标点符号去掉,并返回一个字符组成的数组
    # 然后通过join()函数将数组里的元素都连接成一个长长的字符串
    all_text = ''.join(
        [review for review in reviews_str if review not in punctuation])
    # 将该字符串通过\n换行符分割成数组
    review_list = all_text.split('\n')
    # 将数组里的元素通过空格连接起来,形成一个长长的字符串
    all_text = ' '.join(review_list)
    # 然后通过使用split()函数的默认分隔符-空格来将字符串分割成一个个单词的数组
    words = all_text.split()
    
    return review_list, all_text, words
    
# 调用函数
reviews, all_text, words = dataPreprocess(reviews)
reviews[:2]

# 查看前20个元素(单词)
words[:20]
# 查看前150个字符串
all_text[:150]

# 单词编码
from collections import Counter
# 统计单词的重复个数
word_counter = Counter(words)
# 将变量word_counter根据默认顺序进行逆序排序(从大到小),使用sorted方法,逆序设置参数reverse=True
sorted_vocab = sorted(word_counter, key=word_counter.get, reverse=True)

# 定义显示前10个单词以及它的重复个数的函数
def showTop10Item(dict_obj):
    word_index = 0
    for k, v in dict_obj.items():
        if word_index >= 10:
            break
        print("{}:{}".format(k, v))
        word_index+=1

# 显示变量word_counter里的单词和它对应的数量
showTop10Item(word_counter)
# 按照单词出现的数量从大到小的排序,查看前15个单词的出现次数
word_counter.most_common(15)
# 查看排序后的前15个单词,和上面显示的结果一样
sorted_vocab[:15]
# 创建单词对应的索引关系字典
vocab_to_int = {
    
    word: i for i, word in enumerate(sorted_vocab, 1)}
# 然后显示前10个单词以及它的个数
showTop10Item(vocab_to_int)

# 将每个单词的索引位置取出来,然后添加到reviews_ints数组里
# 也就是说,现在字符串里的每个单词,不是原来的单词字符串了,而是一个数值,表示它的索引
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])
print(reviews_ints[:1])
len(reviews_ints)

# 标签编码
# 对positive进行编码为1,negative为0
labels = labels.split('\n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])
# 查看前10个编码标签值
labels[:10]

from collections import Counter

review_lens = Counter([len(x) for x in reviews_ints])
print("评论的最小长度是: {}".format(review_lens[0]))
print("评论的最大长度是: {}".format(max(review_lens)))
# 过滤掉评论的字符串长度为0的情况,并返回长度非零的索引,形成数组并返回
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
# 去掉字符串长度为0的情况后,还有多少个评论
print(len(non_zero_idx))
# 通过变量non_zero_idx索引数组,过滤掉变量reviews_ints里的字符串为0的情况
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
# 过滤掉由于上面的字符串长度为0的那一行评论后,它对应的标签也需要过滤掉
labels = np.array([labels[i] for i in non_zero_idx])

# 现在,我们要创建一个features的变量来作为特征向量(Feature Vector),这个数据就是我们要传递到神经网络中的,
# 数据来自于reviews_ints变量。因为我们要传递整型的数值到神经网络中,且每行的数值不能
# 超过200个;所以就是,不足200长度的评论,前面使用0来填充;超过200长度的,我们截断前
# 200个字符串的长度。

# 定义一个评论的字符串最大长度是200
seq_len = 200
# 创建一个矩阵,里面的值都默认是0
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
# 将reviews_ints里的值都截断在200的长度,并填充到变量features里。
# 不足200长度的,就是它本身长度
for i, row in enumerate(reviews_ints):
    # 评论长度不足200的,我们在前面使用0来填充
    features[i, -len(row):] = np.array(row)[:seq_len]

# 查看第一个
features[0:1]
features.shape

# 拆分训练集、验证集和测试集数据
# 定义80%的数据用于训练
split_train_ratio = 0.8
# 特征向量的长度
features_len = len(features)
# 训练集的个数
train_len = int(features_len * split_train_ratio)
# 分割出训练集和验证集的数据
train_x, val_x = features[:train_len], features[train_len:]
train_y, val_y = labels[:train_len], labels[train_len:] 
# 将验证集的数量折半
val_x_half_len = int(len(val_x) / 2)
# 将验证集数据分成一半验证集,另一半测试集
val_x, test_x = val_x[:val_x_half_len], val_x[val_x_half_len:]
val_y, test_y = val_y[:val_x_half_len], val_y[val_x_half_len:]

# 输出打印
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

# 定义超参数
lstm_size = 256
lstm_layers = 2
batch_size = 512
learning_rate = 0.01


# 获取单词的总长度
n_words = len(vocab_to_int) + 1
# 创建默认计算图对象
tf.reset_default_graph()
# 给计算图上的张量的输入占位符添加一个前缀inputs
with tf.name_scope('inputs'):
    # 输入特征占位符
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    # 输入标签占位符
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    # 保留率占位符
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    
  
# 嵌入向量的大小
embed_size = 300 
# 给计算图上的张量的嵌入层变量和查找表添加一个前缀Embeddings
with tf.name_scope("Embeddings"):
    # 均匀分布初始化嵌入层的变量,范围是-1到1之间
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    # 将输入特征占位符传入嵌入查找表
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
def lstm_cell():
    # 创建基础LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # 添加dropout层到cell上
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

# 给graph上的tensors的RNN层添加一个前缀RNN_layers
with tf.name_scope("RNN_layers"):
    # 创建多个LSTM层
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # 获取一个初始化状态,默认值都是0
    initial_state = cell.zero_state(batch_size, tf.float32)

with tf.name_scope("RNN_forward"):
    # 通过dynamic_rnn可以返回每一步的输出和隐藏层的最后状态
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
with tf.name_scope('predictions'):
    # 创建输出层,由于我们预测的输出是1或者0,所以sigmoid激活函数是最好的选择
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    
with tf.name_scope('cost'):
    # 定义均方差训练损失函数
    cost = tf.losses.mean_squared_error(labels_, predictions)

with tf.name_scope('train'):
    # 定义训练优化器
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
with tf.name_scope('validation'):
    # 计算验证精确度
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# 定义获取数据批次的生成器函数
def get_batches(x, y, batch_size=100):
    # 计算得出有多少个批次,这里是整除,所以假如x的总数不能被batch_size整除,
    # 那么会剩下很小的一部分数据暂时会被丢弃
    n_batches = len(x)//batch_size
    # 然后再次确定x和y的数据集的数据
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    # 通过for循环,使用yield关键字构建生成器函数
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

# 设置迭代次数,8次
epochs = 8
# 创建检查点保存对象
saver = tf.train.Saver()

# 创建一个TensorFlow会话
with tf.Session() as sess:
    # 初始化全局变量
    sess.run(tf.global_variables_initializer())
    
    iteration = 1
    # 开始迭代
    for e in range(epochs):
        # 首次计算初始化状态
        state = sess.run(initial_state)
        
        # 将所有的数据都进行训练,get_batches()函数会获取数据生成器,然后进行迭代
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {
    
    inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            # 每训练5次时,打印一次训练日志
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            # 每训练25次时,打印一次验证日志
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                # 对验证集的所有数据进行计算分值
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {
    
    inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = \
                        sess.run([accuracy, final_state], feed_dict=feed)
                    # 每25次训练后,完全的验证一次,得到验证分值,保存在数组val_acc里,
                    val_acc.append(batch_acc)
                # 打印每25次训练后,验证的均值
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            
            # 每批次时都记录检查点
            saver.save(sess, "checkpoints/sentiment.ckpt")
    # 当所有的数据迭代训练完毕后,最后记录一次检查点
    saver.save(sess, "checkpoints/sentiment.ckpt")

test_acc = []
with tf.Session() as sess:
    # 从检查点恢复已训练的模型
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    # 在计算测试集数据前,先创建一个空的状态
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    # 获取测试集数据生成器
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {
    
    inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        # 开始批次计算测试集数据
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        # 将每个批次的得分保存到数组
        test_acc.append(batch_acc)
    # 最后输出测试得分均值,即精确度
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

基于Keras

#基于Keras

import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# 为了确保可复现性,我们设置一个随机种子
numpy.random.seed(7)

# 设置5000的意思是,只保留前面5000个以内常见的单词,其它的都为0
top_words = 5000

# 加载数据集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# 设置单个影评的最大长度是500
review_max_length = 500

# 影评长度不够500的用0填充,超过500的截断
X_train = sequence.pad_sequences(X_train, maxlen=review_max_length)
X_test = sequence.pad_sequences(X_test, maxlen=review_max_length)

# 创建模型
embedding_vecor_length = 32
model = Sequential()
# 添加输入嵌入层
model.add(Embedding(top_words, embedding_vecor_length, input_length=review_max_length))
# 添加LSTM隐藏层
model.add(LSTM(100))
# 添加输出层(全连接层),二分类问题,使用sigmoid激活函数
model.add(Dense(1, activation='sigmoid'))
# 编译模型,二分类问题,使用二进制交叉熵来计算损失
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 输出显示模型架构
model.summary()

# 训练模型,所有的训练数据集都要经过3次训练,每次训练时的每批次大小是64个
model.fit(X_train, y_train, epochs=3, batch_size=64)

# 最后评估模型
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {}".format((scores[1]*100)))

猜你喜欢

转载自blog.csdn.net/weixin_45116099/article/details/127714943