练习：科比数据集的处理和预测

import pandas as pd
#读取数据，并返回一个DataFrame对象
raw = pd.read_csv("F:\\skdata\\kobe.csv")
#把剩余时间的分钟数和秒数合并到一列
raw['remaining_time'] = raw['minutes_remaining']*60 + raw['seconds_remaining']
#保留有是否命中值的行，0未命中1命中
kobe = raw[pd.notnull(raw['shot_made_flag'])]
print(kobe.shape)

输出：可以看到pandas读取csv文件后，返回的是一个DataFrame对象

from matplotlib import pyplot as plt
alpha = 0.02
#设置效果图的属性，figsize，效果图占据的位置大小
plt.figure(figsize=(10,10))
#选定子图，后面的绘图属性都作用于该子图：121含义：把图视为1行2列的划分，本图在这个划分的第一个位置
plt.subplot(121)
#绘制散点图
plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
#设置标题
plt.title('loc_x and loc_y')
#选定另一个子图
plt.subplot(122)
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
plt.title('lon and lag')
plt.plot()

输出：通过用图表的形式观察数据，这里可以看出有两列数据内容不同，实际含义相同，所以数据处理时可以只保留一列

#Dataframe.属性：获取表格中的某一列，返回pandas.core.series.Series类对象
#调用Series.unique()，去重统计列中有哪些值
print(kobe.shot_type.unique())
#调用Series.value_counts()，去重统计每个不同值出现了多少次
print(kobe.shot_type.value_counts())
#也可以用dict方式获取Series对象
#kobe['season'].unique()
#kobe['season'].value_counts()
#Series.apple(callable):通过一个回调函数过滤对象中的数据
print(kobe['season'].unique())
#过滤掉原数据中的‘-’，只保留‘赛季’
kobe['season'] = kobe['season'].apply(lambda x: int(x.split('-')[1]))
print(kobe['season'].unique())

输出：

import matplotlib.cm as cm
import numpy as np
plt.figure(figsize=(20, 10))

def scatter_plot_by_category(feat):
    alpha = 0.1
    #把数据集按照某列的取值不同，分为多个组
    gs = kobe.groupby(feat)
    cs = cm.rainbow(np.linspace(0, 1, len(gs)))
    for g, c in zip(gs, cs):
        plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)
        
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

输出：可以看到，这三列数据其实可以只保留一列

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_basic', 'shot_zone_range', 'matchup', 
         'lon', 'lat', 'seconds_remaining', 'minutes_remaining', 'shot_distance', 'game_event_id', 'game_id',
         'game_date']
for drop in drops:
    #按列删除表格中的数据，依据是列名
    raw = raw.drop(drop, 1)
#DataFrame.get_dummies()把原始列中的数据映射为one-hot表示的多列数据，每列代表一个可取值，每行数
#据只在取值列为1，其他列为0，并且可以为新生成的列添加列名
category_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'season', 'shot_zone_area']
for var in category_vars:
    #把所有字符串为值的列转换为多列one-hot值
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
    raw = raw.drop(var, 1)

输出：把原数据中对预测作用不大或者内容重复的列删除掉。把原数据中字符串为值的列，转换为one-hot的多列，便于算法学习。后续用sklearn中随机森林训练模型时，数据必须是数字型。

至此数据的整理已经完成，下面开始训练模型，目的是判断科比是否可以进球。

#用原始数据中，命中数据列的有效值数据行作为训练数据
#用NAN，即无效值数据行作为测试数据
train_kobe = raw[pd.notnull(raw.shot_made_flag)]
train_label = train_kobe.shot_made_flag
train_kobe = train_kobe.drop('shot_made_flag', 1)
test_kobe = raw[pd.isnull(raw.shot_made_flag)]
test_kobe = test_kobe.drop('shot_made_flag', 1)

输出：准备训练数据和测试数据，测试数据用缺失命中值的样本。

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import time

输出：导入用到的库。

min_score = 100000
best_m = 0
best_n = 0
scores = []
ranges = np.logspace(0, 2, num=3).astype(int)
kf = KFold(n_splits=10, shuffle=True)
for m in ranges:
    for n in ranges:
        print('the max depth: ', m)
        t1 = time.time()
        rfc_score = 0
        rfc = RandomForestClassifier(n_estimators=n, max_depth=m)
        for train_k, test_k in kf.split(train_kobe):
            rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
            pred = rfc.predict(train_kobe.iloc[test_k])
            rfc_score += log_loss(train_label.iloc[test_k], pred)/10
        scores_m.append(rfc_score)
        if rfc_score < min_score:
            min_score = rfc_score
            best_m = m
            best_n = n
        t2 = time.time()
        print('Done processing {0} depth {1} trees ({2:.3f}sec)'.format(m, n, t2-t1))
print(best_m, best_n, min_score)

输出：用随机森林训练模型。简单起见，森林的宽度和深度只用了三个值（1,10,100）。

rfc = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
rfc.fit(train_kobe, train_label)
rfc.predict(test_kobe)

输出：使用上面训练好的模型，对测试数据进行预测

以上就是对科比数据集的简单的预处理、训练、预测的过程。

练习：科比数据集的处理和预测

猜你喜欢