记录下之前的代码~!!
import numpy as np
import pandas as pds import seaborn as sns import matplotlib.pyplot as plt
train=pds.read_csv("./train_V2/train_V2.csv") train.head()
train.info()
The Killers
print("the average persion kill {:.4f} players, 99% people have {} kills or less, while the most kills ever recorded is {}.".format(train["kills"].mean(),train["kills"].quantile(0.99),train["kills"].max()))
#Lets display the plots
data = train.copy()
data.loc[data["kills"]>data["kills"].quantile(0.99)]="8+"
plt.figure(figsize=(15,8),dpi=80) sns.countplot(data["kills"].astype("str").sort_values()) plt.title("Kill Count",fontsize=15) plt.show()
#Most People don't make one kill, At least do they do the damage?
data= train.copy()
data=data[data["kills"]==0] plt.figure(figsize=(15,8),dpi=80) plt.title("Damage Dealth by 0 kill",fontsize=15) sns.distplot(data["damageDealt"]) plt.show()
#大多数人又没杀人,也没有伤害。。
#那么看下没有杀人,但是吃鸡的人数以及概率
print("{}人一人未杀吃鸡,占总人数的({:.4f}%)".format(len(data[data["winPlacePerc"]==1]),100*len(data[data["winPlacePerc"]==1])/len(train)))
data1=train[train["damageDealt"]==0].copy()
print("{}人一点伤害未打出杀吃鸡,占总人数的({:.4f}%)".format(len(data1[data1["winPlacePerc"]==1]),100*len(data1[data1["winPlacePerc"]==1])/len(train)))
sns.jointplot(x="winPlacePerc",y="kills",data=train,height=10,ratio=3,color="r") plt.show()
#Apparentrly killing has a correlation with winning. Finally let's group players based on kills (0 kills, 1-2 kills, 3-5 kills, 6-10 kills and 10+ kills).
kills=train.copy()
kills["killsCategory"]=pds.cut(kills["kills"],[-1,0,2,5,10,60],labels=["0_kills","1-2_kills","2-5_kills","5-10_kills","10+kills"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="killsCategory",y="winPlacePerc",data=kills) plt.show()
#Runners
runner_data=train.copy()
print("平均每人跑{}米,99%的人跑了{}米或少于这些,长跑冠军平均跑了{}米".format(runner_data["walkDistance"].mean(),runner_data["walkDistance"].quantile(0.99),runner_data["walkDistance"].max()))
runner_data=runner_data[runner_data["walkDistance"]<train["walkDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) plt.title("Walking Distince Distribution",fontsize=15) sns.distplot(runner_data["walkDistance"]) plt.show()
#吃鸡数据
#win_data=train[train["winPlacePerc"]==1]
#count=len(runner_data[runner_data["walkDistance"]==0])
print("{}玩家({:.4f}%) 跑了0米,也就意味着他们一步没动就死亡了".format(len(runner_data[runner_data["walkDistance"]==0]),100*len(data1[data1["walkDistance"]==0])/len(train)))
sns.jointplot(x="winPlacePerc",y="walkDistance",data=train,height=10,ratio=3,color="lime") plt.show()
#Drivers
driver_data=train.copy() print("每人平均使用载具运行了{}米,99%的人开了{}米,或少于这些,开车最远开了{}米。".format(driver_data["rideDistance"].mean(),driver_data["rideDistance"].quantile(0.99),driver_data["rideDistance"].max()))
driver_data=driver_data[driver_data["rideDistance"]<train["rideDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) sns.distplot(driver_data["rideDistance"]) plt.title("Ride Distance Distribution") plt.show()
print("{} players dirve ({:.4f}%) for 0 meters, This means that they don't have a driving yet".format(len(driver_data[driver_data["rideDistance"]==0]),100*len(data1[data1["rideDistance"]==0])/len(train)))
sns.jointplot(x="winPlacePerc",y="rideDistance",data=train,height=10,ratio=3,color="y") plt.show()
#There is a small correlation between rideDistance and winPlacePerc.
#Let's try the destroy
f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="vehicleDestroys",y="winPlacePerc",data=driver_data,alpha=0.8) plt.xlabel=("Number of Vehicle Destroys") plt.ylabel=("winPlacePerc") plt.title("Vehicle Destroys/ Win Ratio",fontsize=20,color="blue") plt.grid() plt.show()
# this image means the players more destroy the vehicle, get the chicken more Opportunity。
# Heals
heal_data= train.copy()
print("average players use the {:.1f} heals, 99% players use {} heals or less, the doctor in the PUBG use {} heals ".format(heal_data["heals"].mean(),heal_data["heals"].quantile(0.99),heal_data["heals"].max()))
print("average players use the {:.1f} boost, 99% players use {} boost or less, the doctor in the PUBG use {} boost ".format(heal_data["boosts"].mean(),heal_data["boosts"].quantile(0.99),heal_data["boosts"].max()))
heal_data=heal_data[heal_data["heals"]<train["heals"].quantile(0.99)] heal_data=heal_data[heal_data["boosts"]<train["boosts"].quantile(0.99)] plt.figure() f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="heals",y="winPlacePerc",data=heal_data,color="lime",alpha=0.8,label="heals") sns.pointplot(x="boosts",y="winPlacePerc",data=heal_data,color="blue",alpha=0.8,label="boosts") plt.xlabel=("Number of heal/boost items") plt.ylabel=("winPlacePerc") plt.text(4,0.6,"Heals",color="lime") plt.text(4,0.55,"boosts",color="blue") #plt.legend() plt.title("Heals vs Boosts") plt.grid() plt.show()
swim_data=train.copy() #swim_data["swimDistance"].mean() swim_data["swimDistance"].max()
swim_data=swim_data[swim_data["swimDistance"]>train["swimDistance"].quantile(0.99)] swim_data=train.copy() swim_data["swimDistance"]=pds.cut(swim_data["swimDistance"],[-1,0,5,20,3823],labels=["0m","0-5m","6-20m","20+"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="swimDistance",y="winPlacePerc",data=swim_data) plt.show()
plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="heals",data=train,ratio=3,height=10,color="lime") plt.show()
plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="boosts",data=train,ratio=3,height=10,color="blue") plt.show()
solos=train[train["numGroups"]>50] duos=train[(train["numGroups"]>25) & (train["numGroups"]<=50)] squads=train[train["numGroups"]<=25] print("{} players ({:.2f}%) play solos,{} players ({:.2f}%) play duos, {} players ({:.2f}%) play squads".format(len(solos),100*len(solos)/len(train),len(duos),100*len(duos)/len(train),len(squads),100*len(squads)/len(train)))
plt.figure(figsize=(15,8),dpi=80) f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="kills",y="winPlacePerc",data=solos,color="black",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=duos,color="blue",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=squads,color="yellow",alpha=0.8) plt.title("winPlacePerc in solos vs duos vs squads", fontsize=15) plt.text(47,0.6,"solos",color="black",fontsize=12) plt.text(47,0.55,"duos",color="blue",fontsize=12) plt.text(47,0.5,"squads",color="yellow",fontsize=12) plt.ylabel=("Win Percentage") plt.xlabel=("Number of kills") plt.grid() plt.show()
f,ax=plt.subplots(figsize=(15,15),dpi=80) sns.heatmap(train.corr(),annot=True,linewidths=.5,fmt=".1f",ax=ax) plt.show
k=5
f,ax=plt.subplots(figsize=(15,15)) cols=train.corr().nlargest(k,"winPlacePerc")["winPlacePerc"].index cm=np.corrcoef(train[cols].values.T) sns.set(font_scale=1.25) hm=sns.heatmap(cm,cbar=True,annot=True,square=True,fmt=".2f",annot_kws={"size":10},yticklabels=cols.values,xticklabels=cols.values) plt.show()
sns.set()
cols=["winPlacePerc","walkDistance","boosts","weaponsAcquired","damageDealt","killPlace"] data_new=train[cols]
data_new
data_new=data_new.iloc[:10000] sns.pairplot(data_new,height=2.5) plt.show()