前言
下面将对数据利用随机森林算法得到结果。
代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 自动进行训练集的划分
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
'''
version:1.0
author:mrx
Method:Random_Forest
'''
# 树的个数
trees=200
# 读取文件
readFileName="python\python Data analysis and mining\class\dataset\german.xls"
def list_add(a,b):
assert len(a)==len(b)
for i in range(len(a)):
a[i] += b[i]
def list_div(a,num):
for i in range(len(a)):
a[i] /= num
return a
# 读取excel
df=pd.read_excel(readFileName)
list_columns=list(df.columns[:-1])
x=df.ix[:,:-1]
# print(x)
y=df.ix[:,-1]
names=x.columns
# print(y)
acc_mean = 0
feature_mean = [0]*len(df.columns[:-1])
# n = 1000 #迭代次数,哪个训练集划分中精度最高
n = 896
max_acc = 0
index_of_max = 0
min_acc = 1
index_of_min = 0
# for i in range(895,897):
# for i in range(n-1,n+1):
for i in range(n):
print('*'*150)
print('第 %d 次 test' % (i+1))
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=i+1)
# 经测试,random_state=896
#n_estimators表示树的个数,测试中100颗树足够
# print(len(x_train))
# print(len(x_train[0]))
# print('*'*100)
# print('start')
forest=RandomForestClassifier(n_estimators=trees)
forest.fit(x_train,y_train)
test_score = forest.score(x_test,y_test)
print("random forest with %d trees:"%trees)
print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(test_score))
print('Feature importances:{}'.format(forest.feature_importances_))
if (max_acc <= test_score):
max_acc = test_score
index_of_max = i+1
if (min_acc >= test_score):
min_acc = test_score
index_of_min = i+1
acc_mean += test_score
list_add(feature_mean,forest.feature_importances_)
print('Final :')
print("avg accuracy on the test subset:{:.3f}".format(acc_mean/n))
# print('avg Feature importances:{}'.format(list_div(feature_mean,n)))
print('max_acc: %f' % max_acc)
print('index : %d' % index_of_max)
print('min_acc: %f' % min_acc)
print('index : %d' % index_of_min)
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=index_of_max)
forest=RandomForestClassifier(n_estimators=trees)
forest.fit(x_train,y_train)
n_features=x.shape[1]
plt.barh(range(n_features),forest.feature_importances_,align='center')
plt.yticks(np.arange(n_features),names)
plt.title("random forest with %d trees:"%trees)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()