import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
#数据读取
features_path = 'C:\\Users\\zou\\Desktop\\features.dat'
label_path = 'C:\\Users\\zou\\Desktop\\labels.dat'
def data_load(path):
f = open(path)
data_list = []
for item in f:
data = list(map(float,item.strip().split(',')))
data_list.append(data)
data = np.array(data_list)
return data
data_x = data_load(features_path)[:2556]
label = data_load(label_path)[:2556]
#数据缺失值信息查看
temp_data = pd.DataFrame(data_x)
print(temp_data.info())
#查看数据平衡状况
label_1 = np.sum(label[label == 1])
label_0 = len(label) - label_1
print(label_1/label_0)
#建立模型的类
class Model(object):
def __init__(self,train_x,train_y,test_x):
self.train_x = train_x
self.train_y = train_y
self.test_x = test_x
def svm(self):
clf = svm.SVC(C=0.8,kernel = 'rbf',gamma = 20)
clf.fit(self.train_x,self.train_y)
svm_predict = clf.predict(self.test_x)
return svm_predict
def rfc(self):
rfc_model = RandomForestClassifier(n_estimators = 30,max_features=10,max_depth = 10,min_samples_split = 20)
rfc_model.fit(self.train_x,self.train_y)
rfc_predict = rfc_model.predict(self.test_x)
return rfc_predict
def xgboost(self):
dtrain = xgb.DMatrix(self.train_x,label = self.train_y)
dtest = xgb.DMatrix(self.test_x)
params = {'booster':'gbtree',
'objective':'binary:logistic',
'eval_metric':'mae',
'max_depth':8,
'lambda':5,
'subsample':0.75,
'colsample_bytree':0.75,
'min_child_weight':2,
'eta':0.025,
'silent':1}
bst = xgb.train(params,dtrain,num_boost_round = 500)
xgb_predict = bst.predict(dtest)
xgb_predict = np.int64(xgb_predict>=0.5)
return xgb_predict
def xgboost_3d(self,subsample = 0.1,colsample_bytree = 0.1):
dtrain = xgb.DMatrix(self.train_x,label = self.train_y)
dtest = xgb.DMatrix(self.test_x)
params = {'booster':'gbtree',
'objective':'binary:logistic',
'eval_metric':'mae',
'max_depth':8,
'lambda':5,
'subsample':subsample,
'colsample_bytree':colsample_bytree,
'min_child_weight':2,
'eta':0.025,
'silent':1}
bst = xgb.train(params,dtrain,num_boost_round = 500)
xgb_predict = bst.predict(dtest)
xgb_predict = np.int64(xgb_predict>=0.5)
return xgb_predict
#五折交叉建立模型
kf = KFold(n_splits = 5,shuffle = True)
rfc_auc = 0
xgb_auc = 0
for train_index,test_index in kf.split(data_x):
train_x,test_x = data_x[train_index],data_x[test_index]
train_y,test_y = label[train_index],label[test_index]
train_y = train_y.reshape(train_y.shape[0],)
test_y = test_y.reshape(test_y.shape[0],)
model = Model(train_x,train_y,test_x)
svm_predict = model.svm()
#rfc_predict = model.rfc()
#xgb_predict = model.xgboost()
svm_auc = roc_auc_score(test_y,svm_predict)
print('accuracy:',np.sum(test_y==svm_predict))
print('auc:',svm_auc)
#rfc_auc += roc_auc_score(test_y,rfc_predict)
#xgb_auc += roc_auc_score(test_y,xgb_predict)
svm_auc = svm_auc/5
#rfc_auc = rfc_auc/5
#xgb_auc = xgb_auc/5
print('svm_auc:%f'%svm_auc)
'''
#网格搜索法
fig = plt.figure()
ax = fig.gca(projection='3d')
subsample_list = np.linspace(0.01,1,10)
colsample_bytree_list = np.linspace(0.01,1,10)
#auc = np.array(pd.read_csv('result.csv',header = None,index_col = False).iloc[:,1:])
auc = []
for i in subsample_list:
auc_1 = []
for j in colsample_bytree_list:
xgb_auc = 0
for train_index,test_index in kf.split(data_x):
train_x,test_x = data_x[train_index],data_x[test_index]
train_y,test_y = label[train_index],label[test_index]
train_y = train_y.reshape(train_y.shape[0],)
test_y = test_y.reshape(test_y.shape[0],)
model = Model(train_x,train_y,test_x)
xgb_predict = model.xgboost_3d(i,j)
xgb_auc += roc_auc_score(test_y,xgb_predict)
xgb_auc = xgb_auc/5
auc_1.append(xgb_auc)
auc.append(auc_1)
print('i =',i)
subsample_list,colsample_bytree_list = np.meshgrid(subsample_list,colsample_bytree_list)
xgb_auc_result = np.array(auc)
surf = ax.plot_surface(subsample_list,colsample_bytree_list,xgb_auc_result,cmap=cm.coolwarm,
linewidth=0, antialiased=False)
ax.set_zlim(0.2, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
import numpy as np
fig = plt.figure()
ax = fig.gca(projection='3d')
# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)
# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False)
# Customize the z axis.
ax.set_zlim(-1.01, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
'''
机器学习——简单的数据处理
猜你喜欢
转载自blog.csdn.net/weixin_41908529/article/details/81341284
今日推荐
周排行