本文在adult数据集上进行了实验,运用14个基本属性预测工资状况是否高于50K。采用了随机森林和支持向量机的方法进行实验。对原始数据进行了异常值处理和属性数字化的预处理操作,得到实验数据。实验1对数据进行十折交叉验证取十次结果平均值作为最终结果,度量方法为准确率,分别在两种模型上做对比实验。实验2采用两次留出法划分了训练集、验证集、测试集,以验证集为数据、以准确率作为标准调整模型参数,结果分别列出了两种模型下的查全率、查准率和F1值。经过两次实验验证,在adult数据集上,随机森林较支持向量机有更好的分类效果。
1. Adult数据集预处理(数据预处理见实验2:dealdata函数)
1) 去除缺失值:原始数据中包含缺失数据,需进行清理,采用删除条目的方式去除缺失值。原始数据共计32561条,清理后为30162条。正例共计7508条,反例22654条。
2)属性数字化:原始文件描述属性时使用文字形式,需将文字转化成数字作为模型输入。
2. 实验1:十折交叉方法及准确率度量
十折交叉验证是常用的测试方法。将数据集分成十份,轮流将其中9份作为训练数据,1份作为测试数据,进行试验。
每次试验都会得出相应的正确率(或差错率)。10次的结果的正确率(或差错率)的平均值作为对算法精度的估计。
import pandas as pd import numpy as np from sklearn import svm from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import KFold import warnings warnings.filterwarnings("ignore") # 数据预处理并获得训练集测试集 # 从数据集中获得原始数据 adult_digitization = pd.read_csv("data_cleaned.csv") # 构造输入和输出 X = adult_digitization[ ['age', 'workclass', 'fnlwgt', 'education', 'education_number', 'marriage', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'apital_loss', 'hours_per_week', 'native_country']] Y = adult_digitization[['income']] # 交叉验证 preaccrf = [] preaccsvm = [] num = 1 kf = KFold(n_splits=10) for train, test in kf.split(X): X_train, X_test = X.loc[train], X.loc[test] Y_train, Y_test = Y.loc[train], Y.loc[test] rf = RandomForestClassifier(oob_score=False, random_state=10, criterion='entropy', n_estimators=400) rf.fit(X_train, Y_train) test_predictions = rf.predict(X_test) accuracy = accuracy_score(Y_test, test_predictions) preaccrf.append(accuracy) print("随机森林"+str(num)+"测试集准确率: %s " % accuracy) num = num + 1 num = 1 for train, test in kf.split(X): X_train, X_test = X.loc[train], X.loc[test] Y_train, Y_test = Y.loc[train], Y.loc[test] clf = svm.SVC(kernel='rbf', C=1) clf.fit(X_train, Y_train) test_predictions = clf.predict(X_test) accuracy = accuracy_score(Y_test, test_predictions) preaccsvm.append(accuracy) print("支持向量机"+str(num)+"测试集准确率: %s " % accuracy) num = num + 1 print("随机森林十折交叉平均准确率: %s " % np.mean(np.array(preaccrf))) print("支持向量机十折交叉平均准确率: %s " % np.mean(np.array(preaccrf)))
3. 实验2:留出法及查全查准度量
为了调整模型参数以得到分类的最优模型,实验2采用两次留出法构造出比例为7:2:1的训练集、验证集和测试集。训练集用于训练分类模型,在验证集上运行后比较不同参数下的准确度,选择最优参数作为实验结果。最终评价测试集的准确程度使用查全率查准率及F1值。
import pandas as pd import numpy as np from sklearn import svm from sklearn.externals import joblib from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix import warnings warnings.filterwarnings("ignore") # 数据预处理并获得训练集测试集 def dealdata(filename): # 从数据集中获得原始数据 adult_raw = pd.read_csv(filename, header=None) # print(len(adult_raw)) # 添加标题 adult_raw.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education_number', 5: 'marriage', 6: 'occupation', 7: 'relationship', 8: 'race', 9: 'sex', 10: 'capital_gain', 11: 'apital_loss', 12: 'hours_per_week', 13: 'native_country', 14: 'income'}, inplace=True) # 清理数据,删除缺失值 adult_cleaned = adult_raw.dropna() # 属性数字化 adult_digitization = pd.DataFrame() target_columns = ['workclass', 'education', 'marriage', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income'] for column in adult_cleaned.columns: if column in target_columns: unique_value = list(enumerate(np.unique(adult_cleaned[column]))) dict_data = {key: value for value, key in unique_value} adult_digitization[column] = adult_cleaned[column].map(dict_data) else: adult_digitization[column] = adult_cleaned[column] # 确认数据类型为int型数据 # for column in adult_digitization: # adult_digitization[column] = adult_digitization[column].astype(int) # adult_digitization.to_csv("data_cleaned.csv") # print(len(adult_cleaned)) # 构造输入和输出 X = adult_digitization[ ['age', 'workclass', 'fnlwgt', 'education', 'education_number', 'marriage', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'apital_loss', 'hours_per_week', 'native_country']] Y = adult_digitization[['income']] # 查看数据情况 0:22654, 1:7508 # print(Y.value_counts()) # (0.7:0.3)构造训练集和测试集 X_train, X_t_v, Y_train, Y_t_v = train_test_split(X, Y, test_size=0.3, random_state=0) # 7:2:1 训练集:验证集:测试集 X_validation, X_test, Y_validation, Y_test = train_test_split(X_t_v, Y_t_v, test_size=0.3, random_state=0) # validation X_train.to_csv("X_train.csv", index=None) X_validation.to_csv("X_validation.csv", index=None) X_test.to_csv("X_test.csv", index=None) Y_train.to_csv("Y_train.csv", index=None) Y_validation.to_csv("Y_validation.csv", index=None) Y_test.to_csv("Y_test.csv", index=None) def randomforestmodel(): # 构建随机森林模型 rf = RandomForestClassifier(oob_score=False, random_state=10, criterion='entropy', n_estimators=400) rf.fit(X_train, Y_train['income']) joblib.dump(rf, "rf.m") # 调参过程可采用程序控制,使用验证集准确率调参 # validation_predictions = rf.predict(X_validation) # print("验证集准确率: %s " % accuracy_score(Y_validation, validation_predictions)) # 调参1 criterion gini:0.835333122829 ;entropy:0.840069466372 # 调参2 n_estimators gini 10:0.835333122829 ; 20:0.841806125671;30:0.842911272498 ;40:0.842437638143;50:0.845279444269 # 调参2 n_estimators gini 100:0.848437006631;150:0.848910640985;200:0.849068519103 ;300:0.848279128513 ;400:0.848437006631 # 调参2 n_estimators entropy 10:0.840069466372; 20:0.84275339438 ;30:0.843700663088 ;40:0.84433217556 ;50:0.846226712978 # 调参2 n_estimators entropy 100:0.848121250395;150:0.848752762867;200:0.849857909694;300:0.849542153458;400: 0.851594568993 def svmmodel(): # 构建SVM模型 for C in range(1, 10, 1): # C = 1 clf = svm.SVC(kernel='linear', C=C) clf.fit(X_train, Y_train) joblib.dump(clf, "linear"+str(C)+"svm.m") validation_predictions = clf.predict(X_validation) print("C="+str(C)+": 验证集准确率: %s " % accuracy_score(Y_validation, validation_predictions)) # 调参 # kernel='rbf':0.743132301863,‘linear’: 0.775023681718 # C = 1: 验证集准确率: 0.743132301863 # C = 2: 验证集准确率: 0.743290179981 # C = 3: 验证集准确率: 0.743132301863 # C = 4: 验证集准确率: 0.743132301863 # C = 5: 验证集准确率: 0.743132301863 # C = 6: 验证集准确率: 0.743132301863 # C = 7: 验证集准确率: 0.743132301863 # C = 8: 验证集准确率: 0.743132301863 # C = 9: 验证集准确率: 0.743132301863 dealdata("adultdata.csv") X_train = pd.read_csv('X_train.csv') X_validation = pd.read_csv("X_validation.csv") X_test = pd.read_csv('X_test.csv') Y_train = pd.read_csv('Y_train.csv') Y_validation = pd.read_csv("Y_validation.csv") Y_test = pd.read_csv('Y_test.csv') # print('Y_train: ') # 0:15890,1:5223 # print(Y_train['income'].value_counts()) # print('Y_validation: ') # 0:4709,1:1625 # print(Y_validation['income'].value_counts()) # print('Y_test ') # 0:2055,1:660 # print(Y_test['income'].value_counts()) # 训练随机森林模型并存储 # randomforestmodel() # rf = joblib.load("rf.m") # Y_predictions1 = rf.predict(X_test) # print(classification_report(Y_test, Y_predictions1)) # confmat = confusion_matrix(Y_test, Y_predictions1) # print(confmat) # 训练支持向量机模型并存储 svmmodel() # rbf = joblib.load("rbfsvm.m") # Y_predictions2 = rbf.predict(X_test) # print(classification_report(Y_test, Y_predictions2)) # confmat = confusion_matrix(Y_test, Y_predictions2) # print(confmat) # 随机森林预测结果 # precision recall f1-score support # # 0 0.89 0.93 0.91 2055 # 1 0.73 0.63 0.68 660 # # avg / total 0.85 0.85 0.85 2715 # [[1904 151] [ 246 414]] # 支持向量机预测结果 # precision recall f1-score support # # 0 0.76 1.00 0.86 2055 # 1 1.00 0.01 0.01 660 # # avg / total 0.82 0.76 0.66 2715 # [[2055 0] [ 656 4]]
第一次写博客,肯定有很多不足,希望大家多多指正。