0生成数据、切分数据
from sklearn.datasets import make_gaussian_quantiles
from sklearn.model_selection import train_test_split
x_data, y_data = make_gaussian_quantiles(n_samples=100, n_features=2, n_classes=2) # 生成正态分布数据
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state=1)
# print(x_train, x_test, y_train, y_test)
# print(x_data,y_data)
random_state随机数生成种子,保证之后再运行时随机数生成相同,用于调参,下同
1建立弱分类器
model = DecisionTreeClassifier(max_depth=1, random_state=1)
2Adaboost函数
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier
def MyAdaboost(x_train, y_train, x_test, y_test, M=20, weak_clf=DecisionTreeClassifier(max_depth=1)):
# M:迭代数/弱分类器个数
# 输入弱分类器weak_clf,默认为深度为1的分类树
n_train, n_test = len(x_train), len(x_test)#数据长度
# 初始化权重
w = np.ones(n_train) / n_train # np.ones(m)生成m个1组成的矩阵
# print(w)
pred_train = np.zeros(n_train)#用于后续组合分类器结果
pred_test = np.zeros(n_test)
2.1循环
for i in range(M):
# 利用样本权重训练数据
weak_clf.fit(x_train, y_train, sample_weight=w) # 初始权重相同
pred_train_i = weak_clf.predict(x_train) # 训练集的预测结果
pred_test_i = weak_clf.predict(x_test) # 测试集的预测结果
指示函数
# 指示函数
miss = [int(x) for x in (pred_train_i != y_train)] # 对比每一个预测值与真值,输出bool(不相同则输出true,反之false),之后int化得到1,0
print('weak_clf_%02d train acc:%.4f' % (i + 1, 1 - sum(miss) / n_train)) # 输出分类器代号及其分类精度,错分率=错分权重/总权重
此处%02d 是为了不够位数时补位:
%02d(1)——01
# weak_clf_01 train acc:
# weak_clf_02 train acc:
numpy 数组的布尔运算不同于list,是对每个元素的布尔判断:
#a=np.array([1,2,3,4])
# b=np.array([3,2,1,4])
# c=a!=b
# print(c)
# # [True False True False]
# a=[1,2,3,4]
# b=[1,2,4,5]
# print(a!=b)
# True
错分率
err_m = np.dot(w, miss) # 错分率=分类错误样本权之和(经过归一化)
np.dot()如果处理的是一维数组则返回内积(对应相乘再相加)
二维则为一般矩阵乘积
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
print(np.dot(a,b))
# 32
计算分类器权重
alpha = 0.5 * math.log((1 - err_m) / err_m) # math.log--ln()函数
# 也可以用numpy.log
math.log–ln()函数, 也可以用numpy.log
更新权值分布:
w * exp(-ynf(x)α)
yn_fx = np.array([-1 if x == 1 else 1 for x in miss]) # 分错,yn_fx=-1,分对,yn_fx=1
w = np.multiply(w, np.exp(-1 * alpha * yn_fx)) # 更新权:w=[w*exp(-1*yn_fx*alpha)]/Z),Z=为所有样本权之和(包括错分和正确))
w = w / sum(w)
此处若yn_fx如果不是numpy ndarray,计算alpha*yn_fx将会报错
TypeError: can 't multiply sequence by non-int of type 'float'
numpy数组可以与非整数/整数的乘法
a = [1, 2, 3]
p=np.array(a)
print(0.3*p)
# [0.3 0.6 0.9]
list只能作整数的乘法
print(a*3)
#[1, 2, 3, 1, 2, 3, 1, 2, 3]
print(a*0.3)
TypeError: can 't multiply sequence by non-int of type 'float'
组合分类器(对结果进行加权,精度高分类器得到结果的权重大)
h(x)=sign(sum(alpha_m*f_m(x)))
# sum(alpha_m*f_m(x))
# 转化为+1,-1
pred_train_i = np.array([i if i == 1 else -1 for i in pred_train_i])
pred_test_i = np.array([i if i == 1 else -1 for i in pred_test_i])
# 组合分类器
pred_train += np.multiply(alpha, pred_train_i)
pred_test += np.multiply(alpha, pred_test_i)
# print(pred_train)
加权求最佳分类
pred_train = np.array([1 if i > 0 else 0 for i in pred_train])
# 或者
# pred_train1= (pred_train > 0)*1 # pred_train>0返回布尔值,(pred_train>0)*1返回1/0
pred_test = np.array([1 if i > 0 else 0 for i in pred_test])
精度
print(sum(pred_train == y_train) / n_train) # 训练精度
print(sum(pred_test == y_test) / n_test) # 测试精度
与sklearn结果比较
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
if __name__ == '__main__':
MyAdaboost(x_train, y_train, x_test, y_test, M=20, weak_clf=model)
model2 = AdaBoostClassifier(model, n_estimators=20)#sklearn上的adaboost
model2.fit(x_train, y_train)
z = model2.predict(x_test)
#输出精度报告
print(classification_report(y_test, z))
结果
weak_clf_01 train acc:0.7333
weak_clf_02 train acc:0.5067
weak_clf_03 train acc:0.6267
weak_clf_04 train acc:0.5067
weak_clf_05 train acc:0.6267
weak_clf_06 train acc:0.5067
weak_clf_07 train acc:0.5867
weak_clf_08 train acc:0.5067
weak_clf_09 train acc:0.7333
weak_clf_10 train acc:0.5067
weak_clf_11 train acc:0.6267
weak_clf_12 train acc:0.5067
weak_clf_13 train acc:0.6267
weak_clf_14 train acc:0.5067
weak_clf_15 train acc:0.5867
weak_clf_16 train acc:0.5067
weak_clf_17 train acc:0.7333
weak_clf_18 train acc:0.5067
weak_clf_19 train acc:0.6267
weak_clf_20 train acc:0.5067
0.9866666666666667
1.0
precision recall f1-score support
0 1.00 1.00 1.00 13
1 1.00 1.00 1.00 12
accuracy 1.00 25
macro avg 1.00 1.00 1.00 25
weighted avg 1.00 1.00 1.00 25