思路比较简单:将某一列数据作为训练集,将label作为结果,直接训练一个决策树,然后根据决策树的分裂节点的阈值作为分箱的依据。
sklearn的决策树文档:https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
方法:
def decision_tree_binning(x_value: np.ndarray, y_value: np.ndarray, max_bin=10) -> list:
'''利用决策树获得最优分箱的边界值列表'''
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(
criterion='entropy', # “信息熵”最小化准则划分
max_leaf_nodes=max_bin, # 最大叶子节点数
min_samples_leaf=0.05) # 叶子节点样本数量最小占比
clf.fit(x_value.reshape(-1, 1), y_value) # 训练决策树
# 根据决策树进行分箱
n_nodes = clf.tree_.node_count # 决策树节点
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
# 开始分箱
boundary = []
for i in range(n_nodes):
if children_left[i] != children_right[i]: # 获得决策树节点上的划分边界值
boundary.append(threshold[i])
boundary.sort()
min_x = x_value.min()
max_x = x_value.max()
# max_x = x_value.max() + 0.1 # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
boundary = [min_x] + boundary + [max_x]
return boundary
示例代码
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
def decision_tree_binning(x_value: np.ndarray, y_value: np.ndarray, max_bin=10) -> list:
'''利用决策树获得最优分箱的边界值列表'''
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(
criterion='entropy', # “信息熵”最小化准则划分
max_leaf_nodes=max_bin, # 最大叶子节点数
min_samples_leaf=0.05) # 叶子节点样本数量最小占比
clf.fit(x_value.reshape(-1, 1), y_value) # 训练决策树
# 绘图
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(14, 12)) # 指定图片大小
plot_tree(clf)
plt.show()
# 根据决策树进行分箱
n_nodes = clf.tree_.node_count # 决策树节点
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
# 开始分箱
boundary = []
for i in range(n_nodes):
if children_left[i] != children_right[i]: # 获得决策树节点上的划分边界值
boundary.append(threshold[i])
boundary.sort()
min_x = x_value.min()
max_x = x_value.max()
# max_x = x_value.max() + 0.1 # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
boundary = [min_x] + boundary + [max_x]
return boundary
if __name__ == '__main__':
data_x, data_y = make_classification(n_samples=10000, n_classes=4, n_features=10, n_informative=8, random_state=0)
bin_result = decision_tree_binning(data_x[:, 0], data_y, max_bin=6)
bin_value = pd.cut(data_x[:, 0], bin_result).codes # 分箱的结果
其中:bin_result
的结果是:[-7.098299649843083, -2.204209089279175, -0.8099622428417206, 0.4798355847597122, 1.7192054390907288, 2.9111276865005493, 7.604884316749503]
可以得到决策树的结点图: