思路比较简单：将某一列数据作为训练集，将label作为结果，直接训练一个决策树，然后根据决策树的分裂节点的阈值作为分箱的依据。

sklearn的决策树文档：https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

方法：

def decision_tree_binning(x_value: np.ndarray, y_value: np.ndarray, max_bin=10) -> list:
    '''利用决策树获得最优分箱的边界值列表'''
    from sklearn.tree import DecisionTreeClassifier

    clf = DecisionTreeClassifier(
        criterion='entropy',  # “信息熵”最小化准则划分
        max_leaf_nodes=max_bin,  # 最大叶子节点数
        min_samples_leaf=0.05)  # 叶子节点样本数量最小占比
    clf.fit(x_value.reshape(-1, 1), y_value)  # 训练决策树

    # 根据决策树进行分箱
    n_nodes = clf.tree_.node_count  # 决策树节点
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold

    # 开始分箱
    boundary = []
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x_value.min()
    max_x = x_value.max()
    # max_x = x_value.max() + 0.1  # +0.1是为了考虑后续groupby操作时，能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]
    return boundary

示例代码

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification


def decision_tree_binning(x_value: np.ndarray, y_value: np.ndarray, max_bin=10) -> list:
    '''利用决策树获得最优分箱的边界值列表'''
    from sklearn.tree import DecisionTreeClassifier

    clf = DecisionTreeClassifier(
        criterion='entropy',  # “信息熵”最小化准则划分
        max_leaf_nodes=max_bin,  # 最大叶子节点数
        min_samples_leaf=0.05)  # 叶子节点样本数量最小占比
    clf.fit(x_value.reshape(-1, 1), y_value)  # 训练决策树

    # 绘图
    import matplotlib.pyplot as plt
    from sklearn.tree import plot_tree
    plt.figure(figsize=(14, 12))  # 指定图片大小
    plot_tree(clf)
    plt.show()

    # 根据决策树进行分箱
    n_nodes = clf.tree_.node_count  # 决策树节点
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold

    # 开始分箱
    boundary = []
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x_value.min()
    max_x = x_value.max()
    # max_x = x_value.max() + 0.1  # +0.1是为了考虑后续groupby操作时，能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]
    return boundary


if __name__ == '__main__':
    data_x, data_y = make_classification(n_samples=10000, n_classes=4, n_features=10, n_informative=8, random_state=0)
    bin_result = decision_tree_binning(data_x[:, 0], data_y, max_bin=6)
    bin_value = pd.cut(data_x[:, 0], bin_result).codes  # 分箱的结果

其中：bin_result的结果是：[-7.098299649843083, -2.204209089279175, -0.8099622428417206, 0.4798355847597122, 1.7192054390907288, 2.9111276865005493, 7.604884316749503]

可以得到决策树的结点图：

在这里插入图片描述

数据分箱3——决策树分箱（有监督）

示例代码

猜你喜欢