决策树中熵、条件熵、信息增益及信息增益比的python实现

115 阅读 0 评论 76 点赞

我是靠谱客的博主拼搏龙猫，最近开发中收集的这篇文章主要介绍决策树中熵、条件熵、信息增益及信息增益比的python实现，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

有关熵、条件熵、信息增益及信息增益比的概念可以在网上搜索或者在博客：决策树 Decision Tree　上查看

假设我们有一份CSV文件(以部分为例)：car.csv
car.csv
在读取数据之前，要先将csv文件处理成utf-8编码。一种方法是用“记事本”打开CSV文件，然后另存为时设置编码为utf-8保存即可。我们把最后一列视为标签，其余列视为特征计算其熵、条件熵、信息增益及信息增益比：

import numpy as np
import pandas as pd
import math

class InformationGain():
    def __init__(self, feature, label):
        feature = np.array(feature)
        num_of_feature = np.shape(feature)[1]
        num_of_label = len(label)

        temp_ent = 0
        temp_condition_ent = 0
        information_gain_ratio = 0

        shanno_ent = []
        condition_ent = []
        information_gain_list = []
        information_gain_ratio_list = []
        
        for i in set(label):
            temp_ent += -(label.count(i) / num_of_label) * math.log(label.count(i) / num_of_label)

        for i in range(num_of_feature):
            feature1 = feature[:, i]
            sorted_feature = sorted(feature1)
            threshold = [(sorted_feature[inde - 1] + sorted_feature[inde]) / 2 for inde in range(len(feature1)) if
                         inde != 0]

            thre_set = set(threshold)
            if float(max(feature1)) in thre_set:
                thre_set.remove(float(max(feature1)))
            if min(feature1) in thre_set:
                thre_set.remove(min(feature1))
            information_gain = 0
            for thre in thre_set:
                lower = [label[s] for s in range(len(feature1)) if feature1[s] < thre]
                highter = [label[s] for s in range(len(feature1)) if feature1[s] > thre]
                H_l = 0
                for l in set(lower):
                    H_l += -(lower.count(l) / len(lower)) * math.log(lower.count(l) / len(lower))
                H_h = 0
                for h in set(highter):
                    H_h += -(highter.count(h) / len(highter)) * math.log(highter.count(h) / len(highter))
                temp_condition_ent = len(lower) / num_of_label * H_l + len(highter) / num_of_label * H_h
                gain = temp_ent - temp_condition_ent
                information_gain = max(information_gain, gain)
                information_gain_ratio = information_gain / temp_ent
            shanno_ent.append(temp_ent)
            condition_ent.append(temp_condition_ent)
            information_gain_list.append(information_gain)
            information_gain_ratio_list.append(information_gain_ratio)

        self.shannoEnt = shanno_ent[0] # 信息熵
        self.conditionEnt = condition_ent # 每个特征的条件熵
        self.InformationGain = information_gain_list # 每个特征的信息增益
        self.InformationGainRatio = information_gain_ratio_list # 每个特征的信息增益率

    def getEnt(self):
        return self.shannoEnt

    def getConditionEnt(self):
        return self.conditionEnt

    def getInformationGain(self):
        return self.InformationGain

    def getInformationGainRatio(self):
        return self.InformationGainRatio


def read_dataset(fname = u"/car.csv"):
    data = pd.read_csv(fname, encoding="utf-8", , header=0, nrows=300) # 读取数据集，调试时限制nrows
    data = data.fillna(0)
    temp_col_list = ["",""] # ""中填全部特征（包括label）的列名
    for i in temp_col_list:
        lables = data[i].unique().tolist()
        data[i] = data[i].apply(lambda n: lables.index(n))
    return data
train = read_dataset()

# ""中填标签的列名
y = train[""].values.tolist()
X = train.drop([""], axis=1).values.tolist()

ig = InformationGain(X, y)

print(ig.getEnt()) # 输出信息熵
print(ig.getConditionEnt()) # 输出每个特征的条件熵
print(ig.getInformationGain()) # 输出每个特征的信息增益
print(ig.getInformationGainRatio()) # 输出每个特征的信息增益率

输出：

0.5004024235381879
[0.47402237066078146, 0.4974047292018396, 0.5003219957256428, 0.01700877327081982, 0.49501525883766634, 0.4849811557452448, 0.48958274254946765, 0.49965721932952717, 0.49965721932952717, 0.49890921637157837, 0.496769162198576, 0.49890921637157837, 0.49570780362218425, 0.49965721932952717, 0.5003025880631119, 0.4983744219282678, 0.49888633644322344, 0.5001635072186842, 0.4895266089201389, 0.4965707224581661, 0.4857229010143407, 0.5003025880631119, 0.4975942392452718, 0.49965721932952717, 0.49965721932952717]
[0.19328787090455724, 0.03982075923669709, 0.016307095137454486, 0.483393650267368, 0.483393650267368, 0.03869166516981126, 0.03440958975003933, 0.04275228841357964, 0.04193841009210453, 0.23089594240377442, 0.019548136350570233, 0.07962341285100738, 0.004694619916003617, 0.09934963217845783, 0.36824744710714663, 0.0020280016099200604, 0.32361566193699254, 0.03647188721007644, 0.010875814618048985, 0.0038317010800217877, 0.014679522523847188, 0.2609054406254804, 0.08540895349851685, 0.06672032313842502, 0.007831909930446801]
[0.3862648576677139, 0.07957747077869258, 0.032587961949009266, 0.9660098103631151, 0.9660098103631151, 0.07732109867940824, 0.06876383512841516, 0.08543581406199371, 0.0838093664606403, 0.46142051185760047, 0.03906483148573007, 0.15911875943368803, 0.009381689007038454, 0.19853947044458314, 0.7359026051540377, 0.004052741382786878, 0.6467108205607962, 0.07288511305000327, 0.021734136579814156, 0.007657239253417353, 0.0293354345089956, 0.5213912410349658, 0.17068053526722962, 0.13333333333333328, 0.015651223019804407]