概述
有关熵、条件熵、信息增益及信息增益比的概念可以在网上搜索或者在博客:决策树 Decision Tree 上查看
假设我们有一份CSV文件(以部分为例):car.csv
在读取数据之前,要先将csv文件处理成utf-8编码。一种方法是用“记事本”打开CSV文件,然后另存为时设置编码为utf-8保存即可。我们把最后一列视为标签,其余列视为特征计算其熵、条件熵、信息增益及信息增益比:
import numpy as np
import pandas as pd
import math
class InformationGain():
def __init__(self, feature, label):
feature = np.array(feature)
num_of_feature = np.shape(feature)[1]
num_of_label = len(label)
temp_ent = 0
temp_condition_ent = 0
information_gain_ratio = 0
shanno_ent = []
condition_ent = []
information_gain_list = []
information_gain_ratio_list = []
for i in set(label):
temp_ent += -(label.count(i) / num_of_label) * math.log(label.count(i) / num_of_label)
for i in range(num_of_feature):
feature1 = feature[:, i]
sorted_feature = sorted(feature1)
threshold = [(sorted_feature[inde - 1] + sorted_feature[inde]) / 2 for inde in range(len(feature1)) if
inde != 0]
thre_set = set(threshold)
if float(max(feature1)) in thre_set:
thre_set.remove(float(max(feature1)))
if min(feature1) in thre_set:
thre_set.remove(min(feature1))
information_gain = 0
for thre in thre_set:
lower = [label[s] for s in range(len(feature1)) if feature1[s] < thre]
highter = [label[s] for s in range(len(feature1)) if feature1[s] > thre]
H_l = 0
for l in set(lower):
H_l += -(lower.count(l) / len(lower)) * math.log(lower.count(l) / len(lower))
H_h = 0
for h in set(highter):
H_h += -(highter.count(h) / len(highter)) * math.log(highter.count(h) / len(highter))
temp_condition_ent = len(lower) / num_of_label * H_l + len(highter) / num_of_label * H_h
gain = temp_ent - temp_condition_ent
information_gain = max(information_gain, gain)
information_gain_ratio = information_gain / temp_ent
shanno_ent.append(temp_ent)
condition_ent.append(temp_condition_ent)
information_gain_list.append(information_gain)
information_gain_ratio_list.append(information_gain_ratio)
self.shannoEnt = shanno_ent[0] # 信息熵
self.conditionEnt = condition_ent # 每个特征的条件熵
self.InformationGain = information_gain_list # 每个特征的信息增益
self.InformationGainRatio = information_gain_ratio_list # 每个特征的信息增益率
def getEnt(self):
return self.shannoEnt
def getConditionEnt(self):
return self.conditionEnt
def getInformationGain(self):
return self.InformationGain
def getInformationGainRatio(self):
return self.InformationGainRatio
def read_dataset(fname = u"/car.csv"):
data = pd.read_csv(fname, encoding="utf-8", , header=0, nrows=300) # 读取数据集,调试时限制nrows
data = data.fillna(0)
temp_col_list = ["",""] # ""中填全部特征(包括label)的列名
for i in temp_col_list:
lables = data[i].unique().tolist()
data[i] = data[i].apply(lambda n: lables.index(n))
return data
train = read_dataset()
# ""中填标签的列名
y = train[""].values.tolist()
X = train.drop([""], axis=1).values.tolist()
ig = InformationGain(X, y)
print(ig.getEnt()) # 输出信息熵
print(ig.getConditionEnt()) # 输出每个特征的条件熵
print(ig.getInformationGain()) # 输出每个特征的信息增益
print(ig.getInformationGainRatio()) # 输出每个特征的信息增益率
输出:
0.5004024235381879
[0.47402237066078146, 0.4974047292018396, 0.5003219957256428, 0.01700877327081982, 0.49501525883766634, 0.4849811557452448, 0.48958274254946765, 0.49965721932952717, 0.49965721932952717, 0.49890921637157837, 0.496769162198576, 0.49890921637157837, 0.49570780362218425, 0.49965721932952717, 0.5003025880631119, 0.4983744219282678, 0.49888633644322344, 0.5001635072186842, 0.4895266089201389, 0.4965707224581661, 0.4857229010143407, 0.5003025880631119, 0.4975942392452718, 0.49965721932952717, 0.49965721932952717]
[0.19328787090455724, 0.03982075923669709, 0.016307095137454486, 0.483393650267368, 0.483393650267368, 0.03869166516981126, 0.03440958975003933, 0.04275228841357964, 0.04193841009210453, 0.23089594240377442, 0.019548136350570233, 0.07962341285100738, 0.004694619916003617, 0.09934963217845783, 0.36824744710714663, 0.0020280016099200604, 0.32361566193699254, 0.03647188721007644, 0.010875814618048985, 0.0038317010800217877, 0.014679522523847188, 0.2609054406254804, 0.08540895349851685, 0.06672032313842502, 0.007831909930446801]
[0.3862648576677139, 0.07957747077869258, 0.032587961949009266, 0.9660098103631151, 0.9660098103631151, 0.07732109867940824, 0.06876383512841516, 0.08543581406199371, 0.0838093664606403, 0.46142051185760047, 0.03906483148573007, 0.15911875943368803, 0.009381689007038454, 0.19853947044458314, 0.7359026051540377, 0.004052741382786878, 0.6467108205607962, 0.07288511305000327, 0.021734136579814156, 0.007657239253417353, 0.0293354345089956, 0.5213912410349658, 0.17068053526722962, 0.13333333333333328, 0.015651223019804407]
最后
以上就是拼搏龙猫为你收集整理的决策树中熵、条件熵、信息增益及信息增益比的python实现的全部内容,希望文章能够帮你解决决策树中熵、条件熵、信息增益及信息增益比的python实现所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复