《统计学习方法》学习笔记（6）-- 决策树-附代码（sklearn）

100 阅读 0 评论 66 点赞

我是靠谱客的博主淡定面包，这篇文章主要介绍《统计学习方法》学习笔记（6）-- 决策树-附代码（sklearn），现在分享给大家，希望可以做个参考。

决策树，特征选择的三个准则：信息增益（ID3），信息增益比（C4.5），基尼系数（CART）。决策树的生成，决策树的剪枝。
这里写图片描述

这里写图片描述

代码：
数据：decision tree.csv

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
RID,age,income,student,credit_rating,class_buys_computer
1,youth,high,no,fair,no
2,youth,high,no,excellent,no
3,middle_aged,high,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes
6,senior,low,yes,excellent,no
7,middle_aged,low,yes,excellent,yes
8,youth,medium,no,fair,no
9,youth,low,yes,fair,yes
10,senior,medium,yes,fair,yes
11,youth,medium,yes,excellent,yes
12,middle_aged,medium,no,excellent,yes
13,middle_aged,high,yes,fair,yes
14,senior,medium,no,excellent,no

代码：

复制代码

# coding=utf-8
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO

# Read in the csv file and put features into list of dict and list of class label
allElectronicsData = open(r'decision_tree.csv', 'rb')
reader = csv.reader(allElectronicsData) # 按行读取内容
headers = reader.next() # 内容的第一行
#
# print(headers)

featureList = []
labelList = []

for row in reader:
    labelList.append(row[len(row)-1]) # 取每一行的最后一个值
    rowDict = {}
    for i in range(1, len(row)-1):
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)

# print(featureList) # 如下面每一个字典是数据文件中的一行
# # [{'credit_rating': 'fair', 'age': 'youth', 'student': 'no', 'income': 'high'},
# # {'credit_rating': 'excellent', 'age': 'youth', 'student': 'no', 'income': 'high'},。。。。

# Vetorize features
vec = DictVectorizer() # sklearn 中提供了一个工具，可以将包含字典类型的list直接转化为数值型的数据。
dummyX = vec.fit_transform(featureList) .toarray()

# print("dummyX: " + str(dummyX))
# print(vec.get_feature_names())

# print("labelList: " + str(labelList))
# vectorize class labels
lb = preprocessing.LabelBinarizer()

dummyY = lb.fit_transform(labelList)
print("dummyY: " + str(dummyY))

# Using decision tree for classification
# clf = tree.DecisionTreeClassifier()
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
# print("clf: " + str(clf))

# Visualize model
with open("decision_tree.dot", 'w') as f:
    f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
# graphviz 打印出这个tree来看看，打印的时候将转化前的feature_names找回来。
# 上面生成dot文件，可以将其转化为pdf文件可视化出来。
# 转化命令：dot -Tpdf decision_tree.dot -o decision_tree.pdf
oneRowX = dummyX[0, :]
print("oneRowX: " + str(oneRowX))

newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX: " + str(newRowX))

predictedY = clf.predict(newRowX)
print("predictedY: " + str(predictedY))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# coding=utf-8
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO

# Read in the csv file and put features into list of dict and list of class label
allElectronicsData = open(r'decision_tree.csv', 'rb')
reader = csv.reader(allElectronicsData) # 按行读取内容
headers = reader.next() # 内容的第一行
#
# print(headers)

featureList = []
labelList = []

for row in reader:
    labelList.append(row[len(row)-1]) # 取每一行的最后一个值
    rowDict = {}
    for i in range(1, len(row)-1):
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)

# print(featureList) # 如下面每一个字典是数据文件中的一行
# # [{'credit_rating': 'fair', 'age': 'youth', 'student': 'no', 'income': 'high'},
# # {'credit_rating': 'excellent', 'age': 'youth', 'student': 'no', 'income': 'high'},。。。。

# Vetorize features
vec = DictVectorizer() # sklearn 中提供了一个工具，可以将包含字典类型的list直接转化为数值型的数据。
dummyX = vec.fit_transform(featureList) .toarray()

# print("dummyX: " + str(dummyX))
# print(vec.get_feature_names())

# print("labelList: " + str(labelList))
# vectorize class labels
lb = preprocessing.LabelBinarizer()

dummyY = lb.fit_transform(labelList)
print("dummyY: " + str(dummyY))

# Using decision tree for classification
# clf = tree.DecisionTreeClassifier()
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
# print("clf: " + str(clf))

# Visualize model
with open("decision_tree.dot", 'w') as f:
    f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
# graphviz 打印出这个tree来看看，打印的时候将转化前的feature_names找回来。
# 上面生成dot文件，可以将其转化为pdf文件可视化出来。
# 转化命令：dot -Tpdf decision_tree.dot -o decision_tree.pdf
oneRowX = dummyX[0, :]
print("oneRowX: " + str(oneRowX))

newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX: " + str(newRowX))

predictedY = clf.predict(newRowX)
print("predictedY: " + str(predictedY))