机器学习：sklearn&pydotplus实现Decision Tree

95 阅读 0 评论 63 点赞

我是靠谱客的博主风趣机器猫，这篇文章主要介绍机器学习：sklearn&pydotplus实现Decision Tree，现在分享给大家，希望可以做个参考。

复制代码

import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
import pydotplus

'''
数据集 play.csv
RID	age	income	student	credit_rating	Class_buys_computer
1	youth	high	no	fair	no
2	youth	high	no	excellent	no
3	middle_aged	high	no	fair	yes
4	senior	medium	no	fair	yes
5	senior	low	yes	fair	yes
6	senior	low	yes	excellent	yes
7	middle_aged	low	yes	excellent	no
8	youth	medium	no	fair	yes
9	youth	low	yes	fair	no
10	senior	medium	yes	fair	yes
11	youth	medium	yes	excellent	yes
12	middle_aged	medium	no	excellent	yes
13	middle_aged	high	yes	fair	yes
14	senior	medium	no	excellent	no
'''

file = open("E:\play.csv", 'rt', encoding='utf-8')
reader = csv.reader(file)

'''
headers = reader.next() 报错
python csv2libsvm.py: AttributeError: '_csv.reader' object has no attribute 'next'
This is because of the differences between python 2 and python 3.
Use the built-in function next in python 3.
That is, write next(reader) instead of reader.next()
'''

headers = next(reader)
print("表头信息n" + str(headers))

feature_list,result_list = [],[]
for row in reader:
    result_list.append(row[-1])
    feature_list.append(dict(zip(headers[1:-1],row[1:-1])))

print("结果n"+str(result_list),"n特征值n"+str(feature_list))

vec = DictVectorizer() # 将dict类型的list数据，转换成numpy array
DummyX = vec.fit_transform(feature_list).toarray()
DummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
#注意，dummyX是按首字母排序的
print("DummyXn"+str(DummyX),"nDummyYn"+str(DummyY))

clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
# clf = tree.DecisionTreeClassifier()
clf = clf.fit(DummyX,DummyY)

print("clfn"+str(clf))

#输出dot文件
with open("E:\play.dot","w") as f:
    f = tree.export_graphviz(clf,out_file=f)

print( '特征向量n',vec.get_feature_names() )

# help(tree.export_graphviz)
dot_data = tree.export_graphviz(clf,
                                feature_names=vec.get_feature_names(),
                                special_characters=True,
                                filled=True, rounded=True,
                                out_file=None,)
print("dot_datan"+str(dot_data))

'''
pydotplus 画句子的依存结构树
pip install pydotplus 安装不上
pip install --upgrade --ignore-installed pydotplus 可以安装上
pydotplus.graphviz.InvocationException: GraphViz's executables not found
这是《机器学习升级版III》中“决策树随机森林实践”章节的问题。
解决方法：conda install graphviz ，安装完成，重启IDE集成开发工具
先安装GraphViz软件，将GraphViz解压后的目录添加到环境变量path里，然后pip 安装pydotplus，按照这个顺序
安装，如果还不行，重启一下ide或者电脑就行了
'''

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("E:\play.pdf")

#根据特征向量可知：0.0.1.|0.1.|1.0.0.|1.0.表示youth，fair，high，no
oneRowX=dummyX[0]
twoRowX=dummyX[1]
print("oneRowX:n",str(oneRowX),"ntwoRowXn",str(twoRowX))

#进行预测
A = ([[0,0,1,0,1,1,0,0,1,0]])
B = ([[1,0,0,0,1,1,0,0,1,0]])

predict_A = clf.predict(A)
predict_B = clf.predict(B)
print("predict_A",str(predict_A),"predict_B",str(predict_B))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
import pydotplus

'''
数据集 play.csv
RID	age	income	student	credit_rating	Class_buys_computer
1	youth	high	no	fair	no
2	youth	high	no	excellent	no
3	middle_aged	high	no	fair	yes
4	senior	medium	no	fair	yes
5	senior	low	yes	fair	yes
6	senior	low	yes	excellent	yes
7	middle_aged	low	yes	excellent	no
8	youth	medium	no	fair	yes
9	youth	low	yes	fair	no
10	senior	medium	yes	fair	yes
11	youth	medium	yes	excellent	yes
12	middle_aged	medium	no	excellent	yes
13	middle_aged	high	yes	fair	yes
14	senior	medium	no	excellent	no
'''

file = open("E:\play.csv", 'rt', encoding='utf-8')
reader = csv.reader(file)

'''
headers = reader.next() 报错
python csv2libsvm.py: AttributeError: '_csv.reader' object has no attribute 'next'
This is because of the differences between python 2 and python 3.
Use the built-in function next in python 3.
That is, write next(reader) instead of reader.next()
'''

headers = next(reader)
print("表头信息n" + str(headers))

feature_list,result_list = [],[]
for row in reader:
    result_list.append(row[-1])
    feature_list.append(dict(zip(headers[1:-1],row[1:-1])))

print("结果n"+str(result_list),"n特征值n"+str(feature_list))

vec = DictVectorizer() # 将dict类型的list数据，转换成numpy array
DummyX = vec.fit_transform(feature_list).toarray()
DummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
#注意，dummyX是按首字母排序的
print("DummyXn"+str(DummyX),"nDummyYn"+str(DummyY))

clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
# clf = tree.DecisionTreeClassifier()
clf = clf.fit(DummyX,DummyY)

print("clfn"+str(clf))

#输出dot文件
with open("E:\play.dot","w") as f:
    f = tree.export_graphviz(clf,out_file=f)

print( '特征向量n',vec.get_feature_names() )

# help(tree.export_graphviz)
dot_data = tree.export_graphviz(clf,
                                feature_names=vec.get_feature_names(),
                                special_characters=True,
                                filled=True, rounded=True,
                                out_file=None,)
print("dot_datan"+str(dot_data))

'''
pydotplus 画句子的依存结构树
pip install pydotplus 安装不上
pip install --upgrade --ignore-installed pydotplus 可以安装上
pydotplus.graphviz.InvocationException: GraphViz's executables not found
这是《机器学习升级版III》中“决策树随机森林实践”章节的问题。
解决方法：conda install graphviz ，安装完成，重启IDE集成开发工具
先安装GraphViz软件，将GraphViz解压后的目录添加到环境变量path里，然后pip 安装pydotplus，按照这个顺序
安装，如果还不行，重启一下ide或者电脑就行了
'''

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("E:\play.pdf")

#根据特征向量可知：0.0.1.|0.1.|1.0.0.|1.0.表示youth，fair，high，no
oneRowX=dummyX[0]
twoRowX=dummyX[1]
print("oneRowX:n",str(oneRowX),"ntwoRowXn",str(twoRowX))

#进行预测
A = ([[0,0,1,0,1,1,0,0,1,0]])
B = ([[1,0,0,0,1,1,0,0,1,0]])

predict_A = clf.predict(A)
predict_B = clf.predict(B)
print("predict_A",str(predict_A),"predict_B",str(predict_B))