1 源码下载地址
2 代码实现
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO
import numpy as np
# csv文件里面的数据
csvData = open(r'D:\workspace\MachineLearning\07stage\1-fundamental\01\DecisionTree\AllElectronics.csv','r')
# 解析csv文件数据
readerData = csv.reader(csvData)
# 将csv数据转为list数据
sourceData = []
for row in readerData:
sourceData.append(row)
headers = sourceData[0] # 得到表头数据
labelList = [] # 分类结果
featureList = [] # 特征值数据
coreData = sourceData[1:]
for data in coreData:
rowDict = {}
labelList.append( data[ len(data)-1 ] )
for i in range(1,len(data)-1):
rowDict[ headers[i] ] = data[i]
featureList.append(rowDict)
"""
将特征值数据进行向量化
比如:
featureList = [{'age': 'youth'},{'age': 'old'},{'age': 'mid'}]
进行向量化后得到结果:
[[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]]
各维度的解释为:
['age=mid', 'age=old', 'age=youth']
"""
vec = DictVectorizer() # 特征值向量化工具类
featureArray = vec.fit_transform(featureList).toarray() #进行向量化
featureNames = vec.get_feature_names();#得到各个维度的含义
"""
将[[no],[no],[yes],[yes],[yes] ]
转为[ [0],[0],[1],[1],[1] ]
"""
lb = preprocessing.LabelBinarizer()
labels = lb.fit_transform(labelList)
"""
采用信息熵来做决策树
"""
classifier = tree.DecisionTreeClassifier(criterion='entropy')
classifier = classifier.fit(featureArray,labels)
#将算好的信息熵保存到文件中
with open('AllElectronicsDot.dot','w') as f:
f = tree.export_graphviz(classifier,feature_names=featureNames,out_file=f)
"""
使用graphviz绘图工具将AllElectronicsDot.dot转为pdf格式
使用命令: dot -Tpdf AllElectronicsDot.dot -o AllElectronicsPdf.pdf
"""
"""
预测数据
[ youth,high,no,fair ]
"""
newData = featureArray[0,:]; # 拷贝csv文件中第一行数据
newData[0] = 1
newData[1] = 0
newData[2] = 0
predictParam = []
predictParam.append(newData)
predictValue = classifier.predict(predictParam)
print("predictValue " + str(predictValue))