1.2 决策树代码实现

1  源码下载地址

2  代码实现

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO
import numpy as np

# csv文件里面的数据
csvData = open(r'D:\workspace\MachineLearning\07stage\1-fundamental\01\DecisionTree\AllElectronics.csv','r')
# 解析csv文件数据
readerData = csv.reader(csvData)
# 将csv数据转为list数据
sourceData = []
for row in readerData:
    sourceData.append(row)
headers = sourceData[0] # 得到表头数据
labelList = [] # 分类结果
featureList = [] # 特征值数据
coreData = sourceData[1:]
for data in coreData:
    rowDict = {}
    labelList.append(  data[ len(data)-1 ]   )
    for i in range(1,len(data)-1):
        rowDict[ headers[i] ] = data[i]
    featureList.append(rowDict)
"""
将特征值数据进行向量化
比如:
featureList = [{'age': 'youth'},{'age': 'old'},{'age': 'mid'}]
进行向量化后得到结果:
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
各维度的解释为:
['age=mid', 'age=old', 'age=youth']
"""
vec = DictVectorizer() # 特征值向量化工具类
featureArray = vec.fit_transform(featureList).toarray() #进行向量化
featureNames = vec.get_feature_names();#得到各个维度的含义

"""
将[[no],[no],[yes],[yes],[yes] ]
转为[ [0],[0],[1],[1],[1] ] 
"""
lb = preprocessing.LabelBinarizer()
labels = lb.fit_transform(labelList)
"""
采用信息熵来做决策树
"""
classifier = tree.DecisionTreeClassifier(criterion='entropy')
classifier = classifier.fit(featureArray,labels)

#将算好的信息熵保存到文件中
with open('AllElectronicsDot.dot','w') as f:
    f = tree.export_graphviz(classifier,feature_names=featureNames,out_file=f)

"""
使用graphviz绘图工具将AllElectronicsDot.dot转为pdf格式
使用命令: dot -Tpdf AllElectronicsDot.dot -o AllElectronicsPdf.pdf
"""

"""
预测数据
[ youth,high,no,fair ]
"""
newData = featureArray[0,:]; # 拷贝csv文件中第一行数据
newData[0] = 1
newData[1] = 0
newData[2] = 0
predictParam = []
predictParam.append(newData)

predictValue = classifier.predict(predictParam)
print("predictValue " + str(predictValue))

 

上一篇:hdu3722Card Game(KM最大带权匹配)


下一篇:c#连接mysql环境配置