3.1.1信息增益

 1 #计算给定数据集的香农熵
 2 from math import log
 3 
 4 def calcShannonEnt(dataSet):
 5     numEntries = len(dataSet)  #样本条目数
 6     labelCounts = {}
 7     for featVec in dataSet:
 8         currentLabel = featVec[-1]   #取每个样本最后一列值
 9         labelCounts[currentLabel] = labelCounts.get(currentLabel,0)+1
10         #以上得到字典:{‘yes‘:2,‘no‘:3}
11     shannonEnt = 0.0
12     for key in labelCounts:
13         prob = float(labelCounts[key])/numEntries  #求得每个种类的概率
14         shannonEnt -= prob * log(prob,2)  #信息熵公式
15     return shannonEnt   #返回信息熵
16 ‘‘‘
17         if currentLabel not in labelCounts.keys():   #填充字典:以currentLabel为key
18             labelCounts[currentLabel] = 0
19         labelCounts[currentLabel] += 1   #注意缩进
20         #以上得到字典:{‘yes‘:2,‘no‘:3}
21 ‘‘‘
22 ‘‘‘
23 if currentLabel not in labelCounts.keys():
24             labelCounts[currentLabel] = 1
25         else:
26             labelCounts[currentLabel] += 1
27         #以上得到字典:{‘yes‘:2,‘no‘:3}
28 ‘‘‘
29 def createDataSet():
30     dataSet = [[1,1,maybe],
31                [1,1,yes],
32                [1,0,no],
33                [0,1,no],
34                [0,1,no]]
35     labels = [no surfacing,flippers]
36     return dataSet,labels

 

3.1.1信息增益

上一篇:「JOI 2020 Final」奥运公交 题解


下一篇:数据库学习网站和linux学习网站