3.1.1信息增益

 1 #计算给定数据集的香农熵
 2 from math import log
 3 
 4 def calcShannonEnt(dataSet):
 5     numEntries = len(dataSet)  #样本条目数
 6     labelCounts = {}
 7     for featVec in dataSet:
 8         currentLabel = featVec[-1]   #取每个样本最后一列值
 9         labelCounts[currentLabel] = labelCounts.get(currentLabel,0)+1
10         #以上得到字典:{'yes':2,'no':3}
11     shannonEnt = 0.0
12     for key in labelCounts:
13         prob = float(labelCounts[key])/numEntries  #求得每个种类的概率
14         shannonEnt -= prob * log(prob,2)  #信息熵公式
15     return shannonEnt   #返回信息熵
16 '''
17         if currentLabel not in labelCounts.keys():   #填充字典:以currentLabel为key
18             labelCounts[currentLabel] = 0
19         labelCounts[currentLabel] += 1   #注意缩进
20         #以上得到字典:{'yes':2,'no':3}
21 '''
22 '''
23 if currentLabel not in labelCounts.keys():
24             labelCounts[currentLabel] = 1
25         else:
26             labelCounts[currentLabel] += 1
27         #以上得到字典:{'yes':2,'no':3}
28 '''
29 def createDataSet():
30     dataSet = [[1,1,'maybe'],
31                [1,1,'yes'],
32                [1,0,'no'],
33                [0,1,'no'],
34                [0,1,'no']]
35     labels = ['no surfacing','flippers']
36     return dataSet,labels

 

上一篇:codeforces1540B


下一篇:独立键盘与矩阵键盘