决策树ID3算法实现

决策树的ID3算法基于信息增益来选择最优特征,于是自己实现了一把,直接上代码。

 """
CreateTime : 2019/3/3 22:19
Author : X
Filename : decision_tree.py
""" import pandas as pd
from math import log2 def create_data_set():
"""Create 8 * 3 data set. two feature."""
data_set = [['long', 'thick', 'man'],
['short', 'thick', 'man'],
['short', 'thick', 'man'],
['long', 'thin', 'woman'],
['short', 'thin', 'woman'],
['short', 'thick', 'woman'],
['long', 'thick', 'woman'],
['long', 'thick', 'woman']]
labels = ['hair', 'sound']
return data_set, labels def calculate_entropy(data_set):
"""Calculate entropy by data set label.
formula: H(X) = -3/8*log(3/8, 2) - -5/8*log(5/8, 2)"""
data_len = data_set.shape[0]
entropy = 0
for size in data_set.groupby(data_set.iloc[:, -1]).size():
p_label = size/data_len
entropy -= p_label * log2(p_label)
return entropy def get_best_feature(data_set):
"""Get the best feature by infoGain.
formula: InfoGain(X, Y) = H(X) - H(X|Y)
H(X|Y) = sum(P(X) * H(Yx))"""
best_feature = -1
base_entropy = calculate_entropy(data_set)
best_info_gain = 0
len_data = data_set.shape[0]
for i in range(data_set.shape[1] - 1):
new_entropy = 0
for _, group in data_set.groupby(data_set.iloc[:, i]):
p_label = group.shape[0]/len_data
new_entropy += p_label * calculate_entropy(group)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_feature = i
best_info_gain = info_gain
return best_feature def majority_cnt(class_list):
"""When only class label, return the max label."""
majority_class = class_list.groupby(
class_list.iloc[:, -1]).size().sort_values().index[-1]
return majority_class def create_tree(data_set, labels):
"""data_set: DataFrame"""
class_list = data_set.values[:, -1]
class_list_set = set(class_list)
if len(class_list_set) == 1:
return list(class_list)[0]
if len(data_set.values[0]) == 1:
return majority_cnt(data_set)
best_feature = get_best_feature(data_set)
best_feature_label = labels[best_feature]
del labels[best_feature]
my_tree = {best_feature_label: {}}
for name, group in data_set.groupby(data_set.iloc[:, best_feature]):
group.drop(columns=[best_feature], axis=1, inplace=True)
my_tree[best_feature_label][name] = create_tree(group, labels)
return my_tree def classify(test_data, my_tree):
if not test_data:
return 'Not found class.'
for key, tree in my_tree.items():
if key != test_data[0]:
return classify(test_data, tree)
else:
if isinstance(tree, dict):
del test_data[0]
return classify(test_data, tree)
else:
return tree if __name__ == '__main__':
DATA_SET, LABELS = create_data_set()
TREE = create_tree(pd.DataFrame(DATA_SET), LABELS)
import json
print(json.dumps(TREE, indent=4))
print(classify(["thick", "long"], TREE))

C4.5算法是基于信息增益率来选择最优特征的,即在ID3算法基础上再求出信息增益率即可,将信息增益除以基于label的特征X的熵。

此处就不再给出实现代码,自己实现一遍意在加深理解。

上一篇:第2章Zabbix基础进阶


下一篇:javascript 异常处理和事件处理