背景
最近接到一个项目,使用遗传算法对决策树进行调参;以前都是使用网格搜索来调参,没想到也可以用ga来做这件事情,再加上以前也写过比较多的ga算法,也就接了下来,本来以为要花一点时间来搞,实际上熟悉的话2-3个小时就能搞定。
算法
做项目肯定是要用库的啦(不可能自己写的),选择使用sklearn的决策树,ga算法流程比较清晰,就自己手写了,下面关键介绍ga算法的几个步骤是如何做的。
初始化
选择决策树比较重要的三个参数"max_depth", "min_samples_split", "max_leaf_nodes",穷举这三个参数可能的值进行初始化
1 def init(): 2 forest = [] 3 for max_depth in range(5, 31, 3): 4 for min_samples_split in range(5, 25, 5): 5 for max_leaf_nodes in range(5, 25, 5): 6 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes])) 7 return forest
选择
使用准确率作为评分依据得到累计概率
1 def tree_score(X, Y, clf): 2 kf = KFold(n_splits=5) 3 score = [] 4 for train_index, valid_index in kf.split(X): 5 clf.fit(X[train_index], Y[train_index]) 6 pred = clf.predict(X[valid_index]) 7 score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred)) 8 return np.mean(score)
1 def adaption(X, Y, forest): 2 score = [] 3 for t in forest: 4 score.append(tree_score(X, Y, t)) 5 best_pos = np.argmax(score) 6 global BEST_TREE 7 BEST_TREE = copy.deepcopy(forest[best_pos]) 8 sm = np.sum(score) 9 ada = score / sm 10 for i in range(1, len(ada)): 11 ada[i] = ada[i] + ada[i - 1] 12 return ada
选择这里可以注意一下,可以使用精英策略,即:把当前这一轮最好的个体,直接送入下一代中。这个策略在提升算法的稳定性上又很大用处
交叉
交叉使用的是参数的交叉,比如clf1,和clf2 然后随机得到一个找到一个交换参数的位置p,进行交叉
1 def _cross_2_tree(t1, t2): 2 sz = len(param) 3 4 t1_param_value = _dict_get_value_list(t1.__dict__, param) 5 t2_param_value = _dict_get_value_list(t2.__dict__, param) 6 pos = random.randint(0, sz - 1) 7 t1_left = t1_param_value[0:pos + 1] 8 t1_right = t1_param_value[pos + 1:] 9 10 t2_left = t2_param_value[0:pos + 1] 11 t2_right = t2_param_value[pos + 1:] 12 13 t1_left.extend(t2_right) 14 t2_left.extend(t1_right) 15 return [make_tree(t1_left), make_tree(t2_left)] 16 17 18 def cross(forest): 19 result = [] 20 sz = len(forest) 21 for i in range(1, sz, 2): 22 result.extend(_cross_2_tree(forest[i - 1], forest[i])) 23 return result
变异
这一步使用比较简单的策略,直接在参数上进行+1或者-1操作
1 def variation(forest): 2 result = [] 3 for t in forest: 4 r = random.random() 5 if r < VAR_P: 6 result.append(t) 7 continue 8 9 # 变异 10 sz = len(param) 11 pos = random.randint(0, sz - 1) 12 val = t.__dict__[param[pos]] 13 up = random.random() 14 15 if up > 0.5: 16 val = val + 1 17 else: 18 val = val - 1 19 20 if val < 2: 21 val = 2 22 t.__dict__[param[pos]] = val 23 result.append(t) 24 return result
完整代码
1 import pandas as pd 2 import numpy as np 3 from sklearn.tree import DecisionTreeClassifier 4 from sklearn.model_selection import train_test_split 5 from sklearn.model_selection import KFold 6 from sklearn.metrics import accuracy_score 7 import random 8 import copy 9 import matplotlib.pyplot as plt 10 11 param = ["max_depth", "min_samples_split", "max_leaf_nodes"] 12 epochs = 300 13 VAR_P = 0.4 14 BEST_TREE = None 15 16 17 def make_tree(param_value): 18 p = dict(zip(param, param_value)) 19 return DecisionTreeClassifier(**p) 20 21 22 def init(): 23 forest = [] 24 for max_depth in range(5, 31, 3): 25 for min_samples_split in range(5, 25, 5): 26 for max_leaf_nodes in range(5, 25, 5): 27 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes])) 28 return forest 29 30 def tree_score(X, Y, clf): 31 kf = KFold(n_splits=5) 32 score = [] 33 for train_index, valid_index in kf.split(X): 34 clf.fit(X[train_index], Y[train_index]) 35 pred = clf.predict(X[valid_index]) 36 score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred)) 37 return np.mean(score) 38 39 40 def evulate_forest(X, Y, forest): 41 score = [] 42 for t in forest: 43 score.append(tree_score(X, Y, t)) 44 worse_pos = np.argmin(score) 45 global BEST_TREE 46 forest[worse_pos] = BEST_TREE 47 score[worse_pos] = tree_score(X, Y, BEST_TREE) 48 49 score.sort(reverse=True) 50 return score, np.mean(score) 51 52 53 def adaption(X, Y, forest): 54 score = [] 55 for t in forest: 56 score.append(tree_score(X, Y, t)) 57 best_pos = np.argmax(score) 58 global BEST_TREE 59 BEST_TREE = copy.deepcopy(forest[best_pos]) 60 sm = np.sum(score) 61 ada = score / sm 62 for i in range(1, len(ada)): 63 ada[i] = ada[i] + ada[i - 1] 64 return ada 65 66 67 def choose_trees(forest, ada): 68 sz = len(forest) 69 result = [] 70 for i in range(sz): 71 r = random.random() 72 for j in range(len(ada)): 73 if r <= ada[j]: 74 result.append(copy.deepcopy(forest[j])) 75 break 76 return result 77 78 79 def _dict_get_value_list(mp, key_list): 80 value_list = [] 81 for key in key_list: 82 value_list.append(mp.get(key)) 83 return value_list 84 85 86 def _cross_2_tree(t1, t2): 87 sz = len(param) 88 89 t1_param_value = _dict_get_value_list(t1.__dict__, param) 90 t2_param_value = _dict_get_value_list(t2.__dict__, param) 91 pos = random.randint(0, sz - 1) 92 t1_left = t1_param_value[0:pos + 1] 93 t1_right = t1_param_value[pos + 1:] 94 95 t2_left = t2_param_value[0:pos + 1] 96 t2_right = t2_param_value[pos + 1:] 97 98 t1_left.extend(t2_right) 99 t2_left.extend(t1_right) 100 return [make_tree(t1_left), make_tree(t2_left)] 101 102 103 def cross(forest): 104 result = [] 105 sz = len(forest) 106 for i in range(1, sz, 2): 107 result.extend(_cross_2_tree(forest[i - 1], forest[i])) 108 return result 109 110 111 def variation(forest): 112 result = [] 113 for t in forest: 114 r = random.random() 115 if r < VAR_P: 116 result.append(t) 117 continue 118 119 # 变异 120 sz = len(param) 121 pos = random.randint(0, sz - 1) 122 val = t.__dict__[param[pos]] 123 up = random.random() 124 125 if up > 0.5: 126 val = val + 1 127 else: 128 val = val - 1 129 130 if val < 2: 131 val = 2 132 t.__dict__[param[pos]] = val 133 result.append(t) 134 return result 135 136 137 df = pd.read_csv("../dataset/data.csv", index_col=0) 138 X = df.iloc[:, 1:].values 139 Y = df.iloc[:, 0].values 140 forest = init() 141 142 mean_score_arr = [] 143 144 for i in range(epochs): 145 ada = adaption(X, Y, forest) 146 forest = choose_trees(forest, ada) 147 forest = cross(forest) 148 forest = variation(forest) 149 score, mean = evulate_forest(X, Y, forest) 150 mean_score_arr.append(mean) 151 152 print(i, "/", epochs, ":") 153 print("mean:", mean) 154 155 plt.plot(np.arange(len(mean_score_arr)), mean_score_arr) 156 plt.show()
总结
感觉使用ga进行调参很鸡肋,还不如使用网格搜索来的快,但是作为一种思想可以学习一下的。
最近搞了一个人工智能交流的群:831852635,有兴趣的可以加一下!