Datawhale集成学习Task7-投票法
投票法的原理分析
这里以分类问题为例解释投票法的工作机理,假设我们有m个不同的分类器 { C 1 , C 2 . . . , C m } \{C_1, C_2...,C_m\} {C1,C2...,Cm},对于同一测试集,得到 { P 1 , P 2 . . . , P m } \{P_1, P_2...,P_m\} {P1,P2...,Pm}的预测,然后通过投票得到最终的预测 P f P_f Pf。
下面再以绝对多法投票法说明集成模型为什么会优于单个分类器。
绝对多数投票法:每个分类器都会对类别标记进行投票,若某一类别标记获得票数过半,则为最终的输出类别标记;否则拒绝预测。当使用m个分类器组成集成求解二分类问题,只用当有
⌊
T
/
2
+
1
⌋
\lfloor T/2 + 1 \rfloor
⌊T/2+1⌋个或者更多分类器分类正确时,最终的预测才会分类正确。
下面是一个例子:
假设:T个基分类器相互独立且精度为
p
p
p,则基于这个T个基分类器的集成分类正确的概率可以基于二项分布计算获得:
P
m
v
=
∑
k
=
⌊
T
/
2
+
1
⌋
T
(
T
k
)
p
k
(
1
−
p
)
T
−
k
P_{mv} = \sum_{k=\lfloor T/2 + 1 \rfloor}^T\dbinom{T}{k}p^k(1-p)^{T-k}
Pmv=k=⌊T/2+1⌋∑T(kT)pk(1−p)T−k
from scipy.special import comb #计算组合
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False # (解决坐标轴负数的负号显示问题)
import math
def ensemble_acc(n_classifier,acc):
k = math.floor(n_classifier/2 + 1) #向下取整
probs = [comb(n_classifier,k) * acc ** k * (1-acc)**(n_classifier-k)
for k in range(k,n_classifier+1)]
return sum(probs)
base_acc = np.arange(0.2,1,0.1)
classifiers = np.arange(1,53,2)
# en_acc = [ensemble_acc(11,base_a) for base_a in base_acc]
en_acc_list = []
for acc in base_acc:
en_acc = [ensemble_acc(_classifiers,acc) for _classifiers in classifiers]
en_acc_list.append(en_acc)
plt.figure(figsize=(8,6))
# plt.plot(base_acc,en_acc,label = '集成准确率')
# plt.plot(base_error,base_error,linestyle = '--',label = '基础分类器准确率')
for i,en_acc in enumerate(en_acc_list):
plt.plot(classifiers,en_acc,label = round(base_acc[i],1))
plt.xlabel('ensemble size',fontsize = 15)
plt.ylabel('accuracy of the ensemble',fontsize = 15)
plt.grid()
plt.legend()
plt.show()
上图给出不同p和T值下的集成精度,可以看出:
- 如果 p > 0.5, 则 P m v P_{mv} Pmv随着T单调递增,且 l i m T → + ∞ P m v = 1 lim_{T\rightarrow +\infty}P_{mv} = 1 limT→+∞Pmv=1
- 如果 p < 0.5, 则 P m v P_{mv} Pmv随着T单调递减,且 l i m T → + ∞ P m v = 0 lim_{T\rightarrow +\infty}P_{mv} = 0 limT→+∞Pmv=0
- 如果 p = 0.5, 则对任意的 T T T, P m v = 0.5 P_{mv} = 0.5 Pmv=0.5
同时,分类投票法又可以被划分为硬投票与软投票:
- 硬投票:预测结果是所有投票结果最多出现的类。
- 软投票:预测结果是所有投票结果中概率加和最大的类。
下面将展示一些例子来展示硬投票和软投票。
硬投票:
#训练多数规则分类器:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
iris = datasets.load_iris()
X,y = iris.data[:,1:3],iris.target
clf1 = LogisticRegression(solver='lbfgs',multi_class='multinomial',random_state=1)
# solver:逻辑回归损失函数的优化方法,拟牛顿法的一种。利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
clf2 = RandomForestClassifier(n_estimators=50,random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3)],voting='hard')
for clf,label in zip([clf1,clf2,clf3,eclf],['Logistic Regression','Random Forest','Naive Bayes','Ensemble']):
scores = cross_val_score(clf,X,y,cv=5,scoring='accuracy')
print("Accuracy:均值:%0.2f,标准差:%0.2f [%s]" %(scores.mean(),scores.std(),label))
Accuracy:均值:0.95,标准差:0.04 [Logistic Regression]
Accuracy:均值:0.94,标准差:0.04 [Random Forest]
Accuracy:均值:0.91,标准差:0.04 [Naive Bayes]
Accuracy:均值:0.95,标准差:0.04 [Ensemble]
软投票(将上面代码中hard的改成soft):
Accuracy:均值:0.95,标准差:0.04 [Logistic Regression]
Accuracy:均值:0.94,标准差:0.04 [Random Forest]
Accuracy:均值:0.91,标准差:0.04 [Naive Bayes]
Accuracy:均值:0.95,标准差:0.03 [Ensemble]
下面是不同的模型集成案例:
首先是同一类别的KNN:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
# test classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=2)
# get a voting ensemble of models
def get_voting():
# define the base models
models = list()
models.append(('knn1', KNeighborsClassifier(n_neighbors=1)))
models.append(('knn3', KNeighborsClassifier(n_neighbors=3)))
models.append(('knn5', KNeighborsClassifier(n_neighbors=5)))
models.append(('knn7', KNeighborsClassifier(n_neighbors=7)))
models.append(('knn9', KNeighborsClassifier(n_neighbors=9)))
# define the voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')
return ensemble
# get a list of models to evaluate
def get_models():
models = dict()
models['knn1'] = KNeighborsClassifier(n_neighbors=1)
models['knn3'] = KNeighborsClassifier(n_neighbors=3)
models['knn5'] = KNeighborsClassifier(n_neighbors=5)
models['knn7'] = KNeighborsClassifier(n_neighbors=7)
models['knn9'] = KNeighborsClassifier(n_neighbors=9)
models['hard_voting'] = get_voting()
return models
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
import statistics
import matplotlib.pyplot as plt
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
# plot model performance for comparison
plt.boxplot( results,labels=names, showmeans=True)
plt.show()
knn1 0.873 (0.030)
knn3 0.889 (0.038)
knn5 0.895 (0.031)
knn7 0.899 (0.035)
knn9 0.900 (0.033)
hard_voting 0.902 (0.034)
不同类型的分类训练器:
from sklearn import datasets
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X,y = iris.data[:,[0,2]],iris.target
#???data[:,[0,2]]
#Training
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(gamma='scale',kernel='rbf',probability=True)
#gamma:核函数系数
# kernel:算法中采用的核函数类型,‘rbf’:径像核函数/高斯核
# probability:是否启用概率估计
eclf = VotingClassifier(estimators=[('dt',clf1),('knn',clf2),('svc',clf3)],
voting='soft',weights=[1/3,1/9,1/3])
clf1 = clf1.fit(X,y)
clf2 = clf2.fit(X,y)
clf3 = clf3.fit(X,y)
eclf = eclf.fit(X,y)
# Plotting decision regions
x_min,x_max = X[:,0].min() - 1,X[:,0].max() + 1
y_min,y_max = X[:,1].min() - 1,X[:,1].max() + 1
xx,yy = np.meshgrid(np.arange(x_min, x_max, 0.1),np.arange(y_min, y_max, 0.1))
# 生成网格点坐标矩阵:
# 坐标矩阵——横坐标矩阵XX中的每个元素,与纵坐标矩阵YY中对应位置元素,共同构成一个点的完整坐标。
# 如B点坐标(X12,Y12)=(1,1)(X12,Y12)=(1,1)
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
# sharey‘row’ 时,每一行的子图会共享 x 或者 y 轴
for idx,clf,tt in zip(product([0,1],[0,1]),
[clf1,clf2,clf3,eclf],
['Decison Tree(depth=4)','KNN(k=7)','Kernel SVM','Soft Voting']):
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
'''
numpy中的ravel()、flatten()、squeeze()都有将多维数组转换为一维数组的功能,区别:
ravel():如果没有必要,不会产生源数据的副本
flatten():返回源数据的副本
squeeze():只能对维数为1的维度降维
另外,reshape(-1)也可以“拉平”多维数组
'''
'''
np.r_ 是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等,类似于 pandas 中的 concat()。
np.c_ 是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等,类似于pandas中的merge()。
'''
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
# contourf绘制等高线的,contour和contourf都是画三维等高线图的,不同点在于contour() 是绘制轮廓线,contourf()会填充轮廓
axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
不同类型的回归训练器:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
#loading some example data
boston = datasets.load_boston()
X,y = boston.data,boston.target
#training classifier
reg1 = GradientBoostingRegressor(random_state=1,n_estimators=10)
reg2 = RandomForestRegressor(random_state=1,n_estimators=10)
reg3 = LinearRegression()
ereg = VotingRegressor([('gb',reg1),('rf',reg2),('lr',reg3)])
reg1.fit(X,y)
reg2.fit(X,y)
reg3.fit(X,y)
ereg.fit(X,y)
xt = X[:20]
plt.figure()
plt.plot(reg1.predict(xt),'gd',label ='GradientBoostingRegressor')#''中的字母代表了散点图中的表示样式,gd为绿色菱形,g代表绿色
plt.plot(reg2.predict(xt),'b^',label='RandomForestRegressor')
plt.plot(reg3.predict(xt),'ys',label='LinearRegression')#样式为黄色正方形,y代表黄色
plt.plot(ereg.predict(xt),'r*',label='VotingRegressor')
plt.tick_params(axis='y',which='both',bottom=False,top=False,labelbottom=False)
plt.ylabel('predicted')
plt.xlabel('training samples')
plt.legend('Comparison of individual predictions with averaged')
plt.show()
综上,从上面的例子可以看出,基于投票法的集成训练器对最终精度的提升有很大的帮助。
[1]. https://github.com/datawhalechina/team-learning-data-mining/tree/master/EnsembleLearning
[2].https://blog.csdn.net/Carohuan/article/details/100983254?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_title-1&spm=1001.2101.3001.4242
[3].https://blog.csdn.net/weixin_43822124/article/details/115673544
[4].集成学习(基础与算法) 周志华