sklearn学习笔记二——随机森林

随机森林

from sklearn import tree
from sklearn.datasets   import load_wine
from sklearn.model_selection import train_test_split#用于划分训练测试集数目
from sklearn.ensemble import RandomForestClassifier
wine=load_wine()
X_train,X_test,Y_train,Y_test=train_test_split(wine.data,wine.target,test_size=0.3)
#help(train_test_split)
#bulid TREE model
clf=tree.DecisionTreeClassifier( criterion="entropy"
                                ,random_state=0
                                ,splitter='random'
                                ,max_depth=3
                                #,min_samples_leaf =10
                                #,min_impurity_decrease=3
                                )#随机数,决策树是随机的,设置了该参数,则模型不变
#help(tree.DecisionTreeClassifier)
clf = clf.fit(X_train, Y_train)
score1=clf.score(X_train, Y_train)
score_clf=clf.score(X_test,Y_test)
#build randomforest
rfc=RandomForestClassifier()
rfc=rfc.fit(X_train, Y_train)
score_rfc=rfc.score(X_test,Y_test)
print('score_clf:{}'.format(score_clf),
      'score_rfc:{}'.format(score_rfc))
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc=RandomForestClassifier()
clf=tree.DecisionTreeClassifier()
rfc_s=cross_val_score(rfc,wine.data,wine.target,cv=20)
clf_s=cross_val_score(clf,wine.data,wine.target,cv=20)
plt.plot(range(1,21),rfc_s,label='RandomForestClassifier')
plt.plot(range(1,21),clf_s,label='DecisionTreeClassifier')
plt.legend()
plt.show()

sklearn学习笔记二——随机森林

import numpy as np
rfc_s1=[]
clf_s1=[]
for i in range(20):
    
    rfc=RandomForestClassifier()
    clf=tree.DecisionTreeClassifier()
    
    rfc_s=cross_val_score(rfc,wine.data,wine.target,cv=20)
    rfc_s=np.mean(rfc_s)
    rfc_s1.append(rfc_s)
    clf_s=cross_val_score(clf,wine.data,wine.target,cv=20)
    clf_s1.append(np.mean(clf_s))
plt.plot(range(1,21),rfc_s1,label='RandomForestClassifier')
plt.plot(range(1,21),clf_s1,label='DecisionTreeClassifier')
plt.legend()
plt.show()

sklearn学习笔记二——随机森林

#n_estimators
test=[]
for i in range(50):
    rfc=RandomForestClassifier(n_estimators=i+1)
    rfc=rfc.fit(X_train, Y_train)
    score_rfc=rfc.score(X_test,Y_test)
    test.append(score_rfc)
plt.plot(range(1,51),test,label='n_estimators')
plt.legend()
plt.show()

sklearn学习笔记二——随机森林

#属性接口
score_rfc=rfc.score(X_test,Y_test)
rfc.feature_importances_
rfc.apply(X_test)
rfc.predict(X_test)
rfc.predict_proba(X_test)

调参实例

# -*- coding: utf-8 -*-
"""
Created on Mon Aug  5 16:46:32 2019

@author: ZUOQIN
"""

from sklearn.datasets   import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

data=load_breast_cancer()
data.data.shape

#model
rfc=RandomForestClassifier()
rfc_s=cross_val_score(rfc,data.data,data.target,cv=20)
np.mean(rfc_s)
score1=[]
for i in range(0,200,10):
    rfc=RandomForestClassifier(n_estimators=i+1
                               ,random_state=90)
    score=cross_val_score(rfc,data.data,data.target,cv=20)
    score1.append(np.mean(score))
print (max(score1),score1.index(max(score1))*10+1)
plt.figure(figsize=[20,5])
plt.plot(range(1,201,10),score1)
plt.show()

score1=[]
for i in range(70,90):
    rfc=RandomForestClassifier(n_estimators=i+1
                               ,random_state=90)
    score=cross_val_score(rfc,data.data,data.target,cv=20)
    score1.append(np.mean(score))
print (max(score1),score1.index(max(score1))+71)
plt.figure(figsize=[20,5])
plt.plot(range(71,91),score1)
plt.show()

param_grid={'max_depth':range(1,20,1)}#往复杂度小调,max_featue可调复杂也可调简单
rfc=RandomForestClassifier(n_estimators=81
                               ,random_state=90)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
GS.best_params_
GS.best_score_

param_grid={'max_features':range(1,30,1)}#往复杂度小调,max_featue可调复杂也可调简单
rfc=RandomForestClassifier(n_estimators=81
                               ,random_state=90
                               ,max_depth=9)
GS=GridSearchCV(rfc,param_grid,cv=10)
GS.fit(data.data,data.target)
GS.best_params_
GS.best_score_
#误差不变,基本调参结束

可调参数

lass_weight=None, criterion=‘gini’,
max_depth=9, max_features=‘auto’, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=81, n_jobs=1,
oob_score=False, random_state=90, verbose=0, warm_start=False)

上一篇:gateway调用后台系统时选择RFC的决定逻辑


下一篇:WebSocket协议RFC中文翻译版[有原文RFC 6455英文版链接]