from sklearn.ensemble import RandomForestRegressor #导入随机森林的包 import pandas as pd #加载入数据,这里用的是住房的数据 from sklearn.datasets.california_housing import fetch_california_housing housing = fetch_california_housing() #进行数据的分割, test_size表示分割的比例 from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = \ train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42) #构建随机森林的树 rfr = RandomForestRegressor(random_state=42) #每次生成的数都是相同的 rfr.fit(data_train, target_train) #模型计算 rfr_predict = rfr.predict(data_test) #预测结果 rfr.score(data_train, target_train) #默认评估值 #随机森林的参数调节 from sklearn.grid_search import GridSearchCV tree_pram_grad = {'min_samples_split':list((3, 6, 9)), 'n_estimators':list((10,50,100))} grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_pram_grad, cv=5) #RandomForestRegressor() # 表示需要调节的函数程序, param_grid 表示待调节的参数, cv=5表示交叉验证的次数 grid.fit(data_train, target_train) print(grid.grid_scores_, grid.best_params_, grid.best_score_) # 输出每组的grid_scores, 输出最好的参数组合,输出最好的默认评估值 # 输出的结果是在3 和 100 时呈现最好的状态 # 重新构建树做预测 rfr = RandomForestRegressor(random_state=42) #每次生成的数都是相同的 rfr.fit(data_train, target_train) #模型计算 new_rfr_predict = rfr.predict(data_test) #预测结果 #展示变量的重要性 pd.Series(rfr.feature_importances_, index=housing.feature_names).sort_values(ascending=False)