相信许多人对调库充满了恐惧,那些不同的库让人看得眼花缭乱,本次代码分享就是以波士顿房价为例来做预测模型,并采用网格搜索来确定最优参数,最后再用最优参数确定最优模型再打印其评估指标,从而对相关调库进行对比记忆
# 1.回归算法
# -数据:boston房价
from sklearn.datasets import load_boston#导入波士顿房价数据集
from sklearn.pipeline import Pipeline#导入管道机制
from sklearn.preprocessing import PolynomialFeatures,StandardScaler#导入多项式特征,特征缩放
from sklearn.decomposition import PCA#导入PCA降维
from sklearn.linear_model import Ridge,Lasso#导入L1,L2正则
from sklearn.neighbors import KNeighborsRegressor#导入KNN
from sklearn.tree import DecisionTreeRegressor#导入回归树
from sklearn.ensemble import RandomForestRegressor#导入随机森林
from sklearn.model_selection import GridSearchCV,train_test_split#导入网格搜索,数据切分
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error#导入评估指标
# -加载、分割数据集
data=load_boston()
x=data.data
y=data.target
trainx,testx,trainy,testy=train_test_split(x,y,train_size=0.7)
# -建立多项式PolynomialFeature模型
# -PCA降维(降至二维)
# -缩放
# -将上述数据利用管道pipeline机制处理
poly=PolynomialFeatures(degree=3)
std=StandardScaler()
pca=PCA(n_components=2)
pip=Pipeline([('poly',poly),('std',std),('pca',pca)])
trainx=pip.fit_transform(trainx)
testx=pip.fit_transform(testx)
# -再利用Rdige/Lasso/Linearegression/KNN/决策树/随机森林等算法,建立网格搜索模型,得到最好模型
# -输出最好模型的测试集的R2值,rmse, mse等评估指标
param_l2={'alpha':[0.01,0.1,1,10,100]}
l2=Ridge()
grid_search_l2=GridSearchCV(l2,param_grid=param_l2)
grid_search_l2.fit(trainx,trainy)
l2_best=grid_search_l2.best_params_
print('岭回归最优参数',l2_best)
l2best=Ridge(alpha=l2_best['alpha'])
l2best.fit(trainx,trainy)
testh_l2=l2best.predict(testx)
print('岭回归r方',r2_score(testy,testh_l2))
print('岭回归均方误差',mean_squared_error(testy,testh_l2))
print('岭回归均方误差根',mean_squared_error(testy,testh_l2)**0.5)
print('岭回归均绝对值误差',mean_absolute_error(testy,testh_l2))
param_l1={'alpha':[0.01,0.1,1,10,100]}
l1=Lasso()
grid_search_l1=GridSearchCV(l1,param_grid=param_l1)
grid_search_l1.fit(trainx,trainy)
l1_best=grid_search_l1.best_params_
print('套索回归最优参数',l1_best)
l1best=Lasso(alpha=l1_best['alpha'])
l1best.fit(trainx,trainy)
testh_l1=l1best.predict(testx)
print('套索回归r方',r2_score(testy,testh_l1))
print('套索回归均方误差',mean_squared_error(testy,testh_l1))
print('套索回归均方误差根',mean_squared_error(testy,testh_l1)**0.5)
print('套索回归均绝对值误差',mean_absolute_error(testy,testh_l1))
param_knn={'n_neighbors':[3,4,5,6,7,8]}
knn=KNeighborsRegressor()
grid_search_knn=GridSearchCV(knn,param_grid=param_knn)
grid_search_knn.fit(trainx,trainy)
knn_best=grid_search_knn.best_params_
print('KNN最优参数',knn_best)
knnBest=KNeighborsRegressor(n_neighbors=knn_best['n_neighbors'])
knnBest.fit(trainx,trainy)
testh_knn=knnBest.predict(testx)
print('KNNr方',r2_score(testy,testh_knn))
print('KNN均方误差',mean_squared_error(testy,testh_knn))
print('KNN均方误差根',mean_squared_error(testy,testh_knn)**0.5)
print('KNN均绝对值误差',mean_absolute_error(testy,testh_knn))
param_dtr={'max_depth':[3,4,5,6,7]}
dtr=DecisionTreeRegressor()
grid_search_dtr=GridSearchCV(dtr,param_grid=param_dtr)
grid_search_dtr.fit(trainx,trainy)
dtr_best=grid_search_dtr.best_params_
print('回归树最优参数',dtr_best)
dtrBest=DecisionTreeRegressor(max_depth=dtr_best['max_depth'])
dtrBest.fit(trainx,trainy)
testh_dtr=dtrBest.predict(testx)
print('回归树r方',r2_score(testy,testh_dtr))
print('回归树均方误差',mean_squared_error(testy,testh_dtr))
print('回归树均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('回归树均绝对值误差',mean_absolute_error(testy,testh_dtr))
param_forest={'n_estimators':[5,10,50,100]}
forest=RandomForestRegressor()
grid_search_forest=GridSearchCV(forest,param_grid=param_forest)
grid_search_forest.fit(trainx,trainy)
forest_best=grid_search_forest.best_params_
print('随机森林最优参数',forest_best)
forestBest=RandomForestRegressor(n_estimators=forest_best['n_estimators'])
forestBest.fit(trainx,trainy)
testh_forest=forestBest.predict(testx)
print('随机森林r方',r2_score(testy,testh_dtr))
print('随机森林均方误差',mean_squared_error(testy,testh_dtr))
print('随机森林均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('随机森林均绝对值误差',mean_absolute_error(testy,testh_dtr))
其最终运行效果如下:
岭回归最优参数 {‘alpha’: 100}
岭回归r方 0.2815626620061432
岭回归均方误差 60.2892053502206
岭回归均方误差根 7.764612376044319
岭回归均绝对值误差 5.515834261907906
套索回归最优参数 {‘alpha’: 0.1}
套索回归r方 0.28164723100519473
套索回归均方误差 60.28210856184332
套索回归均方误差根 7.764155366930992
套索回归均绝对值误差 5.5153861879821
KNN最优参数 {‘n_neighbors’: 8}
KNNr方 0.17824876248539345
KNN均方误差 68.95901212993421
KNN均方误差根 8.304156316564267
KNN均绝对值误差 5.608141447368421
回归树最优参数 {‘max_depth’: 3}
回归树r方 0.15175727360697133
回归树均方误差 71.18210206215309
回归树均方误差根 8.436948622704366
回归树均绝对值误差 5.820938902402186
随机森林最优参数 {‘n_estimators’: 100}
随机森林r方 0.15175727360697133
随机森林均方误差 71.18210206215309
随机森林均方误差根 8.436948622704366
随机森林均绝对值误差 5.820938902402186