import os
import tarfile
import urllib.request
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy import stats
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
"""
下载数据
"""
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
def load_housing_data(housing_path=HOUSING_PATH):
"""
读取数据
"""
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
# "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedroom_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
"""
自定义转换器
"""
def __init__(self, add_bedroom_per_room=True):
self.add_bedroom_per_room = add_bedroom_per_room
def fit(self, X, y=None):
return self
def transform(self, X):
"""
X[:, rooms_ix]:X数组,":"所有行,取rooms_ix列的数据(即,取第rooms_ix列的全部数据,从0开始计数)
------------
np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。
np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等。即添加计算后的新列数据
例:
a = np.array([[1, 2, 3],
[7,8,9]])
b = np.array([[4,5,6],
[1,2,3]])
c = np.c_[a,b]
>>> print(c)
array([[1, 2, 3, 4, 5, 6],
[7, 8, 9, 1, 2, 3]])
"""
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
if self.add_bedroom_per_room:
bedrooms_per_room = X[:, bedroom_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
def display_scores(scores):
print("分 数:", scores)
print("平均值:", scores.mean())
print("标准差:", scores.std())
if __name__ == '__main__':
# 下载数据
fetch_housing_data()
# 读取数据
housing = load_housing_data()
# 按收入分组
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0, 1.5, 3, 4.5, 6, np.inf],
labels=[1, 2, 3, 4, 5])
# 按income_cat类别比例抽取 20% 的测试集 和 80% 的训练集
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# 删除收入分组,恢复数据
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
# 寻找数据相关性
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
"""
数据处理:
1、训练集拆分成 预测集(housing)和标签(housing_labels);
2、预测集再拆分出 数字属性(housing_num)和 文本属性(ocean_proximity)
"""
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)
"""
数值属性流水线:
1、SimpleImputer(中位数填充缺失值);
2、CombinedAttributesAdder(自定义转化器);
3、StandardScaler(特征缩放)
"""
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
num_attribs = list(housing_num) # 获取数字集列头列表
cat_attribs = ["ocean_proximity"]
"""
所有属性的流水线:
1、num_pipeline(数值属性流水线);
2、OneHotEncoder(文本属性 转换为 独热向量)
"""
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
# 线性回归模型
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)
# K-交叉验证 - 线性回归模型
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
# 决策树模型
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
# K-交叉验证 - 决策树模型
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)
# 随机森林模型
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)
# K-交叉验证 - 随机森林模型
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
"""
模型参数微调:
模型:随机森林模型
方法:网格搜索
"""
param_grid = [
{"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
{"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
# 微调结果
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
"""
模型参数微调:
模型:随机森林模型
方法:随机搜索
"""
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error')
rnd_search.fit(housing_prepared, housing_labels)
# 微调结果
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
"""
选取调整后最好的模型,进行测试集评估系统
"""
final_model = rnd_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)
# 评估的精确度,计算泛化误差的95%置信区间:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))