一、缺少值的处理方法
1.删除缺少值的列
# 获取缺值的列名
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
# 从数据集删除缺值的列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
2.插补缺失值
from sklearn.impute import SimpleImputer
# 填充
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# 填充会移除列名,回填列名
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
3.插补的拓展?
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
# Make new columns indicating what will be imputed
for col in cols_with_missing:
X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))
# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns
print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))
二、分类变量的处理方法
# 检查分类变量
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
1.删除分类变量
# 使用select_dtypes()方法删除分类变量
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
2.编码标签
from sklearn.preprocessing import LabelEncoder
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
# 为每一列分类变量编码标签
label_encoder = LabelEncoder()
for col in object_cols:
label_X_train[col] = label_encoder.fit_transform(X_train[col])
label_X_valid[col] = label_encoder.transform(X_valid[col])
3.热编码创建新列(一般不会用在有超过15个不同值的变量)
from sklearn.preprocessing import OneHotEncoder
# 将one-hot热编码应用分类变量
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# 回填热编码移除的索引
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# 移除分类变量
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# 添加通过热编码得到的数值化特征
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
三、使用管道预处理
1.定义预处理步骤
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# 预处理数值型数据
numerical_transformer = SimpleImputer(strategy='constant')
# 预处理分类数据
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# 使用ColumnTransformer类绑定不同的预处理步骤
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
2.定义模型
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
3.创建和评估管道
from sklearn.metrics import mean_absolute_error
# 绑定预处理和建模步骤到一个管道
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
###
# 可对管道做等同模型的操作
# 训练
my_pipeline.fit(X_train, y_train)
# 预测
preds = my_pipeline.predict(X_valid)
# 评估模型
score = mean_absolute_error(y_valid, preds)
###
四、交叉验证
1.定义一个管道
2.使用scikit learn中的cross-val-score()函数获得交叉验证分数,用cv参数设置折叠数。
from sklearn.model_selection import cross_val_score
# 设置scoring选择一个模型质量指标
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
五、XGBoost
1.导入xgboost,训练模型
from xgboost import XGBRegressor
my_model = XGBRegressor()
my_model.fit(X_train, y_train)
2.设置参数
- n_estimators 指定迭代次数
- early_stopping_rounds 早停法,如果在n轮内正确率没有提升,则退出迭代
- learning_rate 学习率,小的学习率与大的估算值能产生更精确的xgboost模型,但也花费更多时间,默认0.1
- n_jobs 设置并行性地构建模型的核心数量
例子:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
六、数据泄漏
1.目标泄露
2.测试集污染