kaggle intermediate machine learning笔记

一、缺少值的处理方法

1.删除缺少值的列

# 获取缺值的列名
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# 从数据集删除缺值的列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

2.插补缺失值

from sklearn.impute import SimpleImputer

# 填充
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# 填充会移除列名,回填列名
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

3.插补的拓展?

# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

二、分类变量的处理方法

# 检查分类变量
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

1.删除分类变量

# 使用select_dtypes()方法删除分类变量
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

2.编码标签

from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# 为每一列分类变量编码标签
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

3.热编码创建新列(一般不会用在有超过15个不同值的变量)

from sklearn.preprocessing import OneHotEncoder

# 将one-hot热编码应用分类变量
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# 回填热编码移除的索引
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# 移除分类变量
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# 添加通过热编码得到的数值化特征
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

三、使用管道预处理

1.定义预处理步骤

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# 预处理数值型数据
numerical_transformer = SimpleImputer(strategy='constant')

# 预处理分类数据
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 使用ColumnTransformer类绑定不同的预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

2.定义模型

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

3.创建和评估管道

from sklearn.metrics import mean_absolute_error

# 绑定预处理和建模步骤到一个管道
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

###
# 可对管道做等同模型的操作
# 训练
my_pipeline.fit(X_train, y_train)

# 预测
preds = my_pipeline.predict(X_valid)

# 评估模型
score = mean_absolute_error(y_valid, preds)
###

四、交叉验证

1.定义一个管道
2.使用scikit learn中的cross-val-score()函数获得交叉验证分数,用cv参数设置折叠数。

from sklearn.model_selection import cross_val_score

# 设置scoring选择一个模型质量指标
scores = -1 * cross_val_score(my_pipeline, X, y,  cv=5, scoring='neg_mean_absolute_error')

五、XGBoost

1.导入xgboost,训练模型

from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)

2.设置参数

  • n_estimators 指定迭代次数
  • early_stopping_rounds 早停法,如果在n轮内正确率没有提升,则退出迭代
  • learning_rate 学习率,小的学习率与大的估算值能产生更精确的xgboost模型,但也花费更多时间,默认0.1
  • n_jobs 设置并行性地构建模型的核心数量
    例子:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)

六、数据泄漏

1.目标泄露

2.测试集污染

上一篇:2020了你还不会java8新特性?(八)流源构造代码分析与总结


下一篇:Ubuntu安装中文字体