文章目录
特征工程
学习目标
- 学习特征交互、编码、选择的相应方法
- 学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法
object类型处理
先把日期转为时间格式,再分离,构建年月日特征
import datetime
#转化成时间格式 issueDateDT特征表示数据日期离数据集中日期最早的日期(2007-06-01)的天数
train_data['issueDate'] = pd.to_datetime(train_data['issueDate'],format='%Y-%m-%d')
test_data['issueDate'] = pd.to_datetime(train_data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
train_data['issueDateDT'] = train_data['issueDate'].apply(lambda x: x-startdate).dt.days
test_data['issueDateDT'] = test_data['issueDate'].apply(lambda x: x-startdate).dt.days
train_data['issueDate_year'] = train_data['issueDate'].dt.year
test_data['issueDate_year'] = test_data['issueDate'].dt.year
train_data['issueDate_month'] = train_data['issueDate'].dt.month
test_data['issueDate_month'] = test_data['issueDate'].dt.month
# train_data['issueDate_month'] = train_data['issueDate'].dt.weekday
# test_data['issueDate_month'] = test_data['issueDate'].dt.weekday
employmentLength-objuct转为int
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [train_data, test_data]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
1特征编码
1.1 target encoding
#Target Encoding
from sklearn.model_selection import StratifiedKFold
def kfold_risk_feature(train, test, feats, k, seed):
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed) # 这里最好和后面模型的K折交叉验证保持一致
train['fold'] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
train.loc[val_idx, 'fold'] = fold_
kfold_features = []
for feat in feats:
nums_columns = ['isDefault']
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
kfold_features.append(colname)
train[colname] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
tmp_trn = train.iloc[trn_idx]
order_label = tmp_trn.groupby([feat])[f].mean()
tmp = train.loc[train.fold == fold_, [feat]]
train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
# fillna
global_mean = train[f].mean()
train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
train[colname] = train[colname].astype(float)
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
test[colname] = None
order_label = train.groupby([feat])[f].mean()
test[colname] = test[feat].map(order_label)
# fillna
global_mean = train[f].mean()
test[colname] = test[colname].fillna(global_mean)
test[colname] = test[colname].astype(float)
del train['fold']
return train, test
#postcode,regioncode考虑删去
target_encode_cols = ['grade', 'subGrade', 'employmentLength', 'earliesCreditLine', 'issueDate',
'postCode',
'employmentTitle',
'purpose', 'title',
# 'postCode_regionCode',
'homeOwnership',
'regionCode'
]
train, test = kfold_risk_feature(train_data, test_data, target_encode_cols, 5, 32)
1.2 Label Encoding
def trans_month(x):
month_dict = {'Aug': 8, 'May': 5, 'Jul': 7, 'Oct': 10, 'Dec': 12, 'Apr': 4, 'Jan': 1, 'Nov': 11, 'Feb': 2,
'Mar': 3, 'Jun': 6, 'Sep': 9}
tmp = str(x.split('-')[0])
return month_dict[tmp]
train['earliesCreditLine_year'] = train.earliesCreditLine.apply(lambda x: int(str(x).split('-')[1]))
test['earliesCreditLine_year'] = test.earliesCreditLine.apply(lambda x: int(str(x).split('-')[1]))
train['earliesCreditLine_month'] = train.earliesCreditLine.apply(trans_month)
test['earliesCreditLine_month'] = test.earliesCreditLine.apply(trans_month)
grade_list = {'E': 5, 'D': 4, 'A': 1, 'C': 3, 'B': 2, 'F': 6, 'G': 7}
train['grade_encode'] = train.grade.map(grade_list)
test['grade_encode'] = test.grade.map(grade_list)
1.3 Frency Encoding
f_en_list = ['employmentTitle', 'title']
for col in f_en_list:
temp1 = train[col].value_counts().to_dict()
train['{}_freq_encode'.format(col)] = train[col].map(temp1)
test['{}_freq_encode'.format(col)] = test[col].map(temp1)
2 业务特征构建
#业务交叉特征
#开通信用额度与开始发放贷款奶的时间差
train['earliesCreditLine_issueDate_diff'] = train.issueDate_year - train.earliesCreditLine_year
test['earliesCreditLine_issueDate_diff'] = test.issueDate_year - test.earliesCreditLine_year
#贷款金额与年收入比->代表偿债能力,该值越第偿债能力越高
train['dti_cal'] = train.annualIncome/train.loanAmnt
test['dti_cal'] = test.annualIncome/test.loanAmnt
#未结额度占总额度之比
train['openAcc_totalAcc_rate'] = train.openAcc / train.totalAcc
test['openAcc_totalAcc_rate'] = test.openAcc / test.totalAcc
# train['null_num'] = train.isnull().sum(axis=1)
# test['null_num'] = test.isnull().sum(axis=1)
#代表身家的特征
train['money_total'] = train.annualIncome * train.employmentLength
test['money_total'] = test.annualIncome * test.employmentLength
#贷款到什么时候
train['end_year'] = train.issueDate_year + train.term
test['end_year'] = test.issueDate_year + test.term
3 用户画像-WOE编码
#连续变量分箱+woe编码
from sklearn.tree import DecisionTreeClassifier
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
'''
利用决策树获得最优分箱的边界值列表
'''
boundary = [] # 待return的分箱边界值列表
x = x.fillna(nan).values # 填充缺失值
y = y.values
clf = DecisionTreeClassifier(criterion='entropy', #“信息熵”最小化准则划分
max_leaf_nodes=6, # 最大叶子节点数
min_samples_leaf=0.05) # 叶子节点样本数量最小占比
clf.fit(x.reshape(-1, 1), y) # 训练决策树
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
for i in range(n_nodes):
if children_left[i] != children_right[i]: # 获得决策树节点上的划分边界值
boundary.append(threshold[i])
boundary.sort()
min_x = x.min()
max_x = x.max() + 0.1 # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
boundary = [min_x] + boundary + [max_x]
return boundary
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
'''
计算变量各个分箱的WOE、IV值,返回一个DataFrame
'''
x = x.fillna(nan)
boundary = optimal_binning_boundary(x, y, nan) # 获得最优分箱边界值列表
df = pd.concat([x, y], axis=1) # 合并x、y为一个DataFrame,方便后续计算
df.columns = ['x', 'y'] # 特征变量、目标变量字段的重命名
df['bins'] = pd.cut(x=x, bins=boundary, right=False) # 获得每个x值所在的分箱区间
grouped = df.groupby('bins')['y'] # 统计各分箱区间的好、坏、总客户数量
result_df = grouped.agg([('good', lambda y: (y == 0).sum()),
('bad', lambda y: (y == 1).sum()),
('total', 'count')])
result_df['good_pct'] = result_df['good'] / result_df['good'].sum() # 好客户占比
result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum() # 坏客户占比
result_df['total_pct'] = result_df['total'] / result_df['total'].sum() # 总客户占比
result_df['bad_rate'] = result_df['bad'] / result_df['total'] # 坏比率
result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct']) # WOE
result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe'] # IV
print(f"该变量IV = {result_df['iv'].sum()}")
return result_df
关于woe编码的话,风控中应用很广。对于连续特征,可以将其分箱后再woe编码。对于离散特征可以直接进行编码再测试VI值。由于我测试过VI值。这里的特征只有interestRate的VI分数比较理想。
cut = ['interestRate']
for col in cut:
boundary = optimal_binning_boundary(x=train[col],y=train['isDefault'])
labels = list(range(1,len(boundary)))
name = col + 'cut'
train[name] = pd.cut(x=train[col], bins=boundary, labels=labels,right=False)
test[name] = pd.cut(x=test[col], bins=boundary, labels=labels,right=False)
a = feature_woe_iv(x=train['interestRate'], y=train['isDefault']).reset_index().reset_index().rename(
columns={'index':'interestRatecut'})[['interestRatecut','woe']]
a['interestRatecut'] = a + 1
train = pd.merge(a,train,on='interestRatecut')
test = pd.merge(a,test,on='interestRatecut')
4 三次衍生特征(聚合统计量)
a1 = pd.DataFrame(train.groupby('term').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'term_dti'})
a2 = pd.DataFrame(test.groupby('term').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'term_dti'})
agg_train = pd.merge(a1,train,on='term')
agg_test = pd.merge(a2,test,on='term')
b1 = pd.DataFrame(train.groupby('issueDateDT').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'issueDateDT_dti'})
b2 = pd.DataFrame(test.groupby('issueDateDT').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'issueDateDT_dti'})
agg_train = pd.merge(b1,agg_train,on='issueDateDT')
agg_test = pd.merge(b2,agg_test,on='issueDateDT')
c1 = pd.DataFrame(train.groupby('postCode_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'postCode_dti'})
c2 = pd.DataFrame(test.groupby('postCode_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'postCode_dti'})
agg_train = pd.merge(c1,agg_train,on='postCode_isDefault_kfold_mean')
agg_test = pd.merge(c2,agg_test,on='postCode_isDefault_kfold_mean')
d1 = pd.DataFrame(train.groupby('employmentTitle_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'employmentTitle_dti'})
d2 = pd.DataFrame(test.groupby('employmentTitle_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'employmentTitle_dti'})
agg_train = pd.merge(d1,agg_train,on='employmentTitle_isDefault_kfold_mean')
agg_test = pd.merge(d2,agg_test,on='employmentTitle_isDefault_kfold_mean')
以上就是特征工程的全部内容了!