阿里云天池金融风控-task3

文章目录

特征工程

学习目标

  1. 学习特征交互、编码、选择的相应方法
  2. 学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法

object类型处理
先把日期转为时间格式,再分离,构建年月日特征

import datetime
#转化成时间格式  issueDateDT特征表示数据日期离数据集中日期最早的日期(2007-06-01)的天数
train_data['issueDate'] = pd.to_datetime(train_data['issueDate'],format='%Y-%m-%d')
test_data['issueDate'] = pd.to_datetime(train_data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
train_data['issueDateDT'] = train_data['issueDate'].apply(lambda x: x-startdate).dt.days
test_data['issueDateDT'] = test_data['issueDate'].apply(lambda x: x-startdate).dt.days
train_data['issueDate_year'] = train_data['issueDate'].dt.year
test_data['issueDate_year'] = test_data['issueDate'].dt.year
train_data['issueDate_month'] = train_data['issueDate'].dt.month
test_data['issueDate_month'] = test_data['issueDate'].dt.month

# train_data['issueDate_month'] = train_data['issueDate'].dt.weekday
# test_data['issueDate_month'] = test_data['issueDate'].dt.weekday

employmentLength-objuct转为int

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
for data in [train_data, test_data]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
    

1特征编码

1.1 target encoding

#Target Encoding
from sklearn.model_selection import StratifiedKFold
def kfold_risk_feature(train, test, feats, k, seed):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)  # 这里最好和后面模型的K折交叉验证保持一致

    train['fold'] = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
        train.loc[val_idx, 'fold'] = fold_

    kfold_features = []
    for feat in feats:
        nums_columns = ['isDefault']
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
                tmp_trn = train.iloc[trn_idx]
                order_label = tmp_trn.groupby([feat])[f].mean()
                tmp = train.loc[train.fold == fold_, [feat]]
                train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
                # fillna
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)

        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            # fillna
            global_mean = train[f].mean()
            test[colname] = test[colname].fillna(global_mean)
            test[colname] = test[colname].astype(float)
    del train['fold']
    return train, test
#postcode,regioncode考虑删去
target_encode_cols = ['grade', 'subGrade', 'employmentLength', 'earliesCreditLine', 'issueDate',
                      'postCode',
                      'employmentTitle',
                      'purpose', 'title',
                      # 'postCode_regionCode',
                      'homeOwnership',
                      'regionCode'
                     ]
train, test = kfold_risk_feature(train_data, test_data, target_encode_cols, 5, 32)

1.2 Label Encoding

def trans_month(x):
    month_dict = {'Aug': 8, 'May': 5, 'Jul': 7, 'Oct': 10, 'Dec': 12, 'Apr': 4, 'Jan': 1, 'Nov': 11, 'Feb': 2,
                  'Mar': 3, 'Jun': 6, 'Sep': 9}
    tmp = str(x.split('-')[0])
    return month_dict[tmp]

train['earliesCreditLine_year'] = train.earliesCreditLine.apply(lambda x: int(str(x).split('-')[1]))
test['earliesCreditLine_year'] = test.earliesCreditLine.apply(lambda x: int(str(x).split('-')[1]))
train['earliesCreditLine_month'] = train.earliesCreditLine.apply(trans_month)
test['earliesCreditLine_month'] = test.earliesCreditLine.apply(trans_month)

grade_list = {'E': 5, 'D': 4, 'A': 1, 'C': 3, 'B': 2, 'F': 6, 'G': 7}
train['grade_encode'] = train.grade.map(grade_list)
test['grade_encode'] = test.grade.map(grade_list)

1.3 Frency Encoding

f_en_list = ['employmentTitle', 'title']
for col in f_en_list:
    temp1 = train[col].value_counts().to_dict()
    train['{}_freq_encode'.format(col)] = train[col].map(temp1)
    test['{}_freq_encode'.format(col)] = test[col].map(temp1)

2 业务特征构建

#业务交叉特征
#开通信用额度与开始发放贷款奶的时间差
train['earliesCreditLine_issueDate_diff'] = train.issueDate_year - train.earliesCreditLine_year
test['earliesCreditLine_issueDate_diff'] = test.issueDate_year - test.earliesCreditLine_year
#贷款金额与年收入比->代表偿债能力,该值越第偿债能力越高
train['dti_cal'] = train.annualIncome/train.loanAmnt
test['dti_cal'] =  test.annualIncome/test.loanAmnt
#未结额度占总额度之比
train['openAcc_totalAcc_rate'] = train.openAcc / train.totalAcc
test['openAcc_totalAcc_rate'] = test.openAcc / test.totalAcc
# train['null_num'] = train.isnull().sum(axis=1)
# test['null_num'] = test.isnull().sum(axis=1)

#代表身家的特征
train['money_total'] = train.annualIncome * train.employmentLength
test['money_total'] = test.annualIncome * test.employmentLength
#贷款到什么时候
train['end_year'] = train.issueDate_year + train.term
test['end_year'] = test.issueDate_year + test.term

3 用户画像-WOE编码

#连续变量分箱+woe编码
from sklearn.tree import DecisionTreeClassifier
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
        利用决策树获得最优分箱的边界值列表
    '''
    boundary = []  # 待return的分箱边界值列表
    
    x = x.fillna(nan).values  # 填充缺失值
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',    #“信息熵”最小化准则划分
                                 max_leaf_nodes=6,       # 最大叶子节点数
                                 min_samples_leaf=0.05)  # 叶子节点样本数量最小占比

    clf.fit(x.reshape(-1, 1), y)  # 训练决策树
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]

    return boundary
 
 def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
        计算变量各个分箱的WOE、IV值,返回一个DataFrame
    '''
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan)        # 获得最优分箱边界值列表
    df = pd.concat([x, y], axis=1)                        # 合并x、y为一个DataFrame,方便后续计算
    df.columns = ['x', 'y']                               # 特征变量、目标变量字段的重命名
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  # 获得每个x值所在的分箱区间
    
    grouped = df.groupby('bins')['y']                     # 统计各分箱区间的好、坏、总客户数量
    result_df = grouped.agg([('good',  lambda y: (y == 0).sum()), 
                             ('bad',   lambda y: (y == 1).sum()),
                             ('total', 'count')])

    result_df['good_pct'] = result_df['good'] / result_df['good'].sum()       # 好客户占比
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()          # 坏客户占比
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()    # 总客户占比

    result_df['bad_rate'] = result_df['bad'] / result_df['total']             # 坏比率
    
    result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct'])              # WOE
    result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe']  # IV
    
    print(f"该变量IV = {result_df['iv'].sum()}")
    
    return result_df

关于woe编码的话,风控中应用很广。对于连续特征,可以将其分箱后再woe编码。对于离散特征可以直接进行编码再测试VI值。由于我测试过VI值。这里的特征只有interestRate的VI分数比较理想。

cut = ['interestRate']
for col in cut:
    boundary = optimal_binning_boundary(x=train[col],y=train['isDefault'])
    labels = list(range(1,len(boundary)))
    name = col + 'cut'
    train[name] = pd.cut(x=train[col], bins=boundary, labels=labels,right=False)
    test[name] = pd.cut(x=test[col], bins=boundary, labels=labels,right=False)
    


a = feature_woe_iv(x=train['interestRate'], y=train['isDefault']).reset_index().reset_index().rename(
    columns={'index':'interestRatecut'})[['interestRatecut','woe']]
a['interestRatecut'] = a + 1
train = pd.merge(a,train,on='interestRatecut')
test = pd.merge(a,test,on='interestRatecut')

4 三次衍生特征(聚合统计量)

a1 = pd.DataFrame(train.groupby('term').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'term_dti'})
a2 = pd.DataFrame(test.groupby('term').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'term_dti'})
agg_train = pd.merge(a1,train,on='term')
agg_test = pd.merge(a2,test,on='term')

b1 = pd.DataFrame(train.groupby('issueDateDT').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'issueDateDT_dti'})
b2 = pd.DataFrame(test.groupby('issueDateDT').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'issueDateDT_dti'})
agg_train = pd.merge(b1,agg_train,on='issueDateDT')
agg_test = pd.merge(b2,agg_test,on='issueDateDT')

c1 = pd.DataFrame(train.groupby('postCode_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'postCode_dti'})
c2 = pd.DataFrame(test.groupby('postCode_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'postCode_dti'}) 
agg_train = pd.merge(c1,agg_train,on='postCode_isDefault_kfold_mean')
agg_test = pd.merge(c2,agg_test,on='postCode_isDefault_kfold_mean')

d1 = pd.DataFrame(train.groupby('employmentTitle_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'employmentTitle_dti'})
d2 = pd.DataFrame(test.groupby('employmentTitle_isDefault_kfold_mean').agg('mean')['dti_cal']).reset_index().rename(columns={'dti_cal':'employmentTitle_dti'})
agg_train = pd.merge(d1,agg_train,on='employmentTitle_isDefault_kfold_mean')
agg_test = pd.merge(d2,agg_test,on='employmentTitle_isDefault_kfold_mean')

以上就是特征工程的全部内容了!

上一篇:git实用(2):mvn命令


下一篇:maven项目手动创建