电商推荐系统架构

电商推荐系统架构

## 数据工具包
import numpy as np
import pandas as pd
from tqdm import tqdm

## 字符串处理工具包
import string
import re
import gensim
from collections import Counter
import pickle
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence 

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
from functools import partial

import os 
import gc
from scipy.sparse import vstack  
import time
import datetime

import joblib

import multiprocessing as mp
import seaborn as sns 
%matplotlib inline
action_data = pd.read_csv('./data.csv')
action_data.head()
user_log_acct item_sku_id action_time action_type brand_code shop_id item_third_cate_cd vender_id shop_score age sex user_level province city county
0 937922 357022 2020-02-04 08:28:15 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0
1 937922 73 2020-02-04 08:27:07 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0
2 937922 29583 2020-02-04 08:26:31 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0
3 937922 108763 2020-02-04 08:26:10 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0
4 1369473 331139 2020-02-03 21:55:49 1 9985.0 6367.0 73.0 3666.0 0.000000 5.0 1.0 5 1.0 41.0 2058.0
action_data.shape
(37214269, 15)

数据分析

action_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37214269 entries, 0 to 37214268
Data columns (total 15 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_log_acct       int64  
 1   item_sku_id         int64  
 2   action_time         object 
 3   action_type         int64  
 4   brand_code          float64
 5   shop_id             float64
 6   item_third_cate_cd  float64
 7   vender_id           float64
 8   shop_score          float64
 9   age                 float64
 10  sex                 float64
 11  user_level          int64  
 12  province            float64
 13  city                float64
 14  county              float64
dtypes: float64(10), int64(4), object(1)
memory usage: 4.2+ GB
action_data['action_time'].apply(lambda x:x[:7]).value_counts()
2020-02    15109221
2020-03    15004111
2020-04     7100937
Name: action_time, dtype: int64
action_data['action_type'].value_counts()
1    33151074
2     2193489
4      826761
5      600979
3      441966
Name: action_type, dtype: int64
action_data['sex'].value_counts()
 0.0    24293534
 1.0    12824478
-1.0       79207
Name: sex, dtype: int64
action_data['user_level'].value_counts()
7    14440505
5     9062005
1     8763570
6     4910432
4       24432
3       13022
2         303
Name: user_level, dtype: int64

数据预处理

action_data['action_time'].apply(lambda x: len(str(x))).value_counts()
19    34578820
21     2635449
Name: action_time, dtype: int64
# 存在异常值,需要修改
action_data['dd_len'] = action_data['action_time'].apply(lambda x: len(str(x)))
action_data['action_time'] = action_data['action_time'].apply(lambda x: x[:19])
del action_data['dd_len']
action_data['action_time'] = pd.to_datetime(action_data['action_time'])
action_data = action_data.sort_values('action_time')
action_data['month'] = action_data['action_time'].dt.month
action_data['day'] = action_data['action_time'].dt.day
action_data['month_day'] = action_data['month'].values * 100 + action_data['day'].values
action_data['month_day'].value_counts()
204    890713
206    861885
205    799023
201    795076
203    783515
        ...  
217    300989
216    252465
215    218483
328     65396
327     52573
Name: month_day, Length: 74, dtype: int64

训练集切分

def _label_trans(x, dic_):
    try:
        return dic_[x]
    except:
        return 0
def get_label(df, label_st = (4,11), label_en = (4,15),candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10)):
    
    lb_st = df.loc[(df['month'] == label_st[0]) & (df['day'] == label_st[1]),   'month_day'].values[0]
    lb_en = df.loc[(df['month'] == label_en[0]) & (df['day'] == label_en[1]),   'month_day'].values[0]   
    
    cand_st = df.loc[(df['month'] == candidate_st[0]) & (df['day'] == candidate_st[1]),   'month_day'].values[0]
    cand_en = df.loc[(df['month'] == candidate_en[0]) & (df['day'] == candidate_en[1]),   'month_day'].values[0] 
    
    fea_position = df.loc[(df['month'] == fea_en[0]) & (df['day'] == fea_en[1]),   'month_day'].values[0]    
    ind_label = (df['month_day']>= lb_st) & (df['month_day']<= lb_en) & (df['action_type'] ==2)
    ind_candidate = (df['month_day']>= cand_st) & (df['month_day']<= cand_en)
    ind_fea = (df['month_day']<= fea_position)     
    data_label = df.loc[ind_label].copy()
    data_fea = df.loc[ind_fea].copy() # 用来构建特征集合
    data_candidates = df.loc[ind_candidate].copy()          
    
    # 构建候选集
    df_candidates  =  data_candidates[['user_log_acct','item_sku_id']].copy()    
    df_candidates  =  df_candidates.drop_duplicates(subset = ['user_log_acct','item_sku_id']) 
    df_candidates  =  df_candidates.loc[(df_candidates.item_sku_id.isnull() == False)]
    
    # 构建标签 
    label = data_label[['user_log_acct','item_sku_id','day']].copy() 
    print('get label')
    
    # 打标签 
    df_candidates['label_cnt'] = 0 
    df_candidates['label_days'] = 0 
    
    # user + item 特征
    df_candidates['user_item'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_sku_id'].astype(str)

    # user + cate 特征
    df_candidates['user_cate'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str)

    # user + cate + shop 特征
    df_candidates['user_cate_shop_id'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str) + '_' + df_candidates['shop_id'].astype(str)

    label['user_item'] = label['user_log_acct'].astype(str)+'_' + label['item_sku_id'].astype(str)
    dic_cnt  = label['user_item'].value_counts().to_dict()
    dic_days = label.groupby('user_item')['day'].nunique().to_dict()  
    df_candidates['label_cnt'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_cnt)).values
    df_candidates['label_days'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_days)).values
    
    return df_candidates, data_fea
%%time
df_valid_label,data_valid_fea = get_label(action_data, label_st = (4,11), label_en = (4,15), candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10))
get label
CPU times: user 5.43 s, sys: 871 ms, total: 6.3 s
Wall time: 6.3 s
%%time
df_train_label1,data_train_fea1 = get_label(action_data, label_st = (4,6), label_en = (4,10), candidate_st = (4,1), candidate_en = (4,5), fea_en = (4,5))
get label
CPU times: user 4.69 s, sys: 640 ms, total: 5.33 s
Wall time: 5.32 s

特征构建

原始特征

## 原始特征
jd_user = action_data[['user_log_acct','age','sex','user_level','province','city','county']].drop_duplicates(['user_log_acct'], keep='first')
jd_item = action_data[['item_sku_id','brand_code','shop_id','item_third_cate_cd','vender_id','shop_score']].drop_duplicates(['item_sku_id'], keep='first')

user特征

def gen_action_freq_feats(df, start_date):
    
    key = ['user_log_acct']
    action = df[key+['action_type', 'action_time']].copy()
    feats = pd.DataFrame(action[key].drop_duplicates())
    
    for w in tqdm([1, 3, 5, 7, 15, 30]): 
        bef_start_date = start_date - datetime.timedelta(days=w) 
        
        action_cl = action[action['action_time']>=bef_start_date].copy()
        df = pd.get_dummies(action_cl['action_type'], prefix='_'.join(key)+'_last{}_days_action'.format(w))
        action_cl = pd.concat([action_cl, df], axis=1)
        action_cl = action_cl.groupby(key, as_index=False).sum()
        action_cl['_'.join(key)+'_last{}_days_action_1_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_1'.format(w)])
        action_cl['_'.join(key)+'_last{}_days_action_3_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_3'.format(w)])
        action_cl['_'.join(key)+'_last{}_days_action_4_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_4'.format(w)])

        del action_cl['action_type']
        feats = feats.merge(action_cl, on=key, how='left')
    return feats

u_fea_train1 = gen_action_freq_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
u_fea_val1   = gen_action_freq_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 6/6 [00:04<00:00,  1.21it/s]
100%|██████████| 6/6 [00:05<00:00,  1.08it/s]
u_fea_train1.head()
user_log_acct user_log_acct_last1_days_action_1 user_log_acct_last1_days_action_2 user_log_acct_last1_days_action_3 user_log_acct_last1_days_action_4 user_log_acct_last1_days_action_1_rt user_log_acct_last1_days_action_3_rt user_log_acct_last1_days_action_4_rt user_log_acct_last3_days_action_1 user_log_acct_last3_days_action_2 ... user_log_acct_last15_days_action_1_rt user_log_acct_last15_days_action_3_rt user_log_acct_last15_days_action_4_rt user_log_acct_last30_days_action_1 user_log_acct_last30_days_action_2 user_log_acct_last30_days_action_3 user_log_acct_last30_days_action_4 user_log_acct_last30_days_action_1_rt user_log_acct_last30_days_action_3_rt user_log_acct_last30_days_action_4_rt
0 270769 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.000000 0.000000 0.000000 22.0 1.0 0.0 0.0 0.043478 1.000000 1.000000
1 492809 11.0 0.0 0.0 0.0 0.0 0.0 0.0 17.0 0.0 ... 0.000000 0.000000 0.000000 30.0 0.0 1.0 0.0 0.000000 0.000000 0.000000
2 438196 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.000000 0.000000 0.000000 4.0 0.0 0.0 0.0 0.000000 0.000000 0.000000
3 49368 NaN NaN NaN NaN NaN NaN NaN 7.0 0.0 ... 0.015873 0.166667 1.000000 275.0 1.0 23.0 0.0 0.003623 0.041667 1.000000
4 715642 36.0 0.0 0.0 0.0 0.0 0.0 0.0 54.0 0.0 ... 0.003125 0.500000 0.333333 834.0 2.0 1.0 6.0 0.002395 1.000000 0.285714

5 rows × 43 columns

item_sku_id 特征

def gen_item_feats(df_item):
    
    df_item_fea = df_item.copy()
    
    for col in ['item_third_cate_cd','vender_id']: 
        dic_ =  df_item[col].value_counts().to_dict()
        df_item_fea['{}_cnt'.format(col)] = df_item_fea[col].map(dic_).values 
        
    for col in ['shop_score']: 
        dic_ =  df_item.groupby('item_third_cate_cd')[col].mean().to_dict()
        df_item_fea['cate_{}_mean'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values  
    
    for col in ['item_sku_id','brand_code']: 
        dic_ =  df_item.groupby('shop_id')[col].nunique()
        df_item_fea['shop_id_{}_nunique'.format(col)] = df_item_fea['shop_id'].map(dic_).values  
        
    for col in ['item_sku_id','brand_code']: 
        dic_ =  df_item.groupby('item_third_cate_cd')[col].nunique()
        df_item_fea['item_third_cate_cd_{}_nunique'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values  
    
    del df_item_fea['item_third_cate_cd']
    
    return df_item_fea

item_feats = gen_item_feats(jd_item)

user + cate 特征

def get_uc_feats(df, start_date):  
      
    df['user_cate']         = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str) 
    df_fea                  = df[['user_cate']].copy()
    df_fea                  = df_fea.drop_duplicates(subset = ['user_cate'])  
    # 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位 
    df['action_time_diff']  = df.groupby('user_cate')['action_time'].shift().values   
    df['action_time_diff']    = df['action_time'] - df['action_time_diff']
    df['action_time_diff']    = df['action_time_diff'].dt.seconds // 60  
    df['action_time_to_now']  = start_date - df['action_time']
    df['action_time_to_now']  = df['action_time_to_now'].dt.seconds // 60  
    # 最后一次操作距离当前的时间 
    dic_ = df.groupby('user_cate')['action_time_to_now'].min().to_dict()
    df_fea['user_cate_action_time_to_now_last']  = df_fea['user_cate'].map(dic_).values 
    # 以当前位置为核心位置往前移动,过去三周每周的情况 
    #for days in tqdm([1,3,7,14,21,30]): 
    for days in tqdm([1,3,7,14]): 
        tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))  # &(df['action_time'] <= st_time  
        # 相邻两次操作 #
        df_tmp = df[tmp_ind].copy()   
        dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict() 
        df_fea['user_cate_{}_day_cnt'.format(days)] = df_fea['user_cate'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
        df_fea['user_cate_{}_day_nunique_pct'.format(days)] = df_fea['user_cate'].map(dic_).values  * 1.0 / days 
        dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
        df_fea['user_cate_{}_timediff_mean'.format(days)] = df_fea['user_cate'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
        df_fea['user_cate_{}_timediff_std'.format(days)] = df_fea['user_cate'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
        df_fea['user_cate_{}_timediff_median'.format(days)] = df_fea['user_cate'].map(dic_).values 
        
        for type_ in [1,2,3,4]:
            
            ind_type = df['action_type'] == type_
            ind = tmp_ind & ind_type 
            df_tmp = df[ind].copy()  
            
            dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict() 
            df_fea['type_{}_user_cate_{}_day_cnt'.format(type_,days)] = df_fea['user_cate'].map(dic_).values  
            if days > 1 and type_ == 2: 
                # 本次下单距离上一次下单的时间差的统计特征 
                df_tmp['action_time_diff']  = df_tmp.groupby('user_cate')['action_time'].shift().values 
                df_tmp['action_time_diff']  = df_tmp['action_time'] - df_tmp['action_time_diff']
                df_tmp['action_time_diff']  = df_tmp['action_time_diff'].dt.seconds // 60  
                dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
                df_fea['type_{}_user_cate_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate'].map(dic_).values  * 1.0 / days
                dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
                df_fea['type_{}_user_cate_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate'].map(dic_).values 
                dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
                df_fea['type_{}_user_cate_{}_timediff_std'.format(type_,days)] = df_fea['user_cate'].map(dic_).values 
                dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
                df_fea['type_{}_user_cate_{}_timediff_median'.format(type_,days)] = df_fea['user_cate'].map(dic_).values 
                
    return df_fea  

uc_fea_train = get_uc_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
uc_fea_val = get_uc_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 4/4 [01:42<00:00, 25.70s/it]
100%|██████████| 4/4 [01:40<00:00, 25.25s/it]

user + cate + shop 特征

def get_ucs_feats(df, start_date):
    
    df['user_cate_shop_id'] = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str)+ '_' + df['shop_id'].astype(str) 
    df_fea                  = df[['user_cate_shop_id']].copy()
    df_fea                  = df_fea.drop_duplicates(subset = ['user_cate_shop_id'])    
    # 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位 #
    df['action_time_diff']  = df.groupby('user_cate_shop_id')['action_time'].shift().values
    df['action_time_diff']    = df['action_time'] - df['action_time_diff']
    df['action_time_diff']    = df['action_time_diff'].dt.seconds // 60     
    df['action_time_to_now']  = start_date - df['action_time']
    df['action_time_to_now']  = df['action_time_to_now'].dt.seconds // 60     
  
    # 最后一次操作距离当前的时间 #
    dic_ = df.groupby('user_cate_shop_id')['action_time_to_now'].min().to_dict()
    df_fea['user_cate_shop_id_action_time_to_now_last']  = df_fea['user_cate_shop_id'].map(dic_).values      
    # 以当前位置为核心位置往前移动,过去三周每周的情况 #
    # for days in tqdm([0,1,3,7,14,21]): 
    for days in tqdm([0,3,7]): 
        tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))  # &(df['action_time'] <= st_time 
        # 相邻两次操作 #
        df_tmp = df[tmp_ind].copy()   
        dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict() 
        df_fea['user_cate_shop_id_{}_day_cnt'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
        df_fea['user_cate_shop_id_{}_day_nunique_pct'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values  * 1.0 / days       
        dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
        df_fea['user_cate_shop_id_{}_timediff_mean'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
        df_fea['user_cate_shop_id_{}_timediff_std'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values 
        dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict()
        df_fea['user_cate_shop_id_{}_timediff_median'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values 
        
        for type_ in [1,2,3,4]:
            ind_type = df['action_type'] == type_
            ind = tmp_ind & ind_type 
            df_tmp = df[ind].copy()  
            
            dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict() 
            df_fea['type_{}_user_cate_shop_id_{}_day_cnt'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values              
            if days > 1 and type_ == 2: 
                # 本次下单距离上一次下单的时间差的统计特征 #
                df_tmp['action_time_diff']  = df_tmp.groupby('user_cate_shop_id')['action_time'].shift().values 
                df_tmp['action_time_diff']  = df_tmp['action_time'] - df_tmp['action_time_diff']
                df_tmp['action_time_diff']  = df_tmp['action_time_diff'].dt.seconds // 60 
                dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
                df_fea['type_{}_user_cate_shop_id_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values  * 1.0 / days 
                dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
                df_fea['type_{}_user_cate_shop_id_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values 
                dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
                df_fea['type_{}_user_cate_shop_id_{}_timediff_std'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values 
                dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict() 
                df_fea['type_{}_user_cate_shop_id_{}_timediff_median'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values               
                
    return df_fea 

ucs_fea_train = get_ucs_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ucs_fea_val = get_ucs_feats(data_valid_fea, datetime.datetime(2020, 4, 10))

user + item 特征

def get_ui_feats(df, start_date): 
    
    df['user_item']         = df['user_log_acct'].astype(str)+'_' + df['item_sku_id'].astype(str) 
    df_fea                  = df[['user_item']].copy()
    df_fea                  = df_fea.drop_duplicates(subset = ['user_item'])  
    # 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位 
    df['action_time_diff']  = df.groupby('user_item')['action_time'].shift().values
       
    df['action_time_diff']    = df['action_time'] - df['action_time_diff']
    df['action_time_diff']    = df['action_time_diff'].dt.seconds // 60  
    df['action_time_to_now']  = start_date - df['action_time']
    df['action_time_to_now']  = df['action_time_to_now'].dt.seconds // 60  
    # 最后一次操作距离当前的时间 
    dic_ = df.groupby('user_item')['action_time_to_now'].min().to_dict()
    df_fea['user_item_action_time_to_now_last']  = df_fea['user_item'].map(dic_).values 
    # 以当前位置为核心位置往前移动,过去三周每周的情况 
    #for days in tqdm([1,3,7,14,21]): 
    for days in tqdm([1,3,7]): 
        tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))  # &(df['action_time'] <= st_time  
        # 相邻两次操作 #
        df_tmp = df[tmp_ind].copy()   
        dic_ = df_tmp.groupby('user_item')['day'].count().to_dict() 
        df_fea['user_item_{}_day_cnt'.format(days)] = df_fea['user_item'].map(dic_).values 
        dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
        df_fea['user_item_{}_day_nunique_pct'.format(days)] = df_fea['user_item'].map(dic_).values  * 1.0 / days 
        dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
        df_fea['user_item_{}_timediff_mean'.format(days)] = df_fea['user_item'].map(dic_).values 
        dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
        df_fea['user_item_{}_timediff_std'.format(days)] = df_fea['user_item'].map(dic_).values 
        dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
        df_fea['user_item_{}_timediff_median'.format(days)] = df_fea['user_item'].map(dic_).values 
        
        for type_ in [1,2,3,4]:
            
            ind_type = df['action_type'] == type_
            ind = tmp_ind & ind_type 
            df_tmp = df[ind].copy()  
            
            dic_ = df_tmp.groupby('user_item')['day'].count().to_dict() 
            df_fea['type_{}_user_item_{}_day_cnt'.format(type_,days)] = df_fea['user_item'].map(dic_).values  
            if days > 1 and type_ == 2: 
                # 本次下单距离上一次下单的时间差的统计特征 
                df_tmp['action_time_diff']  = df_tmp.groupby('user_item')['action_time'].shift().values 
                df_tmp['action_time_diff']  = df_tmp['action_time'] - df_tmp['action_time_diff']
                df_tmp['action_time_diff']  = df_tmp['action_time_diff'].dt.seconds // 60  
                dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
                df_fea['type_{}_user_item_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_item'].map(dic_).values  * 1.0 / days
                dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
                df_fea['type_{}_user_item_{}_timediff_mean'.format(type_,days)] = df_fea['user_item'].map(dic_).values 
                dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
                df_fea['type_{}_user_item_{}_timediff_std'.format(type_,days)] = df_fea['user_item'].map(dic_).values 
                dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
                df_fea['type_{}_user_item_{}_timediff_median'.format(type_,days)] = df_fea['user_item'].map(dic_).values 
                
    return df_fea  

ui_fea_train = get_ui_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ui_fea_val = get_ui_feats(data_valid_fea, datetime.datetime(2020, 4, 10))

合并特征集

u_fea_cols1    = [col for col in u_fea_train1.columns if col not in ['user_log_acct']]  
u_fea_cols2    = [col for col in jd_user.columns if col not in ['user_log_acct']]  
i_fea_cols     = [col for col in jd_item.columns if col not in ['item_sku_id']] 
ui_fea_cols    = [col for col in ui_fea_train.columns if col not in ['user_item']]

# user + cate 特征
uc_fea_cols    = [col for col in uc_fea_train.columns if col not in ['user_cate']]
# user + cate + shop 特征
ucs_fea_cols   = [col for col in ucs_fea_train.columns if col not in ['user_cate_shop_id']]
train_cols     = ['user_log_acct','item_sku_id'] + u_fea_cols1 + u_fea_cols2 + i_fea_cols + uc_fea_cols + ucs_fea_cols + ui_fea_cols + uc_fea_cols + ucs_fea_cols 

训练集&验证集

# 训练集
df_train =  df_train_label1.merge(u_fea_train1, on ='user_log_acct', how='left') 
#del u_fea_train1
df_train =  df_train.merge(jd_user, on ='user_log_acct', how='left')
#del jd_user
df_train =  df_train.merge(jd_item, on ='item_sku_id', how='left') 
#del jd_item
df_train =  df_train.merge(ui_fea_train, on ='user_item', how='left')
#del ui_fea_train


df_train =  df_train.merge(uc_fea_train, on ='user_cate', how='left')
df_train =  df_train.merge(ucs_fea_train, on ='user_cate_shop_id', how='left')


df_train['label'] =  df_train['label_cnt'] > 0
df_train['label'] =  df_train['label'].astype(int)
# 验证集
df_val =  df_valid_label.merge(u_fea_val1, on ='user_log_acct', how='left') 
#del u_fea_val1
df_val =  df_val.merge(jd_user, on ='user_log_acct', how='left')
#del jd_user
df_val =  df_val.merge(jd_item, on ='item_sku_id', how='left') 
#del jd_item
df_val =  df_val.merge(ui_fea_val, on ='user_item', how='left')
#del ui_fea_val

df_val =  df_train.merge(uc_fea_val, on ='user_cate', how='left')
df_val =  df_train.merge(ucs_fea_val, on ='user_cate_shop_id', how='left')

df_val['label'] =  df_val['label_cnt'] > 0
df_val['label'] =  df_val['label'].astype(int)

模型训练

eval_set = [(df_train[train_cols], df_train['label']), (df_val[train_cols], df_val['label'])]

lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
                               max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9, 
                               colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
                               learning_rate=0.01, random_state=2021, metric="None", n_jobs=20) 

lgb_model.fit(df_train[train_cols], df_train['label'] , eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
df_submit            = pd.DataFrame()
pred_proba           = lgb_model.predict_proba(df_val[train_cols])[:,1]
df_submit['user_log_acct'] = df_val['user_log_acct'].values
df_submit['item_sku_id']    = df_val['item_sku_id'].values
df_submit['prob']    = pred_proba
df_submit = df_submit.drop_duplicates(subset=['user_log_acct','item_sku_id'])
df_submit = df_submit.loc[(df_submit.item_sku_id.isnull() == False) & (df_submit.item_sku_id.isnull() == False)]

df_submit_ = df_submit.loc[df_submit.prob >= 0.06].copy()
df_submit_['item_sku_id'] = df_submit_['item_sku_id'].astype(int)
df_submit_.to_csv('df_submit_all.csv',index=False)
上一篇:attention - 0 - Deep Networks with Internal Selective Attention through Feedback Connections - 1 - 论


下一篇:midas fea基本操作