电商推荐系统架构
## 数据工具包
import numpy as np
import pandas as pd
from tqdm import tqdm
## 字符串处理工具包
import string
import re
import gensim
from collections import Counter
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from functools import partial
import os
import gc
from scipy.sparse import vstack
import time
import datetime
import joblib
import multiprocessing as mp
import seaborn as sns
%matplotlib inline
action_data = pd.read_csv('./data.csv')
action_data.head()
user_log_acct
item_sku_id
action_time
action_type
brand_code
shop_id
item_third_cate_cd
vender_id
shop_score
age
sex
user_level
province
city
county
0
937922
357022
2020-02-04 08:28:15
1
1791.0
8703.0
10.0
5227.0
-1.000000
5.0
1.0
5
11.0
348.0
1782.0
1
937922
73
2020-02-04 08:27:07
1
1791.0
8703.0
10.0
5227.0
-1.000000
5.0
1.0
5
11.0
348.0
1782.0
2
937922
29583
2020-02-04 08:26:31
1
1791.0
2738.0
10.0
3436.0
9.206167
5.0
1.0
5
11.0
348.0
1782.0
3
937922
108763
2020-02-04 08:26:10
1
1791.0
2738.0
10.0
3436.0
9.206167
5.0
1.0
5
11.0
348.0
1782.0
4
1369473
331139
2020-02-03 21:55:49
1
9985.0
6367.0
73.0
3666.0
0.000000
5.0
1.0
5
1.0
41.0
2058.0
action_data.shape
(37214269, 15)
数据分析
action_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37214269 entries, 0 to 37214268
Data columns (total 15 columns):
# Column Dtype
--- ------ -----
0 user_log_acct int64
1 item_sku_id int64
2 action_time object
3 action_type int64
4 brand_code float64
5 shop_id float64
6 item_third_cate_cd float64
7 vender_id float64
8 shop_score float64
9 age float64
10 sex float64
11 user_level int64
12 province float64
13 city float64
14 county float64
dtypes: float64(10), int64(4), object(1)
memory usage: 4.2+ GB
action_data['action_time'].apply(lambda x:x[:7]).value_counts()
2020-02 15109221
2020-03 15004111
2020-04 7100937
Name: action_time, dtype: int64
action_data['action_type'].value_counts()
1 33151074
2 2193489
4 826761
5 600979
3 441966
Name: action_type, dtype: int64
action_data['sex'].value_counts()
0.0 24293534
1.0 12824478
-1.0 79207
Name: sex, dtype: int64
action_data['user_level'].value_counts()
7 14440505
5 9062005
1 8763570
6 4910432
4 24432
3 13022
2 303
Name: user_level, dtype: int64
数据预处理
action_data['action_time'].apply(lambda x: len(str(x))).value_counts()
19 34578820
21 2635449
Name: action_time, dtype: int64
# 存在异常值,需要修改
action_data['dd_len'] = action_data['action_time'].apply(lambda x: len(str(x)))
action_data['action_time'] = action_data['action_time'].apply(lambda x: x[:19])
del action_data['dd_len']
action_data['action_time'] = pd.to_datetime(action_data['action_time'])
action_data = action_data.sort_values('action_time')
action_data['month'] = action_data['action_time'].dt.month
action_data['day'] = action_data['action_time'].dt.day
action_data['month_day'] = action_data['month'].values * 100 + action_data['day'].values
action_data['month_day'].value_counts()
204 890713
206 861885
205 799023
201 795076
203 783515
...
217 300989
216 252465
215 218483
328 65396
327 52573
Name: month_day, Length: 74, dtype: int64
训练集切分
def _label_trans(x, dic_):
try:
return dic_[x]
except:
return 0
def get_label(df, label_st = (4,11), label_en = (4,15),candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10)):
lb_st = df.loc[(df['month'] == label_st[0]) & (df['day'] == label_st[1]), 'month_day'].values[0]
lb_en = df.loc[(df['month'] == label_en[0]) & (df['day'] == label_en[1]), 'month_day'].values[0]
cand_st = df.loc[(df['month'] == candidate_st[0]) & (df['day'] == candidate_st[1]), 'month_day'].values[0]
cand_en = df.loc[(df['month'] == candidate_en[0]) & (df['day'] == candidate_en[1]), 'month_day'].values[0]
fea_position = df.loc[(df['month'] == fea_en[0]) & (df['day'] == fea_en[1]), 'month_day'].values[0]
ind_label = (df['month_day']>= lb_st) & (df['month_day']<= lb_en) & (df['action_type'] ==2)
ind_candidate = (df['month_day']>= cand_st) & (df['month_day']<= cand_en)
ind_fea = (df['month_day']<= fea_position)
data_label = df.loc[ind_label].copy()
data_fea = df.loc[ind_fea].copy() # 用来构建特征集合
data_candidates = df.loc[ind_candidate].copy()
# 构建候选集
df_candidates = data_candidates[['user_log_acct','item_sku_id']].copy()
df_candidates = df_candidates.drop_duplicates(subset = ['user_log_acct','item_sku_id'])
df_candidates = df_candidates.loc[(df_candidates.item_sku_id.isnull() == False)]
# 构建标签
label = data_label[['user_log_acct','item_sku_id','day']].copy()
print('get label')
# 打标签
df_candidates['label_cnt'] = 0
df_candidates['label_days'] = 0
# user + item 特征
df_candidates['user_item'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_sku_id'].astype(str)
# user + cate 特征
df_candidates['user_cate'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str)
# user + cate + shop 特征
df_candidates['user_cate_shop_id'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str) + '_' + df_candidates['shop_id'].astype(str)
label['user_item'] = label['user_log_acct'].astype(str)+'_' + label['item_sku_id'].astype(str)
dic_cnt = label['user_item'].value_counts().to_dict()
dic_days = label.groupby('user_item')['day'].nunique().to_dict()
df_candidates['label_cnt'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_cnt)).values
df_candidates['label_days'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_days)).values
return df_candidates, data_fea
%%time
df_valid_label,data_valid_fea = get_label(action_data, label_st = (4,11), label_en = (4,15), candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10))
get label
CPU times: user 5.43 s, sys: 871 ms, total: 6.3 s
Wall time: 6.3 s
%%time
df_train_label1,data_train_fea1 = get_label(action_data, label_st = (4,6), label_en = (4,10), candidate_st = (4,1), candidate_en = (4,5), fea_en = (4,5))
get label
CPU times: user 4.69 s, sys: 640 ms, total: 5.33 s
Wall time: 5.32 s
特征构建
原始特征
## 原始特征
jd_user = action_data[['user_log_acct','age','sex','user_level','province','city','county']].drop_duplicates(['user_log_acct'], keep='first')
jd_item = action_data[['item_sku_id','brand_code','shop_id','item_third_cate_cd','vender_id','shop_score']].drop_duplicates(['item_sku_id'], keep='first')
user特征
def gen_action_freq_feats(df, start_date):
key = ['user_log_acct']
action = df[key+['action_type', 'action_time']].copy()
feats = pd.DataFrame(action[key].drop_duplicates())
for w in tqdm([1, 3, 5, 7, 15, 30]):
bef_start_date = start_date - datetime.timedelta(days=w)
action_cl = action[action['action_time']>=bef_start_date].copy()
df = pd.get_dummies(action_cl['action_type'], prefix='_'.join(key)+'_last{}_days_action'.format(w))
action_cl = pd.concat([action_cl, df], axis=1)
action_cl = action_cl.groupby(key, as_index=False).sum()
action_cl['_'.join(key)+'_last{}_days_action_1_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_1'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_3_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_3'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_4_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_4'.format(w)])
del action_cl['action_type']
feats = feats.merge(action_cl, on=key, how='left')
return feats
u_fea_train1 = gen_action_freq_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
u_fea_val1 = gen_action_freq_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 6/6 [00:04<00:00, 1.21it/s]
100%|██████████| 6/6 [00:05<00:00, 1.08it/s]
u_fea_train1.head()
user_log_acct
user_log_acct_last1_days_action_1
user_log_acct_last1_days_action_2
user_log_acct_last1_days_action_3
user_log_acct_last1_days_action_4
user_log_acct_last1_days_action_1_rt
user_log_acct_last1_days_action_3_rt
user_log_acct_last1_days_action_4_rt
user_log_acct_last3_days_action_1
user_log_acct_last3_days_action_2
...
user_log_acct_last15_days_action_1_rt
user_log_acct_last15_days_action_3_rt
user_log_acct_last15_days_action_4_rt
user_log_acct_last30_days_action_1
user_log_acct_last30_days_action_2
user_log_acct_last30_days_action_3
user_log_acct_last30_days_action_4
user_log_acct_last30_days_action_1_rt
user_log_acct_last30_days_action_3_rt
user_log_acct_last30_days_action_4_rt
0
270769
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
...
0.000000
0.000000
0.000000
22.0
1.0
0.0
0.0
0.043478
1.000000
1.000000
1
492809
11.0
0.0
0.0
0.0
0.0
0.0
0.0
17.0
0.0
...
0.000000
0.000000
0.000000
30.0
0.0
1.0
0.0
0.000000
0.000000
0.000000
2
438196
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
0.000000
0.000000
0.000000
4.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
3
49368
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
0.0
...
0.015873
0.166667
1.000000
275.0
1.0
23.0
0.0
0.003623
0.041667
1.000000
4
715642
36.0
0.0
0.0
0.0
0.0
0.0
0.0
54.0
0.0
...
0.003125
0.500000
0.333333
834.0
2.0
1.0
6.0
0.002395
1.000000
0.285714
5 rows × 43 columns
item_sku_id 特征
def gen_item_feats(df_item):
df_item_fea = df_item.copy()
for col in ['item_third_cate_cd','vender_id']:
dic_ = df_item[col].value_counts().to_dict()
df_item_fea['{}_cnt'.format(col)] = df_item_fea[col].map(dic_).values
for col in ['shop_score']:
dic_ = df_item.groupby('item_third_cate_cd')[col].mean().to_dict()
df_item_fea['cate_{}_mean'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values
for col in ['item_sku_id','brand_code']:
dic_ = df_item.groupby('shop_id')[col].nunique()
df_item_fea['shop_id_{}_nunique'.format(col)] = df_item_fea['shop_id'].map(dic_).values
for col in ['item_sku_id','brand_code']:
dic_ = df_item.groupby('item_third_cate_cd')[col].nunique()
df_item_fea['item_third_cate_cd_{}_nunique'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values
del df_item_fea['item_third_cate_cd']
return df_item_fea
item_feats = gen_item_feats(jd_item)
user + cate 特征
def get_uc_feats(df, start_date):
df['user_cate'] = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str)
df_fea = df[['user_cate']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_cate'])
# 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位
df['action_time_diff'] = df.groupby('user_cate')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
# 最后一次操作距离当前的时间
dic_ = df.groupby('user_cate')['action_time_to_now'].min().to_dict()
df_fea['user_cate_action_time_to_now_last'] = df_fea['user_cate'].map(dic_).values
# 以当前位置为核心位置往前移动,过去三周每周的情况
#for days in tqdm([1,3,7,14,21,30]):
for days in tqdm([1,3,7,14]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days)) # &(df['action_time'] <= st_time
# 相邻两次操作 #
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict()
df_fea['user_cate_{}_day_cnt'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
df_fea['user_cate_{}_day_nunique_pct'.format(days)] = df_fea['user_cate'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
df_fea['user_cate_{}_timediff_mean'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
df_fea['user_cate_{}_timediff_std'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
df_fea['user_cate_{}_timediff_median'.format(days)] = df_fea['user_cate'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict()
df_fea['type_{}_user_cate_{}_day_cnt'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
if days > 1 and type_ == 2:
# 本次下单距离上一次下单的时间差的统计特征
df_tmp['action_time_diff'] = df_tmp.groupby('user_cate')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
df_fea['type_{}_user_cate_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_cate_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_cate_{}_timediff_std'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_cate_{}_timediff_median'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
return df_fea
uc_fea_train = get_uc_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
uc_fea_val = get_uc_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 4/4 [01:42<00:00, 25.70s/it]
100%|██████████| 4/4 [01:40<00:00, 25.25s/it]
user + cate + shop 特征
def get_ucs_feats(df, start_date):
df['user_cate_shop_id'] = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str)+ '_' + df['shop_id'].astype(str)
df_fea = df[['user_cate_shop_id']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_cate_shop_id'])
# 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位 #
df['action_time_diff'] = df.groupby('user_cate_shop_id')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
# 最后一次操作距离当前的时间 #
dic_ = df.groupby('user_cate_shop_id')['action_time_to_now'].min().to_dict()
df_fea['user_cate_shop_id_action_time_to_now_last'] = df_fea['user_cate_shop_id'].map(dic_).values
# 以当前位置为核心位置往前移动,过去三周每周的情况 #
# for days in tqdm([0,1,3,7,14,21]):
for days in tqdm([0,3,7]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days)) # &(df['action_time'] <= st_time
# 相邻两次操作 #
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict()
df_fea['user_cate_shop_id_{}_day_cnt'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
df_fea['user_cate_shop_id_{}_day_nunique_pct'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
df_fea['user_cate_shop_id_{}_timediff_mean'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
df_fea['user_cate_shop_id_{}_timediff_std'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict()
df_fea['user_cate_shop_id_{}_timediff_median'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_day_cnt'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
if days > 1 and type_ == 2:
# 本次下单距离上一次下单的时间差的统计特征 #
df_tmp['action_time_diff'] = df_tmp.groupby('user_cate_shop_id')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_std'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_median'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
return df_fea
ucs_fea_train = get_ucs_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ucs_fea_val = get_ucs_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
user + item 特征
def get_ui_feats(df, start_date):
df['user_item'] = df['user_log_acct'].astype(str)+'_' + df['item_sku_id'].astype(str)
df_fea = df[['user_item']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_item'])
# 1.宏观的特征: 不管是浏览还是其他操作,我们往下瞬移一个单位
df['action_time_diff'] = df.groupby('user_item')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
# 最后一次操作距离当前的时间
dic_ = df.groupby('user_item')['action_time_to_now'].min().to_dict()
df_fea['user_item_action_time_to_now_last'] = df_fea['user_item'].map(dic_).values
# 以当前位置为核心位置往前移动,过去三周每周的情况
#for days in tqdm([1,3,7,14,21]):
for days in tqdm([1,3,7]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days)) # &(df['action_time'] <= st_time
# 相邻两次操作 #
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_item')['day'].count().to_dict()
df_fea['user_item_{}_day_cnt'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
df_fea['user_item_{}_day_nunique_pct'.format(days)] = df_fea['user_item'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
df_fea['user_item_{}_timediff_mean'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
df_fea['user_item_{}_timediff_std'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
df_fea['user_item_{}_timediff_median'.format(days)] = df_fea['user_item'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_item')['day'].count().to_dict()
df_fea['type_{}_user_item_{}_day_cnt'.format(type_,days)] = df_fea['user_item'].map(dic_).values
if days > 1 and type_ == 2:
# 本次下单距离上一次下单的时间差的统计特征
df_tmp['action_time_diff'] = df_tmp.groupby('user_item')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
df_fea['type_{}_user_item_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_item'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_item_{}_timediff_mean'.format(type_,days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_item_{}_timediff_std'.format(type_,days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_item_{}_timediff_median'.format(type_,days)] = df_fea['user_item'].map(dic_).values
return df_fea
ui_fea_train = get_ui_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ui_fea_val = get_ui_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
合并特征集
u_fea_cols1 = [col for col in u_fea_train1.columns if col not in ['user_log_acct']]
u_fea_cols2 = [col for col in jd_user.columns if col not in ['user_log_acct']]
i_fea_cols = [col for col in jd_item.columns if col not in ['item_sku_id']]
ui_fea_cols = [col for col in ui_fea_train.columns if col not in ['user_item']]
# user + cate 特征
uc_fea_cols = [col for col in uc_fea_train.columns if col not in ['user_cate']]
# user + cate + shop 特征
ucs_fea_cols = [col for col in ucs_fea_train.columns if col not in ['user_cate_shop_id']]
train_cols = ['user_log_acct','item_sku_id'] + u_fea_cols1 + u_fea_cols2 + i_fea_cols + uc_fea_cols + ucs_fea_cols + ui_fea_cols + uc_fea_cols + ucs_fea_cols
训练集&验证集
# 训练集
df_train = df_train_label1.merge(u_fea_train1, on ='user_log_acct', how='left')
#del u_fea_train1
df_train = df_train.merge(jd_user, on ='user_log_acct', how='left')
#del jd_user
df_train = df_train.merge(jd_item, on ='item_sku_id', how='left')
#del jd_item
df_train = df_train.merge(ui_fea_train, on ='user_item', how='left')
#del ui_fea_train
df_train = df_train.merge(uc_fea_train, on ='user_cate', how='left')
df_train = df_train.merge(ucs_fea_train, on ='user_cate_shop_id', how='left')
df_train['label'] = df_train['label_cnt'] > 0
df_train['label'] = df_train['label'].astype(int)
# 验证集
df_val = df_valid_label.merge(u_fea_val1, on ='user_log_acct', how='left')
#del u_fea_val1
df_val = df_val.merge(jd_user, on ='user_log_acct', how='left')
#del jd_user
df_val = df_val.merge(jd_item, on ='item_sku_id', how='left')
#del jd_item
df_val = df_val.merge(ui_fea_val, on ='user_item', how='left')
#del ui_fea_val
df_val = df_train.merge(uc_fea_val, on ='user_cate', how='left')
df_val = df_train.merge(ucs_fea_val, on ='user_cate_shop_id', how='left')
df_val['label'] = df_val['label_cnt'] > 0
df_val['label'] = df_val['label'].astype(int)
模型训练
eval_set = [(df_train[train_cols], df_train['label']), (df_val[train_cols], df_val['label'])]
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9,
colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
learning_rate=0.01, random_state=2021, metric="None", n_jobs=20)
lgb_model.fit(df_train[train_cols], df_train['label'] , eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
df_submit = pd.DataFrame()
pred_proba = lgb_model.predict_proba(df_val[train_cols])[:,1]
df_submit['user_log_acct'] = df_val['user_log_acct'].values
df_submit['item_sku_id'] = df_val['item_sku_id'].values
df_submit['prob'] = pred_proba
df_submit = df_submit.drop_duplicates(subset=['user_log_acct','item_sku_id'])
df_submit = df_submit.loc[(df_submit.item_sku_id.isnull() == False) & (df_submit.item_sku_id.isnull() == False)]
df_submit_ = df_submit.loc[df_submit.prob >= 0.06].copy()
df_submit_['item_sku_id'] = df_submit_['item_sku_id'].astype(int)
df_submit_.to_csv('df_submit_all.csv',index=False)