今日分感谢大神们
下面我们直接上代码
def GetUserMerchantRelateInfo(feature):
#4.user_merchant:
#times_user_buy_merchant_before.
feature3 = feature
all_user_merchant = feature3[['user_id','merchant_id']]
all_user_merchant.drop_duplicates(inplace=True)
#只保留销售了商品的商户id
t = feature3[['user_id','merchant_id','date']]
t = t[t.date!='null'][['user_id','merchant_id']]
#用户一共买了这家商户的多少商品
t['user_merchant_buy_total'] = 1
t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
t.drop_duplicates(inplace=True)
t1 = feature3[['user_id','merchant_id','coupon_id']]
t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
#用户一共收到一个商户的多少优惠券
t1['user_merchant_received'] = 1
t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
t1.drop_duplicates(inplace = True)
t2 = feature3[['user_id','merchant_id','date','date_received']]
t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
#用户在一家商户中使用优惠券购买的商品的数目
t2['user_merchant_buy_use_coupon'] = 1
t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
t2.drop_duplicates(inplace = True)
#用户在一家商家的所有记录总数
t3 = feature3[['user_id','merchant_id']]
t3['user_merchant_any'] = 1
t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
t3.drop_duplicates(inplace = True)
t4 = feature3[['user_id','merchant_id','date','coupon_id']]
t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
#用户没有使用优惠券购买的商品的数目
t4['user_merchant_buy_common'] = 1
t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
t4.drop_duplicates(inplace = True)
user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
#都是针对一家商户和一个用户
user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
#y优惠券的转换率,用户使用了的优惠券/一共收到的优惠券
user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
#用户使用优惠券的概率,在一家商户使用优惠券购买的商品/在一家商户购买商品的总数
user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
#用户在商户消费的概率 用户在商户购买的总数/在一家商户浏览的总次数
user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
#用户在一家商户不适用优惠券购买的概率 普通购买的商品数/购买商品的总数
user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
return user_merchant3
训练集和测试集的构造
def get_label(s):
s = s.split(':')
if s[0]=='null':
return 0
elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
return 1
else:
return -1
def GenerateData(dataset, feature, label=True):
# 获取各个特征处理后的结果
coupon_feature = GetCouponFeature(dataset, feature)
merchant_feature = GetMerchantFeature(feature)
user_feature = GetUserRelateInfo(feature)
user_merchant = GetUserMerchantRelateInfo(feature)
other_feature = GetOtherFeature(dataset)
dataset = pd.merge(coupon_feature, merchant_feature,
on='merchant_id', how='left')
dataset = pd.merge(dataset, user_feature, on='user_id', how='left')
dataset = pd.merge(dataset, user_merchant, on=[
'user_id', 'merchant_id'], how='left')
dataset = pd.merge(dataset, other_feature, on=[
'user_id', 'coupon_id', 'date_received'], how='left')
dataset.drop_duplicates(inplace=True)
dataset.user_merchant_buy_total = dataset.user_merchant_buy_total.replace(
np.nan, 0)
dataset.user_merchant_any = dataset.user_merchant_any.replace(np.nan, 0)
dataset.user_merchant_received = dataset.user_merchant_received.replace(
np.nan, 0)
dataset['is_weekend'] = dataset.day_of_week.apply(
lambda x: 1 if x in (6, 7) else 0)
weekday_dummies = pd.get_dummies(dataset.day_of_week)
weekday_dummies.columns = [
'weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
dataset = pd.concat([dataset, weekday_dummies], axis=1)
# 如果是训练集要记得处理label标签值 但是在测试集中不用处理label标签 注意off_train和off_test字段
if label:
dataset['label'] = dataset.date.astype(
'str') + ':' + dataset.date_received.astype('str')
dataset.label = dataset.label.apply(get_label)
dataset.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
'coupon_count'], axis=1, inplace=True)
else:
dataset.drop(['merchant_id', 'day_of_week', 'coupon_count'],
axis=1, inplace=True)
# 所有的表都要一起处理null
dataset = dataset.replace('null', np.nan)
return dataset
特征处理及保存
GenerateData1 = GenerateData(dataset1, feature1)
GenerateData2 = GenerateData(dataset2, feature2)
GenerateData3 = GenerateData(dataset3, feature3, False)
GenerateData1.to_csv('./GenerateData1.csv', index=None)
GenerateData2.to_csv('./GenerateData2.csv', index=None)
GenerateData3.to_csv('./GenerateData3.csv', index=None)