优惠券预测——数据探索2

#分隔符
separator=':'
#计算折扣率,将满减和折扣统一
#因为discount_rate为null的时候一般都是没有使用优惠券,这个时候折扣应该是1
def get_discount_rate(s):
    s = str(s)
    if s=='null':
        return -1
        #return 1
    s = s.split(separator)
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])

#获取是否满减(full reduction promotion)
def get_if_fd(s):
    s = str(s)
    s = s.split(separator)
    if len(s)==1:
        return 0
    else:
        return 1
        
#获取满减的条件
def get_full_value(s):
    s = str(s)
    s = s.split(separator)
    if len(s)==1:
        return -1
    else:
        return int(s[0])
        
#获取满减的优惠     
def get_reduction_value(s):
    s = str(s)
    s = s.split(separator)
    if len(s) == 1:
        return -1
    else:
        return int(s[1])


#获取月份
def get_month(s):
    if s[0]=='null':
        return -1
    else:    
        return int(s[4:6])

#获取日期
def get_day(s):
    if s[0]=='null':
        return -1
    else:    
        return int(s[6:8])
    
#获取日期间隔输入内容为Date:Date_received
def get_day_gap(s):
    s = s.split(separator)
    if s[0]=='null':
        return -1
    if s[1]=='null':
        return -1
    else:    
        return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days

#获取Label,输入内容为Date:Date_received
def get_label(s):
    s = s.split(separator)
    if s[0]=='null':
        return 0
    if s[1]=='null':
        return -1
    elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
        return 1
    else:
        return -1
def add_feature(df):
    df['if_fd']=df['discount_rate'].apply(get_if_fd)
    df['full_value']=df['discount_rate'].apply(get_full_value)
    df['reduction_value']=df['discount_rate'].apply(get_reduction_value)
    df['discount_rate']=df['discount_rate'].apply(get_discount_rate)
    df['distance']=df['distance'].replace('null',-1).astype(int)
    #df['month_received'] = df['date_received'].apply(get_month)
    #df['month'] = df['date'].apply(get_month)
    return df
    
def add_label(df):
    df['day_gap']=df['date'].astype('str') + ':' +  df['date_received'].astype('str')
    df['label']=df['day_gap'].apply(get_label)
    df['day_gap']=df['day_gap'].apply(get_day_gap)
    return df
#拷贝数据,免得调试的时候重读文件
dftrain = off_train.copy()
dftest = off_test.copy()
dftrain=add_feature(dftrain)
dftrain=add_label(dftrain)
dftest=add_feature(dftest)
# 数据分析
dftrain.head()

dftrain.describe()

dftrain[dftrain.distance>=0]['distance'].value_counts()/dftrain[dftrain.distance>=0]['distance'].count()

dftest[dftest.distance>=0]['distance'].value_counts()/dftest[dftest.distance>=0]['distance'].count()

dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].value_counts()/dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].count()
print ('Offline 训练集满减情况')
dftrain.if_fd.value_counts()/dftrain.if_fd.count()
print ('测试集满减情况')
dftest.if_fd.value_counts()/dftest.if_fd.count()
# 箱线图查看分布
fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],orient="v", width=0.5)
fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],orient="v", width=0.5)
# 直方图和QQ图
plt.figure(figsize=(10,5))

ax=plt.subplot(1,2,1)
sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],fit=stats.norm)
ax=plt.subplot(1,2,2)
res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], plot=plt)
plt.figure(figsize=(10,5))

ax=plt.subplot(1,2,1)
sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],fit=stats.norm)
ax=plt.subplot(1,2,2)
res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], plot=plt)
# 对比分布
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], color="Red", shade=True)
ax = sns.kdeplot(dftest[(dftest.discount_rate>=0)]['discount_rate'], color="Blue", shade=True)
ax.set_xlabel('discount_rate')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], color="Red", shade=True)
ax = sns.kdeplot(dftest[(dftest.distance>=0)]['distance'], color="Blue", shade=True)
ax.set_xlabel('distance')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.full_value>=0)]['full_value'], color="Red", shade=True)
ax = sns.kdeplot(dftest[(dftest.full_value>=0)]['full_value'], color="Blue", shade=True)
ax.set_xlabel('full_value')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.reduction_value>=0)]['reduction_value'], color="Red", shade=True)
ax = sns.kdeplot(dftest[(dftest.reduction_value>=0)]['reduction_value'], color="Blue", shade=True)
ax.set_xlabel('reduction_value')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
# 可视化线性关系
fcols = 2
frows = 1
plt.figure(figsize=(8,4))
ax=plt.subplot(1,2,1)
sns.regplot(x='distance', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.distance>=0)][['distance','label']], ax=ax, 
            scatter_kws={'marker':'.','s':3,'alpha':0.3},
            line_kws={'color':'k'});
plt.xlabel('distance')
plt.ylabel('label')
ax=plt.subplot(1,2,2)
sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].dropna())
plt.xlabel('distance')
plt.show()
fcols = 2
frows = 1
plt.figure(figsize=(8,4))
ax=plt.subplot(1,2,1)
sns.regplot(x='discount_rate', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)][['discount_rate','label']], ax=ax, 
            scatter_kws={'marker':'.','s':3,'alpha':0.3},
            line_kws={'color':'k'});
plt.xlabel('discount_rate')
plt.ylabel('label')
ax=plt.subplot(1,2,2)
sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'].dropna())
plt.xlabel('discount_rate')
plt.show()

 

上一篇:Codeforces Round #715 (Div. 2)赛后补题


下一篇:07 监控100台服务器磁盘利用率