一、导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openpyxl
import datetime as dt
二、导入数据
df=pd.read_excel(os.getcwd() + os.path.sep + "data.xlsx")
三、实现
df.sort_values('memberid').head(5)
def retention(data,n,dur=False):
result = {}
dates = sorted(set(data['购买日期']))
new_member = data[['memberid','购买日期']].groupby('memberid').min()
for date in dates:
date_pay_p = set(new_member[new_member['购买日期']==date].index) # 当日首次下单用户
daten = date + dt.timedelta(days=n)
if dur == True:
daten_pay_p = set(df[(df['购买日期'] > date) & (df['购买日期']<=daten)]['memberid']) # 首次下单后n日内(含第n日)再次下单用户
else:
daten_pay_p = set(df[df['购买日期']==daten]['memberid']) # 首次下单第n日再次下单用户
day_count = len(date_pay_p)
nday_count = len(daten_pay_p)
retention_count = len(date_pay_p.intersection(daten_pay_p))
if len(date_pay_p) == 0:
retention_rate = np.nan
else:
retention_rate = retention_count/day_count
result[date] = [day_count,nday_count,retention_count,retention_rate]
result = pd.DataFrame.from_dict(result).T
result.columns=['day_count','nday_count','retention_count','retention_rate']
return(result)
四、结果呈现
retention_table = retention(df,3)
retention_table['Month'] = pd.to_datetime(retention_table.index).month
retention_table.groupby(retention_table['Month'])['retention_rate'].mean().plot()
五、数据及源码
阿里云盘:https://www.aliyundrive.com/s/R1y61iU6zdT