1.背景及问题
现某IT产品销售公司,有一定量的小公司水平的用户,这些用户在做出购买时,会接触到销售公司的多个营销渠道,不同的渠道上投入怎样分配,以实现营销效益的最大化,便成为了很多公司的市场营销部门亟需解决的问题。
即:找出转化率最高的渠道路径或方式
2.思路步骤
线性模型分析
马尔科夫链分析
可视化马尔科夫链
转换率计算
第一次点击 用户访问路径上的第一个触点获取所有贡献值
最后一次点击 用户购买之前最后一个触点获取所有贡献值
线性模型分析 用户访问路径上的所有触点平分贡献值
马尔科夫链 马尔科夫链的转移矩阵 -> 每个触点的移除效应-> 触点贡献值
3.数据集介绍
Id: 某IT产品销售公司的客户,客户类型是小公司
Segment: 客户的画像
Channel:客户生命周期中触及过的渠道; DM(直邮),EM(电子邮件), PHONE(电话)和 WEB(产品销售官网浏览记录)
Date: 客户触及渠道的日期,触及时间长度为1年
Pur_flag: 等于1表示该客户在接触完相应渠道后,完成了IT产品的购买
4.代码及具体步骤
导入模块
import numpy as np
import pandas as pd
import networkx as nx
from pprint import pprint
import os
import matplotlib.pyplot as plt
os.chdir(r'C:/Users/pc/Desktop/数据分析项目/客户转化分析/')
# 导入数据,将第四列解析为日期格式
df = pd.read_csv(r'./ChannelAttribute.csv', parse_dates=[3])
df.head()
id | segment | channel | date | pur_flag | |
---|---|---|---|---|---|
0 | 20398764672 | Tier 2 | DM | 2018-03-19 | 0 |
1 | 20408399343 | Tier 2 | WEB | 2017-09-27 | 0 |
2 | 20438922645 | Tier 2 | WEB | 2017-11-15 | 0 |
3 | 20225918468 | Tier 2 | DM | 2017-05-24 | 0 |
4 | 20278581048 | Tier 3 | DM | 2018-04-23 | 0 |
创建路径数据
def create_path_with_value(data, element):
path = []
path.append('start')
df2 = data.loc[data['id'] == element, :].sort_values(['id', 'date'], ascending=[False, True])
for i in range(len(df2)):
path.append(df2.iloc[i]['channel'])
if df2['pur_flag'].unique() == 1:
path.append('conversion')
conv = 1
conv_null = 0
else:
path.append('null')
conv = 0
conv_null = 1
return [path, conv, conv_null]
final_path, conv, conv_null = [], [], []
for element in df['id'].unique():
rst = create_path_with_value(df, element)
final_path.append(rst[0])
conv.append(rst[1])
conv_null.append(rst[2])
# 路径数据
path_data = pd.DataFrame({'path': final_path, 'conv': conv, 'conv_nulls': conv_null})
path_data.head(10)
path | conv | conv_nulls | |
---|---|---|---|
0 | [start, WEB, EM, DM, null] | 0 | 1 |
1 | [start, EM, WEB, DM, null] | 0 | 1 |
2 | [start, WEB, EM, DM, null] | 0 | 1 |
3 | [start, DM, EM, WEB, null] | 0 | 1 |
4 | [start, EM, WEB, DM, null] | 0 | 1 |
5 | [start, PHONE, EM, DM, WEB, null] | 0 | 1 |
6 | [start, PHONE, WEB, DM, null] | 0 | 1 |
7 | [start, DM, PHONE, WEB, null] | 0 | 1 |
8 | [start, WEB, EM, DM, conversion] | 1 | 0 |
9 | [start, PHONE, WEB, DM, null] | 0 | 1 |
归因分析(最后一次点击、第一次点击和线性模型)
def create_last_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i]['path'][-2])
temp_conv.append(data.iloc[i]['conv'])
return pd.DataFrame({'touch': temp_path, 'Last_Conv': temp_conv})
def create_first_order_states_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i]['path'][1])
temp_conv.append(data.iloc[i]['conv'])
return pd.DataFrame({'touch': temp_path, 'First_Conv': temp_conv})
def create_linear_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
if len(data.iloc[i]['path'])==6:
for j in range(1,5):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (6 - 2))
elif len(data.iloc[i]['path'])==5:
for j in range(1,4):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (5 - 2))
elif len(data.iloc[i]['path'])==4:
for j in range(1,3):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (4 - 2))
else:
for j in range(1, 2):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (3 - 2))
return pd.DataFrame({'touch': temp_path, 'Linear_Conv': temp_conv})
last_touch = create_last_click_stats_pair(path_data).groupby('touch')['Last_Conv'].sum().reset_index()
linear_touch = create_linear_click_stats_pair(path_data).groupby('touch')['Linear_Conv'].sum().reset_index()
first_touch = create_first_order_states_pair(path_data).groupby('touch')['First_Conv'].sum().reset_index()
lst = last_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
li = linear_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
fst = first_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
dfs = [fst, lst, li]
dfs = [df for df in dfs]
dfs[0].join(dfs[1:])
First_Conv | Last_Conv | Linear_Conv | |
---|---|---|---|
touch | |||
DM | 0.341152 | 0.744850 | 0.504964 |
EM | 0.278233 | 0.097915 | 0.198478 |
PHONE | 0.094440 | 0.014768 | 0.050488 |
WEB | 0.286175 | 0.142467 | 0.246070 |
线性模型分析结论: 相比于其他渠道,DM(直邮)是转化率较优的渠道
马尔科夫链
# 手动计算状态转移矩阵
def split_states(data):
temp_data = []
for i in range(len(data)):
path = data.iloc[i]['path']
state_pairs, values = [], []
for j in range(len(path)-1):
state_pairs.append((path[j], path[j+1]))
values.append(1)
temp_data.append([state_pairs, values])
return temp_data
temps = split_states(path_data)
temps[0:3]
[[[('start', 'WEB'), ('WEB', 'EM'), ('EM', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]],
[[('start', 'EM'), ('EM', 'WEB'), ('WEB', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]],
[[('start', 'WEB'), ('WEB', 'EM'), ('EM', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]]]
def transition_maxtrix(data):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({'state_pairs': state_pairs, 'values': values})
grp_df = temp_df.groupby('state_pairs')['values'].sum().reset_index()
grp_df[['start', 'end']] = grp_df['state_pairs'].apply(pd.Series)
table = pd.crosstab(grp_df['end'], grp_df['start'], values=grp_df['values'], aggfunc=np.sum, normalize='columns')\
.applymap(lambda x: "{:3.2f}".format(x))
return table
# 输出状态转移矩阵
tmp = transition_maxtrix(temps)
tmp1 = tmp.transpose()
tmp1
end | DM | EM | PHONE | WEB | conversion | null |
---|---|---|---|---|---|---|
start | ||||||
DM | 0.00 | 0.09 | 0.01 | 0.12 | 0.21 | 0.57 |
EM | 0.43 | 0.00 | 0.02 | 0.41 | 0.04 | 0.09 |
PHONE | 0.18 | 0.11 | 0.00 | 0.69 | 0.01 | 0.01 |
WEB | 0.58 | 0.22 | 0.03 | 0.00 | 0.05 | 0.12 |
start | 0.14 | 0.29 | 0.32 | 0.25 | 0.00 | 0.00 |
计算渠道的移除效应
def channel_remove(data,channel_removed):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({'state_pairs': state_pairs, 'values': values})
grp_df = temp_df.groupby('state_pairs')['values'].sum().reset_index()
grp_df[['start', 'end']] = grp_df['state_pairs'].apply(pd.Series)
temp = grp_df.copy()
grp_df['start'] = grp_df['start'].replace(channel_removed, 'unknown')
grp_df['end'] = grp_df['end'].replace(channel_removed, 'unknown')
return [grp_df, temp]
# 筛选出成功转化路径
path_data_pur = path_data[path_data['conv']==1]
temps = split_states(path_data_pur)
conversion =[]
columns = ['start', 'end', 'values_x', 'values_y', 'perct']
# 所有渠道
channels_list = list(df['channel'].unique())
df_dummy1 = pd.DataFrame({'start': ['start', 'conversion', 'null'],
'end': ['start', 'conversion', 'null'],
'values_x': [0, 0, 0],
'values_y': [0, 0, 0],
'perct': [0, 1, 1]})
dy_dummy = pd.DataFrame(df_dummy1, columns=columns)
df_dummy2 = pd.DataFrame({'start': ['start', 'conversion', 'null'],
'end': ['start', 'conversion', 'null']})
# 逐个计算移除单个渠道后的总转化数
for chnl in channels_list:
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
df_temp = df_remove.groupby('start')['values'].sum().reset_index()
df_temp = pd.merge(df_remove, df_temp, on='start', how='left')
df_temp['perct'] = df_temp['values_x']/df_temp['values_y']
df_temp = pd.DataFrame(df_temp, columns=columns)
df_temp = pd.concat([df_temp, dy_dummy], axis=0)
df_ini = pd.DataFrame(df_noremove, columns=['start', 'end'])
df_temp2 = pd.concat([df_ini, df_dummy2], axis=0)
df_temp = pd.merge(df_temp2, df_temp, on=['start', 'end'], how='left')
# 用0填充由于左连接出现的NaN
df_temp['values_x'].fillna(0, inplace=True)
df_temp['values_y'].fillna(0, inplace=True)
df_temp['perct'].fillna(0, inplace=True)
df_trans1 = pd.crosstab(df_temp['start'], df_temp['end'], values=df_temp['perct'], aggfunc=np.sum)
df_trans1.update(df_trans1[['DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null', 'start']].fillna(0))
# 转化为numpy矩阵
df_trans_mat = np.matrix(df_trans1)
inist_n1 = pd.crosstab(df_temp['start'], df_temp['end'], values=df_temp['values_x'], aggfunc=np.sum)
inist_n1.update(inist_n1[['DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null', 'start']].fillna(0))
inist_mat = np.matrix(inist_n1.iloc[-1])
# 矩阵乘积
mat = inist_mat*df_trans_mat
# 取出估计出来的转化数
conversion.append(mat[0,4])
# 计算单个渠道的移除效应
chnl_conversion = pd.DataFrame({'channel': channels_list, 'conv': conversion})
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
tot_conv = df_remove['values'].sum()
chnl_conversion['impact'] = (tot_conv-chnl_conversion['conv'])/tot_conv
tot_impact = chnl_conversion['impact'].sum()
chnl_conversion['convet_rate']= chnl_conversion['impact']/tot_impact
chnl_conversion
channel | conv | impact | convet_rate | |
---|---|---|---|---|
0 | DM | 1003.039274 | 0.961468 | 0.265241 |
1 | WEB | 2724.928034 | 0.895320 | 0.246993 |
2 | EM | 2856.954317 | 0.890248 | 0.245594 |
3 | PHONE | 3179.825240 | 0.877845 | 0.242172 |
可视化马尔科夫链
states = ['start', 'DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null']
def _get_markov_edges(Q):
edges = {}
for col in Q.columns:
for idx in Q.index:
edges[(idx,col)] = Q.loc[idx,col]
return edges
edges_wts = _get_markov_edges(tmp1)
edges_wts
{('DM', 'DM'): '0.00',
('EM', 'DM'): '0.43',
('PHONE', 'DM'): '0.18',
('WEB', 'DM'): '0.58',
('start', 'DM'): '0.14',
('DM', 'EM'): '0.09',
('EM', 'EM'): '0.00',
('PHONE', 'EM'): '0.11',
('WEB', 'EM'): '0.22',
('start', 'EM'): '0.29',
('DM', 'PHONE'): '0.01',
('EM', 'PHONE'): '0.02',
('PHONE', 'PHONE'): '0.00',
('WEB', 'PHONE'): '0.03',
('start', 'PHONE'): '0.32',
('DM', 'WEB'): '0.12',
('EM', 'WEB'): '0.41',
('PHONE', 'WEB'): '0.69',
('WEB', 'WEB'): '0.00',
('start', 'WEB'): '0.25',
('DM', 'conversion'): '0.21',
('EM', 'conversion'): '0.04',
('PHONE', 'conversion'): '0.01',
('WEB', 'conversion'): '0.05',
('start', 'conversion'): '0.00',
('DM', 'null'): '0.57',
('EM', 'null'): '0.09',
('PHONE', 'null'): '0.01',
('WEB', 'null'): '0.12',
('start', 'null'): '0.00'}
# 移除0转化概率的边
for key, value in list(edges_wts.items()):
if value == '0.00':
edges_wts.pop(key)
pprint(edges_wts)
{('DM', 'EM'): '0.09',
('DM', 'PHONE'): '0.01',
('DM', 'WEB'): '0.12',
('DM', 'conversion'): '0.21',
('DM', 'null'): '0.57',
('EM', 'DM'): '0.43',
('EM', 'PHONE'): '0.02',
('EM', 'WEB'): '0.41',
('EM', 'conversion'): '0.04',
('EM', 'null'): '0.09',
('PHONE', 'DM'): '0.18',
('PHONE', 'EM'): '0.11',
('PHONE', 'WEB'): '0.69',
('PHONE', 'conversion'): '0.01',
('PHONE', 'null'): '0.01',
('WEB', 'DM'): '0.58',
('WEB', 'EM'): '0.22',
('WEB', 'PHONE'): '0.03',
('WEB', 'conversion'): '0.05',
('WEB', 'null'): '0.12',
('start', 'DM'): '0.14',
('start', 'EM'): '0.29',
('start', 'PHONE'): '0.32',
('start', 'WEB'): '0.25'}
# 用networkx绘制马尔科夫链
os.environ["PATH"] += os.pathsep + './graphviz-2.38/release/bin/'
G = nx.MultiDiGraph()
# 增加节点状态
G.add_nodes_from(states)
print('Nodes:\n{G.nodes()}\n')
# 边表示转换概率
for k, v in edges_wts.items():
tmp_origin, tmp_destination = k[0], k[1]
G.add_edge(tmp_origin, tmp_destination, weight=v, label=v)
print('Edges:')
pprint(G.edges(data=True))
pos = nx.drawing.nx_pydot.graphviz_layout(G, prog='dot')
nx.draw_networkx(G, pos)
# 创建边标签
edge_labels = {(n1,n2):d['label'] for n1,n2,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G , pos, edge_labels=edge_labels)
nx.drawing.nx_pydot.write_dot(G, 'customer_markov.dot')
Nodes:
{G.nodes()}
Edges:
OutMultiEdgeDataView([('DM', 'WEB', {'label': '0.12', 'weight': '0.12'}), ('DM', 'EM', {'label': '0.09', 'weight': '0.09'}), ('DM', 'null', {'label': '0.57', 'weight': '0.57'}), ('DM', 'PHONE', {'label': '0.01', 'weight': '0.01'}), ('DM', 'conversion', {'label': '0.21', 'weight': '0.21'}), ('start', 'EM', {'label': '0.29', 'weight': '0.29'}), ('start', 'DM', {'label': '0.14', 'weight': '0.14'}), ('start', 'PHONE', {'label': '0.32', 'weight': '0.32'}), ('start', 'WEB', {'label': '0.25', 'weight': '0.25'}), ('EM', 'DM', {'label': '0.43', 'weight': '0.43'}), ('EM', 'WEB', {'label': '0.41', 'weight': '0.41'}), ('EM', 'null', {'label': '0.09', 'weight': '0.09'}), ('EM', 'PHONE', {'label': '0.02', 'weight': '0.02'}), ('EM', 'conversion', {'label': '0.04', 'weight': '0.04'}), ('WEB', 'DM', {'label': '0.58', 'weight': '0.58'}), ('WEB', 'EM', {'label': '0.22', 'weight': '0.22'}), ('WEB', 'null', {'label': '0.12', 'weight': '0.12'}), ('WEB', 'PHONE', {'label': '0.03', 'weight': '0.03'}), ('WEB', 'conversion', {'label': '0.05', 'weight': '0.05'}), ('PHONE', 'WEB', {'label': '0.69', 'weight': '0.69'}), ('PHONE', 'DM', {'label': '0.18', 'weight': '0.18'}), ('PHONE', 'null', {'label': '0.01', 'weight': '0.01'}), ('PHONE', 'EM', {'label': '0.11', 'weight': '0.11'}), ('PHONE', 'conversion', {'label': '0.01', 'weight': '0.01'})])