淘宝大数据的游戏,我重新提高自己的思维方式,
插件和代码前前后后写在六个版本,但最好的结果其实是我的第一次2第二码。这让我很惊讶,
但它也说明了一个问题。当你更熟悉的语言,当一方,你缺少的是其他的知识,
- 首先是我的数学知识,在分析用户行为时,我们知道浏览次数和购买次数是由一定规律的,这个方面找了数学系的同学问了一些,得到的结论是:你能够进行线性拟合。这是最简单的,可是得到的结果不一定真实,于是推荐我使用高斯分布来做。可是由于自己单枪匹马,所以选了比較简单的线性拟合
- 心理学,我们能够从数据中发现。那些常常在淘宝买东西的假设是时间间隔一段就买了同一种商品的,那说明这个人的属于死宅之类的,由于这些东西一般我们旁边就有,还有,就是浏览次数和购买之间的关系,用数学来解答,心理学来分析。多天浏览和购买的关系,
先意淫这些吧,下来上三个版本号的代码:
第一版本号,简单推測浏览十五次购买一次:
import time u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0
t_num1=0
t_num2=0
t_num3=0
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
fileline=fileread.readline()
# print type(fileline)
# print fileline,
# print i
filedian =fileline.find(r',')
filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2]
b_id.append(b_id1) u_id1=fileline[:filedian]
u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1]
t_id.append(t_id1) b_time1=fileline[filedian1:-2]
b_time.append(b_time1) if not fileline:
break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
print b_time ff=0
while True:
if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i])
output.write(' ')
ff=ff+1
if b_id[i]==b_id[i+1]: if int(t_id[i])==0:
t_num0=t_num0+1
elif int(t_id[i])==1:
t_num1=t_num1+1
elif int(t_id[i])==2:
t_num2=t_num2+1
else:
t_num3=t_num3+1
else:
j=j+1
print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
if t_num0>=15 or t_num1>=1 :
output.write(b_id[i])
output.write(",")
# else:
# output.write(b_id[i])
# output.write(',')
t_num0=0
t_num1=0
t_num2=0
t_num3=0 # else:
else:
output.write('\n')
ff=0
# print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
i=i+1
第二版本号,观察时间和购买行为
#coding:utf-8
import time u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0
t_num1=0
t_num2=0
t_num3=0
b_num1=0
b_time4=0
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
fileline=fileread.readline()
# print type(fileline)
# print fileline,
# print i
filedian =fileline.find(r',')
filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2]
b_id.append(b_id1) u_id1=fileline[:filedian]
u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1]
t_id.append(t_id1) b_time1=fileline[filedian1:-2]
b_time.append(b_time1) if not fileline:
break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
#print b_time ff=0
while True:
if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i])
output.write(' ')
ff=ff+1
if b_id[i]==b_id[i+1]: if int(t_id[i])==0:
t_num0=t_num0+1
elif int(t_id[i])==1:
t_num1=t_num1+1
elif int(t_id[i])==2:
t_num2=t_num2+1
elif b_time[i]!=b_time[i+1]:
b_time4=b_time4+1
else:
t_num3=t_num3+1
else:
j=j+1
b_num1=b_num1+1
print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
if t_num0>=15 and t_num1==0:
output.write(b_id[i])
output.write(",")
if b_time4>=2 and t_num1==0:
output.write(b_id[i])
output.write(',')
if t_num0>15 and t_num1>=2:
output.write(b_id[i])
output.write(',')
if t_num2>=1 and t_num1==0:
output.write(b_id[i])
output.write(',')
if len(b_id)<=3:
output.write(b_id[i])
output.write(',') # if b_num1<=3:
# output.write(b_id[i])
# output.write(',')
#
#
t_num0=0
t_num1=0
t_num2=0
t_num3=0
b_time4=0 # else:
else:
output.write('\n')
b_num1=b_num1+1
# print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
i=i+1
b_num1=0
第三版本号,使用数学分析
#coding:utf-8
import time
import numpy as np
from scipy import optimize
from math import sqrt u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0 #类型
t_num1=0
t_num2=0
t_num3=0
b_num1=0 #品牌个数
b_time4=0 #时间
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
fileline=fileread.readline()
# print type(fileline)
# print fileline,
# print i
filedian =fileline.find(r',')
filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2]
b_id.append(b_id1) u_id1=fileline[:filedian]
u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1]
t_id.append(t_id1) b_time1=fileline[filedian1:-2]
b_time.append(b_time1) if not fileline:
break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
#print b_time
t_num00=[]
t_num11=[]
t_num22=[]
t_num33=[]
t_time44=[]
cc=0
ff=0
pp=0
while True:
if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i])
output.write(' ')
ff=ff+1
if b_id[i]==b_id[i+1]:
# cc=cc+1
if int(t_id[i])==0:
t_num0=t_num0+1
elif int(t_id[i])==1:
t_num1=t_num1+1
elif int(t_id[i])==2:
t_num2=t_num2+1
else:
t_num3=t_num3+1
if b_time[i]!=b_time[i+1]:
# print b_time4
b_time4=b_time4+1
else:
j=j+1
# b_num1=b_num1+1
# print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
# if b_time4>=3:
# print b_time4
# pp=pp+1
# 数据拟合分析部分 t_num00.append(t_num0)
t_num11.append(t_num1)
t_num22.append(t_num2)
t_num33.append(t_num3)
t_time44.append(b_time4) # if t_num0>=10 :
# output.write(b_id[i]) #看了15次的没有买的
# output.write(",")
# elif b_time4>=3 :
# output.write(b_id[i]) #多天看的,没有买
# output.write(',')
# # if t_num0>15 and t_num1>=2:
# # output.write(b_id[i])
# # output.write(',')
# elif t_num2>=1 :
# output.write(b_id[i]) #收藏出可是没有买
# output.write(',')
# elif t_num3>=1 : #放进购物车可是没有买
# output.write(b_id[i])
# output.write(',')
# # if b_time4>=2 and t_num1>=2:
# # output.write(b_id[i])
# # output.write(',')
# #
# elif t_num1>=1:
# output.write(b_id[i]) #买过两次
# output.write(',')
# # if len(b_id)<=3:
# output.write(b_id[i])
# output.write(',') # if b_num1<=3:
# output.write(b_id[i])
# output.write(',')
#
#
t_num0=0
t_num1=0
t_num2=0
t_num3=0
b_time4=0
# elif b_num1<=3 and ff!=0:
# print b_id[i]
# output.write(b_id[i])
# output.write('\n')
# ff=0
## else:
elif not u_id[i+1]:
break
# else:
#
# output.write('\n')
# ff=0
# print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
i=i+1
b_num1=0 #分析浏览次数和购买的关系
y=np.array(t_num00)
x=np.array(t_num11) def residuals(p):
k,b=p
return y-(k*x-b) r=optimize.leastsq(residuals,[1,0])
k,b=r[0]
print "K=",k,"b=",b #分析收藏和购买的关系
x22=np.array(t_num22) def residuals(p):
k,b=p
return y-(k*x22-b) r=optimize.leastsq(residuals,[1,0])
k22,b22=r[0]
print "Kt_num22=",k22,"b22=",b22 #分析购物车和购买的关系
x33=np.array(t_num33) def residuals(p):
k,b=p
return y-(k*x33-b) r=optimize.leastsq(residuals,[1,0])
k33,b33=r[0]
print "kt_num33=",k33,"b33=",b33
#查看天数和购物关系
x44=np.array(t_time44) def residuals(p):
k,b=p
return y-(k*x44-b) r=optimize.leastsq(residuals,[1,0])
k44,b44=r[0]
print "ktime=",k44,"b44=",b44 print pp
#def sim_pearson()