hive练习

1、查看数据

$ head orders.csv
order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2539329,1,prior,1,2,08,
2398795,1,prior,2,3,07,15.0
473747,1,prior,3,3,12,21.0
2254736,1,prior,4,4,07,29.0
431534,1,prior,5,4,15,28.0
3367565,1,prior,6,2,07,19.0
550135,1,prior,7,1,09,20.0
3108588,1,prior,8,1,14,14.0
2295261,1,prior,9,1,16,0.0


$ head order_products__prior.csv
order_id,product_id,add_to_cart_order,reordered
2,33120,1,1
2,28985,2,1
2,9327,3,0
2,45918,4,1
2,30035,5,0
2,17794,6,1
2,40141,7,1
2,1819,8,1
2,43668,9,0

 

2、创建表结构

create table orders(

order_id string,
user_id string,
eval_set string,
order_number bigint,
order_dow string,
order_hour_of_day string,
days_since_prior_order string
)row format delimited fields terminated by ','
lines terminated by '\n';

create table order_products__prior(
order_id string,
product_id string,
add_to_cart_order string,
reordered string
)row format delimited fields terminated by ','
lines terminated by '\n';

3、每个用户有多少个订单[orders](聚合count)

select user_id,count(*) from orders group by user_id 
order by user_id desc limit 3;

99999    35
99998    6
99997    4

4、每个用户买了多少商品(表连接)

select user_id,count(*) from 
orders join order_products__prior
on(orders.order_id=order_products__prior.order_id) 
group by user_id
order by user_id desc limit 3;
99999    329
99998    21
99997    38

5、每个用户平均每个订单是多少商品(聚合函数avg)

select user_id,avg(cnt)
from orders join
(
select order_id,count(*) as cnt from order_products__prior group by order_id)t
on (orders.order_id=t.order_id)
group by user_id
order by user_id desc limit 3;

99999    9.676470588235293
99998    4.2
99997    12.666666666666666

6、每个用户一周中的订单分布(行装列,case when)

select user_id,
sum(case when order_dow=='0' then '1' else '0' end )as dow_0,
sum(case when order_dow=='1' then '1' else '0' end )as dow_1,
sum(case when order_dow=='2' then '1' else '0' end )as dow_2,
sum(case when order_dow=='3' then '1' else '0' end )as dow_3,
sum(case when order_dow=='4' then '1' else '0' end )as dow_4,
sum(case when order_dow=='5' then '1' else '0' end )as dow_5,
sum(case when order_dow=='6' then '1' else '0' end )as dow_6
from orders
group by user_id
order by user_id desc
limit 10;

7、每个用户平均每个购买天中,购买的商品数量(if补充缺失值)

--方法1
select user_id,avg(s_cnt) as avg_cnt
from
(select user_id,
if(days_since_prior_order="",0,days_since_prior_order) as days_since_prior_order,
sum(cnt) as s_cnt from 
orders
join 
(select order_id,count(*) as cnt from order_products__prior group by order_id)t
on (orders.order_id=t.order_id)
group by user_id,days_since_prior_order)t2
group by user_id
order by user_id desc limit 3;

--方法二
select user_id,sum(cnt)/count(distinct days_since_prior_order)
from orders 
join
(select order_id,count(*) as cnt from order_products__prior group by order_id)t
on 
(orders.order_id=t.order_id)
group by user_id
order by user_id desc limit 3;

8、每个用户最喜欢的3个商品(字段链接concat_ws,列转行concat_list或者concat_set,排序row_number() over(partition by x order by y) as z)

select user_id,collect_list(concat_ws("_",product_id,cast(r as string))) as top3
from
(
select user_id, product_id,
row_number() over(partition by user_id order by cnt desc) as r,
cnt
from
(select user_id,product_id,count(*) as cnt
from orders
join order_products__prior
on(orders.order_id=order_products__prior.order_id)
group by user_id,product_id
)t1
)t2
where r<=3
group by user_id
order by user_id desc
limit 10;

 总结: 

1、函数的使用:

2、hive优化:

上一篇:Image Processing Using Multi-Code GAN Prior, CVPR2020


下一篇:线性表练习之ZUMA