第一题
需求:
我们有如下的用户访问数据:
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL统计出每个用户的累积访问次数,如下表所示: 用户id 月份 小计 累积实现
数据准备*/
CREATE TABLE test_sql.test1 (
userId string,
visitDate string,
visitCount INT )
ROW format delimited FIELDS TERMINATED BY "\t";
INSERT INTO TABLE test_sql.test1
VALUES
( 'u01', '2017/1/21', 5 ),
( 'u02', '2017/1/23', 6 ),
( 'u03', '2017/1/22', 8 ),
( 'u04', '2017/1/20', 3 ),
( 'u01', '2017/1/23', 6 ),
( 'u01', '2017/2/21', 8 ),
( 'u02', '2017/1/23', 6 ),
( 'u01', '2017/2/22', 4 );
查询SQL:
SELECT t2.userid,
t2.visitmonth,
subtotal_visit_cnt,
sum(subtotal_visit_cnt) over (partition BY userid
ORDER BY visitmonth) AS total_visit_cnt
FROM
(SELECT userid,
visitmonth,
sum(visitcount) AS subtotal_visit_cnt
FROM
(SELECT userid,
date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') AS visitmonth,
visitcount
FROM test_sql.test1) t1
GROUP BY userid,
visitmonth)t2
ORDER BY t2.userid,
t2.visitmonth;
第二题
需求:
有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,
访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,数据如下:
请统计:
(1)每个店铺的UV(访客数)
(2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
数据准备:
CREATE TABLE test_sql.test2 (
user_id string,
shop string )
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test2 VALUES
( 'u1', 'a' ),
( 'u2', 'b' ),
( 'u1', 'b' ),
( 'u1', 'a' ),
( 'u3', 'c' ),
( 'u4', 'b' ),
( 'u1', 'a' ),
( 'u2', 'c' ),
( 'u5', 'b' ),
( 'u4', 'b' ),
( 'u6', 'c' ),
( 'u2', 'c' ),
( 'u1', 'b' ),
( 'u2', 'a' ),
( 'u2', 'a' ),
( 'u3', 'a' ),
( 'u5', 'a' ),
( 'u5', 'a' ),
( 'u5', 'a' );
查询SQL实现
方式1:
每个店铺的UV(访客数)
SELECT shop,count(DISTINCT user_id)
FROM test_sql.test2
GROUP BY shop;
方式2:
每个店铺的UV(访客数)
SELECT t.shop,count()
FROM
(SELECT user_id,shop
FROM test_sql.test2
GROUP BY user_id,shop) t
GROUP BY t.shop;
//每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
SELECT t2.shop,t2.user_id,t2.cnt
FROM
(SELECT t1.,
row_number() over(partition BY t1.shop ORDER BY t1.cnt DESC) rank
FROM
(SELECT user_id,shop,count(*) AS cnt
FROM test_sql.test2
GROUP BY user_id,shop) t1)t2
WHERE rank <= 3;
第三题
需求
已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。
数据样例:2017-01-01,10029028,1000003251,33.57。
请给出sql进行统计:
(1)给出 2017年每个月的订单数、用户数、总成交金额。
(2)给出2017年11月的新客数(指在11月才有第一笔订单)
实现
数据准备*/
CREATE TABLE test_sql.test3 (
dt string,
order_id string,
user_id string,
amount DECIMAL ( 10, 2 ) )
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029029','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','100290288','1000003252',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','10029088','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290281','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290282','1000003253',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-11-02','10290282','100003253',234);
INSERT INTO TABLE test_sql.test3 VALUES ('2018-11-02','10290284','100003243',234);
查询SQL
(1)给出 2017年每个月的订单数、用户数、总成交金额。
SELECT t1.mon,
count(t1.order_id) AS order_cnt,
count(DISTINCT t1.user_id) AS user_cnt,
sum(amount) AS total_amount
FROM
(SELECT order_id,
user_id,
amount,
date_format(dt,'yyyy-MM') mon
FROM test_sql.test3
WHERE date_format(dt,'yyyy') = '2017') t1
GROUP BY t1.mon;
(2)给出2017年11月的新客数(指在11月才有第一笔订单)
第三题第二问
select count(user_id)
from (
SELECT user_id
FROM test_sql.test3
GROUP BY user_id
HAVING date_format(min(dt), 'yyyy-MM') = '2017-01'
);
第三题 引申 第三问,每个月的新用户
select min_mon, count(user_id)
from (
SELECT user_id,
date_format(min(dt), 'yyyy-MM') min_mon
FROM test_sql.test3
GROUP BY user_id
) a
group by min_mon
order by min_mon;
第四题
需求
有一个5000万的用户文件(user_id,name,age),
一个2亿记录的用户看电影的记录文件(user_id,url),根据年龄段(每10岁)观看电影的次数进行排序?
数据准备:
CREATE TABLE test_sql.test4user
(
user_id string,
name string,
age int
);
CREATE TABLE test_sql.test4log
(
user_id string,
url string
);
INSERT INTO TABLE test_sql.test4user
VALUES ('001', 'u1', 10);
INSERT INTO TABLE test_sql.test4user
VALUES ('002', 'u2', 15);
INSERT INTO TABLE test_sql.test4user
VALUES ('003', 'u3', 15);
INSERT INTO TABLE test_sql.test4user
VALUES ('004', 'u4', 20);
INSERT INTO TABLE test_sql.test4user
VALUES ('005', 'u5', 25);
INSERT INTO TABLE test_sql.test4user
VALUES ('006', 'u6', 35);
INSERT INTO TABLE test_sql.test4user
VALUES ('007', 'u7', 40);
INSERT INTO TABLE test_sql.test4user
VALUES ('008', 'u8', 45);
INSERT INTO TABLE test_sql.test4user
VALUES ('009', 'u9', 50);
INSERT INTO TABLE test_sql.test4user
VALUES ('0010', 'u10', 65);
INSERT INTO TABLE test_sql.test4log
VALUES ('001', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('002', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('003', 'url2');
INSERT INTO TABLE test_sql.test4log
VALUES ('004', 'url3');
INSERT INTO TABLE test_sql.test4log
VALUES ('005', 'url3');
INSERT INTO TABLE test_sql.test4log
VALUES ('006', 'url1');
INSERT INTO TABLE test_sql.test4log
VALUES ('007', 'url5');
INSERT INTO TABLE test_sql.test4log
VALUES ('008', 'url7');
INSERT INTO TABLE test_sql.test4log
VALUES ('009', 'url5');
INSERT INTO TABLE test_sql.test4log
VALUES ('0010', 'url1');
查询SQL 思路一
SELECT t2.age_phase,
sum(t1.cnt) as view_cnt
FROM (SELECT user_id,
count(*) cnt
FROM test_sql.test4log
GROUP BY user_id) t1
JOIN(SELECT user_id,
CASE
WHEN age <= 10 AND age > 0 THEN '0-10'
WHEN age <= 20 AND age > 10 THEN '10-20'
WHEN age > 20 AND age <= 30 THEN '20-30'
WHEN age > 30 AND age <= 40 THEN '30-40'
WHEN age > 40 AND age <= 50 THEN '40-50'
WHEN age > 50 AND age <= 60 THEN '50-60'
WHEN age > 60 AND age <= 70 THEN '60-70'
ELSE '70以上' END as age_phase
FROM test_sql.test4user) t2 ON t1.user_id = t2.user_id
GROUP BY t2.age_phase;
思路二:
SELECT t2.age_phase,
sum(t1.cnt) as view_cnt
FROM (SELECT user_id,
count(*) cnt
FROM test_sql.test4log
GROUP BY user_id) t1
JOIN(SELECT user_id,
age,
concat(floor(age / 10) * 10, '-', (floor(age / 10) + 1) * 10) as age_phase
FROM test_sql.test4user) t2
ON t1.user_id = t2.user_id
GROUP BY t2.age_phase;
第五题
需求:
有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有
访问记录的用户)
--日期 用户 年龄
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
数据准备:
CREATE TABLE test5(
dt string,
user_id string,
age int)
ROW format delimited fields terminated BY ',';
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_3',39);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-12','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-13','test_1',23);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-15','test_2',19);
INSERT INTO TABLE test_sql.test5 VALUES ('2019-02-16','test_2',19);
查询SQL:
SELECT sum(total_user_cnt) total_user_cnt,
sum(total_user_avg_age) total_user_avg_age,
sum(two_days_cnt) two_days_cnt,
sum(avg_age) avg_age
FROM (SELECT 0 total_user_cnt,
0 total_user_avg_age,
count() AS two_days_cnt,
cast(sum(age) / count() AS decimal(5, 2)) AS avg_age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, age, dt, rank, date_sub(dt, rank) flag
FROM (SELECT dt,
user_id,
max(age) age,
row_number() over (PARTITION BY user_id
ORDER BY dt) rank
FROM test_sql.test5
GROUP BY dt, user_id) t1) t2
GROUP BY user_id, flag
HAVING count() >= 2) t3
GROUP BY user_id) t4
UNION ALL
SELECT count() total_user_cnt,
cast(sum(age) / count(*) AS decimal(5, 2)) total_user_avg_age,
0 two_days_cnt,
0 avg_age
FROM (SELECT user_id,
max(age) age
FROM test_sql.test5
GROUP BY user_id) t5) t6;
思路二:
SELECT b.total_user_cnt total_user_cnt,
b.total_user_avg_age total_user_avg_age,
a.two_days_cnt two_days_cnt,
a.avg_age avg_age
FROM (SELECT count() AS two_days_cnt,
cast(sum(age) / count() AS decimal(5, 2)) AS avg_age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, max(age) age
FROM (SELECT user_id, age, dt, rank, date_sub(dt, rank) flag
FROM (SELECT dt,
user_id,
max(age) age,
row_number() over (PARTITION BY user_id
ORDER BY dt) rank
FROM test_sql.test5
GROUP BY dt, user_id) t1) t2
GROUP BY user_id, flag
HAVING count() >= 2) t3
GROUP BY user_id) t4) a
CROSS JOIN
(SELECT count() total_user_cnt,
cast(sum(age) / count(*) AS decimal(5, 2)) total_user_avg_age
FROM (SELECT user_id,
max(age) age
FROM test_sql.test5
GROUP BY user_id) t5) b on 1 = 1