一、hive sql 的练习
1)数据结构
字段 备注 详细描述
video id 视频唯一id(String) 11位字符串
uploader 视频上传者(String) 上传视频的用户名String
age 视频年龄(int) 视频在平台上的整数天
category 视频类别(Array<String>) 上传视频指定的视频分类
length 视频长度(Int) 整形数字标识的视频长度
views 观看次数(Int) 视频被浏览的次数
rate 视频评分(Double) 满分5分
Ratings 流量(Int) 视频的流量,整型数字
conments 评论数(Int) 一个视频的整数评论数
related ids 相关视频id(Array<String>) 相关视频的id,最多20个
2)准备工作
对数据创建外部表,数据存放在hdfs的/gulivideo/video目录下
create external table video_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
location ‘/gulivideo/video‘;
一般不会在文本文件上直接处理数据,一般会把我们建好的外部表的数据导入到内部表中
创建内部表
create table video_orc stored as orc tblproperties("orc.compress"="SNAPPY") as select * from video_ori;
3)统计视频观看数Top10
思路:使用order by按照views字段做一个全局排序即可,同时我们设置只显示前10条。
select
videoId,
views
from
video_orc
order by
views desc
limit 10;
4)统计视频类别热度Top10
--1. 将类别拆开 因为 类别是数组类型的
select
videoId,
categories,
views
from
video_orc
lateral view
explode(category) tbl as categories;
--2. 按照类别统计观看数综合,并排序取Top10
select
categories,
sum(views) hot
from
t1
goup by
categories
order by
hot desc
limit 10;
方法二:
select
categories,
sum(views) hot
from
video_orc
lateral view
explode(category) tbl as categories
group by
categories
order by
hot desc
limit 10;
5)统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
--1. 求Top20视频的类别
select
videoId,
category,
views
from
video_orc
order by
views desc
limit 20;
--2. 将类别炸开
select
videoId,
categories
from
t1
lateral view explode(category) tbl as categories;
--3. 统计个数
select
categories,
count(videoId) cnt
from
t2
group by
categories
order by cnt desc (可以省略这句);
6) 统计视频观看数Top50所关联视频的所属类别热度排序
--1. 统计Top50观看的视频以及其关联视频
select
videoId,
views,
relatedid
from
video_orc
order by
views desc
limit 50;
--2. 将关联视频炸开
select
explode(relatedid) videoId
from
t1;
--3. 将t2和video_orc表join,获取其种类
select
v.videoId,
v.category
from
video_orc v
join
t2
on v.videoId = t2.videoId;
--4. 炸开类别
select
explode(category) categories
from
t3
--类别热度表
select
categories,
sum(views) hot
from
video_orc lateral view explode(category) tbl as categories
group by
categories;
--5. 将炸开的类别和类别热度表关联,并按热度排序
select
distinct t4.categories,
ch.hot
from
t4
join
ch
on
t4.categories = ch.categories
order by
hot desc;
7)统计每个类别中的视频热度Top10,以Music为例
--1.生成中间表格
create table video_category stored as orc as
select
videoid,
uploader,
age,
categories,
length,
views,
rate,
ratings,
comments,
relatedid
from
video_orc
lateral view explode(category) tbl as categories;
--2. 统计Music类视频热度Top10
select
videoid,
views
from
video_category
where
categories="Music"
order by
views desc
limit 10;
8)统计每个类别视频观看数Top10
--1. 给分类内部视频按照观看数排名
select
categories,
videoid,
views,
rank() over(partition by categories order by views desc) rk
from
video_category;
--2. 取前十
select
categories,
videoid,
views
from
t1
where
rk<=10;
HIve sql的练习