create table user
(
id BIGINT,
active_time STRING,
status BIGINT,
amount double
)
窗口函数
1.排序:
i.Row_number()
select row_number() over(partition by id order by active_time) as rn,id from user
1,2,3,4 active_time相同时,行数不同
ii.rank()
select rank() over(partition by id order by active_time) as rn,id from user
1,2,3,4 active_time相同时,行数相同,有多少数有多少行
iii.dense_rank()
select dense_rank() over(partition by id order by active_time) as rn,id from user
1,2,2,3 active_time相同时,行数相同,有多少去重数有多少行
2.分组:
i.ntile(n) over (partition by id order by active_time) as group_num,id from user
ii.随机分堆:ntile(n) over (partition by id order by dbms_random.value) as group_num,id from user
3.取值:
i.lag(id, n, default)迟滞 对某一列往后错行
select lag(active_time,1,’1970-01-01 00:00:00’) over(partition by id order by active_time) as last_time from user
ii.lead(id, n, default)超前 对某一列往前错行
select lead(active_time,1,’2077-01-01 00:00:00’) over(partition by id order by active_time) as next_time from user
4.聚合:
i.至今累计花费
select id,sum(amount) over(partition by id) from user
ii.之前3次平均花费
select id,avg(amount) over (partition by id order by active_time rows between 3 preceding and current row) from user
iii.之前两次和之后两次的最大开销
select id,max(amount) over (partition by id order by active_time rows between 2 preceding and 2 following) from user
iv.未来的最小开销
select id,min(amount) over (partition by id order by active_time rows between current row and unbounded following ) from user
5.window字句:
preceding 向前移动,following 向后移动,current row 当前行, unbounded 起点
增强聚合
1.grouping sets:可以用于代替多个union all
select nvl(id,’all’),nvl(status,’all’),sum(amount) from user group by id ,status grouping
sets((id, status),id,status,())
=
select id, status,sum(amount) from user group by id, status
union all
select id,’all’,sum(amount) from user group by id
union all
select ‘all’,status,sum(amount) from user group by status
union all
select ‘all’,’all’,sum(amount) from user
2.with cube:列出所有可能
select nvl(id,’all’),nvl(status,’all’),nvl(active_time,’all’),sum(amount) from user group by id, status,active_time with cube
=
select nvl(id,’all’),nvl(status,’all’),nvl(active_time,’all’),sum(amount) from user group by id, status,active_time sets((id, status,active_time),(id, status),(id,active_time),(status,active_time),id, status,active_time,())
3.with rollup:根据group by 顺序逐层上钻
select nvl(id,’all’),nvl(status,’all’),nvl(active_time,’all’),sum(amount) from user group by id, status,active_time with rollup
=
select nvl(id,’all’),nvl(status,’all’),nvl(active_time,’all’),sum(amount) from user group by id, status,active_time sets((id, status,active_time),(id, status),id,())
列转行
1.concat():多个字符连接 输出成String
2.concat_ws():用分隔符分割数组中的数据
3.collect_set :把聚合的数据根据group by合成一个数组 同时去重
4.collect_list:同上 但不去重
5.select status,concat_ws(‘,’,collect_set(user_id)) from user group by status
行转列
1.explode:将hive一行中负责的array或者map结构拆分成多行。
2.lateral view 侧视图 一个语句生成把单行数据拆解成多行后的数据结果集
3.例子:
排序
1.order by:全局排序 只用一个reducer
2.sort by:部分排序(每个reducer上排序)
分类
1.group by
2.distribute by 指定分配到reducer的数据
i.示例:
select * from tab cluster by id 相同id的数据分配到同一个reducer上排序
3.cluster by 指定分配到reducer的数据后 在reducer上排序 只能升序
i.示例:
select * from tab cluster by id
=
select* from tab distribute by id sort by id
ii.先分区再排序 (分区数量小于字段种类时,就有意义)