Hive指标案例
准备数据
先创建txt表导入数据,然后将表数据导入ORC表中
-- createtable.hql
drop database sale cascade;
create database if not exists sale;
create table sale.dimdate_ori(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
)
row format delimited
fields terminated by ",";
create table sale.sale_ori(
orderid string,
locationid string,
dt date
)
row format delimited
fields terminated by ",";
create table sale.saledetail_ori(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)
row format delimited
fields terminated by ",";
create table sale.dimdate(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
) stored as orc;
create table sale.sale(
orderid string,
locationid string,
dt date
) stored as orc;
create table sale.saledetail(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)stored as orc;
hive -f createtable.hql
-- 加载数据
use sale;
load data local inpath "/home/hadoop/data/hive/tbDate.txt" overwrite into table dimdate_ori;
load data local inpath "/home/hadoop/data/hive/tbSale.txt" overwrite into table sale_ori;
load data local inpath "/home/hadoop/data/hive/tbSaleDetail.txt" overwrite into table saledetail_ori;
-- 导入数据
insert into table dimdate select * from dimdate_ori;
insert into table sale select * from sale_ori;
insert into table saledetail select * from saledetail_ori;
hive -f loaddata.hql
表结构
日期表(dimdate)
dt date 日期
yearmonth int 年月
year smallint 年
month tinyint 月
day tinyint 日
week tinyint 周几
weeks tinyint 第几周
quat tinyint 季度
tendays tinyint 旬
halfmonth tinyint 半月
订单表(sale)
orderid string 订单号
locationid string 交易位置
dt date 交易日期
订单销售明细表(saledetail)
orderid string 订单号
rownum int 行号
itemid string 货品
num int 数量
price double 单价
amount double 金额
指标计算
1.按年统计销售额
SELECT year(B.dt) year, round(sum(A.amount)/10000, 2) amount
FROM saledetail A join sale B on A.orderid=B.orderid
group by year(B.dt);
2.销售金额在 10W 以上的订单
SELECT orderid, round(sum(amount), 2) amount
FROM saledetail
group by orderid
having sum(amount) > 100000
3.每年销售额的差值
思路:先算出每年的销售额,再算出每年销售额的差值
select
year,
round(amount,2) amount,
round(lag(amount) over (order by year),2) prioramount,
round(amount - lag(amount) over (order by year),2) diff
from (
SELECT year(B.dt) year, sum(A.amount) amount
from saledetail A
join sale B on A.orderid=B.orderid
group by year(B.dt)
) tmp;
4.年度订单金额前10位(年度、订单号、订单金额、排名)
思路:
1. 第一层先求出对应的年度和订单号,订单金额,然后排序
2. 第二层选出rank < 11 的订单条数
SELECT dt, orderid, amount,rank from (
SELECT dt, orderid, amount,dense_rank() over(PARTITION BY dt ORDER BY amount desc) rank
from(SELECT year(B.dt) dt,A.orderid, sum(A.amount) amount
from saledetail A
join sale B on A.orderid=B.orderid
group by year(B.dt),A.orderid) tmp1) tmp2
where rank < 11;
5.季度订单金额前10位(年度、季度、订单id、订单金额、排名)
思路:Join dimdate 表获取quat信息
with tmp as (
select C.year, C.quat, A.orderid, round(sum(B.amount), 2)
amount
from sale A join saledetail B on A.orderid=B.orderid
join dimdate C on A.dt=C.dt
group by C.year, C.quat, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order
by amount desc) rank
from tmp
) tmp1
where rank <= 10;
6.求所有交易日中订单金额最高的前10位
with tmp as (
select A.dt, A.orderid, round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by A.dt, A.orderid
)
select dt, orderid, amount, rank
from (
select dt, orderid, amount, dense_rank() over(order by
amount desc) rank
from tmp
) tmp1
where rank <= 10;
7.每年度销售额最大的交易日
with tmp as (
select dt, amount, dense_rank() over (partition by year(dt)
order by amount desc) as rank
from (select A.dt, round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by A.dt) tab1
)
select year(dt) as year, dt, amount
from tmp
where rank=1;
8.年度最畅销的商品(即每年销售金额最大的商品)
with tmp as (
select year(B.dt) year, goods, round(sum(amount),2) amount
from saledetail A join sale B on A.orderid=B.orderid
group by year(B.dt), goods
)
select year, goods, amount
from (select year, goods, amount, dense_rank() over
(partition by year order by amount desc) rank
from tmp) tmp1
where rank = 1;