备注:
Hive 版本 2.1.1
文章目录
测试数据
-- create table
create table dept
(
deptno int,
dname varchar(14),
loc varchar(13)
);
insert into dept(deptno, dname, loc)
values ('10', 'accounting', 'new york');
insert into dept(deptno, dname, loc)
values ('20', 'research', 'dallas');
insert into dept(deptno, dname, loc)
values ('30', 'sales', 'chicago');
insert into dept(deptno, dname, loc)
values ('40', 'operations', 'boston');
-- create table
create table emp
(
empno int,
ename varchar(10),
job varchar(9),
mgr int,
hiredate date,
sal decimal(7,2),
comm decimal(7,2),
deptno int
) ;
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7369', 'smith', 'clerk', '7902','1980-12-17', '800', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7499', 'allen', 'salesman', '7698', '1981-02-20', '1600', '300', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7521', 'ward', 'salesman', '7698', '1981-02-22', '1250', '500', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7566', 'jones', 'manager', '7839', '1981-04-02', '2975', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7654', 'martin', 'salesman', '7698', '1981-09-28', '1250', '1400', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7698', 'blake', 'manager', '7839', '1981-05-01', '2850', null, '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7782', 'clark', 'manager', '7839', '1981-06-09', '2450', null, '10');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7788', 'scott', 'analyst', '7566', '1987-06-13', '3000', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7839', 'king', 'president', null, '1981-11-17', '5000', null, '10');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7844', 'turner', 'salesman', '7698', '1981-09-08', '1500', '0', '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7876', 'adams', 'clerk', '7788', '1987-06-13', '1100', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7900', 'james', 'clerk', '7698', '1981-12-03', '950', null, '30');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7902', 'ford', 'analyst', '7566', '1981-12-03', '3000', null, '20');
insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7934', 'miller', 'clerk', '7782', '1982-01-23', '1300', null, '10');
create table salgrade
(
grade int,
losal int,
hisal int
) ;
insert into salgrade(grade, losal, hisal)
values ('1', '700', '1200');
insert into salgrade(grade, losal, hisal)
values ('2', '1201', '1400');
insert into salgrade(grade, losal, hisal)
values ('3', '1401', '2000');
insert into salgrade(grade, losal, hisal)
values ('4', '2001', '3000');
insert into salgrade(grade, losal, hisal)
values ('5', '3001', '9999');
create table bonus
(
ename varchar(10),
job varchar(9),
sal int,
comm int
);
一.Hive with语句概述
在进行HQL开发的过程中,对于复杂的报表逻辑,经常需要嵌套多层临时表,代码逻辑复杂,此时可以使用with将临时表进行封装。
优势
– 代码模块化
– 代码可读性增强
– 相同查询唯一化
语法:
with subquery_name1 as (subquery_body1),
subquery_name2 as (subquery_body2)
...
select * from subquery_name1 a, subquery_name2 b
where a.col = b.col
...
个人强烈推荐大家使用with语句。
没有with语句,进行复杂查询的时候,from子句后面一堆的临时表,而且面临特别复杂的需求的时候,from里的临时表里面可能还会嵌套临时表,代码量可读性、可维护性特别差。
with语句推出后,可以通过with语句封装好各个临时表,临时表直接可以根据顺序相互引用。
大大简化了代码的复杂程度,提升代码的可读性、可维护性。
另外,如果查询中多个临时表都用到某个表,可以通过with进行封装,达到一次查询多次引用,减少访问表的次数,在一定程度上也可以达到优化的效果。
二.Hive with语句测试案例
需求:求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资
代码:
-- 求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资
-- 主查询的from后面跟了2个临时表,程序可读性不佳
select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
from dept d
left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
from emp e1
group by e1.deptno) tmp1
on d.deptno = tmp1.deptno
left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
from emp e1
where e1.sal > 1000
group by e1.deptno) tmp2
on d.deptno = tmp2.deptno;
-- 求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资
-- 2个临时表的定时语句通过with封装成子查询了,程序可读性增强
with tmp1 as
(select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
from emp e1
group by e1.deptno),
tmp2 as
(select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
from emp e1
where e1.sal > 1000
group by e1.deptno)
select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
from dept d
left join tmp1
on d.deptno = tmp1.deptno
left join tmp2
on d.deptno = tmp2.deptno;
测试记录:
hive>
>
>
>
> select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
> from dept d
> left join (select e1.deptno, round(avg(ifnull(e1.sal, 0)), 2) avg_sal
> from emp e1
> group by e1.deptno) tmp1
> on d.deptno = tmp1.deptno
> left join (select e1.deptno, round(avg(ifnull(e1.sal, 0)), 2) avg_sal
> from emp e1
> where e1.sal > 1000
> group by e1.deptno) tmp2
> on d.deptno = tmp2.deptno;
FAILED: SemanticException [Error 10011]: Line 3:41 Invalid function 'ifnull'
hive>
> select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
> from dept d
> left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
> from emp e1
> group by e1.deptno) tmp1
> on d.deptno = tmp1.deptno
> left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
> from emp e1
> where e1.sal > 1000
> group by e1.deptno) tmp2
> on d.deptno = tmp2.deptno;
Query ID = root_20201217112303_9a1a8138-9748-4a61-8030-173f0b23aaa7
Total jobs = 4
Launching Job 1 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0263, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0263/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0263
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 11:23:10,448 Stage-1 map = 0%, reduce = 0%
2020-12-17 11:23:16,613 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 2.94 sec
2020-12-17 11:23:17,644 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 5.76 sec
2020-12-17 11:23:23,819 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 9.1 sec
MapReduce Total cumulative CPU time: 9 seconds 100 msec
Ended Job = job_1606698967173_0263
Launching Job 2 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0264, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0264/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0264
Hadoop job information for Stage-3: number of mappers: 2; number of reducers: 1
2020-12-17 11:23:35,851 Stage-3 map = 0%, reduce = 0%
2020-12-17 11:23:42,132 Stage-3 map = 100%, reduce = 0%, Cumulative CPU 6.93 sec
2020-12-17 11:23:47,275 Stage-3 map = 100%, reduce = 100%, Cumulative CPU 10.38 sec
MapReduce Total cumulative CPU time: 10 seconds 380 msec
Ended Job = job_1606698967173_0264
Stage-7 is selected by condition resolver.
Stage-2 is filtered out by condition resolver.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/log4j-slf4j-impl-2.8.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
2020-12-17 11:23:54 Starting to launch local task to process map join; maximum memory = 1908932608
2020-12-17 11:23:55 Dump the side-table for tag: 1 with group count: 3 into file: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile01--.hashtable
2020-12-17 11:23:55 Uploaded 1 File to: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile01--.hashtable (331 bytes)
2020-12-17 11:23:55 Dump the side-table for tag: 2 with group count: 3 into file: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile02--.hashtable
2020-12-17 11:23:55 Uploaded 1 File to: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile02--.hashtable (331 bytes)
2020-12-17 11:23:55 End of local task; Time Taken: 0.924 sec.
Execution completed successfully
MapredLocal task succeeded
Launching Job 4 out of 4
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1606698967173_0265, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0265/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0265
Hadoop job information for Stage-5: number of mappers: 2; number of reducers: 0
2020-12-17 11:24:03,551 Stage-5 map = 0%, reduce = 0%
2020-12-17 11:24:10,761 Stage-5 map = 100%, reduce = 0%, Cumulative CPU 4.07 sec
MapReduce Total cumulative CPU time: 4 seconds 70 msec
Ended Job = job_1606698967173_0265
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 9.1 sec HDFS Read: 17286 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-3: Map: 2 Reduce: 1 Cumulative CPU: 10.38 sec HDFS Read: 17206 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-5: Map: 2 Cumulative CPU: 4.07 sec HDFS Read: 14497 HDFS Write: 288 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 23 seconds 550 msec
OK
20 2175.00 2518.75
30 1566.67 1690.00
40 NULL NULL
10 2916.67 2916.67
Time taken: 68.583 seconds, Fetched: 4 row(s)
hive>
> with tmp1 as
> (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
> from emp e1
> group by e1.deptno),
> tmp2 as
> (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
> from emp e1
> where e1.sal > 1000
> group by e1.deptno)
> select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
> from dept d
> left join tmp1
> on d.deptno = tmp1.deptno
> left join tmp2
> on d.deptno = tmp2.deptno;
Query ID = root_20201217112424_4e872f85-1ab0-4e2b-ab5a-64fa9a231730
Total jobs = 4
Launching Job 1 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0266, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0266/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0266
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 11:24:30,690 Stage-1 map = 0%, reduce = 0%
2020-12-17 11:24:37,886 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 6.36 sec
2020-12-17 11:24:44,067 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 9.65 sec
MapReduce Total cumulative CPU time: 9 seconds 650 msec
Ended Job = job_1606698967173_0266
Launching Job 2 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0267, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0267/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0267
Hadoop job information for Stage-3: number of mappers: 2; number of reducers: 1
2020-12-17 11:24:56,091 Stage-3 map = 0%, reduce = 0%
2020-12-17 11:25:03,307 Stage-3 map = 50%, reduce = 0%, Cumulative CPU 3.41 sec
2020-12-17 11:25:04,335 Stage-3 map = 100%, reduce = 0%, Cumulative CPU 6.89 sec
2020-12-17 11:25:09,486 Stage-3 map = 100%, reduce = 100%, Cumulative CPU 10.29 sec
MapReduce Total cumulative CPU time: 10 seconds 290 msec
Ended Job = job_1606698967173_0267
Stage-7 is selected by condition resolver.
Stage-2 is filtered out by condition resolver.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/log4j-slf4j-impl-2.8.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
2020-12-17 11:25:16 Starting to launch local task to process map join; maximum memory = 1908932608
Execution completed successfully
MapredLocal task succeeded
Launching Job 4 out of 4
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1606698967173_0268, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0268/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job -kill job_1606698967173_0268
Hadoop job information for Stage-5: number of mappers: 2; number of reducers: 0
2020-12-17 11:25:25,836 Stage-5 map = 0%, reduce = 0%
2020-12-17 11:25:32,009 Stage-5 map = 50%, reduce = 0%, Cumulative CPU 2.13 sec
2020-12-17 11:25:33,038 Stage-5 map = 100%, reduce = 0%, Cumulative CPU 4.17 sec
MapReduce Total cumulative CPU time: 4 seconds 170 msec
Ended Job = job_1606698967173_0268
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 9.65 sec HDFS Read: 17509 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-3: Map: 2 Reduce: 1 Cumulative CPU: 10.29 sec HDFS Read: 17429 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-5: Map: 2 Cumulative CPU: 4.17 sec HDFS Read: 14519 HDFS Write: 288 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 24 seconds 110 msec
OK
20 2175.00 2518.75
30 1566.67 1690.00
40 NULL NULL
10 2916.67 2916.67
Time taken: 69.438 seconds, Fetched: 4 row(s)
hive>