大数据开发之Hive篇10-Hive高级查询with语句

备注:
Hive 版本 2.1.1

文章目录

测试数据

-- create table
create table dept
(
  deptno int,
  dname  varchar(14),
  loc    varchar(13)
);

insert into dept(deptno, dname, loc)
values ('10', 'accounting', 'new york');

insert into dept(deptno, dname, loc)
values ('20', 'research', 'dallas');

insert into dept(deptno, dname, loc)
values ('30', 'sales', 'chicago');

insert into dept(deptno, dname, loc)
values ('40', 'operations', 'boston');


-- create table
create table emp
(
  empno    int,
  ename    varchar(10),
  job      varchar(9),
  mgr      int,
  hiredate date,
  sal      decimal(7,2),
  comm     decimal(7,2),
  deptno   int
) ;

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7369', 'smith', 'clerk', '7902','1980-12-17', '800', null, '20');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7499', 'allen', 'salesman', '7698', '1981-02-20', '1600', '300', '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7521', 'ward', 'salesman', '7698', '1981-02-22', '1250', '500', '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7566', 'jones', 'manager', '7839', '1981-04-02', '2975', null, '20');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7654', 'martin', 'salesman', '7698', '1981-09-28', '1250', '1400', '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7698', 'blake', 'manager', '7839', '1981-05-01', '2850', null, '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7782', 'clark', 'manager', '7839', '1981-06-09', '2450', null, '10');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7788', 'scott', 'analyst', '7566', '1987-06-13', '3000', null, '20');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7839', 'king', 'president', null, '1981-11-17', '5000', null, '10');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7844', 'turner', 'salesman', '7698', '1981-09-08', '1500', '0', '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7876', 'adams', 'clerk', '7788', '1987-06-13', '1100', null, '20');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7900', 'james', 'clerk', '7698', '1981-12-03', '950', null, '30');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7902', 'ford', 'analyst', '7566', '1981-12-03', '3000', null, '20');

insert into emp(empno, ename, job, mgr, hiredate, sal, comm, deptno)
values ('7934', 'miller', 'clerk', '7782', '1982-01-23', '1300', null, '10');


create table salgrade
(
  grade int,
  losal int,
  hisal int
) ;


insert into salgrade(grade, losal, hisal)
values ('1', '700', '1200');

insert into salgrade(grade, losal, hisal)
values ('2', '1201', '1400');

insert into salgrade(grade, losal, hisal)
values ('3', '1401', '2000');

insert into salgrade(grade, losal, hisal)
values ('4', '2001', '3000');

insert into salgrade(grade, losal, hisal)
values ('5', '3001', '9999');


create table bonus
(
  ename varchar(10),
  job   varchar(9),
  sal   int,
  comm  int
);

一.Hive with语句概述

在进行HQL开发的过程中,对于复杂的报表逻辑,经常需要嵌套多层临时表,代码逻辑复杂,此时可以使用with将临时表进行封装。

优势
– 代码模块化
– 代码可读性增强
– 相同查询唯一化

语法:

with subquery_name1 as (subquery_body1),
        subquery_name2 as (subquery_body2)
...
select * from subquery_name1 a, subquery_name2 b
where a.col = b.col
...

个人强烈推荐大家使用with语句。
没有with语句,进行复杂查询的时候,from子句后面一堆的临时表,而且面临特别复杂的需求的时候,from里的临时表里面可能还会嵌套临时表,代码量可读性、可维护性特别差。
with语句推出后,可以通过with语句封装好各个临时表,临时表直接可以根据顺序相互引用。
大大简化了代码的复杂程度,提升代码的可读性、可维护性。

另外,如果查询中多个临时表都用到某个表,可以通过with进行封装,达到一次查询多次引用,减少访问表的次数,在一定程度上也可以达到优化的效果。

二.Hive with语句测试案例

需求:求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资

代码:

-- 求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资
-- 主查询的from后面跟了2个临时表,程序可读性不佳
select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
  from dept d
  left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
               from emp e1
              group by e1.deptno) tmp1
    on d.deptno = tmp1.deptno
  left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
               from emp e1
              where e1.sal > 1000
              group by e1.deptno) tmp2
    on d.deptno = tmp2.deptno;
    
    
-- 求每个部门的平均工资,以及剔除薪资低于1000的实习人员之后的平均工资
-- 2个临时表的定时语句通过with封装成子查询了,程序可读性增强
with tmp1 as
 (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    from emp e1
   group by e1.deptno),
tmp2 as
 (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    from emp e1
   where e1.sal > 1000
   group by e1.deptno)
select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
  from dept d
  left join tmp1
    on d.deptno = tmp1.deptno
  left join tmp2
    on d.deptno = tmp2.deptno;

测试记录:

hive> 
    > 
    > 
    > 
    > select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
    >   from dept d
    >   left join (select e1.deptno, round(avg(ifnull(e1.sal, 0)), 2) avg_sal
    >                from emp e1
    >               group by e1.deptno) tmp1
    >     on d.deptno = tmp1.deptno
    >   left join (select e1.deptno, round(avg(ifnull(e1.sal, 0)), 2) avg_sal
    >                from emp e1
    >               where e1.sal > 1000
    >               group by e1.deptno) tmp2
    >     on d.deptno = tmp2.deptno;
FAILED: SemanticException [Error 10011]: Line 3:41 Invalid function 'ifnull'
hive> 
    > select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
    >   from dept d
    >   left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    >                from emp e1
    >               group by e1.deptno) tmp1
    >     on d.deptno = tmp1.deptno
    >   left join (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    >                from emp e1
    >               where e1.sal > 1000
    >               group by e1.deptno) tmp2
    >     on d.deptno = tmp2.deptno;
Query ID = root_20201217112303_9a1a8138-9748-4a61-8030-173f0b23aaa7
Total jobs = 4
Launching Job 1 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0263, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0263/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0263
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 11:23:10,448 Stage-1 map = 0%,  reduce = 0%
2020-12-17 11:23:16,613 Stage-1 map = 50%,  reduce = 0%, Cumulative CPU 2.94 sec
2020-12-17 11:23:17,644 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 5.76 sec
2020-12-17 11:23:23,819 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 9.1 sec
MapReduce Total cumulative CPU time: 9 seconds 100 msec
Ended Job = job_1606698967173_0263
Launching Job 2 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0264, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0264/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0264
Hadoop job information for Stage-3: number of mappers: 2; number of reducers: 1
2020-12-17 11:23:35,851 Stage-3 map = 0%,  reduce = 0%
2020-12-17 11:23:42,132 Stage-3 map = 100%,  reduce = 0%, Cumulative CPU 6.93 sec
2020-12-17 11:23:47,275 Stage-3 map = 100%,  reduce = 100%, Cumulative CPU 10.38 sec
MapReduce Total cumulative CPU time: 10 seconds 380 msec
Ended Job = job_1606698967173_0264
Stage-7 is selected by condition resolver.
Stage-2 is filtered out by condition resolver.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/log4j-slf4j-impl-2.8.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
2020-12-17 11:23:54     Starting to launch local task to process map join;      maximum memory = 1908932608
2020-12-17 11:23:55     Dump the side-table for tag: 1 with group count: 3 into file: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile01--.hashtable
2020-12-17 11:23:55     Uploaded 1 File to: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile01--.hashtable (331 bytes)
2020-12-17 11:23:55     Dump the side-table for tag: 2 with group count: 3 into file: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile02--.hashtable
2020-12-17 11:23:55     Uploaded 1 File to: file:/tmp/root/ce61dccc-9c42-4d5a-bbbe-bbd070e6720d/hive_2020-12-17_11-23-03_261_732316802191620401-1/-local-10005/HashTable-Stage-5/MapJoin-mapfile02--.hashtable (331 bytes)
2020-12-17 11:23:55     End of local task; Time Taken: 0.924 sec.
Execution completed successfully
MapredLocal task succeeded
Launching Job 4 out of 4
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1606698967173_0265, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0265/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0265
Hadoop job information for Stage-5: number of mappers: 2; number of reducers: 0
2020-12-17 11:24:03,551 Stage-5 map = 0%,  reduce = 0%
2020-12-17 11:24:10,761 Stage-5 map = 100%,  reduce = 0%, Cumulative CPU 4.07 sec
MapReduce Total cumulative CPU time: 4 seconds 70 msec
Ended Job = job_1606698967173_0265
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 2  Reduce: 1   Cumulative CPU: 9.1 sec   HDFS Read: 17286 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-3: Map: 2  Reduce: 1   Cumulative CPU: 10.38 sec   HDFS Read: 17206 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-5: Map: 2   Cumulative CPU: 4.07 sec   HDFS Read: 14497 HDFS Write: 288 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 23 seconds 550 msec
OK
20      2175.00 2518.75
30      1566.67 1690.00
40      NULL    NULL
10      2916.67 2916.67
Time taken: 68.583 seconds, Fetched: 4 row(s)
hive> 
    > with tmp1 as
    >  (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    >     from emp e1
    >    group by e1.deptno),
    > tmp2 as
    >  (select e1.deptno, round(avg(nvl(e1.sal, 0)), 2) avg_sal
    >     from emp e1
    >    where e1.sal > 1000
    >    group by e1.deptno)
    > select d.deptno, tmp1.avg_sal avg_sal1, tmp2.avg_sal avg_sal2
    >   from dept d
    >   left join tmp1
    >     on d.deptno = tmp1.deptno
    >   left join tmp2
    >     on d.deptno = tmp2.deptno;
Query ID = root_20201217112424_4e872f85-1ab0-4e2b-ab5a-64fa9a231730
Total jobs = 4
Launching Job 1 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0266, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0266/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0266
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2020-12-17 11:24:30,690 Stage-1 map = 0%,  reduce = 0%
2020-12-17 11:24:37,886 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 6.36 sec
2020-12-17 11:24:44,067 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 9.65 sec
MapReduce Total cumulative CPU time: 9 seconds 650 msec
Ended Job = job_1606698967173_0266
Launching Job 2 out of 4
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1606698967173_0267, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0267/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0267
Hadoop job information for Stage-3: number of mappers: 2; number of reducers: 1
2020-12-17 11:24:56,091 Stage-3 map = 0%,  reduce = 0%
2020-12-17 11:25:03,307 Stage-3 map = 50%,  reduce = 0%, Cumulative CPU 3.41 sec
2020-12-17 11:25:04,335 Stage-3 map = 100%,  reduce = 0%, Cumulative CPU 6.89 sec
2020-12-17 11:25:09,486 Stage-3 map = 100%,  reduce = 100%, Cumulative CPU 10.29 sec
MapReduce Total cumulative CPU time: 10 seconds 290 msec
Ended Job = job_1606698967173_0267
Stage-7 is selected by condition resolver.
Stage-2 is filtered out by condition resolver.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/log4j-slf4j-impl-2.8.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/jars/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
2020-12-17 11:25:16     Starting to launch local task to process map join;      maximum memory = 1908932608
Execution completed successfully
MapredLocal task succeeded
Launching Job 4 out of 4
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1606698967173_0268, Tracking URL = http://hp1:8088/proxy/application_1606698967173_0268/
Kill Command = /opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/hadoop/bin/hadoop job  -kill job_1606698967173_0268
Hadoop job information for Stage-5: number of mappers: 2; number of reducers: 0
2020-12-17 11:25:25,836 Stage-5 map = 0%,  reduce = 0%
2020-12-17 11:25:32,009 Stage-5 map = 50%,  reduce = 0%, Cumulative CPU 2.13 sec
2020-12-17 11:25:33,038 Stage-5 map = 100%,  reduce = 0%, Cumulative CPU 4.17 sec
MapReduce Total cumulative CPU time: 4 seconds 170 msec
Ended Job = job_1606698967173_0268
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 2  Reduce: 1   Cumulative CPU: 9.65 sec   HDFS Read: 17509 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-3: Map: 2  Reduce: 1   Cumulative CPU: 10.29 sec   HDFS Read: 17429 HDFS Write: 164 HDFS EC Read: 0 SUCCESS
Stage-Stage-5: Map: 2   Cumulative CPU: 4.17 sec   HDFS Read: 14519 HDFS Write: 288 HDFS EC Read: 0 SUCCESS
Total MapReduce CPU Time Spent: 24 seconds 110 msec
OK
20      2175.00 2518.75
30      1566.67 1690.00
40      NULL    NULL
10      2916.67 2916.67
Time taken: 69.438 seconds, Fetched: 4 row(s)
hive> 
上一篇:Oracle SQL语句练习题


下一篇:【数据分析师_02_SQL+MySQL】016_MySQL的数据汇聚AVG,COUNT,MAX,MIN,SUM