hive sql
create table sql
create table student(s_id string,s_name string,s_birth string,s_sex string) row format delimited fields terminated by ‘\t‘;
create table course(c_id string,c_name string,t_id string) row format delimited fields terminated by ‘\t‘;
create table teacher(t_id string,t_name string) row format delimited fields terminated by ‘\t‘;
create table score(s_id string,c_id string,s_score int) row format delimited fields terminated by ‘\t‘;
student.csv
01 赵雷 1990-01-01 男
02 钱电 1990-12-21 男
03 孙风 1990-05-20 男
04 李云 1990-08-06 男
05 周梅 1991-12-01 女
06 吴兰 1992-03-01 女
07 郑竹 1989-07-01 女
08 王菊 1990-01-20 女
course.csv
01 语文 02
02 数学 01
03 英语 03
teacher.csv
01 张三
02 李四
03 王五
score.csv
01 01 80
01 02 90
01 03 99
02 01 70
02 02 60
02 03 80
03 01 80
03 02 80
03 03 80
04 01 50
04 02 30
04 03 20
05 01 76
05 02 87
06 01 31
06 03 34
07 02 89
07 03 98
load data into hive
load data local inpath ‘/root/decent_sql_test/student.csv‘ into table student;
load data local inpath ‘/root/decent_sql_test/course.csv‘ into table course;
load data local inpath ‘/root/decent_sql_test/teacher.csv‘ into table teacher;
load data local inpath ‘/root/decent_sql_test/score.csv‘ into table score;
– 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:
select student.*,
s.s_score as 01_score,
b.s_score as 02_score
from student
join score s
on student.s_id = s.s_id and s.c_id = 01
left join score b on student.s_id = b.s_id and b.c_id = 02
where s.s_score > b.s_score;
– 3、查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩:
select s.s_id,
s.s_name,
avg(s2.s_score) avg
from student s
left join score s2 on s.s_id = s2.s_id
group by s.s_id, s.s_name
having avg > 60;
– 4、查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:
– (包括有成绩的和无成绩的)
select s.s_id, --平均成绩小于60的学生信息
s.s_name,
round(avg(s2.s_score), 2) avg
from student s
left join score s2 on s.s_id = s2.s_id
group by s.s_id, s.s_name
having round(avg(s2.s_score), 2) < 60
union all --拼接关联在一起
select t.s_id, t.s_name, 0 as avgScore --无成绩的学生的信息
from student t
where s_id not in (select distinct sc.s_id from score sc);
– 5、查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
select student.s_id,
student.s_name,
count(distinct s.c_id), --聚合函数
sum(s.s_score) --聚合函数
from student
left join score s on student.s_id = s.s_id
group by student.s_id,student.s_name --前面出现聚合函数,非聚合函数字段要进行分组
– 6、查询"李"姓老师的数量:
--慎用count(distinct),count(distinct)容易产生倾斜问题。
--尽量使用group by 代替 count(distinct)
select count(distinct t.t_name) from teacher t where t.t_name like ‘李%‘;
select count(t.t_name) from teacher t where t.t_name like ‘李%‘ group by t.t_name ;
– 7、查询学过"张三"老师授课的同学的信息:
--思维误区,嵌套查询,
select c.c_id from course c where c_id = (select t_id from teacher where t_name=‘张三‘);
--正常逻辑
select student.* from student
join score s on student.s_id = s.s_id
join course c on s.c_id = c.c_id
join teacher t on c.t_id = t.t_id and t_name=‘张三‘;
– 8、查询没学过"张三"老师授课的同学的信息:
select student.*
from student
left join (
select s_id
from score
join course c on score.c_id = c.c_id
join teacher t2 on c.t_id = t2.t_id and t2.t_name = ‘张三‘
) t on t.s_id = student.s_id
where t.s_id is null;
;
– 9、查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:
select *
from student
join (select s_id from score where c_id = ‘01‘) t1
on student.s_id = t1.s_id --前面是一个整体,一个表 join 就是保留两边都有的数据
join (select s_id from score where c_id = ‘02‘) t2
on student.s_id = t2.s_id;
– 10、查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
select student.* from student
join (select s_id from score where c_id =1 )tmp1
on student.s_id=tmp1.s_id
left join (select s_id from score where c_id =2 )tmp2 --前面是一个整体,一个表此处用left join 保留左表符合条件的数据
on student.s_id =tmp2.s_id
where tmp2.s_id is null;
– 11、查询没有学全所有课程的同学的信息:
select s.*
from student s
join
(select count(c_id) cn1 from course) t1
left join
(select s_id, count(c_id) cn2 from score s group by s.s_id) t on t.cn2 = t1.cn1 and s.s_id = t.s_id
where t.s_id is null;
– 12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:
select student.*
from student
join (select c_id from score where score.s_id = 01) temp
join (select s_id, c_id from score) temp2
on temp.c_id = temp2.c_id and student.s_id = temp2.s_id
where student.s_id not in (‘01‘)
group by student.s_id, s_name, s_birth, s_sex;
– 13、查询和"01"号的同学学习的课程完全相同的其他同学的信息:
select student.*, tmp1.course_id
from student
join (select s_id, concat_ws(‘|‘, collect_set(c_id)) course_id
from score
group by s_id
having s_id not in (1)) tmp1
on student.s_id = tmp1.s_id
join (select concat_ws(‘|‘, collect_set(c_id)) course_id2
from score
where s_id = 1) tmp2
on tmp1.course_id = tmp2.course_id2;
?
union all and union 区别
UNION去重且排序
UNION ALL不去重不排序
hiving的用法
sql中having子句与where子句类似,都是表示条件的设定,它们的区别在于,
having子句在查询过程中慢于聚合语句(sum,min,max,avg,count);
而where子句在查询过程中则快于聚合语句(sum,min,max,avg,count)。
如果你对何时应该使用WHERE,何时使用HAVING仍旧很迷惑,请遵照下面的说明:
WHERE语句在GROUP BY语句之前;SQL会在分组之前计算WHERE语句。
HAVING语句在GROUP BY语句之后;SQL会在分组之后计算HAVING语句。
hive join 用法
内关联(JOIN)
只返回能关联上的结果。
左外关联(LEFT [OUTER] JOIN)
以LEFT [OUTER] JOIN关键字前面的表作为主表,和其他表进行关联,返回记录和主表的记录数一致,关联不上的字段置为NULL。
是否指定OUTER关键字,貌似对查询结果无影响。
右外关联(RIGHT [OUTER] JOIN)
和左外关联相反,以RIGTH [OUTER] JOIN关键词后面的表作为主表,和前面的表做关联,返回记录数和主表一致,关联不上的字段为NULL。
是否指定OUTER关键字,貌似对查询结果无影响
全外关联(FULL [OUTER] JOIN)
以两个表的记录为基准,返回两个表的记录去重之和,关联不上的字段为NULL。
是否指定OUTER关键字,貌似对查询结果无影响。
注意:FULL JOIN时候,Hive不会使用MapJoin来优化。
LEFT SEMI JOIN
以LEFT SEMI JOIN关键字前面的表为主表,返回主表的KEY也在副表中的记录。
?
?