41 Hive DDL、DML、查询

2023-10-08 20:34:16

DDL

创建数据库

create database [if not exists] database_name
[comment ""]
[location hdfs_path]
[with dbproperties (property_name=property_value)];

创建一个数据库，数据库在HDFS上默认存储路径是/user/hive/warehouse/*.db
为了避免要创建的数据库已经存在的错误，要加if not exists
创建一个数据库，指定数据库在HDFS上存放的位置
```
create database db_hive2 location '/db_hive2.db';
```

查询数据库

显示数据库
```
show databases;
```
过滤显示查询的数据库
```
show databases like 'db_hive*';
```

显示数据库信息

desc database db_hive;

# 显示数据库详细信息
desc database extended db_hive;

切换当前数据库
```
use db_hive;
```

修改数据库

# 修改用来描述这个数据库的属性信息
alter database db_hive set dbproperties('createtime'='20220201');

删除数据库

# 如果删除的数据库不存在，利用if exists 判断数据库是否存在
drop database if exists db_hive2;

# 如果数据库不为空，可以采用cascade命令强制删除
drop database db_hive cascade;

创建表

create [external] table [if not exists] table_name
[col_name data_type]
[comment ""]
[partitioned by col_name data_type]
[clustered by]
[row format row_format]
[stored as file_format]
[location '']
[tabproperties (name=value)]
[as select_statement]
[like ]

external关键字可以让用户创建一个外部表，见表同时可以指定一个指向实际数据的路径（Location）
在删除表的时候，内部表的元数据和数据会被一起删除，而外部表只删除元数据，不删除数据

外部表

每天将收集到的网站日志定期流入HDFS文件
在外部表（原始日志表）的基础上做大量的统计分析
用到的中间表、结果表使用内部表存储
数据通过select + insert进入内部表

管理表(内部表)与外部表的互相转换

查询表的类型
```
desc formatted student2;
```

修改内部表student2为外部表

alter table student2 set tblproperties('EXTERNAL'='TRUE');

# 'EXTERNAL'='TRUE'为固定写法,区分大小写

修改表

重命名表

alter table table_name rename to new_name;

增加/修改/替换列信息

# 添加列
alter table dept add columns(deptdesc string);

# 查询表结构
desc dept;

# 更新列
alter table dept change column deptdesc desc string;

# 替换列
alter table dept replace columns(deptno string, dname string, loc string);

删除表

drop table dept;

DML

数据导入

向表中装载数据

load data [local] inpath '数据的path' [overwrite] into table student [partition ()];

local:表示从本地加载数据到Hive表,否则从HDFS加载数据到Hive表
overwrite:表示覆盖表中已有数据,否则表示追加
partition:表示上传到指定分区

# 创建一张表
create table student(id string, name string) row format delimited fields terminated by '\t';

# 加载本地文件到hive
load data local inpath '/opt/moudule/hive/datas/student.txt' into table default.student;

# 加载HDFS上数据
load data inpath '/user/atguigu/hive/student.txt' into table default.student;

通过查询语句向表中插入数据(insert)

# 创建一张表
create table student_par(id int, name string) row format delimited fields terminated by '\t';

# 基本插入数据
insert into table student_par values(1,'wangwu'),(2,'zhaoliu');

# 根据表单查询结果插入
insert overwrite table student_par select id, name from student;

# 多分区插入
from stuednt
insert overwrite table student partition(month='202201')
select id, name where month='202201'
insert overwrite table student partition(month='202202')
select id, name where month='202202'

# 查询语句中创建表并加载数据(as select)
create table if not exists student3 as select id, name from student;

创建表时通过location指定加载数据路径

# 创建表并指定在hdfs上的位置
create external table if not exists student5(id int, name string)
row format delimited fields terminated by '\t'
location '/student';

import数据到指定Hive表中

import table student2 from '/user/hive/warehouse/export/student';

数据导出

insert 导出

# 将查询的结果格式化导出到本地
insert overwrite local directory
'/opt/module/hive/datas/export/student1'
row format delimited fields terminated by '\t'
select * from student;

# 将查询结果导出到HDFS上(没有local)
insert overwrite directory '/user/atguigu/student2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
select * from student;

用Hadoop命令导出到本地

dfs -get /user/hive/warehouse/student/student.txt /opt/module/datas/export/student3.txt;

Hive Shell命令导出

bin/hive -e 'select * from default.stuent;' > /opt/module/hive/datas/export/student4.txt;

export导出到HDFS上

export default.student to '/user/hive/warehouse/export/student';

sqoop导出

清除表中数据(保留表结构)

# truncate 只能删除管理表，不能删除外部表中的数据
truncate table student;

查询

原始数据

dept

10	ACCOUNTING	1700
20	RESEARCH	1800
30	SALES	1900
40	OPERATIONS	1700

emp

7369	SMITH	CLERK	7902	1980-12-17	800.00		20
7499	ALLEN	SALESMAN	7698	1981-2-20	1600.00	300.00	30
7521	WARD	SALESMAN	7698	1981-2-22	1250.00	500.00	30
7566	JONES	MANAGER	7839	1981-4-2	2975.00		20
7654	MARTIN	SALESMAN	7698	1981-9-28	1250.00	1400.00	30
7698	BLAKE	MANAGER	7839	1981-5-1	2850.00		30
7782	CLARK	MANAGER	7839	1981-6-9	2450.00		10
7788	SCOTT	ANALYST	7566	1987-4-19	3000.00		20
7839	KING	PRESIDENT		1981-11-17	5000.00		10
7844	TURNER	SALESMAN	7698	1981-9-8	1500.00	0.00	30
7876	ADAMS	CLERK	7788	1987-5-23	1100.00		20
7900	JAMES	CLERK	7698	1981-12-3	950.00		30
7902	FORD	ANALYST	7566	1981-12-3	3000.00		20
7934	MILLER	CLERK	7782	1982-1-23	1300.00		10

创建部门表

create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';

创建员工表

create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string, 
sal double, 
comm double,
deptno int)
row format delimited fields terminated by '\t';

导入数据

load data local inpath '/opt/module/datas/dept.txt' into table
dept;
load data local inpath '/opt/module/datas/emp.txt' into table emp;

案例实操

# 查询出薪水等于5000的所有员工
select * from emp where sal =5000;

# 查询工资在500到1000的员工信息
select * from emp where sal between 500 and 1000;

# 查询comm为空的所有员工信息
select * from emp where comm is null;

# 查询工资是1500或5000的员工信息
select * from emp where sal IN (1500, 5000);

# 查找名字以A开头的员工信息
select * from emp where ename LIKE 'A%';

# 查找名字中第二个字母为A的员工信息
select * from emp where ename LIKE '_A%';

# 查找名字中带有A的员工信息
select * from emp where ename  RLIKE '[A]';

# 查询薪水大于1000，部门是30
select * from emp where sal>1000 and deptno=30;

# 查询薪水大于1000，或者部门是30
select * from emp where sal>1000 or deptno=30;

# 查询除了20部门和30部门以外的员工信息
select * from emp where deptno not IN(30, 20);

# 计算emp表每个部门的平均工资
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;

# 计算emp每个部门中每个岗位的最高薪水
select t.deptno, t.job, max(t.sal) max_sal from emp t group by
 t.deptno, t.job;

# 求每个部门的平均薪水大于2000的部门
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;

多表连接

数据准备

1700	Beijing
1800	London
1900	Tokyo

创建位置表

create table if not exists location(
loc int,
loc_name string
)
row format delimited fields terminated by '\t';

导入数据

load data local inpath '/opt/module/datas/location.txt' into table location;

多表连接查询

SELECT e.ename, d.dname, l.loc_name
FROM   emp e 
JOIN   dept d
ON     d.deptno = e.deptno 
JOIN   location l
ON     d.loc = l.loc;

排序

全局排序（Order By）

# 按照员工薪水的2倍排序
select ename, sal*2 twosal from emp order by twosal;

# 按照部门和工资升序排序
select ename, deptno, sal from emp order by deptno, sal;

每个Reduce内部排序（Sort By）

# 设置reduce个数
set mapreduce.job.reduces=3;

# 根据部门编号降序查看员工信息
select * from emp sort by deptno desc;

# 将查询结果导入到文件中
insert overwrite local directory '/opt/module/hive/datas/sortby-result'
select * from emp sort by deptno desc;

分区（Distribute By）

distribute by的分区规则是根据分区字段的hash码与reduce的个数进行模除后，余数相同的分到一个区
Hive要求DISTRIBUTE BY语句要写在SORT BY语句之前

# 先按照部门编号分区，再按照员工编号降序排序
set mapreduce.job.reduces=3;
insert overwrite local directory '/opt/module/hive/datas/distribute-result' select * from emp distribute by deptno sort by empno desc;

Cluster By

当distribute by 和sort by字段相同时，可以使用cluster by方式
cluster排序只能是升序排序，不能指定排序规则为asc或desc

# 以下两种写法等价
select * from emp cluster by deptno;
select * from emp distribute by deptno sort by deptno;

码农公寓

DDL

创建数据库

查询数据库

修改数据库

删除数据库

创建表

外部表

管理表(内部表)与外部表的互相转换

修改表

删除表

DML

数据导入

数据导出

清除表中数据(保留表结构)

查询

原始数据

创建部门表

创建员工表

导入数据

案例实操

多表连接

数据准备

创建位置表

导入数据

多表连接查询

排序

全局排序（Order By）

每个Reduce内部排序（Sort By）

分区（Distribute By）

Cluster By

相关文章