Hive 之 DDL

目录

DDL(data definition language): 主要的命令有CREATE、ALTER、DROP等。DDL主要是用在定义、修改数据库对象的结构 或 数据类型。

1、Database

1.1 创建database

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
[COMMENT database_comment] 
[LOCATION hdfs_path]
[MANAGEDLOCATION hdfs_path]
[WITH DBPROPERTIES (property_name=property_value, ...)];
  • COMMENT : 数据库备注
  • LOCATION : 数据在HDFS 上的存储路径, 默认地址是 /user/hive/warehouse/*.db
  • MANAGEDLOCATION :内部表数据存储路径,目前没用到,为HIVE 后续版本预留的
  • DBPROPERTIES :添加一些数据库的属性
hive (default)>
              > CREATE  DATABASE  IF NOT EXISTS  my_db
              > COMMENT  'my hive db'
              > LOCATION '/user/hive/my_db.db'
              > WITH DBPROPERTIES('date'='2021-10-1','city'= 'shenzhen');
OK
Time taken: 0.154 seconds
hive (default)>
              > show databases;
OK
database_name
default
my_db
Time taken: 0.076 seconds, Fetched: 2 row(s)

1.2 查看数据库

desc  database  my_db;
desc database  extended  my_db;
describe database  extended  my_db;

使用数据库

use my_db;

正在使用的库

select current_database();

查看建库语句

show create database my_db;

1.3 删除数据库

-- 删除一个空数据库
drop database databasename;
-- 如果数据库不为空,使用 cascade 强制删除
drop database databasename cascade;

2、Table

2.1 建表语法

create [external] table [IF NOT EXISTS] table_name
[(colName colType [comment 'comment'], ...)]
[comment table_comment]
[partition by (colName colType [comment col_comment], ...)]
[clustered BY (colName, colName, ...)
             [sorted by (col_name [ASC|DESC], ...)] 
             into num_buckets  buckets]
[row format row_format]
[stored as file_format]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)]
[AS select_statement];


CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS]
[db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];

  • EXTERNAL关键字。创建外部表,否则创建的是内部表(管理表)。
  • partition by。对表中数据进行分区,指定表的分区字段
  • clustered by。创建分桶表,指定分桶字段
  • sorted by。对桶中的一个或多个列排序,较少使用
  • row format row_format
ROW FORMAT DELIMITED
[FIELDS TERMINATED BY char]
[COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char]
[LINES TERMINATED BY char] | SERDE serde_name
[WITH SERDEPROPERTIES (property_name=property_value,
property_name=property_value, ...)]
  • stored as
     SEQUENCEFILE //序列化文件
      | TEXTFILE //普通的文本文件格式
      | RCFILE  //行列存储相结合的文件
      | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname //自定义文件格式,如果文件数据是纯文本,可以使用 STORED AS TEXTFILE。如果数据需要压缩,使用 STORED AS SEQUENCE 。

  • AS。后面可以接查询语句,表示根据后面的查询结果创建表

  • . LIKE。like 表名,允许用户复制现有的表结构,但是不复制数据

建表t1

create table t1(
id int,
name string,
hobby array<string>,
addr map<string, string>
)
row format delimited
fields terminated by ";"           # 字段分隔符
collection items terminated by "," # 集合元素之间的分隔符 
map keys terminated by ":";        # map 类型中kv 的分隔符

# 数据
[root@master hive]# cat t1.dat
2;zhangsan;book,TV,code;beijing:chaoyang,shagnhai:pudong
3;lishi;book,code;nanjing:jiangning,*:taibei
4;wangwu;music,book;heilongjiang:haerbin
[root@master hive]#

# 导入
0: jdbc:hive2://master:10000> load data local inpath '/root/bigdata/test_data/hive/t1.dat' into table t1;
No rows affected (1.57 seconds)
0: jdbc:hive2://master:10000> select *  from t1;
+--------+-----------+-----------------------+---------------------------------------------+
| t1.id  |  t1.name  |       t1.hobby        |                   t1.addr                   |
+--------+-----------+-----------------------+---------------------------------------------+
| 2      | zhangsan  | ["book","TV","code"]  | {"beijing":"chaoyang","shagnhai":"pudong"}  |
| 3      | lishi     | ["book","code"]       | {"nanjing":"jiangning","*":"taibei"}   |
| 4      | wangwu    | ["music","book"]      | {"heilongjiang":"haerbin"}                  |
+--------+-----------+-----------------------+---------------------------------------------+



[root@master hive]# hadoop fs -ls  /user/hive/warehouse/t1
Found 1 items
-rwxr-xr-x   1 root supergroup        148 2021-10-01 12:41 /user/hive/warehouse/t1/t1.dat

2.2 查看表

0: jdbc:hive2://master:10000> desc formatted t1;
+-------------------------------+----------------------------------------------------+-----------------------+
|           col_name            |                     data_type                      |        comment        |
+-------------------------------+----------------------------------------------------+-----------------------+
| # col_name                    | data_type                                          | comment               |
|                               | NULL                                               | NULL                  |
| id                            | int                                                |                       |
| name                          | string                                             |                       |
| hobby                         | array<string>                                      |                       |
| addr                          | map<string,string>                                 |                       |
|                               | NULL                                               | NULL                  |
| # Detailed Table Information  | NULL                                               | NULL                  |
| Database:                     | default                                            | NULL                  |
| Owner:                        | root                                               | NULL                  |
| CreateTime:                   | Fri Oct 01 12:35:31 CST 2021                       | NULL                  |
| LastAccessTime:               | UNKNOWN                                            | NULL                  |
| Retention:                    | 0                                                  | NULL                  |
| Location:                     | hdfs://master:9000/user/hive/warehouse/t1          | NULL                  |
| Table Type:                   | MANAGED_TABLE                                      | NULL                  |
| Table Parameters:             | NULL                                               | NULL                  |
|                               | numFiles                                           | 1                     |
|                               | numRows                                            | 0                     |
|                               | rawDataSize                                        | 0                     |
|                               | totalSize                                          | 148                   |
|                               | transient_lastDdlTime                              | 1633063314            |
|                               | NULL                                               | NULL                  |
| # Storage Information         | NULL                                               | NULL                  |
| SerDe Library:                | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL                  |
| InputFormat:                  | org.apache.hadoop.mapred.TextInputFormat           | NULL                  |
| OutputFormat:                 | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL                  |
| Compressed:                   | No                                                 | NULL                  |
| Num Buckets:                  | -1                                                 | NULL                  |
| Bucket Columns:               | []                                                 | NULL                  |
| Sort Columns:                 | []                                                 | NULL                  |
| Storage Desc Params:          | NULL                                               | NULL                  |
|                               | colelction.delim                                   | ,                     |
|                               | field.delim                                        | ;                     |
|                               | mapkey.delim                                       | :                     |
|                               | serialization.format                               | ;                     |
+-------------------------------+----------------------------------------------------+-----------------------+

2.3 删除表

drop table t1;
  • 内部表删除时,表信息和数据会一起删除,删除外部表时,仅删除表定义,数据不会被删除

内部表和外部表转换

-- 内部表转外部表
alter table t1 set tblproperties('EXTERNAL'='TRUE');
-- 查询表信息,是否转换成功
desc formatted t1;
-- 外部表转内部表。EXTERNAL 大写,false 不区分大小
alter table t1 set tblproperties('EXTERNAL'='FALSE');
-- 查询表信息,是否转换成功
desc formatted t1;

3、分区表

Hive在执行查询时,一般会扫描整个表的数据。由于表的数据量大,全表扫描消耗时间长、效率低。而有时候,查询只需要扫描表中的一部分数据即可,Hive引入了分区表的概念,将表
的数据存储在不同的子目录中,每一个子目录对应一个分区。只查询部分分区数据时,可避免全表扫描,提高查询效率。在实际中,通常根据时间、地区等信息进行分区。

– 创建表


create table if not exists t3(
id int
,name string
,hobby array<string>
,addr map<String,string>
)
partitioned by (dt string)
row format delimited
fields terminated by ';'
collection items terminated by ','
map keys terminated by ':';

–加载数据

load data local inpath '/root/bigdata/test_data/hive/t1.dat' into table t3 partition(dt="2020-06-01");

 load data local inpath '/root/bigdata/test_data/hive/t1.dat' into table t3 partition(dt="2020-06-02");

 load data local inpath '/root/bigdata/test_data/hive/t1.dat' into table t3 partition(dt="2020-06-03");

备注:分区字段不是表中已经存在的数据,可以将分区字段看成伪列

查看分区

0: jdbc:hive2://master:10000> show partitions t3;
+----------------+
|   partition    |
+----------------+
| dt=2020-06-01  |
| dt=2020-06-02  |
| dt=2020-06-03  |
+----------------+

新增分区并设置数据

0: jdbc:hive2://master:10000> alter table t3 add partition(dt='2020-06-04');
No rows affected (2.174 seconds)
0: jdbc:hive2://master:10000> show partitions t3;
+----------------+
|   partition    |
+----------------+
| dt=2020-06-01  |
| dt=2020-06-02  |
| dt=2020-06-03  |
| dt=2020-06-04  |
+----------------+
4 rows selected (0.655 seconds)
0: jdbc:hive2://master:10000> select * from t3 where t3.dt = '2020-06-04';
+--------+----------+-----------+----------+--------+
| t3.id  | t3.name  | t3.hobby  | t3.addr  | t3.dt  |
+--------+----------+-----------+----------+--------+
+--------+----------+-----------+----------+--------+



-- 增加多个分区,不加载数据
alter table t3
add partition(dt='2020-06-05') partition(dt='2020-06-06');

加载数据

0: jdbc:hive2://master:10000> select * from t3 where dt='2020-06-04';
+--------+----------+-----------+----------+--------+
| t3.id  | t3.name  | t3.hobby  | t3.addr  | t3.dt  |
+--------+----------+-----------+----------+--------+
+--------+----------+-----------+----------+--------+

# 复制数据文件
 hadoop fs -cp /user/hive/warehouse/t3/dt=2020-06-01/t1.dat    /user/hive/warehouse/t3/dt=2020-06-04

0: jdbc:hive2://master:10000> select * from t3 where dt='2020-06-04';
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| t3.id  |  t3.name  |       t3.hobby        |                   t3.addr                   |    t3.dt    |
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| 2      | zhangsan  | ["book","TV","code"]  | {"beijing":"chaoyang","shagnhai":"pudong"}  | 2020-06-04  |
| 3      | lishi     | ["book","code"]       | {"nanjing":"jiangning","*":"taibei"}   | 2020-06-04  |
| 4      | wangwu    | ["music","book"]      | {"heilongjiang":"haerbin"}                  | 2020-06-04  |
+--------+-----------+-----------------------+---------------------------------------------+-------------+


-- 增加多个分区。加载数据
alter table t3 add
partition(dt='2020-06-07') location
'/user/hive/warehouse/mydb.db/t3/dt=2020-06-07'
partition(dt='2020-06-08') location
'/user/hive/warehouse/mydb.db/t3/dt=2020-06-08';

修改分区的hdfs路径

0: jdbc:hive2://master:10000> select * from t3 where dt='2020-06-05';
+--------+----------+-----------+----------+--------+
| t3.id  | t3.name  | t3.hobby  | t3.addr  | t3.dt  |
+--------+----------+-----------+----------+--------+
+--------+----------+-----------+----------+--------+
No rows selected (0.614 seconds)
0: jdbc:hive2://master:10000>
0: jdbc:hive2://master:10000>
#
0: jdbc:hive2://master:10000> alter table t3 partition(dt='2020-06-05') set location '/user/hive/warehouse/t3/dt=2020-06-01';
No rows affected (0.894 seconds)
0: jdbc:hive2://master:10000>
0: jdbc:hive2://master:10000> select * from t3 where dt='2020-06-05';
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| t3.id  |  t3.name  |       t3.hobby        |                   t3.addr                   |    t3.dt    |
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| 2      | zhangsan  | ["book","TV","code"]  | {"beijing":"chaoyang","shagnhai":"pudong"}  | 2020-06-05  |
| 3      | lishi     | ["book","code"]       | {"nanjing":"jiangning","*":"taibei"}   | 2020-06-05  |
| 4      | wangwu    | ["music","book"]      | {"heilongjiang":"haerbin"}                  | 2020-06-05  |
+--------+-----------+-----------------------+---------------------------------------------+-------------+
3 rows selected (0.732 seconds)
0: jdbc:hive2://master:10000> select * from t3 where dt='2020-06-01';
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| t3.id  |  t3.name  |       t3.hobby        |                   t3.addr                   |    t3.dt    |
+--------+-----------+-----------------------+---------------------------------------------+-------------+
| 2      | zhangsan  | ["book","TV","code"]  | {"beijing":"chaoyang","shagnhai":"pudong"}  | 2020-06-01  |
| 3      | lishi     | ["book","code"]       | {"nanjing":"jiangning","*":"taibei"}   | 2020-06-01  |
| 4      | wangwu    | ["music","book"]      | {"heilongjiang":"haerbin"}                  | 2020-06-01  |
+--------+-----------+-----------------------+---------------------------------------------+-------------+

删除分区

-- 可以删除一个或多个分区,用逗号隔开
alter table t3 drop partition(dt='2020-06-03'),
partition(dt='2020-06-04');

4 、分桶表

当单个的分区或者表的数据量过大,分区不能更细粒度的划分数据,就需要使用分桶
技术将数据划分成更细的粒度。将数据按照指定的字段进行分成多个桶中去,即将数
据按照字段进行划分,数据按照字段划分到多个文件当中去。分桶的原理:

  • MR中:key.hashCode % reductTask
  • Hive中:分桶字段.hashCode % 分桶个数

数据

1,java,90
1,c,78
1,python,91
1,hadoop,80
2,java,75
2,c,76
2,python,80
2,hadoop,93
3,java,98
3,c,74
3,python,89
3,hadoop,91
5,java,93
6,c,76
7,python,87
8,hadoop,88

– 创建分桶表

create table b_course(
id int,
name string,
score int
)
clustered by (id) into 3 buckets
row format delimited fields terminated by ",";


– 创建普通表

create table course_common(
id int,
name string,
score int
)
row format delimited fields terminated by ",";

– 普通表加载数据

load data local inpath '/root/bigdata/test_data/hive/buckets.dat' into
table course_common;

– 通过 insert … select … 给桶表加载数据

insert into table b_course select * from course_common;

[root@master hive]# hadoop fs -ls /user/hive/warehouse/b_course
Found 3 items
-rwxr-xr-x   1 root supergroup         48 2021-10-02 21:43 /user/hive/warehouse/b_course/000000_0
-rwxr-xr-x   1 root supergroup         53 2021-10-02 21:43 /user/hive/warehouse/b_course/000001_0
-rwxr-xr-x   1 root supergroup         63 2021-10-02 21:43 /user/hive/warehouse/b_course/000002_0
[root@master hive]#
[root@master hive]# hadoop fs -ls /user/hive/warehouse/b_course/000000_0
-rwxr-xr-x   1 root supergroup         48 2021-10-02 21:43 /user/hive/warehouse/b_course/000000_0
[root@master hive]#
[root@master hive]# hadoop fs -text /user/hive/warehouse/b_course/000000_0
3,hadoop,91
3,python,89
3,c,74
3,java,98
6,c,76

5、修改表 & 删除表

-- 修改表名。rename
alter table course_common
rename to course_common1;

-- 修改列名。change column
alter table course_common1
change column id cid int;

-- 修改字段类型。change column
alter table course_common1
change column cid cid string;
-- The following columns have types incompatible with the
-- existing columns in their respective positions
-- 修改字段数据类型时,要满足数据类型转换的要求。如int可以转为string,但是string不能转为int

-- 增加字段。add columns
alter table course_common1
add columns (common string);

-- 删除字段:replace columns
-- 这里仅仅只是在元数据中删除了字段,并没有改动hdfs上的数据文件
alter table course_common1
replace columns(
id string, cname string, score int);

-- 删除表
drop table course_common1;
上一篇:《区块链编程》第十一章


下一篇:DataTable和实体集合互转(C#)