hive系列(二) 常用语句操作

hive sql 官方文档:https://cwiki.apache.org/confluence/display/Hive/LanguageManual

# 展示所有数据库
show databases;

# 切换数据库
use database_name;

# 查看数据库结构信息
desc demo;

# 查看数据库结构明细信息
desc formatted demo;

create table


CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
     [STORED AS DIRECTORIES]
  [
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  [AS select_statement];   -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
 
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE existing_table_or_view_name
  [LOCATION hdfs_path];
 
data_type
  : primitive_type
  | array_type
  | map_type
  | struct_type
  | union_type  -- (Note: Available in Hive 0.7.0 and later)
 
primitive_type
  : TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
  | STRING
  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
  | DATE        -- (Note: Available in Hive 0.12.0 and later)
  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
  | CHAR        -- (Note: Available in Hive 0.13.0 and later)
 
array_type
  : ARRAY < data_type >
 
map_type
  : MAP < primitive_type, data_type >
 
struct_type
  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
 
union_type
   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)
 
row_format
  : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
        [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
        [NULL DEFINED AS char]   -- (Note: Available in Hive 0.13 and later)
  | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
 
file_format:
  : SEQUENCEFILE
  | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
  | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
  | ORC         -- (Note: Available in Hive 0.11.0 and later)
  | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
  | AVRO        -- (Note: Available in Hive 0.14.0 and later)
  | JSONFILE    -- (Note: Available in Hive 4.0.0 and later)
  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
 
column_constraint_specification:
  : [ PRIMARY KEY|UNIQUE|NOT NULL|DEFAULT [default_value]|CHECK  [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
 
default_value:
  : [ LITERAL|CURRENT_USER()|CURRENT_DATE()|CURRENT_TIMESTAMP()|NULL ] 
 
constraint_specification:
  : [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
    [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
    [, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE 
    [, CONSTRAINT constraint_name UNIQUE (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
    [, CONSTRAINT constraint_name CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]

1. 创建普通hive表(不包含行定义格式)

create table demo
(
id int,
name string,
likes array<string>,
address map<string,string>
);

2. 创建自定义行格式的hive表(指定列的分割符)

create table demo2
(
	id int,
	name string,
	likes array<string>,
	address map<string,string>
)
	row format delimited
	fields terminated by ','
	collection items terminated by '-'
	map keys terminated by ':';

hive系列(二) 常用语句操作

 3. 创建默认分隔符的hive表(^A、^B、^C)

create table psn3
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003';


# -------------------------------------
create table psn3
(
id int,
name string,
likes array<string>,
address map<string,string>
)

4. 创建hive的外部表(需要添加external和location的关键字)

create external table demo4
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/data';

    在之前创建的表都属于hive的内部表(psn,psn2,psn3),而psn4属于hive的外部表,
    内部表跟外部表的区别:
        1、hive内部表创建的时候数据存储在hive的默认存储目录中,外部表在创建的时候需要制定额外的目录
        2、hive内部表删除的时候,会将元数据和数据都删除,而外部表只会删除元数据,不会删除数据
    应用场景:
        内部表:需要先创建表,然后向表中添加数据,适合做中间表的存储
        外部表:可以先创建表,再添加数据,也可以先有数据,再创建表,本质上是将hdfs的某一个目录的数据跟    

5. 分区表

hive的分区表:
        hive默认将表的数据保存在某一个hdfs的存储目录下,当需要检索符合条件的某一部分数据的时候,需要全量

create table demo5
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(gender string)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';

#--创建多分区表
create table psn6
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(gender string,age int)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';

注意:

  1. 当创建完分区表之后,在保存数据的时候,会在hdfs目录中看到分区列会成为一个目录,以多级目录的形式存在
  2. 当创建多分区表之后,插入数据的时候不可以只添加一个分区列,需要将所有的分区列都添加值
  3. 多分区表在添加分区列的值得时候,与顺序无关,与分区表的分区列的名称相关,按照名称就行匹配

注意:

  1. 添加分区列的值的时候,如果定义的是多分区表,那么必须给所有的分区列都赋值
  2. 删除分区列的值的时候,无论是单分区表还是多分区表,都可以将指定的分区进行删除

修复分区:
        在使用hive外部表的时候,可以先将数据上传到hdfs的某一个目录中,然后再创建外部表建立映射关系,如果在上传数据的时候,参考分区表的形式也创建了多级目录,那么此时创建完表之后,是查询不到数据的,原因是分区的元数据没有保存在mysql中,因此需要修复分区,将元数据同步更新到mysql中,此时才可以查询到元数据。具体操作如下:

# 准备外部数据
hdfs dfs -mkdir /data
hdfs dfs -mkdir /data/age=10
hdfs dfs -mkdir /data/age=20
hdfs dfs -put /root/data/data /data/age=10
hdfs dfs -put /root/data/data /data/age=20


# 创建外部表
create external table demo7
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(age int)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/data';


# 查询结果(没有数据)
select * from demo7;

# 修复分区
msck repair table demo7;

# 查询结果(有数据)
select * from demo7;

以上面的方式创建hive的分区表会存在问题,每次插入的数据都是人为指定分区列的值,我们更加希望能够根据记录中的某一个字段来判断将数据插入到哪一个分区目录下,此时利用我们上面的分区方式是无法完成操作的,需要使用动态分区来完成相关操作,现在学的知识点无法满足,后续讲解。

上一篇:error C4430: missing type specifier - int assumed. Note: C++ does not support default-int 解决方法


下一篇:Note -「Dijkstra 求解 MCMF」