文章目录
Hive SQL
表类型
Hive 内部表
CREATE TABLE [IF NOT EXISTS] table_name
删除表时,元数据与数据都会被删除
Hive 外部表
CREATE EXTERNAL TABLE [IF NOT EXISTS] table_name LOCATION hdfs_path
location后面只能是hdfs路径,不能是文件
删除外部表只删除metastore中的元数据信息,不会删除hdfs中表数据
create external table demo1
(
id int,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/usr/';
快速建表
Create Table Like
#只复制表结构
CREATE TABLE empty_key_value_store LIKE key_value_store;
Create Table As Select (CTAS)
#复制结构和数据
CREATE TABLE new_key_value_store
AS
SELECT columA, columB FROM key_value_store;
修改表
ALTER TABLE name RENAME TO new_name
ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
查看表信息
desc table_name;
#查看更详细的信息
desc formatted table_name;
静态分区
分区信息是储存在元数据表中的
创建分区表
create table demo2
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(age int,sex string)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
添加分区
#(表已创建,在此基础上添加分区):
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location1'] partition_spec [LOCATION 'location2'] ...;
alter table demo3 add if not exists partition(month_id='201805',day_id='20180509') location '/user/tuoming/part/201805/20180509';
删除分区
内部表中对应的元数据和数据将被一并删除
ALTER TABLE table_name DROP partition_spec, partition_spec,...
ALTER TABLE day_hour_table DROP PARTITION (dt='2008-08-08', hour='09');
向指定分区添加数据
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA INPATH '/user/pv.txt' INTO TABLE day_hour_table PARTITION(dt='2008-08- 08', hour='08');
LOAD DATA local INPATH '/user/hua/*' INTO TABLE day_hour partition(dt='2010-07- 07');
当数据被加载至表中时,不会对数据进行任何转换。Load操作只是将数据复制至Hive表对应的位置。数据加载时在表下自动创建一个目录
查看分区
show partitions table_name
查询执行分区语法
SELECT day_table.* FROM day_table WHERE day_table.dt>= '2008-08-08';
分区表的意义在于优化查询。查询时尽量利用分区字段。如果不使用分区字段,就会全部扫描
外部表预先导入分区操作,但是数据无法识别怎么做
Msck repair table tablename
或者使用add partition 直接添加分区
动态分区
静态分区与动态分区的主要区别在于静态分区是手动指定,而动态分区是通过数据来进行判断。详细来说,静态分区的列实在编译时期,通过用户传递来决定的;动态分区只有在SQL执行时才能决定
动态分区是需要执行MapReduce的
开启支持动态分区
set hive.exec.dynamic.partition=true;
默认:true
set hive.exec.dynamic.partition.mode=nostrict;
默认:strict(至少有一个分区列是静态分区)
- 先创建一个原始数据表
#先创建一个原始表
create table t_original(
id int,
age int,
sex string,
name string,
likes array<string>,
address map<string,string>
)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
本地的数据文件为
2,13,female,小明2,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
3,12,male,小明3,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
4,13,female,小明4,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
5,12,male,小明5,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
6,13,female,小明6,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
7,12,male,小明7,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
8,13,female,小明8,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
9,12,female,小明9,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
10,13,male,小明10,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
将数据导入到原始表中
load data local inpath '/var/demo.txt' into table t_original;
创建分区表
create table t_dynamic(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by (age int,sex string)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
加载数据到分区表
from t_original
insert into t_dynamic partition(age,sex)
select id,name,likes,address,age,sex;
插入数据
加载本地或者hdfs表文件
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
表到表
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]]
select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]]
select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
FROM psn
INSERT OVERWRITE TABLE psn10
SELECT id,name
insert into psn11
select id,likes
表到本地表
insert overwrite local directory '/root/result'
select * from psn;
分桶
开启分桶
(我的hive3.1 已经没有下面这个参数了,旧版本会有)
set hive.enforce.bucketing=true;
默认:false;设置为true之后,mr运行时会根据bucket的个数自动分配reduce task个数。(用户也可以通过mapred.reduce.tasks自己设置reduce任务个数,但分桶时不推荐使用)
注意:一次作业产生的桶(文件数量)和reduce task个数一致。
创建一个分桶表
create table t_bucket(
id int,
name string,
likes array<string>,
address map<string,string>
)
clustered by(id) into 4 buckets
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';
向表中插入数据
insert into t_bucket select id,name,likes,address from t_origin;
可在dfs中查看分桶表,也可 desc formatted t_bucket
查看信息
桶表的抽样查询
TABLESAMPLE语法:
TABLESAMPLE(BUCKET x OUT OF y)
x:表示从哪个bucket开始抽取数据
y:必须为该表总bucket数的倍数或因子
桶:32
select * from table_name
TABLESAMPLE(BUCKET 3 OUT OF 16 ON id)取出哪个桶数据:
3,19
公式:用桶总数/y
Hive 正则
有个日志文件是这样的
192.168.57.4 - - [29/Feb/2016:18:14:35 +0800] "GET /bg-upper.png HTTP/1.1" 304 -
192.168.57.4 - - [29/Feb/2016:18:14:35 +0800] "GET /bg-nav.png HTTP/1.1" 304 -
192.168.57.4 - - [29/Feb/2016:18:14:35 +0800] "GET /asf-logo.png HTTP/1.1" 304 -
192.168.57.4 - - [29/Feb/2016:18:14:35 +0800] "GET /bg-button.png HTTP/1.1" 304 -
192.168.57.4 - - [29/Feb/2016:18:14:35 +0800] "GET /bg-middle.png HTTP/1.1" 304 -
192.168.57.4 - - [29/Feb/2016:18:14:36 +0800] "GET / HTTP/1.1" 200 11217
192.168.57.4 - - [29/Feb/2016:18:14:36 +0800] "GET / HTTP/1.1" 200 11217
创建匹配表
#创建表
CREATE TABLE logtb1 (
host STRING,
identity STRING,
t_user STRING,
time1 STRING,
request STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) \\[(.*)\\] \"(.*)\" (-|[0-9]*) (-|[0-9]*)"
)
STORED AS TEXTFILE;
将数据加载进logtb1中
load data local inpath '/usr/local/log.txt' into table logtb1;
LATERAL VIEW
Lateral View用于和UDTF函数(explode、split)结合来使用。
首先通过UDTF函数拆分成多行,再将多行结果组合成一个支持别名的虚拟表。
**主要解决 ** 在select使用UDTF做查询过程中,查询只能包含单个UDTF,不能包含其他字段、以及多个UDTF的问题
语法:
LATERAL VIEW udtf(expression) tableAlias AS columnAlias (’,’ columnAlias)
数据表person(id int ,name string,likes array,map<string,string>)
1,小明1,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
2,小明2,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
3,小明3,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
4,小明4,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
5,小明5,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
7,小明1,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
8,小明2,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
9,小明3,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
10,小明4,lol-book-moive,zhejiang:hangzhou-shanghai:pudong
查询共有多少种爱好,多少城市
select count(distinct(myCol1)), count(distinct(myCol2)) from person
LATERAL VIEW explode(likes) myTable1 AS myCol1
LATERAL VIEW explode(address) myTable2 AS myCol2, myCol3
LATERAL VIEW OUTER
还有一种情况,如果UDTF转换的Array是空的怎么办呢?
在Hive0.12里面会支持outer关键字,如果UDTF的结果是空,默认会被忽略输出。
如果加上outer关键字,则会像left outer join 一样,还是会输出select出的列,而UDTF的输出结果是NULL
select * from person lateral view explode(array())test as t1;
这样什么数据也查不出来
然后加上outer关键字
select * from person lateral view outer explode(array())test as t1;
输出:
2 小明2 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
3 小明3 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
4 小明4 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
5 小明5 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
6 小明6 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
7 小明7 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
8 小明8 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
9 小明9 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL
10 小明10 ["lol","book","moive"] {"zhejiang":"hangzhou","shanghai":"pudong"} NULL