Day16_分区表创建及加载数据

房屋建筑时间分布
    year_group
        5年以内        10年以内    15年以内    20年以内    20年以上 
分析:
        build_date  获取  房龄
        1987年建 截取字符串

use second_house;
SELECT 
  t.year_group, COUNT(1) AS total
FROM(
  SELECT 
    build_date,
    CASE 
      WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 0 AND 5 THEN '5年以内' 
      WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 5 AND 10 THEN '10年以内'
      WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 10 AND 15 THEN '15年以内'
      WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 15 AND 20 THEN '20年以内'
      ELSE '20年以上'
    END AS year_group
  FROM 
      2nd_sh_house
) t
GROUP BY 
  t.year_group
ORDER BY 
  total DESC ;

查看函数

show functions;
desc function substring;

/*
    分区表:
        以链家二手房信息为主
        分区字段:
            城市、月份
*/  
#按照月份和城市进行分区
#PARTITIONED BY (month_str STRING COMMENT 'First Partition Month', city STRING COMMENT 'Second Partition city')
#标的注释
#COMMENT 'LianJia ErShouFang Price'
CREATE TABLE  tb_second_price(
village_name string,
house_type string,
house_area double,
region string,
house_floor string,
direction string,
total_price double,
single_price double,
build_date string
)
COMMENT 'LianJia ErShouFang Price'
PARTITIONED BY (month_str STRING COMMENT 'First Partition Month', city STRING COMMENT 'Second Partition city')
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
  

-- 向分区表中 加载数据的时候,一定要指定分区字段的值
-- 语法:
--    LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
 #加载BJ,4月份二手房数据  SH,04月份二手房数据
[rdedu@bigdata-training01 datas]$ cp 2nd_house_price.csv  04bj_2nd_house_price.csv 
[rdedu@bigdata-training01 datas]$ cp 2nd_house_price.csv  04sh_2nd_house_price.csv    

LOAD DATA LOCAL INPATH '/opt/datas/04bj_2nd_house_price.csv' INTO TABLE tb_second_price PARTITION (month_str='04', city='bj') ;
LOAD DATA LOCAL INPATH '/opt/datas/04sh_2nd_house_price.csv' INTO TABLE tb_second_price PARTITION (month_str='04', city='sh');

》验证
/user/hive/warehouse/second_house.db/tb_second_price/month_str=04
drwxr-xr-x    rdedu    supergroup    0 B    2019/9/7 下午10:42:25    0    0 B    city=bj
drwxr-xr-x    rdedu    supergroup    0 B    2019/9/7 下午10:42:37    0    0 B    city=sh

  
  
-- 对分区表的数据进行查询
SELECT COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'bj' ;
SELECT COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sh' ;

-- 需求1:分析上海 4 和 3 月份数据结果对比
SELECT '03' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '03' AND city = 'sh'
UNION ALL
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sh' ;


-- 需求2:分析所有的4月份的数据
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' ;


-- 需求3:分析所有的sh的数据
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE city = 'sh' ;

/*
    直接使用put命令 将  nj 和 sz 的4月份数据加载到 分区表中
    nj:
        /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj
    sz:
        /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz 
*/
-- 1. 创建对应的分区目录
dfs -mkdir -p /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj ;
dfs -mkdir -p /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz ;

-- 2. 上传对应城市的数据到对应分区目录中
[rdedu@bigdata-training01 datas]$ cp 04bj_2nd_house_price.csv  04nj_2nd_house_price.csv  
[rdedu@bigdata-training01 datas]$ cp 04bj_2nd_house_price.csv  04sz_2nd_house_price.csv
dfs -put /opt/datas/04nj_2nd_house_price.csv /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj ;
dfs -put /opt/datas/04sz_2nd_house_price.csv /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz ;

验证:

-- 3. 添加分区
-- load可以直接添加进去,但是put方法进去的数据需要进行添加分区
/*
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
 
partition_spec:
  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)
*/
ALTER TABLE second_house.tb_second_price ADD IF NOT EXISTS PARTITION (month_str = '04', city = 'nj');
ALTER TABLE second_house.tb_second_price ADD IF NOT EXISTS PARTITION (month_str = '04', city = 'sz');

-- 查看 分区表中分区信息
show partitions tb_second_price ;

SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'nj';
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sz';


/*
    结合 分区表 和 外部表 使用
*/
-- 表的数据存储目录
dfs -mkdir -p /user/rdedu/lianjia_2nd_house ;

-- 外部表 + 分区表
#LOCATION '/user/rdedu/lianjia_2nd_house';  外部表一般要指定地址
CREATE EXTERNAL TABLE second_house.lianjia_2nd_house(
village_name string,
house_type string,
house_area double,
region string,
house_floor string,
direction string,
total_price double,
single_price double,
build_date string
)
PARTITIONED BY (year_str STRING, month_str STRING, city STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/rdedu/lianjia_2nd_house';


-- 采用put方式加载数据
-- a. 创建分区目录
dfs -mkdir -p /user/rdedu/lianjia_2nd_house/year_str=2018/month_str=04/city=sh ;

-- b. 上传数据
dfs -put /opt/datas/04sh_2nd_house_price.csv /user/rdedu/lianjia_2nd_house/year_str=2018/month_str=04/city=sh ;

-- c. 添加分区 
ALTER TABLE second_house.lianjia_2nd_house ADD IF NOT EXISTS PARTITION (year_str = '2018', month_str = '04', city = 'sh') ;


 

上一篇:代理模式


下一篇:(2)MySQL进阶篇SQL优化(show status、explain分析)