房屋建筑时间分布
year_group
5年以内 10年以内 15年以内 20年以内 20年以上
分析:
build_date 获取 房龄
1987年建 截取字符串
use second_house;
SELECT
t.year_group, COUNT(1) AS total
FROM(
SELECT
build_date,
CASE
WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 0 AND 5 THEN '5年以内'
WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 5 AND 10 THEN '10年以内'
WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 10 AND 15 THEN '15年以内'
WHEN (2018 - substring(build_date, 0, 4)) BETWEEN 15 AND 20 THEN '20年以内'
ELSE '20年以上'
END AS year_group
FROM
2nd_sh_house
) t
GROUP BY
t.year_group
ORDER BY
total DESC ;
查看函数
show functions;
desc function substring;
/*
分区表:
以链家二手房信息为主
分区字段:
城市、月份
*/
#按照月份和城市进行分区
#PARTITIONED BY (month_str STRING COMMENT 'First Partition Month', city STRING COMMENT 'Second Partition city')
#标的注释
#COMMENT 'LianJia ErShouFang Price'
CREATE TABLE tb_second_price(
village_name string,
house_type string,
house_area double,
region string,
house_floor string,
direction string,
total_price double,
single_price double,
build_date string
)
COMMENT 'LianJia ErShouFang Price'
PARTITIONED BY (month_str STRING COMMENT 'First Partition Month', city STRING COMMENT 'Second Partition city')
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
-- 向分区表中 加载数据的时候,一定要指定分区字段的值
-- 语法:
-- LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
#加载BJ,4月份二手房数据 SH,04月份二手房数据
[rdedu@bigdata-training01 datas]$ cp 2nd_house_price.csv 04bj_2nd_house_price.csv
[rdedu@bigdata-training01 datas]$ cp 2nd_house_price.csv 04sh_2nd_house_price.csv
LOAD DATA LOCAL INPATH '/opt/datas/04bj_2nd_house_price.csv' INTO TABLE tb_second_price PARTITION (month_str='04', city='bj') ;
LOAD DATA LOCAL INPATH '/opt/datas/04sh_2nd_house_price.csv' INTO TABLE tb_second_price PARTITION (month_str='04', city='sh');
》验证
/user/hive/warehouse/second_house.db/tb_second_price/month_str=04
drwxr-xr-x rdedu supergroup 0 B 2019/9/7 下午10:42:25 0 0 B city=bj
drwxr-xr-x rdedu supergroup 0 B 2019/9/7 下午10:42:37 0 0 B city=sh
-- 对分区表的数据进行查询
SELECT COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'bj' ;
SELECT COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sh' ;
-- 需求1:分析上海 4 和 3 月份数据结果对比
SELECT '03' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '03' AND city = 'sh'
UNION ALL
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sh' ;
-- 需求2:分析所有的4月份的数据
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' ;
-- 需求3:分析所有的sh的数据
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE city = 'sh' ;
/*
直接使用put命令 将 nj 和 sz 的4月份数据加载到 分区表中
nj:
/user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj
sz:
/user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz
*/
-- 1. 创建对应的分区目录
dfs -mkdir -p /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj ;
dfs -mkdir -p /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz ;
-- 2. 上传对应城市的数据到对应分区目录中
[rdedu@bigdata-training01 datas]$ cp 04bj_2nd_house_price.csv 04nj_2nd_house_price.csv
[rdedu@bigdata-training01 datas]$ cp 04bj_2nd_house_price.csv 04sz_2nd_house_price.csv
dfs -put /opt/datas/04nj_2nd_house_price.csv /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=nj ;
dfs -put /opt/datas/04sz_2nd_house_price.csv /user/hive/warehouse/second_house.db/tb_second_price/month_str=04/city=sz ;
验证:
-- 3. 添加分区
-- load可以直接添加进去,但是put方法进去的数据需要进行添加分区
/*
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
partition_spec:
: (partition_column = partition_col_value, partition_column = partition_col_value, ...)
*/
ALTER TABLE second_house.tb_second_price ADD IF NOT EXISTS PARTITION (month_str = '04', city = 'nj');
ALTER TABLE second_house.tb_second_price ADD IF NOT EXISTS PARTITION (month_str = '04', city = 'sz');
-- 查看 分区表中分区信息
show partitions tb_second_price ;
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'nj';
SELECT '04' AS mon, COUNT(1) AS cnt FROM tb_second_price WHERE month_str = '04' AND city = 'sz';
/*
结合 分区表 和 外部表 使用
*/
-- 表的数据存储目录
dfs -mkdir -p /user/rdedu/lianjia_2nd_house ;
-- 外部表 + 分区表
#LOCATION '/user/rdedu/lianjia_2nd_house'; 外部表一般要指定地址
CREATE EXTERNAL TABLE second_house.lianjia_2nd_house(
village_name string,
house_type string,
house_area double,
region string,
house_floor string,
direction string,
total_price double,
single_price double,
build_date string
)
PARTITIONED BY (year_str STRING, month_str STRING, city STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/rdedu/lianjia_2nd_house';
-- 采用put方式加载数据
-- a. 创建分区目录
dfs -mkdir -p /user/rdedu/lianjia_2nd_house/year_str=2018/month_str=04/city=sh ;
-- b. 上传数据
dfs -put /opt/datas/04sh_2nd_house_price.csv /user/rdedu/lianjia_2nd_house/year_str=2018/month_str=04/city=sh ;
-- c. 添加分区
ALTER TABLE second_house.lianjia_2nd_house ADD IF NOT EXISTS PARTITION (year_str = '2018', month_str = '04', city = 'sh') ;