一:建表语句差别
create table if not exists text(
a bigint
) partitioned by (dt string)
row format delimited fields terminated by '\001'
location '/hdfs/text/';
create table if not exists orc(
a bigint)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as orc
location '/hdfs/orc/';
create table if not exists parquet(
a bigint)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as parquet
location '/hdfs/parquet/';
其实就是stored as 后面跟的不一样
二:HDFS存储对比
parquet | orc | text |
709M | 275M | 1G |
687M | 249M | 1G |
647M | 265M | 1G |
三:查询时间对比
parquet | orc | text |
36.451 | 26.133 | 42.574 |
38.425 | 29.353 | 41.673 |
36.647 | 27.825 | 43.938 |
四:文件如何生成
val sparkSession = SparkSession.builder().master("local").appName("pushFunnelV3").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD = javasc.parallelize(util.Arrays.asList("{'name':'zhangsan','age':'18'}", "{'name':'lisi','age':'19'}")).rdd;
sparkSession.read.json(nameRDD).write.mode(SaveMode.Overwrite).csv("/data/aa")
sparkSession.read.json(nameRDD).write.mode(SaveMode.Overwrite).orc("/data/bb")
sparkSession.read.json(nameRDD).write.mode(SaveMode.Overwrite).parquet("/data/cc")