json 数据源
{"name":"Michael","sex":"female"}
{"name":"Andy", "age":30,"sex":"male"}
{"name":"Justin", "age":19,"sex":"male"}
{"name":"Justin", "age":74,"sex":"male","birthday":"20120506"}
{"name":"Justin", "age":50,"sex":"male"}
{"name":"Justin", "age":44,"sex":"female"}
{"name":"Justin", "age":33,"sex":"male","birthday":"20130406"}
{"name":"Justin", "age":25,"sex":"female"}
{"name":"Justin", "age":4,"sex":"male","address":"beij"}
{"name":"Justin", "age":90,"sex":"female","address":"sh"}
{"name":"Justin", "age":28,"sex":"male","address":"nj"}
{"name":"Justin", "age":17,"sex":"male","address":"nj"}
{"name":"Justin", "age":16,"sex":"female","birthday":"20190206"}
{"name":"Justin", "age":15,"sex":"male","birthday":"20040306"}
{"name":"Justin", "age":37,"sex":"male","birthday":"20070506"}
val bb = spark.read.json("2.json")
查询指定字段 排序
bb.select("name","sex","birthday").where("birthday>20051010").show() => select name,sex,birthdat from bb where birthdat>200151010
分组
bb.groupBy("name","sex").agg(count("name") as "number").show() => select count(name) as 'number ,name from bb group by name,sex