数据集如下:有两个json文件,table1.json和table2.json
table1.json
{"A":"A1", "B":30, "C":1}
{"A":"A2", "B":31, "C":2}
{"A":"A3", "B":32, "C":3}
{"A":"A4", "B":33, "C":4}
{"A":"A5", "B":34, "C":5}
{"A":"A6", "B":35, "C":6}
{"A":"A7", "B":36, "C":7}
{"A":"A8", "B":37, "C":8}
{"A":"A9", "B":38, "C":9}
table2.json
{"C":1, "D":1, "E":1}
{"C":2, "D":2, "E":2}
{"C":3, "D":3, "E":3}
{"C":4, "D":4, "E":4}
{"C":5, "D":5, "E":5}
{"C":6, "D":6, "E":6}
{"C":7, "D":7, "E":7}
{"C":8, "D":8, "E":8}
{"C":9, "D":9, "E":9}
要求:有两个表,表可以是文本或Json数据,结构化后分别是Table1(A,B,C)和Table2(C、D、E),使用SparkSQL将两个表通过C关联,要求求出D+E之和,并以(A、B、D+E)三列返回。
import org.apache.spark.sql.{DataFrame, SparkSession}
object Homework extends App {
private val session: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
private val df1: DataFrame = session.read.json("file:///D:\\data\\table1.json")
private val df2: DataFrame = session.read.json("file:///D:\\data\\table2.json")
df2.show()
private val frame: DataFrame = df1.join(df2, df2("C") === df1("C"))
frame.show()
frame.createTempView("tmp")
val sql =
"""
|select
|A,B,D+E
|from tmp
|""".stripMargin
session.sql(sql).show()
}
执行结果
+---+---+-------+
| A| B|(D + E)|
+---+---+-------+
| A1| 30| 2|
| A2| 31| 4|
| A3| 32| 6|
| A4| 33| 8|
| A5| 34| 10|
| A6| 35| 12|
| A7| 36| 14|
| A8| 37| 16|
| A9| 38| 18|
+---+---+-------+