spark sql练习之join操作

数据集如下:有两个json文件,table1.json和table2.json

table1.json

{"A":"A1", "B":30, "C":1}
{"A":"A2", "B":31, "C":2}
{"A":"A3", "B":32, "C":3}
{"A":"A4", "B":33, "C":4}
{"A":"A5", "B":34, "C":5}
{"A":"A6", "B":35, "C":6}
{"A":"A7", "B":36, "C":7}
{"A":"A8", "B":37, "C":8}
{"A":"A9", "B":38, "C":9}

table2.json

{"C":1, "D":1, "E":1}
{"C":2, "D":2, "E":2}
{"C":3, "D":3, "E":3}
{"C":4, "D":4, "E":4}
{"C":5, "D":5, "E":5}
{"C":6, "D":6, "E":6}
{"C":7, "D":7, "E":7}
{"C":8, "D":8, "E":8}
{"C":9, "D":9, "E":9}

要求:有两个表,表可以是文本或Json数据,结构化后分别是Table1(A,B,C)和Table2(C、D、E),使用SparkSQL将两个表通过C关联,要求求出D+E之和,并以(A、B、D+E)三列返回。

import org.apache.spark.sql.{DataFrame, SparkSession}

object Homework extends App {
  private val session: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()

  private val df1: DataFrame = session.read.json("file:///D:\\data\\table1.json")
  private val df2: DataFrame = session.read.json("file:///D:\\data\\table2.json")

  df2.show()
  private val frame: DataFrame = df1.join(df2, df2("C") === df1("C"))
  frame.show()
  frame.createTempView("tmp")
  val sql =
    """
      |select
      |A,B,D+E
      |from tmp
      |""".stripMargin
  session.sql(sql).show()
}

执行结果

+---+---+-------+
|  A|  B|(D + E)|
+---+---+-------+
| A1| 30|      2|
| A2| 31|      4|
| A3| 32|      6|
| A4| 33|      8|
| A5| 34|     10|
| A6| 35|     12|
| A7| 36|     14|
| A8| 37|     16|
| A9| 38|     18|
+---+---+-------+
上一篇:Redis 基础知识


下一篇:面试题