进阶RDD

import Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object Demo {
  def main(args: Array[String]): Unit = {
    val sc: SparkContext =SparkUtils.getSparkContext()
    val rdd: RDD[(String, Int)] =sc.textFile("data/cc.txt")
      .flatMap(_.split("\\s+"))
      .map((_,1))
      .reduceByKey(_+_)
      .map(_._1.toUpperCase)
      .map((_,1))
      .reduceByKey(_+_)
    println(rdd.toDebugString)
      sc.stop()
  }
}

(2) ShuffledRDD[7] at reduceByKey at Demo.scala:16 [] ±(2)
MapPartitionsRDD[6] at map at Demo.scala:15 []
| MapPartitionsRDD[5] at map at Demo.scala:14 []
| ShuffledRDD[4] at reduceByKey at Demo.scala:13 []
±(2) MapPartitionsRDD[3] at map at Demo.scala:12 []
| MapPartitionsRDD[2] at flatMap at Demo.scala:11 []
| data/cc.txt MapPartitionsRDD[1] at textFile at Demo.scala:10 []
| data/cc.txt HadoopRDD[0] at textFile at Demo.scala:10 []

上一篇:Python正则表达式的用法


下一篇:Spark第三篇:pyspark下的key-value函数