package com.shujia.spark.core import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} object Demo15Cache { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() .setMaster("local") .setAppName("cache") val sc = new SparkContext(conf) val studentRDD: RDD[String] = sc.textFile("data/students.txt") val studentsRDD: RDD[(String, String, Int, String, String)] = studentRDD.map(student => { println("studentsRDD处理") val split: Array[String] = student.split(",") val id: String = split(0) val name: String = split(1) val age: Int = split(2).toInt val gender: String = split(3) val clazz: String = split(4) (id, name, age, gender, clazz) }) /** * rdd中默认不报错数据,如果对同一个rdd使用多次,这个rdd会处理多次 * * 持久化级别选择 * 1、如果数据量不大,内存充足----> MEMORY_ONLY * 2、如果数据超过内存限制 ---> MEMORY_AND_DISK_SER (不管压缩不压缩,放内存中都比放磁盘上快,) * * 压缩---> 体积小,压缩和解压需要时间 * * */ //MEMORY_ONLY 默认是MEMORY_ONLY // studentsRDD.cache() studentsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER) //班级人数 val clazzNum: RDD[(String, Int)] = studentsRDD.map(stu => (stu._5, 1)).reduceByKey(_ + _) clazzNum.foreach(println) //性别的人数 val genderNum: RDD[(String, Int)] = studentsRDD.map(stu => (stu._4, 1)).reduceByKey(_ + _) genderNum.foreach(println) //性别人数 val ageNumRDD: RDD[(Int, Int)] = studentsRDD.map(stu => (stu._3, 1)).reduceByKey(_ + _) ageNumRDD.foreach(println) } }