import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext /**
* Created by Administrator on 2017/1/7.
object TestMain {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Hangzhou_Test")
//.setMaster("local[1]").setMaster("spark://").setJars(List("xxx.jar")).set("spark.executor.memory", "10g")
val sc = new SparkContext(conf)
val hiveContext = new HiveContext(sc)
// use rc_hive_db;
hiveContext.sql("use rc_hive_db") import hiveContext.implicits._ hiveContext.setConf("mapred.max.split.size", "")
hiveContext.setConf("mapred.min.split.size.per.node", "")
hiveContext.setConf("mapred.min.split.size.per.rack", "")
hiveContext.setConf("hive.input.format", "")
hiveContext.setConf("hive.merge.mapfiles", "true")
hiveContext.setConf("hive.merge.mapredfiles", "true")
hiveContext.setConf("hive.merge.size.per.task", "")
hiveContext.setConf("hive.merge.smallfiles.avgsize", "")
hiveContext.setConf("hive.groupby.skewindata", "true") hiveContext.sql("create table if not exists tb_id_vs_name(id int,name string)")
hiveContext.sql("create table if not exists tb_id_vs_name2(id int,name string)") println("-------------------------word count:------------------------------------")
var words = "When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
val textFile = sc.parallelize(words.split(" "), )
textFile.flatMap(line => line.split(" "))
.map(word => (word, ))
.reduceByKey(_ + _)
.foreach(println) println("-------------------------map(func):------------------------------------")
val rdd = sc.parallelize( to ) //创建RDD
val map = * ) //对RDD中的每个元素都乘于2
map.foreach(x => print(x + " ")) println("-------------------------flatMap(func):------------------------------------")
// 2.flatMap(func)
val fm = rdd.flatMap(x => ( to x)).collect()
fm.foreach(x => print(x + " ")) println("-------------------------mapPartitions(func) 1:------------------------------------")
// 3.mapPartitions(func)
val mp = sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), ).mapPartitions(x => {
var woman = List[String]()
while (x.hasNext) {
val next =
next match {
case (_, "female") => woman = next._1 :: woman
case _ =>
/*val mp = rdd.mapPartitionsWithIndex(partitionsFun)*/
mp.collect.foreach(x => (print(x + " "))) //将分区中的元素转换成Aarray再输出 println("-------------------------mapPartitions(func) 2:------------------------------------")
sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), )
.mapPartitions(x => x.filter(_._2 == "female"))
.map(x => x._1)
.foreach(x => (print(x + " "))) println("-------------------------mapPartitionsWithIndex(func) :------------------------------------")
// 4.mapPartitionsWithIndex(func)
sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), )
.mapPartitionsWithIndex((index: Int, iter: Iterator[(String, String)]) => {
var woman = List[String]()
while (iter.hasNext) {
val next =
next match {
case (_, "female") => woman = "[" + index + "]" + next._1 :: woman
case _ =>
.collect.foreach(x => (print(x + " "))) //将分区中的元素转换成Aarray再输出 println("-------------------------simple(withReplacement,fraction,seed) :------------------------------------")
// 5.simple(withReplacement,fraction,seed)
val sample1 = rdd.sample(true, 0.5, )
sample1.collect.foreach(x => print(x + " ")) println("-------------------------union(ortherDataset) :将两个RDD中的数据集进行合并,最终返回两个RDD的并集,若RDD中存在相同的元素也不会去重------------------------------------")
// 6.union(ortherDataset)
val rdd1 = sc.parallelize( to )
val rdd2 = sc.parallelize( to )
rdd1.union(rdd2).collect.foreach(x => print(x + " ")) println("-------------------------union(ortherDataset) :返回两个RDD的交集------------------------------------")
// 7.intersection(otherDataset)
rdd1.intersection(rdd2).collect.foreach(x => print(x + " ")) println("-------------------------distinct([numTasks]) :对RDD中的元素进行去重------------------------------------")
// 8.distinct([numTasks])
sc.parallelize(List(, , , , , , , )).distinct().collect.foreach(x => print(x + " ")) println("-------------------------cartesian(otherDataset):对两个RDD中的所有元素进行笛卡尔积操作------------------------------------")
// 9.cartesian(otherDataset)
sc.parallelize( to ).cartesian(sc.parallelize( to )).foreach(x => println(x + " ")) println("-------------------------coalesce(numPartitions,shuffle):对RDD的分区进行重新分区,shuffle默认值为false,当shuffle=false时,不能增加分区数------------------------------------")
// 10.coalesce(numPartitions,shuffle)
val coalesceRDD = sc.parallelize( to , ).coalesce() //当suffle的值为false时,不能增加分区数(即分区数不能从5->7)
println("重新分区后的分区个数:" + coalesceRDD.partitions.size) val coalesceRDD2 = sc.parallelize( to , ).coalesce(, true)
println("重新分区后的分区个数:" + coalesceRDD2.partitions.size)
println("RDD依赖关系:" + coalesceRDD2.toDebugString) println("-------------------------repartition(numPartition):是函数coalesce(numPartition,true)的实现,效果和例9.1的coalesce(numPartition,true)的一样------------------------------------")
// 11.repartition(numPartition) // 12.glom()glom():将RDD的每个分区中的类型为T的元素转换换数组Array[T]
// 13.randomSplit(weight:Array[Double],seed):根据weight权重值将一个RDD划分成多个RDD,权重越高划分得到的元素较多的几率就越大 println("-------------------------repartition(numPartition)-----------------------------") sc.parallelize(List((, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),
(, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, "")
)).map(s => (s._1, s._2)).toDF().registerTempTable("temp_tb_id_vs_name") hiveContext.sql("insert into tb_id_vs_name select * from temp_tb_id_vs_name") sc.parallelize(List((, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, "")
)).map(s => (s._1, s._2)).toDF().registerTempTable("temp_tb_id_vs_name2") hiveContext.sql("insert into tb_id_vs_name2 select * from temp_tb_id_vs_name2") var result = hiveContext.sql("select as t10_id, as t10_name from tb_id_vs_name t10 inner join tb_id_vs_name2 t11 on") => (s.getAs[Int]("t10_id"), s.getAs[String]("t10_name"))).foreach(s => {
println(s._1 + ":" + s._2)
}) sc.stop()
-------------------------word count:------------------------------------
-------------------------mapPartitions(func) :------------------------------------
kpop lucy
-------------------------mapPartitions(func) :------------------------------------
-------------------------mapPartitionsWithIndex(func) :------------------------------------
[]kpop []lucy -------------------------simple(withReplacement,fraction,seed) :------------------------------------
RDD渚濊禆鍏崇郴:() MapPartitionsRDD[] at coalesce at TestMain.scala: []
| CoalescedRDD[] at coalesce at TestMain.scala: []
| ShuffledRDD[] at coalesce at TestMain.scala: []
+-() MapPartitionsRDD[] at coalesce at TestMain.scala: []
| ParallelCollectionRDD[] at parallelize at TestMain.scala: []