spark算子大致上可分三大类算子:
1、Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据。
2、Key-Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Key-Value型的数据。
3、Action算子,这类算子会触发SparkContext提交作业。
一、Value型Transformation算子
1)map
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), )
val b = a.map(_.length)
val c = a.zip(b)
c.collect
res0: Array[(String, Int)] = Array((dog,), (salmon,), (salmon,), (rat,), (elephant,))
2)flatMap
val a = sc.parallelize( to , )
a.flatMap( to _).collect
res47: Array[Int] = Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ) sc.parallelize(List(, , ), ).flatMap(x => List(x, x, x)).collect
res85: Array[Int] = Array(, , , , , , , , )
3)mapPartiions
val x = sc.parallelize( to , )
x.flatMap(List.fill(scala.util.Random.nextInt())(_)).collect res1: Array[Int] = Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , )
4)glom(形成一个Array数组)
val a = sc.parallelize( to , )
a.glom.collect
res8: Array[Array[Int]] = Array(Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ), Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ), Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ))
5)union
val a = sc.parallelize( to , )
val b = sc.parallelize( to , )
(a ++ b).collect
res0: Array[Int] = Array(, , , , , )
6)cartesian(笛卡尔操作)
val x = sc.parallelize(List(,,,,))
val y = sc.parallelize(List(,,,,))
x.cartesian(y).collect
res0: Array[(Int, Int)] = Array((,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,), (,))
7)groupBy(生成相应的key,相同的放在一起)
val a = sc.parallelize( to , )
a.groupBy(x => { if (x % == ) "even" else "odd" }).collect
res42: Array[(String, Seq[Int])] = Array((even,ArrayBuffer(, , , )), (odd,ArrayBuffer(, , , , )))
8)filter
val a = sc.parallelize( to , )
val b = a.filter(_ % == )
b.collect
res3: Array[Int] = Array(, , , , )
9)distinct(去重)
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), )
c.distinct.collect
res6: Array[String] = Array(Dog, Gnu, Cat, Rat)
10)subtract(去掉含有重复的项)
val a = sc.parallelize( to , )
val b = sc.parallelize( to , )
val c = a.subtract(b)
c.collect
res3: Array[Int] = Array(, , , , , )
11)sample
val a = sc.parallelize( to , )
a.sample(false, 0.1, ).count
res24: Long =
12)takesample
val x = sc.parallelize( to , )
x.takeSample(true, , )
res3: Array[Int] = Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , )
13)cache、persist
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), )
c.getStorageLevel
res0: org.apache.spark.storage.StorageLevel = StorageLevel(false, false, false, false, )
c.cache
c.getStorageLevel
res2: org.apache.spark.storage.StorageLevel = StorageLevel(false, true, false, true, )
二、Key-Value型Transformation算子
1)mapValues
val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), )
val b = a.map(x => (x.length, x))
b.mapValues("x" + _ + "x").collect
res5: Array[(Int, String)] = Array((,xdogx), (,xtigerx), (,xlionx), (,xcatx), (,xpantherx), (,xeaglex))
2)combineByKey
val a = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), )
val b = sc.parallelize(List(,,,,,,,,), )
val c = b.zip(a)
val d = c.combineByKey(List(_), (x:List[String], y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
d.collect
res16: Array[(Int, List[String])] = Array((,List(cat, dog, turkey)), (,List(gnu, rabbit, salmon, bee, bear, wolf)))
3)reduceByKey
val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), )
val b = a.map(x => (x.length, x))
b.reduceByKey(_ + _).collect
res86: Array[(Int, String)] = Array((,dogcatowlgnuant)) val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), )
val b = a.map(x => (x.length, x))
b.reduceByKey(_ + _).collect
res87: Array[(Int, String)] = Array((,lion), (,dogcat), (,panther), (,tigereagle))
4)partitionBy
(对RDD进行分区操作)
5)cogroup
val a = sc.parallelize(List(, , , ), )
val b = a.map((_, "b"))
val c = a.map((_, "c"))
b.cogroup(c).collect
res7: Array[(Int, (Iterable[String], Iterable[String]))] = Array(
(,(ArrayBuffer(b),ArrayBuffer(c))),
(,(ArrayBuffer(b),ArrayBuffer(c))),
(,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
)
6)join
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), )
val b = a.keyBy(_.length)
val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), )
val d = c.keyBy(_.length)
b.join(d).collect res0: Array[(Int, (String, String))] = Array((,(salmon,salmon)), (,(salmon,rabbit)), (,(salmon,turkey)), (,(salmon,salmon)), (,(salmon,rabbit)), (,(salmon,turkey)), (,(dog,dog)), (,(dog,cat)), (,(dog,gnu)), (,(dog,bee)), (,(rat,dog)), (,(rat,cat)), (,(rat,gnu)), (,(rat,bee)))
7)leftOutJoin
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), )
val b = a.keyBy(_.length)
val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), )
val d = c.keyBy(_.length)
b.leftOuterJoin(d).collect res1: Array[(Int, (String, Option[String]))] = Array((,(salmon,Some(salmon))), (,(salmon,Some(rabbit))), (,(salmon,Some(turkey))), (,(salmon,Some(salmon))), (,(salmon,Some(rabbit))), (,(salmon,Some(turkey))), (,(dog,Some(dog))), (,(dog,Some(cat))), (,(dog,Some(gnu))), (,(dog,Some(bee))), (,(rat,Some(dog))), (,(rat,Some(cat))), (,(rat,Some(gnu))), (,(rat,Some(bee))), (,(elephant,None)))
8)rightOutJoin
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), )
val b = a.keyBy(_.length)
val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), )
val d = c.keyBy(_.length)
b.rightOuterJoin(d).collect res2: Array[(Int, (Option[String], String))] = Array((,(Some(salmon),salmon)), (,(Some(salmon),rabbit)), (,(Some(salmon),turkey)), (,(Some(salmon),salmon)), (,(Some(salmon),rabbit)), (,(Some(salmon),turkey)), (,(Some(dog),dog)), (,(Some(dog),cat)), (,(Some(dog),gnu)), (,(Some(dog),bee)), (,(Some(rat),dog)), (,(Some(rat),cat)), (,(Some(rat),gnu)), (,(Some(rat),bee)), (,(None,wolf)), (,(None,bear)))
三、Actions算子
1)foreach
val c = sc.parallelize(List("cat", "dog", "tiger", "lion", "gnu", "crocodile", "ant", "whale", "dolphin", "spider"), )
c.foreach(x => println(x + "s are yummy"))
lions are yummy
gnus are yummy
crocodiles are yummy
ants are yummy
whales are yummy
dolphins are yummy
spiders are yummy
2)saveAsTextFile
val a = sc.parallelize( to , )
a.saveAsTextFile("mydata_a")
// :: INFO FileOutputCommitter: Saved output of task 'attempt_201404032111_0000_m_000002_71' to file:/home/cloudera/Documents/spark-0.9.-incubating-bin-cdh4/bin/mydata_a
3)saveAsObjectFile
val x = sc.parallelize( to , )
x.saveAsObjectFile("objFile")
val y = sc.objectFile[Int]("objFile")
y.collect
res52: Array[Int] = Array[Int] = Array(, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , )
4)collect
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), )
c.collect
res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu, Rat)
5)collectAsMap
val a = sc.parallelize(List(, , , ), )
val b = a.zip(a)
b.collectAsMap
res1: scala.collection.Map[Int,Int] = Map( -> , -> , -> )
6)reduceByKeyLocally
val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), )
val b = a.map(x => (x.length, x))
b.reduceByKey(_ + _).collect
res86: Array[(Int, String)] = Array((,dogcatowlgnuant))
7)lookup
val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), )
val b = a.map(x => (x.length, x))
b.lookup()
res0: Seq[String] = WrappedArray(tiger, eagle)
8)count
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), )
c.count
res2: Long =
9)top
val c = sc.parallelize(Array(, , , , , ), )
c.top()
res28: Array[Int] = Array(, )
10)reduce
val a = sc.parallelize( to , )
a.reduce(_ + _)
res41: Int =
11)fold
val a = sc.parallelize(List(,,), )
a.fold()(_ + _)
res59: Int =
12)aggregate
val z = sc.parallelize(List(,,,,,), ) // lets first print out the contents of the RDD with partition labels
def myfunc(index: Int, iter: Iterator[(Int)]) : Iterator[String] = {
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
} z.mapPartitionsWithIndex(myfunc).collect
res28: Array[String] = Array([partID:, val: ], [partID:, val: ], [partID:, val: ], [partID:, val: ], [partID:, val: ], [partID:, val: ]) z.aggregate()(math.max(_, _), _ + _)
res40: Int =
参考:http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html