概述
目录
- 前言
- 11种方式实现wordcount
- 方式1:groupBy
- 方式2:groupByKey
- 方式3:reduceByKey
- 方式4:aggregateByKey
- 方式5:foldByKey
- 方式6:combineByKey
- 方式7:countByKey
- 方式8:countByValue
- 方式9:reduce
- 方式10:aggregate
- 方式11:fold
- 结语
前言
学习任何一门语言,都是从helloword开始,对于大数据框架来说,则是从wordcount开始,Spark也不例外,作为一门大数据处理框架,在系统的学习spark之后,wordcount可以有11种方式实现,你知道的有哪些呢?还等啥,不知道的来了解一下吧!
扫码进,更快捷:
11种方式实现wordcount
方式1:groupBy
def wordcount(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val group = mRDD.groupBy(t => t._1)
val wordcount = group.mapValues(
iter => iter.size
)
}
方式2:groupByKey
def wordcount2(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val group = mRDD.groupByKey()
val wordcount = group.mapValues(
iter => iter.size
)
}
方式3:reduceByKey
def wordcount3(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val wordcount = mRDD.reduceByKey(_+_)
}
方式4:aggregateByKey
def wordcount4(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val wordcount = mRDD.aggregateByKey(0)(_+_,_+_)
}
方式5:foldByKey
def wordcount5(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val wordcount = mRDD.foldByKey(0)(_+_)
}
方式6:combineByKey
def wordcount6(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val wordcount = mRDD.combineByKey(
v => v,
(x:Int, y) => x + y,
(x:Int, y:Int) => x + y
)
}
方式7:countByKey
def wordcount7(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val mRDD = mapRDD.map((_, 1))
val wordcount = mRDD.countByKey()
}
方式8:countByValue
def wordcount8(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val mapRDD = rdd.flatMap(_.split(" "))
val wordcount = mapRDD.countByValue()
}
方式9:reduce
def wordcount9(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
// word => Map[(word,1)]
val mapRDD = words.map(
word => {
// mutable 操作起来比较方便
mutable.Map[String, Long]((word, 1))
}
)
// Map 和 Map 聚合
val wordcount = mapRDD.reduce(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordcount)
}
方式10:aggregate
def wordcount10(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
// word => Map[(word,1)]
val mapRDD = words.map(
word => {
// mutable 操作起来比较方便
mutable.Map[String, Long]((word, 1))
}
)
// Map 和 Map 聚合
val wordcount = mapRDD.aggregate(mutable.Map[String, Long]((mapRDD.first().keySet.head,0)))(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}, (map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordcount)
}
方式11:fold
def wordcount11(sc: SparkContext): Unit ={
val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark"))
val words = rdd.flatMap(_.split(" "))
// word => Map[(word,1)]
val mapRDD = words.map(
word => {
// mutable 操作起来比较方便
mutable.Map[String, Long]((word, 1))
}
)
// Map 和 Map 聚合
val wordcount = mapRDD.fold(mutable.Map[String, Long]((mapRDD.first().keySet.head,0)))(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordcount)
}
结语
以上就是十一种求WordCount的实现方式了!好了,今天就为大家分享到这里了,如果本文对你有帮助的话,欢迎点赞&收藏&分享,这对我继续分享&创作优质文章非常重要。感谢????????
–END–
非常欢迎大家加我个人微信,有关大数据的问题我们一起讨论。
扫码上方二维码,加我微信
最后
以上就是外向外套为你收集整理的Spark实现WordCount的11种方式,你知道的有哪些?的全部内容,希望文章能够帮你解决Spark实现WordCount的11种方式,你知道的有哪些?所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复