【发布时间】:2018-04-28 22:56:29
【问题描述】:
我正在学习 Scala,我正在尝试弄清楚如何在 Scala 中创建一个 MapReduce 程序,以便为文件中的每个单词查找最跟随的单词。 这就是我所拥有的。它有效,但我想实际使用 map reduce,我正在尝试找到尽可能减少循环的方法
//initialize the list with first two words
val list = scala.collection.mutable.MutableList((words.collect()(0),
words.collect()(1)));
for (x <- 1 to (words.collect().length - 2)) {
// add element into the list
list += ((words.collect()(x), words.collect()(x + 1)))
}
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.map(word => (word, 1)) // ex: key is (basketball,is) value is 1
val counter = rdd2.reduceByKey((x, y) => x + y).sortBy(_._2, false) // sort in dec
val result2 = counter.collect();
print("the most frequent follower for basketball, the, and competitive \n")
println(" ")
// calls the function
findFreq("basketball", result2)
findFreq("the", result2)
findFreq("competitive", result2)
}
// method to find the most frequent follower for the specific word
def findFreq(str: String, RDD: Array[((String, String), (Int))]): Unit =
{
var max = -1;
for (x <- RDD) {
}
// display the results
if (x._1._1.equals(str) && x._2 == max) {
println("\"" + x._1._1 + "\"" + " is followed by " + "\"" + x._1._2 + "\"" + " " + x._2 + " times.\n")
}
}
}
}
【问题讨论】:
标签: scala apache-spark mapreduce