【发布时间】:2016-03-10 21:44:28
【问题描述】:
以下类包含尝试从 Elasticsearch 读取并打印返回的文档的主函数:
object TopicApp extends Serializable {
def run() {
val start = System.currentTimeMillis()
val sparkConf = new Configuration()
sparkConf.set("spark.executor.memory","1g")
sparkConf.set("spark.kryoserializer.buffer","256")
val es = new EsContext(sparkConf)
val esConf = new Configuration()
esConf.set("es.nodes","localhost")
esConf.set("es.port","9200")
esConf.set("es.resource", "temp_index/some_doc")
esConf.set("es.query", "?q=*:*")
esConf.set("es.fields", "_score,_id")
val documents = es.documents(esConf)
documents.foreach(println)
val end = System.currentTimeMillis()
println("Total time: " + (end-start) + " ms")
es.shutdown()
}
def main(args: Array[String]) {
run()
}
}
以下类使用org.json4s将返回的文档转换为JSON
class EsContext(sparkConf:HadoopConfig) extends SparkBase {
private val sc = createSCLocal("ElasticContext", sparkConf)
def documentsAsJson(esConf:HadoopConfig):RDD[String] = {
implicit val formats = DefaultFormats
val source = sc.newAPIHadoopRDD(
esConf,
classOf[EsInputFormat[Text, MapWritable]],
classOf[Text],
classOf[MapWritable]
)
val docs = source.map(
hit => {
val doc = Map("ident" -> hit._1.toString) ++ mwToMap(hit._2)
write(doc)
}
)
docs
}
def shutdown() = sc.stop()
// mwToMap() converts MapWritable to Map
}
以下类为应用程序创建本地SparkContext:
trait SparkBase extends Serializable {
protected def createSCLocal(name:String, config:HadoopConfig):SparkContext = {
val iterator = config.iterator()
for (prop <- iterator) {
val k = prop.getKey
val v = prop.getValue
if (k.startsWith("spark."))
System.setProperty(k, v)
}
val runtime = Runtime.getRuntime
runtime.gc()
val conf = new SparkConf()
conf.setMaster("local[2]")
conf.setAppName(name)
conf.set("spark.serializer", classOf[KryoSerializer].getName)
conf.set("spark.ui.port", "0")
new SparkContext(conf)
}
}
当我运行 TopicApp 时,我收到以下错误:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:324)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:323)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.map(RDD.scala:323)
at TopicApp.EsContext.documents(EsContext.scala:51)
at TopicApp.TopicApp$.run(TopicApp.scala:28)
at TopicApp.TopicApp$.main(TopicApp.scala:39)
at TopicApp.TopicApp.main(TopicApp.scala)
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
- object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext@14f70e7d)
- field (class: TopicApp.EsContext, name: sc, type: class org.apache.spark.SparkContext)
- object (class TopicApp.EsContext, TopicApp.EsContext@2cf77cdc)
- field (class: TopicApp.EsContext$$anonfun$documents$1, name: $outer, type: class TopicApp.EsContext)
- object (class TopicApp.EsContext$$anonfun$documents$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 13 more
浏览其他涵盖类似问题的帖子,大多建议创建类Serializable 或尝试将不可序列化的对象与类分开。
根据我得到的错误,我推断SparkContext 即sc 是不可序列化的,因为 SparkContext 不是可序列化的类。
我应该如何解耦 SparkContext,让应用程序正常运行?
【问题讨论】:
标签: scala serialization apache-spark