【发布时间】:2020-10-04 17:03:17
【问题描述】:
我正在尝试处理来自 readstream 的记录并尝试打印该行。 但是在我的驱动程序日志或执行程序日志中看不到任何打印的语句。 可能有什么问题?
- 对于每条记录或批次(理想情况下)我想打印消息
- 对于每个批次,我都想执行一个流程。
val kafka = spark.readStream
.format("kafka")
.option("maxOffsetsPerTrigger", MAX_OFFSETS_PER_TRIGGER)
.option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS)
.option("subscribe", topic) // comma separated list of topics
.option("startingOffsets", "earliest")
.option("checkpointLocation", CHECKPOINT_LOCATION)
.option("failOnDataLoss", "false")
.option("minPartitions", sys.env.getOrElse("MIN_PARTITIONS", "64").toInt)
.load()
import spark.implicits._
println("JSON output to write into sink")
val consoleOutput = kafka.selectExpr("CAST(key AS STRING) as key", "CAST(value AS STRING) as value")
//.select(from_json($"json", schema) as "data")
//.select("data.*")
//.select(get_json_object(($"value").cast("string"), "$").alias("body"))
.writeStream
.foreach(new ForeachWriter[Row] {
override def open(partitionId: Long, epochId: Long): Boolean = true
override def process(row: Row): Unit = {
logger.info(
s"Record received in data frame is -> " + row.mkString )
runProcess() // Want to run some process every microbatch
}
override def close(errorOrNull: Throwable): Unit = {}
})
.outputMode("append")
.format("console")
.trigger(Trigger.ProcessingTime("30 seconds"))
.start()
consoleOutput.awaitTermination()
}
【问题讨论】:
标签: apache-spark databricks spark-structured-streaming