【发布时间】:2018-10-09 19:27:59
【问题描述】:
我有一个包含两列的数据框 (DF1)
+--------+------+ |单词 |价值 | +--------+------+ |ABC |1.0 | |XYZ |2.0 | |DEF |3.0 | |GHI |4.0 | +--------+------+和另一个像这样的数据帧(DF2)
+------------------------------+ |字符串 | +------------------------------+ |ABC DEF GHI | |XYZ ABC 定义 | +------------------------------+我必须将 DF2 中的单个字符串值替换为它们在 DF1 中的对应值。例如,在操作之后,我应该取回这个数据帧。
+------------------------------+ |stringToDouble | +------------------------------+ |1.0 3.0 4.0 | |2.0 1.0 3.0 | +------------------------------+我尝试了多种方法,但似乎无法找到解决方案。
def createCorpus(conversationCorpus: Dataset[Row], dataDictionary: Dataset[Row]): Unit = {
import spark.implicits._
def getIndex(word: String): Double = {
val idxRow = dataDictionary.selectExpr("index").where('words.like(word))
val idx = idxRow.toString
if (!idx.isEmpty) idx.trim.toDouble else 1.0
}
conversationCorpus.map { //eclipse doesnt like this map here.. throws an error..
r =>
def row = {
val arr = r.getString(0).toLowerCase.split(" ")
val arrList = ArrayBuffer[Double]()
arr.map {
str =>
val index = getIndex(str)
}
Row.fromSeq(arrList.toSeq)
}
row
}
}
【问题讨论】:
标签: scala apache-spark dataframe spark-dataframe