一种方法是使用Window 函数row_number 以降序排列正确分区的日期,并且只选择每个分区的第一行:
val df = Seq(
(1, 80025709, "2010-07-19 00:00:00", "JUAN"),
(1, 35468731, "2010-07-28 00:00:00", "PEDRO"),
(1, 51714038, "2010-08-02 00:00:00", "ALEX"),
(1, 35468731, "2011-09-28 00:00:00", "KAREN"),
(1, 35468731, "2012-08-25 00:00:00", "MARIA")
).toDF("c1", "c2", "date", "name")
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
df.withColumn(
"rownum",
row_number.over(Window.partitionBy($"c1", $"c2").orderBy($"date".desc))
).
select($"c1", $"c2", $"date", $"name").
where($"rownum" === 1).
show
// +---+--------+-------------------+-----+
// | c1| c2| date| name|
// +---+--------+-------------------+-----+
// | 1|51714038|2010-08-02 00:00:00| ALEX|
// | 1|80025709|2010-07-19 00:00:00| JUAN|
// | 1|35468731|2012-08-25 00:00:00|MARIA|
// +---+--------+-------------------+-----+