第一个meltDataFrame(How to melt Spark DataFrame?):
df = spark.createDataFrame(
[(1, "a001", 0, 32, 14, 108), (2, "a02", 80, 0, 0, 92)],
("id", "name", "01-Jan-10", "01-Feb-10", "01-Jan-11", "01-Feb-11")
)
df_long = melt(df, df.columns[:2], df.columns[2:])
# +---+----+---------+-----+
# | id|name| variable|value|
# +---+----+---------+-----+
# | 1|a001|01-Jan-10| 0|
# | 1|a001|01-Feb-10| 32|
# | 1|a001|01-Jan-11| 14|
# | 1|a001|01-Feb-11| 108|
# | 2| a02|01-Jan-10| 80|
# | 2| a02|01-Feb-10| 0|
# | 2| a02|01-Jan-11| 0|
# | 2| a02|01-Feb-11| 92|
# +---+----+---------+-----+
下一次解析日期并提取年月:
from pyspark.sql.functions import to_date, date_format, year
date = to_date("variable", "dd-MMM-yy")
parsed = df_long.select(
"id", "name", "value",
year(date).alias("year"), date_format(date, "MMM").alias("month")
)
# +---+----+-----+----+-----+
# | id|name|value|year|month|
# +---+----+-----+----+-----+
# | 1|a001| 0|2010| Jan|
# | 1|a001| 32|2010| Feb|
# | 1|a001| 14|2011| Jan|
# | 1|a001| 108|2011| Feb|
# | 2| a02| 80|2010| Jan|
# | 2| a02| 0|2010| Feb|
# | 2| a02| 0|2011| Jan|
# | 2| a02| 92|2011| Feb|
# +---+----+-----+----+-----+
终于pivot(How to pivot Spark DataFrame?):
# Providing a list of levels is not required but will make the process faster
# months = [
# "Jan", "Feb", "Mar", "Apr", "May", "Jun",
# "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
# ]
months = ["Jan", "Feb"]
parsed.groupBy("id", "name", "year").pivot("month", months).sum("value")
# +---+----+----+---+---+
# | id|name|year|Feb|Jan|
# +---+----+----+---+---+
# | 2| a02|2011| 92| 0|
# | 1|a001|2010| 32| 0|
# | 1|a001|2011|108| 14|
# | 2| a02|2010| 0| 80|
# +---+----+----+---+---+