from pyspark.sql import functions as F
from pyspark.sql.window import Window
data = [("t01", "05/03/2021", "16/03/2021"),
("t02", "07/03/2021", "13/04/2021"),
("t03", "23/03/2021", "04/04/2021"),
("t04", "07/03/2021", "13/05/2021"),
("t05", "23/03/2021", "04/05/2021")]
schema = ["TaskID", "TaskstartDate", "TaskEndDate"]
df = spark.createDataFrame(data, schema)
df = df.withColumn("TaskstartDate",F.to_date(F.col("TaskstartDate"), "dd/MM/yyyy"))\
.withColumn("TaskEndDate",F.to_date(F.col("TaskEndDate"), "dd/MM/yyyy"))
df_grouped = df.withColumn("EndDay_fromStartDate", F.last_day(F.col("TaskstartDate")))\
.withColumn("EndDay_fromEndDate", F.last_day(F.col("TaskEndDate")))\
.filter((F.col("EndDay_fromStartDate") != F.col("EndDay_fromEndDate")) & (F.col("TaskstartDate") < F.col("TaskEndDate")))\
.groupBy('EndDay_fromStartDate')\
.agg(F.count(F.lit(1)).alias('backlog'))
df_grouped.show()