【发布时间】:2017-07-26 09:47:24
【问题描述】:
我正在使用 pyspark 2.1 。以下是我的数据框内容
expecteddays,date
139,30.JUl.2017
134,01.NOV.2018
我的输出应该如下
138,30.JUL.2017,<30/SEP/2018,4/FEB/2019>
我下面的模块dateRangeBetween 和get_date 处理了最后一列的填充
下面是我的代码
from datetime import datetime
from datetime import timedelta
import pandas as pd
from datetime import timedelta
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import concat,explode
from datetime import datetime
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from datetime import timedelta
import pandas as pd
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType
from pyspark.sql import types maintenance_final_join=spark.read.csv('/user/NaveenSri/adh_dev_engg/test.csv',header=True)
def get_date(dateFormat="%d-%m-%Y", addDays=0 ,timeNow=0 ):
#print('inside get date',timesNow)
if (addDays!=0):
anotherTime = timeNow + timedelta(days=addDays)
else:
anotherTime = timeNow
return anotherTime.strftime(dateFormat)
def dateRangebetween(expectedDate , estimatedDays):
output_format = '%d-%m-%Y'
dateRangeList =[]
j=2
#print('inside Date range',expectedDate)
rangeEnddate= datetime.strptime(get_date(output_format, 730,expectedDate), '%d-%m-%Y').date()
#print('rangeEnddate---',rangeEnddate)
calculatedDate = datetime.strptime(get_date(output_format,estimatedDays ,expectedDate), '%d-%m-%Y').date()
#print('calculatedDate----',calculatedDate)
while(calculatedDate<=rangeEnddate):
# print(calculatedDate)
#print (estimatedDays)
dateRangeList.append(calculatedDate)
calculatedDate = datetime.strptime(get_date(output_format,estimatedDays ,calculatedDate), '%d-%m-%Y').date()
#print('-----', datetime.strptime(get_date(output_format,estimatedDays ,calculatedDate), '%d-%m-%Y').date())
return dateRangeList
dateRange = udf(dateRangebetween, types.ArrayType(types.StringType()))
addDays=182
result = maintenance_final_join.withColumn('Part_Dates',dateRange(maintenance_final_join.Expected,maintenance_final_join.estimateddays)).show()
执行后出现此错误:
TypeError: coercing to Unicode: need string or buffer, datetime.timedelta found
【问题讨论】:
标签: python hadoop apache-spark cloudera