两种方法解决:
数据框
import pyspark.sql.functions as f
df = spark.read.text('/your_data_path/*.txt')
output_df = (df
.withColumn('values', f.split(f.expr('substring(`value`, 2, length(`value`) - 2)'), '\|'))
.selectExpr('TRANSFORM(SEQUENCE(1, SIZE(values), 3), i -> ' \
'STRUCT(values[i - 1] AS c0, values[i] AS c1, values[i + 1] AS c2)) AS values')
.selectExpr('inline(values)'))
output_df.show()
RDD
rdd = sc.textFile('/your_data_path/*.txt')
def split(text):
text = text.strip('|')
values = text.split('|')
for i in range(0, len(values), 3):
yield values[i:i+3]
df = (rdd
.flatMap(split)
.toDF('c0 string, c1 string, c2 string'))
df.show()
输出
+---+---------+---+
| c0| c1| c2|
+---+---------+---+
| 1| NewYork| 96|
| 2| Chennai| 84|
| 3|Amsterdam| 96|
+---+---------+---+