我只是想将其添加为新答案,因为我不太确定多维数据集在 collect() 上的性能。但我觉得这比我之前的回答要好。试试这个。
import pyspark.sql.functions as F
from pyspark.sql.window import Window
#Test data
tst = sqlContext.createDataFrame([('a1','b1','c1','d1'),('a1','b2','c2','d2'),('a3','b2','c3','d6'),('a4','b4','c3','d7'),('a5','b5','c5','d7'),('a6','b6','c6','d27'),('a9','b88','c54','d71')],schema=['a','b','c','d'])
#%% aggregate and cube the columns and count
tst_res1 = tst.cube('a','b','c','d').count()
# We need count of individual values in columns. so we count how many nulls are there in column
tst_nc = tst_res1.withColumn("null_count",sum([F.when(F.col(x).isNull(),1).otherwise(0) for x in tst_res1.columns]))
# Filter only with 3 null values since we have 4 columns and select values that occur more than once
tst_flt = tst_nc.filter((F.col('null_count')==len(tst.columns)-1)& (F.col('count')>1))
# coalesce to get the elements that occur more than once
tst_coala= tst_flt.withColumn("elements",F.coalesce(*tst.columns))
# collect the elements that occur more than once in an element.
tst_array = (tst_coala.groupby(F.lit(1)).agg(F.collect_list('elements').alias('elements'))).collect()
#%% convert elements to string, can be skipped for numericals
elements = map(str,tst_array[0]['elements'])
#%% introduce the values that occur more than once as an array in main df
tst_cmp= tst.withColumn("elements_array",F.array(map(F.lit,[x for x in elements])))
# convert the elements into an array
tst_cmp = tst_cmp.withColumn("main_array",F.array(*tst.columns))
#%% find if any of the elements in the row occur more than once in the entire data
tst_result = tst_cmp.withColumn("flag", F.size(F.array_intersect(F.col('main_array'),F.col('elements_array'))))
#%% select the disjoint values
tst_final = tst_result.where('flag=0')
结果:
+---+---+---+---+----------------+-------------------+----+
| a| b| c| d| elements_array| main_array|flag|
+---+---+---+---+----------------+-------------------+----+
| a6| b6| c6|d27|[b2, c3, a1, d7]| [a6, b6, c6, d27]| 0|
| a9|b88|c54|d71|[b2, c3, a1, d7]|[a9, b88, c54, d71]| 0|
+---+---+---+---+----------------+-------------------+----+