【发布时间】:2018-09-05 02:34:32
【问题描述】:
我有一个具有如下架构的数据框:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
我有一个新的数据框,其架构如下:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: integer (containsNull = true)
我想加入这些数据框并具有如下结构:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: long (containsNull = true)
但反过来,我在加入后得到数据框,如下所示: 根
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Country: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: long (containsNull = true)
应该怎么做? 我已经完成了 docId 字段的外部连接,上面的数据框是我得到的。
【问题讨论】:
-
你尝试过分解数据帧然后加入吗?
-
爆炸将等同于展平结构然后加入这肯定会起作用,但我正在寻找一些更好的方法,如果它存在的话。
标签: apache-spark pyspark apache-spark-sql