【问题标题】:How to convert JSON Schema from Camel case to lower case如何将 JSON Schema 从 Camel 大小写转换为小写
【发布时间】:2022-01-25 17:50:02
【问题描述】:

我有一个 JSON 模式,其键为驼峰式,我正在尝试将所有数据类型转换为小写。 我遇到了ArrayType 的问题。

 import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
 import org.apache.spark.sql.types.{DataType, StructType}
 import spark.implicits._

 val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
 var sample_schema = spark.read.json("path").schema

 def columnsToLowercase(schema: StructType): StructType = {
    def recurRename(schema: StructType): Seq[StructField] =
       schema.fields.map {
          case StructField(name, dtype: StructType, nullable, meta) =>
             StructField(name.toLowerCase, StructType(recurRename(dtype)), nullable, meta)                        
          case StructField(name, dtype, nullable, meta) =>
             StructField(name.toLowerCase, dtype, nullable, meta)
       }

    StructType(recurRename(schema))
 }

 val jsonDFrame: DataFrame = spark.read.schema(columnsToLowercase(sample_schema)).json("path")

示例架构:

root
 |-- id: string (nullable = true)
 |-- master: struct (nullable = true)
 |    |-- code: string (nullable = true)
 |    |-- provInfo: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- claimInfo: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- claimId: string (nullable = true)
 |    |    |    |-- demoInfo: struct (nullable = true)
 |    |    |    |    |-- family: struct (nullable = true)
 |    |    |    |    |    |-- outOrder: struct (nullable = true)
 |    |    |    |    |    |    |-- LocOut: boolean (nullable = true)
 |    |    |    |    |    |    |-- found: boolean (nullable = true)
 |    |-- claimAddr: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- address: string (nullable = true)
 |-- system: string (nullable = true)

【问题讨论】:

  • 您在阵列方面面临什么问题?
  • 感谢您的回复 - 我在解析 ArrayType 时遇到了问题,上面的代码转换了所有字段 Structtype 但不转换为 Arraytype,当我在案例类中包含 ArrayType 时,它​​会抛出预期的错误 StructType 而不是 ArrayType ...

标签: dataframe scala apache-spark apache-spark-sql


【解决方案1】:

您应该能够通过添加另一个 case 子句将嵌套在 ArrayType 中的字段小写。对于数组列,还需要检查其子元素类型:

def columnsToLowercase(schema: StructType): StructType = {
     // ....
          case StructField(name, dtype: ArrayType, nullable, meta) => dtype.elementType match {
            case s: StructType => StructField(name.toLowerCase, ArrayType(StructType(recurRename(s)), true), nullable, meta)
            case dt => StructField(name.toLowerCase, dt, nullable, meta)
          }          
    //.... 
}

应用于您的架构:

df.printSchema
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// |    |-- provInfo: struct (nullable = true)
// |    |    |-- claimInfo: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- claimId: string (nullable = true)
// |    |    |-- demoInfo: struct (nullable = true)
// |    |    |    |-- family: struct (nullable = true)
// |    |    |    |    |-- outOrder: struct (nullable = true)
// |    |    |    |    |    |-- LocOut: boolean (nullable = false)
// |    |    |    |    |    |-- found: boolean (nullable = false)
// |    |-- claimAddr: array (nullable = true)
// |    |    |-- element: struct (containsNull = true)
// |    |    |    |-- address: string (nullable = true)
// |-- system: string (nullable = true)


columnsToLowercase(df.schema).printTreeString()
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// |    |-- provinfo: struct (nullable = true)
// |    |    |-- claiminfo: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- claimid: string (nullable = true)
// |    |    |-- demoinfo: struct (nullable = true)
// |    |    |    |-- family: struct (nullable = true)
// |    |    |    |    |-- outorder: struct (nullable = true)
// |    |    |    |    |    |-- locout: boolean (nullable = false)
// |    |    |    |    |    |-- found: boolean (nullable = false)
// |    |-- claimaddr: array (nullable = true)
// |    |    |-- element: struct (containsNull = true)
// |    |    |    |-- address: string (nullable = true)
// |-- system: string (nullable = true)

【讨论】:

    猜你喜欢
    • 2011-07-21
    • 2019-02-26
    • 1970-01-01
    • 1970-01-01
    • 2015-11-28
    • 2022-01-24
    • 2020-04-28
    • 1970-01-01
    • 2022-12-05
    相关资源
    最近更新 更多