Pyspark Convert Nested Struct field to Json String


I’m trying to ingest some mongo collections to big query using pyspark. The schema looks like this.

 |-- groups: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- my_field: struct (nullable = true)
 |    |    |    |-- **{ mongo id }**: struct (nullable = true)
 |    |    |    |    |-- A: timestamp (nullable = true)
 |    |    |    |    |-- B: string (nullable = true)
 |    |    |    |    |-- C: struct (nullable = true)
 |    |    |    |    |    |-- abc: boolean (nullable = true)
 |    |    |    |    |    |-- def: boolean (nullable = true)
 |    |    |    |    |    |-- ghi: boolean (nullable = true)
 |    |    |    |    |    |-- xyz: boolean (nullable = true)

The issue is that inside my_field we store the id, each group has it’s own id and when I import everything to big query I end up having a new column for each id. I want to convert my_field to a string and store all the nested fields as a json or something like that. But when I try to convert it I’m getting this error

temp_df = temp_df.withColumn("groups.my_field", col("groups.my_field").cast('string'))

TypeError: Column is not iterable

What am I missing?

Asked By: Alex Fragotsis



So it turns out that in order to append/remove/rename a nested field you need to change the schema. I didn’t know that. So here’s my answer. I copied and modified the code from here in order to make it work with my schema

here’s the modified version of "exclude_nested_field"

def change_nested_field_type(schema, fields_to_change, parent=""):
  new_schema = []

  if isinstance(schema, StringType):
      return schema

  for field in schema:
      full_field_name =

      if parent:
          full_field_name = parent + "." + full_field_name

      if full_field_name not in fields_to_change:
          if isinstance(field.dataType, StructType):
              inner_schema = change_nested_field_type(field.dataType, fields_to_change, full_field_name)
              new_schema.append(StructField(, inner_schema))
          elif isinstance(field.dataType, ArrayType):
              inner_schema = change_nested_field_type(field.dataType.elementType, fields_to_change, full_field_name)
              new_schema.append(StructField(, ArrayType(inner_schema)))
              new_schema.append(StructField(, field.dataType))
          # Here we change the field type to String
          new_schema.append(StructField(, StringType()))

  return StructType(new_schema)

and here’s how I call the function

new_schema = ArrayType(change_nested_field_type(df.schema["groups"].dataType.elementType, ["my_field"]))
df = df.withColumn("json", to_json("groups")).drop("groups")
df = df.withColumn("groups", from_json("json", new_schema)).drop("json")
Answered By: Alex Fragotsis

I needed a generic solution that can handle arbitrary level of nested column casting. By extending the accepted answer, I came up with the following functions

from typing import Dict
from pyspark.sql.types import StructType, ArrayType, StringType, StructField, _all_atomic_types
from pyspark.sql.functions import col

def apply_nested_column_casts(
    schema: StructType, column_cast: Dict[str, str], parent: str
) -> StructType:
    new_schema = []

    if isinstance(schema, StringType):
        return schema

    for field in schema:
        full_field_name =

        if parent:
            full_field_name = parent + "." + full_field_name

        if full_field_name not in column_cast:
            if isinstance(field.dataType, StructType):
                inner_schema = apply_nested_column_casts(
                    field.dataType, column_cast, full_field_name
                new_schema.append(StructField(, inner_schema))
            elif isinstance(field.dataType, ArrayType):
                inner_schema = apply_nested_column_casts(
                    field.dataType.elementType, column_cast, full_field_name
                new_schema.append(StructField(, ArrayType(inner_schema)))
                new_schema.append(StructField(, field.dataType))
            # Here we change the field type to the intended type
            cast_type = _all_atomic_types[column_cast[full_field_name]]
            new_schema.append(StructField(, cast_type()))

    return StructType(new_schema)

def apply_column_casts(
    df: SparkDataFrame, column_casts: Dict[str, str]
) -> SparkDataFrame:
    for col_name, cast_to in column_casts.items():
        splitted_col_name = col_name.split(".")

        if len(splitted_col_name) == 1:
            df = df.withColumn(col_name, col(col_name).cast(cast_to))
            nested_field_parent_field = splitted_col_name[0]
            nested_field_parent_type = df.schema[nested_field_parent_field].dataType
            column_cast = {col_name: cast_to}
            if isinstance(nested_field_parent_type, StructType):
                new_schema = apply_nested_column_casts(
                    nested_field_parent_type, column_cast, nested_field_parent_field
            elif isinstance(nested_field_parent_type, ArrayType):
                new_schema = ArrayType(

            tmp_json = f"{nested_field_parent_field}_json"

            df = df.withColumn(tmp_json, to_json(nested_field_parent_field)).drop(
            df = df.withColumn(
                nested_field_parent_field, from_json(tmp_json, new_schema)
    return df

And you can call the functions as shown below using dot notation for nested column casts

column_casts = {
    "col_a": "string",
    "col_b.nested_col": "double",
    "col_b.nested_struct_col.some_col": "long", 

df = apply_column_casts(df, column_casts)
Answered By: gamberooni
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.