How to convert dataframe to nested dictionary with specific array and list?
Question:
How can I use a dataframe to create a nested dictionary, with interleaved lists and columns, as in the example below?
Create dictionary:
columns = ["name","reason","cgc","limit","email","address","message","type","value"]
data = [("Paulo", "La Fava","123456","0","[email protected]","avenue A","msg txt 1","string","low"), ("Pedro", "Petrus","123457","20.00","[email protected]","avenue A","msg txt 2","string", "average"), ("Saulo", "Salix","123458","150.00","[email protected]","avenue B","msg txt 3","string","high")]
df = spark.createDataFrame(data).toDF(*columns)
df.show()
expected outcome
{
"accepted": [
{
"issuer": {
"name": "Paulo",
"reason": "La Fava",
"cgc": "123456"
},
"Recipient": {
"limit": "0",
"email": "[email protected]",
"address": "avenue A"
},
"additional_fields": [
{
"message": "msg txt 1",
"type": "string",
"value": "low"
}
]
}
]
}
Answers:
Arrays in Spark are homogeneous i.e. the elements should have same data type. In your sample expected output, the array type of "additional_fields" does not match with other two map fields "issuer" & "recipient".
You have two ways to resolve this:
If you can relax "additional_fields" to be just the map (not array) like "issuer" & "recipient", then you can use following transformation:
df = df.withColumn("issuer", F.create_map(F.lit("name"), F.col("name"),
F.lit("reason"), F.col("reason"),
F.lit("cgc"), F.col("cgc"),
)
)
.withColumn("recipient", F.create_map(F.lit("limit"), F.col("limit"),
F.lit("email"), F.col("email"),
F.lit("address"), F.col("address"),
)
)
.withColumn("additional_fields", F.create_map(F.lit("message"), F.col("message"),
F.lit("type"), F.col("type"),
F.lit("value"), F.col("value"),
)
)
.withColumn("accepted", F.array(F.create_map(F.lit("issuer"), F.col("issuer"),
F.lit("recipient"), F.col("recipient"),
F.lit("additional_fields"), F.col("additional_fields"),
))
)
.drop(*[c for c in df.columns if c != "accepted"] + ["issuer", "recipient", "additional_fields"])
or, if you want to make "issuer" & "recipient" field types similar to "additional_fields" then use:
df = df.withColumn("issuer", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["name", "reason", "cgc"]]))
.withColumn("recipient", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["limit", "email", "address"]]))
.withColumn("additional_fields", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["message", "type", "value"]]))
.withColumn("accepted", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["issuer", "recipient", "additional_fields"]]))
.drop(*[c for c in df.columns if c != "accepted"] + ["issuer", "recipient", "additional_fields"])
I used the for loop to update another dict, then loaded it into a list and finally into the main dictionary.
coll = df.collect()
main = {}
main["main"] = lst
lst = []
lst.append(dict1)
dict1 = dict(dicta, **dictb)
dict1 = {}
for row in coll:
print(row['accepted'])
dicta = row['accepted']
print(row['additional_fields2'])
dictb = row['additional_fields2']
dict1.update(dicta)
dict1.update(dictb)
How can I use a dataframe to create a nested dictionary, with interleaved lists and columns, as in the example below?
Create dictionary:
columns = ["name","reason","cgc","limit","email","address","message","type","value"]
data = [("Paulo", "La Fava","123456","0","[email protected]","avenue A","msg txt 1","string","low"), ("Pedro", "Petrus","123457","20.00","[email protected]","avenue A","msg txt 2","string", "average"), ("Saulo", "Salix","123458","150.00","[email protected]","avenue B","msg txt 3","string","high")]
df = spark.createDataFrame(data).toDF(*columns)
df.show()
expected outcome
{
"accepted": [
{
"issuer": {
"name": "Paulo",
"reason": "La Fava",
"cgc": "123456"
},
"Recipient": {
"limit": "0",
"email": "[email protected]",
"address": "avenue A"
},
"additional_fields": [
{
"message": "msg txt 1",
"type": "string",
"value": "low"
}
]
}
]
}
Arrays in Spark are homogeneous i.e. the elements should have same data type. In your sample expected output, the array type of "additional_fields" does not match with other two map fields "issuer" & "recipient".
You have two ways to resolve this:
If you can relax "additional_fields" to be just the map (not array) like "issuer" & "recipient", then you can use following transformation:
df = df.withColumn("issuer", F.create_map(F.lit("name"), F.col("name"),
F.lit("reason"), F.col("reason"),
F.lit("cgc"), F.col("cgc"),
)
)
.withColumn("recipient", F.create_map(F.lit("limit"), F.col("limit"),
F.lit("email"), F.col("email"),
F.lit("address"), F.col("address"),
)
)
.withColumn("additional_fields", F.create_map(F.lit("message"), F.col("message"),
F.lit("type"), F.col("type"),
F.lit("value"), F.col("value"),
)
)
.withColumn("accepted", F.array(F.create_map(F.lit("issuer"), F.col("issuer"),
F.lit("recipient"), F.col("recipient"),
F.lit("additional_fields"), F.col("additional_fields"),
))
)
.drop(*[c for c in df.columns if c != "accepted"] + ["issuer", "recipient", "additional_fields"])
or, if you want to make "issuer" & "recipient" field types similar to "additional_fields" then use:
df = df.withColumn("issuer", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["name", "reason", "cgc"]]))
.withColumn("recipient", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["limit", "email", "address"]]))
.withColumn("additional_fields", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["message", "type", "value"]]))
.withColumn("accepted", F.array([F.create_map(F.lit(c), F.col(c)) for c in ["issuer", "recipient", "additional_fields"]]))
.drop(*[c for c in df.columns if c != "accepted"] + ["issuer", "recipient", "additional_fields"])
I used the for loop to update another dict, then loaded it into a list and finally into the main dictionary.
coll = df.collect()
main = {}
main["main"] = lst
lst = []
lst.append(dict1)
dict1 = dict(dicta, **dictb)
dict1 = {}
for row in coll:
print(row['accepted'])
dicta = row['accepted']
print(row['additional_fields2'])
dictb = row['additional_fields2']
dict1.update(dicta)
dict1.update(dictb)