Pyspark: JSON to Pyspark dataframe

Question:

I want to transform this json to a pyspark dataframe I have added my current code.

json = {
"key1": 0.75,
"values":[
    {
        "id": 2313,
        "val1": 350,
        "val2": 6000
    },
    {
        "id": 2477,
        "val1": 340,
        "val2": 6500
    }
]

}

my code:
I can get the expected output using my code. Hope someone improve this.

import json
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

json_string = json.dumps({
    "key1": 0.75,
    "values":[
        {
            "id": 2313,
            "val1": 350,
            "val2": 6000
        },
        {
            "id": 2477,
            "val1": 340,
            "val2": 6500
        }
    ]
})
df = spark.read.json(spark.sparkContext.parallelize([json_string]))

df = df.select("key1", "values.id", "values.val1", "values.val2")
df.show()

output

+----+-------------+-------------+-------------+
|key1|           id|         val1|         val2|
+----+-------------+-------------+-------------+
|0.75| [2313, 2477]|   [350, 340]| [6000, 6500]|
+----+-------------+-------------+-------------+

Help appreciate to get the expecting output.

Expecting output:

+----+----+----+----+
|key1|  id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+

Asked By: Leonard

||

Answers:

You can try the spark inline function.

df = df.selectExpr("key1", "inline(values)")
Answered By: 过过招

If you don’t want to use explode to do this you can use pandas as an intermediary

import json
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

d = {
    "key1": 0.75,
    "values":[
        {
            "id": 2313,
            "val1": 350,
            "val2": 6000
        },
        {
            "id": 2477,
            "val1": 340,
            "val2": 6500
        }
    ]
}
# We need to put this data into columnar format for pandas
df_dict = {
    'key1': [d['key1'] for _ in range(len(d['values']))],
    'id': [x['id'] for x in d['values']],
    'val1': [x['val1'] for x in d['values']],
    'val2': [x['val2'] for x in d['values']],
}

pdf = pd.DataFrame.from_dict(df_dict)

df = spark.createDataFrame(pdf)
df.show()
+----+----+----+----+
|key1|  id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+

A pandas and explode free alternative

d_list =[
    {
        'key1': d['key1'], 
        'id': d['values'][k]['id'], 
        'val1':d['values'][k]['val1'], 
        'val2':d['values'][k]['val2']
    } for k in range(len(d['values']))
]
json_string = json.dumps(d_list)
df = spark.read.json(spark.sparkContext.parallelize([json_string]))
df.show()
+----+----+----+----+
|  id|key1|val1|val2|
+----+----+----+----+
|2313|0.75| 350|6000|
|2477|0.75| 340|6500|
+----+----+----+----+
Answered By: optical_anathema