Pyspark: JSON to Pyspark dataframe
Question:
I want to transform this json to a pyspark dataframe I have added my current code.
json = {
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
}
my code:
I can get the expected output using my code. Hope someone improve this.
import json
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()
json_string = json.dumps({
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
})
df = spark.read.json(spark.sparkContext.parallelize([json_string]))
df = df.select("key1", "values.id", "values.val1", "values.val2")
df.show()
output
+----+-------------+-------------+-------------+
|key1| id| val1| val2|
+----+-------------+-------------+-------------+
|0.75| [2313, 2477]| [350, 340]| [6000, 6500]|
+----+-------------+-------------+-------------+
Help appreciate to get the expecting output.
Expecting output:
+----+----+----+----+
|key1| id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+
Answers:
You can try the spark inline function.
df = df.selectExpr("key1", "inline(values)")
If you don’t want to use explode to do this you can use pandas as an intermediary
import json
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()
d = {
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
}
# We need to put this data into columnar format for pandas
df_dict = {
'key1': [d['key1'] for _ in range(len(d['values']))],
'id': [x['id'] for x in d['values']],
'val1': [x['val1'] for x in d['values']],
'val2': [x['val2'] for x in d['values']],
}
pdf = pd.DataFrame.from_dict(df_dict)
df = spark.createDataFrame(pdf)
df.show()
+----+----+----+----+
|key1| id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+
A pandas and explode free alternative
d_list =[
{
'key1': d['key1'],
'id': d['values'][k]['id'],
'val1':d['values'][k]['val1'],
'val2':d['values'][k]['val2']
} for k in range(len(d['values']))
]
json_string = json.dumps(d_list)
df = spark.read.json(spark.sparkContext.parallelize([json_string]))
df.show()
+----+----+----+----+
| id|key1|val1|val2|
+----+----+----+----+
|2313|0.75| 350|6000|
|2477|0.75| 340|6500|
+----+----+----+----+
I want to transform this json to a pyspark dataframe I have added my current code.
json = {
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
}
my code:
I can get the expected output using my code. Hope someone improve this.
import json
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()
json_string = json.dumps({
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
})
df = spark.read.json(spark.sparkContext.parallelize([json_string]))
df = df.select("key1", "values.id", "values.val1", "values.val2")
df.show()
output
+----+-------------+-------------+-------------+
|key1| id| val1| val2|
+----+-------------+-------------+-------------+
|0.75| [2313, 2477]| [350, 340]| [6000, 6500]|
+----+-------------+-------------+-------------+
Help appreciate to get the expecting output.
Expecting output:
+----+----+----+----+
|key1| id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+
You can try the spark inline function.
df = df.selectExpr("key1", "inline(values)")
If you don’t want to use explode to do this you can use pandas as an intermediary
import json
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()
d = {
"key1": 0.75,
"values":[
{
"id": 2313,
"val1": 350,
"val2": 6000
},
{
"id": 2477,
"val1": 340,
"val2": 6500
}
]
}
# We need to put this data into columnar format for pandas
df_dict = {
'key1': [d['key1'] for _ in range(len(d['values']))],
'id': [x['id'] for x in d['values']],
'val1': [x['val1'] for x in d['values']],
'val2': [x['val2'] for x in d['values']],
}
pdf = pd.DataFrame.from_dict(df_dict)
df = spark.createDataFrame(pdf)
df.show()
+----+----+----+----+
|key1| id|val1|val2|
+----+----+----+----+
|0.75|2313| 350|6000|
|0.75|2477| 340|6500|
+----+----+----+----+
A pandas and explode free alternative
d_list =[
{
'key1': d['key1'],
'id': d['values'][k]['id'],
'val1':d['values'][k]['val1'],
'val2':d['values'][k]['val2']
} for k in range(len(d['values']))
]
json_string = json.dumps(d_list)
df = spark.read.json(spark.sparkContext.parallelize([json_string]))
df.show()
+----+----+----+----+
| id|key1|val1|val2|
+----+----+----+----+
|2313|0.75| 350|6000|
|2477|0.75| 340|6500|
+----+----+----+----+