How to parse nested JSON and load the data into a table / dataframe
Question:
I am confused how to use the json library methods and df methods.
I have a json and I am trying to read it as follows:
with open(json_path) as f:
json_dict = json.load(f)
dfs.append(pd.DataFrame([json_dict]))
df = pd.concat(dfs, ignore_index=True, sort=False)
display(df)
requestId processingTimeInMilliseconds items
0 123 43 [{'address': {'countryCode': 'de', 'id': 0}, 'results': [{'values': {'height': 1}, 'requestedId': '1'}, {'values': {'distance': -...
df2 = pd.json_normalize(json_dict, record_path="items", meta=["requestId","processingTimeInMilliseconds"])
display(df2)
results
0 [{'values': {'height': 1}, 'requestedId': '1'}, {'values': {'distance': -1}, 'requestedId': '2'}, ...
id address.countryCode adress.id requestId processingTimeInMilliseconds
1 de 0 123 43
df3 = df2.explode('results')
display(df3)
results id address.countryCode address.id requestId processingTimeInMilliseconds
0 {'values': {'height': 1}, 'requestedId': '1'} 1 de 0 123 43
0 {'values': {'distance': -1}, 'requestedId': '2'} 1 de 0 123 43
How should I continue with "results" so that I see all the keys as fields and their values?
Is there a better option/way to do everything in one step so that I unpack/explode all columns.
My aim is that at the end I see all keys from the json as fields and the values in a table.
Answers:
You could do the following. I provide a function that cn be used on any nested dataframe so you could use it directly for the original dataframe:
def flatten_nested_json_df(df):
df = df.reset_index()
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
exploded.index = df.index
df = pd.concat([df, exploded], axis=1).drop(columns=[col])
new_columns.extend(exploded.columns) # inplace
for col in list_columns:
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
return df
Whit your data
import numpy as np
import pandas as pd
data = [
{
'results': {'values': {'height': 1}, 'requestedId': '1'},
'id': 1,
'address.countryCode': 'de',
'address.id': 0,
'requestId': 123,
'processingTimeInMilliseconds': 43
},
{
'results': {'values': {'distance': -1}, 'requestedId': '2'},
'id': 1,
'address.countryCode': 'de',
'address.id': 0,
'requestId': 123,
'processingTimeInMilliseconds': 43
}
]
df = pd.json_normalize(data)
applying the function
flatten_nested_json_df(df)
returns
index id address.countryCode address.id requestId
0 0 1 de 0 123
1 1 1 de 0 123
processingTimeInMilliseconds results.values.height results.requestedId
0 43 1.0 1
1 43 NaN 2
results.values.distance
0 NaN
1 -1.0
I am confused how to use the json library methods and df methods.
I have a json and I am trying to read it as follows:
with open(json_path) as f:
json_dict = json.load(f)
dfs.append(pd.DataFrame([json_dict]))
df = pd.concat(dfs, ignore_index=True, sort=False)
display(df)
requestId processingTimeInMilliseconds items
0 123 43 [{'address': {'countryCode': 'de', 'id': 0}, 'results': [{'values': {'height': 1}, 'requestedId': '1'}, {'values': {'distance': -...
df2 = pd.json_normalize(json_dict, record_path="items", meta=["requestId","processingTimeInMilliseconds"])
display(df2)
results
0 [{'values': {'height': 1}, 'requestedId': '1'}, {'values': {'distance': -1}, 'requestedId': '2'}, ...
id address.countryCode adress.id requestId processingTimeInMilliseconds
1 de 0 123 43
df3 = df2.explode('results')
display(df3)
results id address.countryCode address.id requestId processingTimeInMilliseconds
0 {'values': {'height': 1}, 'requestedId': '1'} 1 de 0 123 43
0 {'values': {'distance': -1}, 'requestedId': '2'} 1 de 0 123 43
How should I continue with "results" so that I see all the keys as fields and their values?
Is there a better option/way to do everything in one step so that I unpack/explode all columns.
My aim is that at the end I see all keys from the json as fields and the values in a table.
You could do the following. I provide a function that cn be used on any nested dataframe so you could use it directly for the original dataframe:
def flatten_nested_json_df(df):
df = df.reset_index()
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
exploded.index = df.index
df = pd.concat([df, exploded], axis=1).drop(columns=[col])
new_columns.extend(exploded.columns) # inplace
for col in list_columns:
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
return df
Whit your data
import numpy as np
import pandas as pd
data = [
{
'results': {'values': {'height': 1}, 'requestedId': '1'},
'id': 1,
'address.countryCode': 'de',
'address.id': 0,
'requestId': 123,
'processingTimeInMilliseconds': 43
},
{
'results': {'values': {'distance': -1}, 'requestedId': '2'},
'id': 1,
'address.countryCode': 'de',
'address.id': 0,
'requestId': 123,
'processingTimeInMilliseconds': 43
}
]
df = pd.json_normalize(data)
applying the function
flatten_nested_json_df(df)
returns
index id address.countryCode address.id requestId
0 0 1 de 0 123
1 1 1 de 0 123
processingTimeInMilliseconds results.values.height results.requestedId
0 43 1.0 1
1 43 NaN 2
results.values.distance
0 NaN
1 -1.0