Parse deeply nested JSON file
Question:
I’m struggling to get the information I need with json_normalize. I’ve looked at the documentation and probably 10 examples of deeply nested JSON files, but I can’t quite grasp the context of the function well enough to extract the right info. I’m trying to build a data frame that would contain the timestamped values (values key) for each sensor. 1534023900 is the timestamp in UTC Seconds.
A short sample of the JSON is below.
Any thoughts?
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " u00b0C",
"value": 28.34
}
]
]
}
}
}
}
}
}
Answers:
Revised JSON
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
}
jmespath can help with nested data : the docs are quite robust, but the basics for accessing data are : if it is a key, then you can use a .
if it is not the first entry in the data, if it is an array/list use the []
Summary of your data position : device -> timeseries(dict)->[](array)->configuration(dict)->values(key)->[](array)->[0](array and get the first value)
Actual code:
import jmespath
expression = jmespath.compile('device.timeseries[].configuration.values[][0]')
expression.search(data)
[1534023900]
I have been working on a similar problem where json_normalize didn’t help me much. However, I have written the following piece of code that is working fine for most of the nested JSON usecases. I am still in a developing state and testing various json files.
Anyone, please feel free to comment or provide any suggestions to improve this part.
My main goal is to preserve the structure of JSON and provide it in a data frame.
import json
import pandas as pd
import numpy as np
def flatten_outer (data):
full_list = []
def flatten_inner(sub_data,first_level_key='',index=0,tot_len=0):
for k,v in sub_data.items():
full_key = first_level_key+'.'+k if first_level_key !='' else k
if isinstance(v, dict):
flatten_inner(v, full_key)
elif isinstance(v, list):
for i in range(0, len(v)):
if (isinstance(v[i], dict)):
flatten_inner(v[i], full_key,index=i, tot_len=len(v))
else:
val_ls = value_list[full_key] if full_key in value_list.keys() else []
val_ls.append(v)
value_list[full_key] = val_ls
break
else:
if full_key in value_list.keys():
placeholder_list = value_list[full_key]
placeholder_list[index] = v
value_list[full_key] = placeholder_list
else:
if index == 0:
if tot_len == 0:
value_list[full_key] = v
else:
placeholder_list = [None]*tot_len
placeholder_list[0] = v
value_list[full_key] = placeholder_list
else:
dif = tot_len - index - 1
placeholder_list = [None] * index
placeholder_list.append(v)
placeholder_list = placeholder_list + [None] * dif
value_list[full_key] = placeholder_list
return value_list
for row in data:
value_list = dict() #creating a value_list to store key value pairs(column values) for each record
cv = flatten_inner(row)
full_list.append(cv)
return full_list
def df_create_clean(full_list):
df = pd.DataFrame(full_list)
df = df.where(pd.notnull(df),None)
cols = df.columns
for col in cols:
df[col] = df[col].apply(lambda x: None if (isinstance(x,list) and len(x)==0) else x)
df[col] = df[col].apply(lambda x: x[0] if (isinstance(x,list) and len(x)==1) else x)
return df
def flatten_json(data):
df = pd.DataFrame(flatten_outer(data))
cleaned_df = df_create_clean(df)
return cleaned_df
Save this above code into a file flatten_json.py. Run the following code.
import flatten_json as fj
data = [{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": False,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": False,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": False,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": False,
"units": " u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
} ]
df = fj.flatten_json(data)
print(df.loc[0])
Output
created 2020-05-12T15:10:37Z
device.device_info.device_fw 204
device.device_info.device_sn 06-02133
device.device_info.device_trait 2
device.device_info.device_type 190
device.timeseries.configuration.sensors.measurements [[BATTERY, BATTERY_MV], [REFERENCE_KPA, TEMPC_LOG...
device.timeseries.configuration.sensors.port [7, 8]
device.timeseries.configuration.sensors.sensor_bonus_value [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_firmware_ver [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_number [133, 134]
device.timeseries.configuration.sensors.sensor_sn [Unavailable, Unavailable]
device.timeseries.configuration.valid_since 2018-08-11T21:45:00Z
device.timeseries.configuration.values [1534023900, 0, 19, [{'description': 'Battery ...
Now from this df you can take device.timeseries.configuration.values column to do further analysis on each sensor data.
I’m struggling to get the information I need with json_normalize. I’ve looked at the documentation and probably 10 examples of deeply nested JSON files, but I can’t quite grasp the context of the function well enough to extract the right info. I’m trying to build a data frame that would contain the timestamped values (values key) for each sensor. 1534023900 is the timestamp in UTC Seconds.
A short sample of the JSON is below.
Any thoughts?
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " u00b0C",
"value": 28.34
}
]
]
}
}
}
}
}
}
Revised JSON
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
}
jmespath can help with nested data : the docs are quite robust, but the basics for accessing data are : if it is a key, then you can use a .
if it is not the first entry in the data, if it is an array/list use the []
Summary of your data position : device -> timeseries(dict)->[](array)->configuration(dict)->values(key)->[](array)->[0](array and get the first value)
Actual code:
import jmespath
expression = jmespath.compile('device.timeseries[].configuration.values[][0]')
expression.search(data)
[1534023900]
I have been working on a similar problem where json_normalize didn’t help me much. However, I have written the following piece of code that is working fine for most of the nested JSON usecases. I am still in a developing state and testing various json files.
Anyone, please feel free to comment or provide any suggestions to improve this part.
My main goal is to preserve the structure of JSON and provide it in a data frame.
import json
import pandas as pd
import numpy as np
def flatten_outer (data):
full_list = []
def flatten_inner(sub_data,first_level_key='',index=0,tot_len=0):
for k,v in sub_data.items():
full_key = first_level_key+'.'+k if first_level_key !='' else k
if isinstance(v, dict):
flatten_inner(v, full_key)
elif isinstance(v, list):
for i in range(0, len(v)):
if (isinstance(v[i], dict)):
flatten_inner(v[i], full_key,index=i, tot_len=len(v))
else:
val_ls = value_list[full_key] if full_key in value_list.keys() else []
val_ls.append(v)
value_list[full_key] = val_ls
break
else:
if full_key in value_list.keys():
placeholder_list = value_list[full_key]
placeholder_list[index] = v
value_list[full_key] = placeholder_list
else:
if index == 0:
if tot_len == 0:
value_list[full_key] = v
else:
placeholder_list = [None]*tot_len
placeholder_list[0] = v
value_list[full_key] = placeholder_list
else:
dif = tot_len - index - 1
placeholder_list = [None] * index
placeholder_list.append(v)
placeholder_list = placeholder_list + [None] * dif
value_list[full_key] = placeholder_list
return value_list
for row in data:
value_list = dict() #creating a value_list to store key value pairs(column values) for each record
cv = flatten_inner(row)
full_list.append(cv)
return full_list
def df_create_clean(full_list):
df = pd.DataFrame(full_list)
df = df.where(pd.notnull(df),None)
cols = df.columns
for col in cols:
df[col] = df[col].apply(lambda x: None if (isinstance(x,list) and len(x)==0) else x)
df[col] = df[col].apply(lambda x: x[0] if (isinstance(x,list) and len(x)==1) else x)
return df
def flatten_json(data):
df = pd.DataFrame(flatten_outer(data))
cleaned_df = df_create_clean(df)
return cleaned_df
Save this above code into a file flatten_json.py. Run the following code.
import flatten_json as fj
data = [{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": False,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": False,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": False,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": False,
"units": " u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
} ]
df = fj.flatten_json(data)
print(df.loc[0])
Output
created 2020-05-12T15:10:37Z
device.device_info.device_fw 204
device.device_info.device_sn 06-02133
device.device_info.device_trait 2
device.device_info.device_type 190
device.timeseries.configuration.sensors.measurements [[BATTERY, BATTERY_MV], [REFERENCE_KPA, TEMPC_LOG...
device.timeseries.configuration.sensors.port [7, 8]
device.timeseries.configuration.sensors.sensor_bonus_value [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_firmware_ver [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_number [133, 134]
device.timeseries.configuration.sensors.sensor_sn [Unavailable, Unavailable]
device.timeseries.configuration.valid_since 2018-08-11T21:45:00Z
device.timeseries.configuration.values [1534023900, 0, 19, [{'description': 'Battery ...
Now from this df you can take device.timeseries.configuration.values column to do further analysis on each sensor data.