traversing the multi index dictionaries which is present in dataframe & store the values in a list
Question:
The above link contains the data frame, it has 2 columns [CREATEDAT, RESPONSE], the objective is to traverse the response column and in that response column there is an estimate key, in that key, I have to fetch the provider&storeexternalid
adding the sample row in a snippet, the location is [‘store-boundary-dsp’][‘estimates’]
I have written a function, it might require some modification adding in the snippet
def traverse_dsp(data_frame,column):
provider = []
store_id= []
#Iterate over each row in Dataframe
for index, row in data_frame.iterrows():
# Iterate over each json object in each row in DataFrame
for i in range(0,len(row[column])):
for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():
if k=="storeExternalId":
store_val=v
store.append(store_val)
if k=="provider":
provider_val=v
provider.append(provider_val)
return provider,store_id
Answers:
This should create a list of tuple. Each tuple will be the store_id and providor pair. The apply function is used to iterate over the RESPONSE column, and pass that cell’s value to the extract_details function, which handles the main data extraction part.
def extract_details(response):
res = []
# Check to ensure first two keys are in the response, otherwise quit early
if not 'store-boundary-dsp' in response:
return res
if not 'estimates' in response['store-boundary-dsp']:
return res
# Iterate through each estimate
for estimate in response['store-boundary-dsp']['estimates']:
# Iterate through each key, value pair for the estimage
store_id = ''
providor = ''
for k, v in estimate.items():
if k == 'storeExternalId':
store_id = v
if k == 'provider':
providor = v
res.append((store_id, providor))
return res
# For each cell in the 'RESPONSE' column, extract out the store_id, providor pair
response_values = data_frame['RESPONSE'].apply(extract_details).tolist()
response_values
pair_values = [val for sublist in response_values for val in sublist]
pair_values
If you use nested list comprehension instead of nested for-loops:
# import pandas as pd
# import ast ## I just needed this to parse the RESPONSE column from csv
# df = pd.read_csv('https://raw.githubusercontent.com/ajayvd/dataframe/main/data_sub.csv')
# df['RESPONSE'] = df['RESPONSE'].apply(ast.literal_eval) # maybe only after read_csv
k3List = ('provider', 'storeExternalId')
get_e = lambda resp_v:resp_v['store-boundary-dsp']['estimates']
def get_separate_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
def get_k3(k3):
return [e[k3] for resp in data_frame[column] for e in get_l(resp) if k3 in e]
isList = isinstance(k3List, (list,tuple,set))
lists = [get_k3(k) for k in (k3List if isList else [k3List])]
return lists if isList else lists[0]
provider, store_id = get_separate_lists(df)
# provider = get_separate_lists(df, k3List='provider')
# store_id = get_separate_lists(df, 'RESPONSE', 'storeExternalId')
[k3List
can be a single key or a list (or tuple or set) of keys, and get_l
should be a function.]
If you want parallel lists, you can start with a list of tuples and then unpack and zip
to basically "unzip" them into separate lists:
# k3List, get_e = ... ## as before
def get_tuple_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
return [ tuple(e.get(k3) for k3 in k3List)
for resp in data_frame[column] for e in get_l(resp) ]
provider_stores = get_tuple_lists(df)
provider, store_id = [list(t) for t in zip(*provider_stores)]
# provider, store_id = list(zip(*provider_stores)) ## 2 tuples instead of 2 lists
With either function, print(f'{store_id=}n{provider=}')
should print
store_id=['1504', '1504', '9346', '9346', '1035', '4883', '3791', '5464', '5464', '3869', '3869', '7510', '6221', '5708', '5708', '3465']
provider=['Instacart', 'DoorDash', 'DoorDash', 'Uber', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'Skipcart', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'DoorDash']
But the direct output of get_tuple_lists
would look like
provider_stores=[('Instacart', '1504'), ('DoorDash', '1504'), ('DoorDash', '9346'), ('Uber', '9346'), ('DoorDash', '1035'), ('DoorDash', '4883'), ('DoorDash', '3791'), ('Postmates', '5464'), ('DoorDash', '5464'), ('Skipcart', '3869'), ('DoorDash', '3869'), ('DoorDash', '7510'), ('DoorDash', '6221'), ('Postmates', '5708'), ('DoorDash', '5708'), ('DoorDash', '3465')]
If you’re not sure that the outer keys used (like store-boundary-dsp
and estimates
above) exist on every row, you can use try...except
in get_l
:
def get_b(resp_v):
try: return resp_v['store-boundary-dsp']['boundaries']
except: return []
boundary_names = set(get_separate_lists(df, k3List='name', get_l=get_b))
# --> # boundary_names={'9346 - Area - 1', '1504 - Primary', '1504-Primary'}
Just some notes about the snippet in your question:
for index, row in data_frame.iterrows():
for i in range(0,len(row[column])):
for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():
# if k==....
- You don’t really need to use
.iterrows()
or range
or .items()
here – you can just use
for rc in data_frame[column]:
for est in rc['store-boundary-dsp']['estimates']:
if 'storeExternalId' in est: store_id.append(est['storeExternalId'])
if 'provider' in est: provider.append(est['provider'])
- Even in the
for k,v
loop, defining store_val
or provider_val
(as v
) is redundant [unless you plan to use them outside of their respective if
blocks or plan to modify v
in some way] when you can just .append(v)
- You could also add extract any number of lists [from
store-boundary-dsp.estimates
] by adding them to the all_lists
dictionary below [instead of coding more if
s inside for est...
]
all_lists = {
'storeExternalId': (store_id := []),
'provider': (provider := []),
}
for rc in data_frame[column]:
for est in rc['store-boundary-dsp']['estimates']:
for k in all_lists:
if k in est: all_lists[k].append(est[k])
- You could also use nested list comprehension instead of nested for-loops:
def get_k3(k3, k2='estimates', k1='store-boundary-dsp'):
return [e[k3] for resp in data_frame[column] for e in resp[k1][k2] if k3 in e]
k3List = ['provider', 'storeExternalId'] ## line them up EXACTLY
provider, store_id = [get_k3(k) for k in k3List]
Btw, you can also use .explode
and json_normalize
to completely flatten the DataFrame:
df1 = pd.concat([df[['CREATEDAT']], pd.json_normalize(df['RESPONSE'])], axis=1)
df1.columns = [c.split('.',1)[-1] for c in df1.columns]
lCols = ['value', 'errors', 'tags.1504']
dlCols = ['boundaries', 'distances', 'estimates', 'fulfillments']
for c in (lCols+dlCols): df1 = df1.explode(c)
df1 = pd.concat([df1.drop(dlCols, axis=1).reset_index(drop=True), *[
pd.json_normalize(df1[c]).rename(columns=lambda cn: f'{c}.{cn}')
for c in dlCols
]], axis=1)#.dropna(axis='columns', thresh=140)
## dropna(axis='columns',thresh=N)--> only keep columns with < N empty cells
The above link contains the data frame, it has 2 columns [CREATEDAT, RESPONSE], the objective is to traverse the response column and in that response column there is an estimate key, in that key, I have to fetch the provider&storeexternalid
adding the sample row in a snippet, the location is [‘store-boundary-dsp’][‘estimates’]
I have written a function, it might require some modification adding in the snippet
def traverse_dsp(data_frame,column):
provider = []
store_id= []
#Iterate over each row in Dataframe
for index, row in data_frame.iterrows():
# Iterate over each json object in each row in DataFrame
for i in range(0,len(row[column])):
for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():
if k=="storeExternalId":
store_val=v
store.append(store_val)
if k=="provider":
provider_val=v
provider.append(provider_val)
return provider,store_id
This should create a list of tuple. Each tuple will be the store_id and providor pair. The apply function is used to iterate over the RESPONSE column, and pass that cell’s value to the extract_details function, which handles the main data extraction part.
def extract_details(response):
res = []
# Check to ensure first two keys are in the response, otherwise quit early
if not 'store-boundary-dsp' in response:
return res
if not 'estimates' in response['store-boundary-dsp']:
return res
# Iterate through each estimate
for estimate in response['store-boundary-dsp']['estimates']:
# Iterate through each key, value pair for the estimage
store_id = ''
providor = ''
for k, v in estimate.items():
if k == 'storeExternalId':
store_id = v
if k == 'provider':
providor = v
res.append((store_id, providor))
return res
# For each cell in the 'RESPONSE' column, extract out the store_id, providor pair
response_values = data_frame['RESPONSE'].apply(extract_details).tolist()
response_values
pair_values = [val for sublist in response_values for val in sublist]
pair_values
If you use nested list comprehension instead of nested for-loops:
# import pandas as pd
# import ast ## I just needed this to parse the RESPONSE column from csv
# df = pd.read_csv('https://raw.githubusercontent.com/ajayvd/dataframe/main/data_sub.csv')
# df['RESPONSE'] = df['RESPONSE'].apply(ast.literal_eval) # maybe only after read_csv
k3List = ('provider', 'storeExternalId')
get_e = lambda resp_v:resp_v['store-boundary-dsp']['estimates']
def get_separate_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
def get_k3(k3):
return [e[k3] for resp in data_frame[column] for e in get_l(resp) if k3 in e]
isList = isinstance(k3List, (list,tuple,set))
lists = [get_k3(k) for k in (k3List if isList else [k3List])]
return lists if isList else lists[0]
provider, store_id = get_separate_lists(df)
# provider = get_separate_lists(df, k3List='provider')
# store_id = get_separate_lists(df, 'RESPONSE', 'storeExternalId')
[k3List
can be a single key or a list (or tuple or set) of keys, and get_l
should be a function.]
If you want parallel lists, you can start with a list of tuples and then unpack and zip
to basically "unzip" them into separate lists:
# k3List, get_e = ... ## as before
def get_tuple_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
return [ tuple(e.get(k3) for k3 in k3List)
for resp in data_frame[column] for e in get_l(resp) ]
provider_stores = get_tuple_lists(df)
provider, store_id = [list(t) for t in zip(*provider_stores)]
# provider, store_id = list(zip(*provider_stores)) ## 2 tuples instead of 2 lists
With either function, print(f'{store_id=}n{provider=}')
should print
store_id=['1504', '1504', '9346', '9346', '1035', '4883', '3791', '5464', '5464', '3869', '3869', '7510', '6221', '5708', '5708', '3465'] provider=['Instacart', 'DoorDash', 'DoorDash', 'Uber', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'Skipcart', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'DoorDash']
But the direct output of get_tuple_lists
would look like
provider_stores=[('Instacart', '1504'), ('DoorDash', '1504'), ('DoorDash', '9346'), ('Uber', '9346'), ('DoorDash', '1035'), ('DoorDash', '4883'), ('DoorDash', '3791'), ('Postmates', '5464'), ('DoorDash', '5464'), ('Skipcart', '3869'), ('DoorDash', '3869'), ('DoorDash', '7510'), ('DoorDash', '6221'), ('Postmates', '5708'), ('DoorDash', '5708'), ('DoorDash', '3465')]
If you’re not sure that the outer keys used (like store-boundary-dsp
and estimates
above) exist on every row, you can use try...except
in get_l
:
def get_b(resp_v):
try: return resp_v['store-boundary-dsp']['boundaries']
except: return []
boundary_names = set(get_separate_lists(df, k3List='name', get_l=get_b))
# --> # boundary_names={'9346 - Area - 1', '1504 - Primary', '1504-Primary'}
Just some notes about the snippet in your question:
for index, row in data_frame.iterrows(): for i in range(0,len(row[column])): for k,v in row[column]['store-boundary-dsp']['estimates'][i].items(): # if k==....
- You don’t really need to use
.iterrows()
orrange
or.items()
here – you can just usefor rc in data_frame[column]: for est in rc['store-boundary-dsp']['estimates']: if 'storeExternalId' in est: store_id.append(est['storeExternalId']) if 'provider' in est: provider.append(est['provider'])
- Even in the
for k,v
loop, definingstore_val
orprovider_val
(asv
) is redundant [unless you plan to use them outside of their respectiveif
blocks or plan to modifyv
in some way] when you can just.append(v)
- You could also add extract any number of lists [from
store-boundary-dsp.estimates
] by adding them to theall_lists
dictionary below [instead of coding moreif
s insidefor est...
]all_lists = { 'storeExternalId': (store_id := []), 'provider': (provider := []), } for rc in data_frame[column]: for est in rc['store-boundary-dsp']['estimates']: for k in all_lists: if k in est: all_lists[k].append(est[k])
- You could also use nested list comprehension instead of nested for-loops:
def get_k3(k3, k2='estimates', k1='store-boundary-dsp'): return [e[k3] for resp in data_frame[column] for e in resp[k1][k2] if k3 in e] k3List = ['provider', 'storeExternalId'] ## line them up EXACTLY provider, store_id = [get_k3(k) for k in k3List]
Btw, you can also use .explode
and json_normalize
to completely flatten the DataFrame:
df1 = pd.concat([df[['CREATEDAT']], pd.json_normalize(df['RESPONSE'])], axis=1)
df1.columns = [c.split('.',1)[-1] for c in df1.columns]
lCols = ['value', 'errors', 'tags.1504']
dlCols = ['boundaries', 'distances', 'estimates', 'fulfillments']
for c in (lCols+dlCols): df1 = df1.explode(c)
df1 = pd.concat([df1.drop(dlCols, axis=1).reset_index(drop=True), *[
pd.json_normalize(df1[c]).rename(columns=lambda cn: f'{c}.{cn}')
for c in dlCols
]], axis=1)#.dropna(axis='columns', thresh=140)
## dropna(axis='columns',thresh=N)--> only keep columns with < N empty cells