traversing the multi index dictionaries which is present in dataframe & store the values in a list

Question

The above link contains the data frame, it has 2 columns [CREATEDAT, RESPONSE], the objective is to traverse the response column and in that response column there is an estimate key, in that key, I have to fetch the provider&storeexternalid

adding the sample row in a snippet, the location is [‘store-boundary-dsp’][‘estimates’]

I have written a function, it might require some modification adding in the snippet

def traverse_dsp(data_frame,column):
    provider = []
    store_id= []

    #Iterate over each row in Dataframe
    for index, row in data_frame.iterrows():
        
        # Iterate over each json object in each row in DataFrame
        for i in range(0,len(row[column])):

            for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():

                if k=="storeExternalId":

                    store_val=v
                    store.append(store_val)

                if k=="provider":

                    provider_val=v
                    provider.append(provider_val)

               

    return provider,store_id

Asked By: lifo

||

Source

Answer 1

This should create a list of tuple. Each tuple will be the store_id and providor pair. The apply function is used to iterate over the RESPONSE column, and pass that cell’s value to the extract_details function, which handles the main data extraction part.

def extract_details(response):

    res = []

    # Check to ensure first two keys are in the response, otherwise quit early
    if not 'store-boundary-dsp' in response:
        return res

    if not 'estimates' in response['store-boundary-dsp']:
        return res

    # Iterate through each estimate
    for estimate in response['store-boundary-dsp']['estimates']:
        # Iterate through each key, value pair for the estimage
        store_id = ''
        providor = ''
        for k, v in estimate.items():
            
            if k == 'storeExternalId':
                store_id = v
            if k == 'provider':
                providor = v
        res.append((store_id, providor))
                
    return res


# For each cell in the 'RESPONSE' column, extract out the store_id, providor pair
response_values = data_frame['RESPONSE'].apply(extract_details).tolist()
response_values

pair_values = [val for sublist in response_values for val in sublist]
pair_values

Answered By: Jamie_B

Answer 2

If you use nested list comprehension instead of nested for-loops:

# import pandas as pd
# import ast ## I just needed this to parse the RESPONSE column from csv
# df = pd.read_csv('https://raw.githubusercontent.com/ajayvd/dataframe/main/data_sub.csv')
# df['RESPONSE'] = df['RESPONSE'].apply(ast.literal_eval) # maybe only after read_csv

k3List = ('provider', 'storeExternalId') 
get_e = lambda resp_v:resp_v['store-boundary-dsp']['estimates']
def get_separate_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e): 
    def get_k3(k3):
        return [e[k3] for resp in data_frame[column] for e in get_l(resp) if k3 in e]
      
    isList = isinstance(k3List, (list,tuple,set)) 
    lists = [get_k3(k) for k in (k3List if isList else [k3List])]
    return lists if isList else lists[0]

provider, store_id = get_separate_lists(df)
# provider = get_separate_lists(df, k3List='provider')
# store_id = get_separate_lists(df, 'RESPONSE', 'storeExternalId')

[k3List can be a single key or a list (or tuple or set) of keys, and get_l should be a function.]

If you want parallel lists, you can start with a list of tuples and then unpack and zip to basically "unzip" them into separate lists:

# k3List, get_e = ... ## as before
def get_tuple_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e): 
    return [    tuple(e.get(k3) for k3 in k3List) 
                for resp in data_frame[column] for e in get_l(resp)    ]

provider_stores = get_tuple_lists(df)
provider, store_id = [list(t) for t in zip(*provider_stores)]
# provider, store_id = list(zip(*provider_stores)) ## 2 tuples instead of 2 lists

With either function, print(f'{store_id=}n{provider=}') should print

store_id=['1504', '1504', '9346', '9346', '1035', '4883', '3791', '5464', '5464', '3869', '3869', '7510', '6221', '5708', '5708', '3465']
provider=['Instacart', 'DoorDash', 'DoorDash', 'Uber', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'Skipcart', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'DoorDash']

But the direct output of get_tuple_lists would look like

provider_stores=[('Instacart', '1504'), ('DoorDash', '1504'), ('DoorDash', '9346'), ('Uber', '9346'), ('DoorDash', '1035'), ('DoorDash', '4883'), ('DoorDash', '3791'), ('Postmates', '5464'), ('DoorDash', '5464'), ('Skipcart', '3869'), ('DoorDash', '3869'), ('DoorDash', '7510'), ('DoorDash', '6221'), ('Postmates', '5708'), ('DoorDash', '5708'), ('DoorDash', '3465')]

If you’re not sure that the outer keys used (like store-boundary-dsp and estimates above) exist on every row, you can use try...except in get_l:

def get_b(resp_v):
    try: return resp_v['store-boundary-dsp']['boundaries']
    except: return [] 
boundary_names = set(get_separate_lists(df, k3List='name', get_l=get_b))
# --> # boundary_names={'9346 - Area - 1', '1504 - Primary', '1504-Primary'}

Just some notes about the snippet in your question:

    for index, row in data_frame.iterrows():
        for i in range(0,len(row[column])):
            for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():
               # if k==....

You don’t really need to use .iterrows() or range or .items() here – you can just use

    for rc in data_frame[column]:
        for est in rc['store-boundary-dsp']['estimates']:
            if 'storeExternalId' in est: store_id.append(est['storeExternalId'])
            if 'provider' in est: provider.append(est['provider'])

Even in the for k,v loop, defining store_val or provider_val (as v) is redundant [unless you plan to use them outside of their respective if blocks or plan to modify v in some way] when you can just .append(v)

You could also add extract any number of lists [from store-boundary-dsp.estimates] by adding them to the all_lists dictionary below [instead of coding more ifs inside for est...]

    all_lists = {
        'storeExternalId': (store_id := []),
        'provider': (provider := []),
    }

    for rc in data_frame[column]:
          for est in rc['store-boundary-dsp']['estimates']:
              for k in all_lists:
                  if k in est: all_lists[k].append(est[k])

You could also use nested list comprehension instead of nested for-loops:

    def get_k3(k3, k2='estimates', k1='store-boundary-dsp'):
        return [e[k3] for resp in data_frame[column] for e in resp[k1][k2] if k3 in e]

    k3List = ['provider', 'storeExternalId'] ## line them up EXACTLY
    provider, store_id = [get_k3(k) for k in k3List]

Btw, you can also use .explode and json_normalize to completely flatten the DataFrame:

df1 = pd.concat([df[['CREATEDAT']], pd.json_normalize(df['RESPONSE'])], axis=1)
df1.columns = [c.split('.',1)[-1] for c in df1.columns]
lCols = ['value', 'errors', 'tags.1504']
dlCols = ['boundaries', 'distances', 'estimates', 'fulfillments']
for c in (lCols+dlCols): df1 = df1.explode(c)

df1 = pd.concat([df1.drop(dlCols, axis=1).reset_index(drop=True), *[
    pd.json_normalize(df1[c]).rename(columns=lambda cn: f'{c}.{cn}')
    for c in dlCols 
]],  axis=1)#.dropna(axis='columns', thresh=140) 
## dropna(axis='columns',thresh=N)--> only keep columns with < N empty cells

Answered By: Driftr95

traversing the multi index dictionaries which is present in dataframe & store the values in a list

Question:

Answers: