multi-index df aggregation to nested dict

Question:

I’m trying to convert a multi-index df to a nested dictionary by aggregation sum of all index.

DataFrame is in the format

                    jan  feb
first second third          
bar   bar1   bar3     0    5
             bar4     4    4
      bar2   bar5     1    9
foo   foo1   foo3     7    2
             foo4     7    7
      foo2   foo5     0    3

I tried to aggregate with each index separately and convert to dict using .to_dict('records'). But couldn’t able to achieve nested dictionary with aggregation of each index
expected result is

[
    {'first': 'bar',
     'jan': 5,
     'feb': 18,
     'sub_rows': [{'second': 'bar1',
                   'jan': 4,
                   'feb': 9,
                   'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5}, {'third': 'bar4', 'jan': 4, 'feb': 4}]
                   },
                  {'second': 'bar2',
                   'jan': 1,
                   'feb': 9,
                   'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}]
                   }]
     },
    {'first': 'foo',
     'jan': 14,
     'feb': 12,
     'sub_rows': [{'second': 'foo1',
                   'jan': 14,
                   'feb': 9,
                   'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2}, {'third': 'foo4', 'jan': 7, 'feb': 7}]
                   },
                  {'second': 'foo2',
                   'jan': 0,
                   'feb': 3,
                   'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}]
                   }]
     }
]

Can you please guide me to achieve this format. Thanks

Asked By: Dhaaz

||

Answers:

import pandas as pd


def multiindex_df_to_list(df):
    out_list = []
    nlevels = df.index.nlevels
    level_name = df.index.levels[0].name if nlevels > 1 else df.index.name
    for i, level_value in enumerate(df.groupby(level=0)):
        out_list.append({})
        out_list[i][level_name] = level_value[0]
        for col, sum_val in level_value[1].sum().iteritems():
            out_list[i][col] = sum_val
        if nlevels > 1:
            out_list[i]['sub_rows'] = multiindex_df_to_list(level_value[1].droplevel(level=0))
    return out_list


def main():
    idx = pd.MultiIndex.from_arrays(
        [
            ['bar', 'bar', 'bar', 'foo', 'foo', 'foo'],
            ['bar1', 'bar1', 'bar2', 'foo1', 'foo1', 'foo2'],
            ['bar3', 'bar4', 'bar5', 'foo3', 'foo4', 'foo5']
        ],
        names=('first', 'second', 'third'))
    col = ['jan', 'feb']
    data = [[0, 5], [4, 4], [1, 9], [7, 2], [7, 7], [0, 3]]
    df = pd.DataFrame(data, idx, columns=col)

    out_list = multiindex_df_to_list(df)
    return out_list


if __name__ == "__main__":
    main()
Answered By: Daniel Tomer

Another recursive answer using a list as parameter:

def to_nested_dict(df, data=[]):
    lvl_name = df.index.get_level_values(0).name

    for name, subdf in df.groupby(level=0):
        d = {lvl_name: name} | subdf.sum().to_dict()
        data.append(d)
        if df.index.nlevels > 1:
            l = d.setdefault('sub_rows', [])
            to_nested_dict(subdf.loc[name], l)
    return data

data = to_nested_dict(df)

Output:

>>> data
[{'first': 'bar',
  'jan': 5,
  'feb': 18,
  'sub_rows': [{'second': 'bar1',
    'jan': 4,
    'feb': 9,
    'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5},
     {'third': 'bar4', 'jan': 4, 'feb': 4}]},
   {'second': 'bar2',
    'jan': 1,
    'feb': 9,
    'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}]}]},
 {'first': 'foo',
  'jan': 14,
  'feb': 12,
  'sub_rows': [{'second': 'foo1',
    'jan': 14,
    'feb': 9,
    'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2},
     {'third': 'foo4', 'jan': 7, 'feb': 7}]},
   {'second': 'foo2',
    'jan': 0,
    'feb': 3,
    'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}]}]}]
Answered By: Corralien
sub_rows3=df.reset_index(level=2).apply(lambda ss:ss.to_dict(),axis=1).tolist()

df1=df.assign(sub_rows=sub_rows3)
    .groupby(level=[0,1]).agg({'jan':sum,'feb':sum,"sub_rows":lambda ss:ss.tolist()})
    .assign(second=lambda dd:dd.index.get_level_values(1))
    .assign(sub_rows=lambda ss:ss.apply(lambda ss:ss.to_dict(),axis=1))

df1.groupby(level=0).agg({'jan':sum,'feb':sum,"sub_rows":lambda ss:ss.tolist()})
    .reset_index().to_dict('r')

out:

[{'first': 'bar',
  'jan': 5,
  'feb': 18,
  'sub_rows': [{'jan': 4,
    'feb': 9,
    'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5},
     {'third': 'bar4', 'jan': 4, 'feb': 4}],
    'second': 'bar1'},
   {'jan': 1,
    'feb': 9,
    'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}],
    'second': 'bar2'}]},
 {'first': 'foo',
  'jan': 14,
  'feb': 12,
  'sub_rows': [{'jan': 14,
    'feb': 9,
    'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2},
     {'third': 'foo4', 'jan': 7, 'feb': 7}],
    'second': 'foo1'},
   {'jan': 0,
    'feb': 3,
    'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}],
    'second': 'foo2'}]}]
Answered By: G.G
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.