Flatten a list of dictionaries in python

Question:

I have a list of dictionaries:

data = [{"average": 2, "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

I want my output as:

output = [{"name": "joe", "employee_id": 1, "day": "2022-01-01", "average": 2, "sum": 13},
          {"name": "joe", "employee_id": 1, "day": "2022-01-02", "average": 3, "sum": 15},
          {"name": "joe", "employee_id": 1, "day": "2022-01-03", "average": 9, "sum": 0},
          {"name": "bob", "employee_id": 2, "day": "2022-01-01", "average": 1, "sum": 9},
          {"name": "bob", "employee_id": 2, "day": "2022-01-02", "average": 3, "sum": 8}]

The goal is that the output values are put together by day, name, and employee_id.

I’ve tried:

output = {}
for item in data:
    if item["day"] not in output:
        output[item["day"]] = item
    else:
        output[item["day"]].update(item)
print(list(output.values()))

This works in getting the "average" and "sum" and "date" together, but it ends up not including all of the employees and their IDs.

Any help is appreciated

Asked By: getunstuck

||

Answers:

Using collections.defaultdict with dict. Here, take the value of 'day' and 'name' of each dictionary as the key:

>>> from collections import defaultdict
>>> defdict = defaultdict(dict)
>>> for mp in data:
...     defdict[mp['day'], mp['name']].update(mp)
...
>>> keys = ('name', 'employee_id', 'day', 'average', 'sum')
>>> [{k: mp[k] for k in keys} for mp in defdict.values()]
[{'name': 'joe', 'employee_id': 1, 'day': '2022-01-01', 'average': 2, 'sum': 13},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-02', 'average': 3, 'sum': 15},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-03', 'average': 9, 'sum': 0},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-01', 'average': 1, 'sum': 9},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-02', 'average': 3, 'sum': 8}]

For 150w pieces of data, the performance of this solution is still better than that of pandas (at least when converting data into DataFrame, the for loop has completed the work):

In [451]: random.seed(0)
     ...: names = [''.join(random.choices(string.ascii_lowercase, k=random.randrange(3, 7))) for _ in range(10000)]
     ...: dates = [str(datetime.date(2022, i, j)) for i in range(7, 10) for j in range(1, 31)]
     ...: keys = ['sum', 'average']
     ...:
     ...: data = [{k: random.randrange(10), 'day': date, 'name': name, 'employee_id': i}
     ...:         for i, name in enumerate(names, 1)
     ...:         for date in sorted(random.sample(dates, random.randrange(60, 90)))
     ...:         for k in keys]
     ...:

In [452]: len(data)
Out[452]: 1492286

In [453]: %%timeit
     ...: defdict = defaultdict(dict)
     ...: for mp in data:
     ...:     defdict[mp['day'], mp['name']].update(mp)
     ...: keys = ('name', 'employee_id', 'day', 'average', 'sum')
     ...: [{k: mp[k] for k in keys} for mp in defdict.values()]
     ...:
     ...:
926 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [454]: %%timeit
     ...: df = pd.DataFrame(data)
     ...: pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
     ...:          df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
     ...:          how='outer'
     ...: ).to_dict(orient= 'records')
     ...:
     ...:
3.58 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [455]: %timeit pd.DataFrame(data)
1.26 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Answered By: Mechanic Pig

here is one way to do it

# filter using loc where average is not null and where sum is not null, as two separate frame
# merge the two DF
#finally, output as a dict of orient record

pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
         df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
         how='outer'
).to_dict(orient= 'records')

[{'name': 'joe',
  'day': '2022-01-01',
  'employee_id': 1,
  'average': 2.0,
  'sum': 13.0},
 {'name': 'joe',
  'day': '2022-01-02',
  'employee_id': 1,
  'average': 3.0,
  'sum': 15.0},
 {'name': 'joe',
  'day': '2022-01-03',
  'employee_id': 1,
  'average': 9.0,
  'sum': 0.0},
 {'name': 'bob',
  'day': '2022-01-01',
  'employee_id': 2,
  'average': 1.0,
  'sum': 9.0},
 {'name': 'bob',
  'day': '2022-01-02',
  'employee_id': 2,
  'average': 3.0,
  'sum': 8.0}]
Answered By: Naveed

From the description given the combination "day’, "name", "employee_id" acts like a unique combination to which the other two fields should be added. Each incoming dictionary has these and we can use them as a key into a new dictionary, but we need to convert them to something hashable like a json string which we need to produce with sorting to make them unique..

from json import dumps

data = [{"average": 2, "day": "2022-01-01",  "employee_id": 1, "name": "joe"},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

flattend_employee_summaries = dict()

for employee_summary in data:
    
    
    key = employee_summary.copy()
        
    if "average" in key:
        del key["average"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["average"] = employee_summary["average"]
            
    if "sum" in key:
        del key["sum"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["sum"] = employee_summary["sum"]
        
flattend_employee_summaries = [ summary for summary  in flattend_employee_summaries.values()]
          
print(f'{flattend_employee_summaries=}')
Answered By: Ronald van Elburg

It has been answered, and I suspect this is the long way of repeating Mechanic Pig’s solution which I’d recommend. For all of the solutions, I believe we are assuming there is only one average record per employee/day.

employees = dict()
for data_row in data:
    if data_row['employee_id'] not in employees:
        
        employees[data_row['employee_id']] = {data_row['day']: {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum',0) 
                                                              } 
                                            }
    else:
        data_row_day = data_row['day']
        
        if data_row['day'] not in employees[data_row['employee_id']]:
            employees[data_row['employee_id']][data_row_day] = {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum', 0) 
                                                              }
        else:
            current_sum = employees[data_row['employee_id']][data_row_day].get('sum',0)
            employees[data_row['employee_id']][data_row_day].update({'sum': current_sum + data_row.get('sum', 0) })

employee_output = list()
for employee_id, employee_dates in employees.items():
    for employee_date, employee_details in employee_dates.items():
        employee_output.append({"name": employee_details['name'],
                                "employee_id": employee_id,
                                "day": employee_date,
                                "average": employee_details['average'],
                                "sum":  employee_details['sum'],
                               })

employee_output would then contain:

[{'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-01',
  'average': 2,
  'sum': 13},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-02',
  'average': 3,
  'sum': 15},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-03',
  'average': 9,
  'sum': 0},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-01',
  'average': 1,
  'sum': 9},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-02',
  'average': 3,
  'sum': 8}]
Answered By: AMG
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.