Group by multiple keys and summarize/average values of a list of dictionaries
Question:
What is the most pythonic way to group by multiple keys and summarize/average values of a list of dictionaries in Python please? Say I have a list of dictionaries as below:
input = [
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId1', 'qty': 100},
{'dept': '001', 'sku': 'bar', 'transId': 'uniqueId2', 'qty': 200},
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId3', 'qty': 300},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId4', 'qty': 400},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId5', 'qty': 500},
{'dept': '002', 'sku': 'qux', 'transId': 'uniqueId6', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'transId': 'uniqueId7', 'qty': 700}
]
Desired output for aggregation:
output=[
{'dept': '001', 'sku': 'foo', 'qty': 400},
{'dept': '001', 'sku': 'bar', 'qty': 200},
{'dept': '002', 'sku': 'baz', 'qty': 900},
{'dept': '002', 'sku': 'qux', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'qty': 700}
]
or average:
output=[
{'dept': '001', 'sku': 'foo', 'avg': 200},
{'dept': '001', 'sku': 'bar', 'avg': 200},
{'dept': '002', 'sku': 'baz', 'avg': 450},
{'dept': '002', 'sku': 'qux', 'avg': 600},
{'dept': '003', 'sku': 'foo', 'avg': 700}
]
I have found this: Group by and aggregate the values of a list of dictionaries in Python but it doesn’t seem to give me what I want.
Answers:
To get the aggregated results
from itertools import groupby
from operator import itemgetter
grouper = itemgetter("dept", "sku")
result = []
for key, grp in groupby(sorted(input_data, key = grouper), grouper):
temp_dict = dict(zip(["dept", "sku"], key))
temp_dict["qty"] = sum(item["qty"] for item in grp)
result.append(temp_dict)
from pprint import pprint
pprint(result)
Output
[{'dept': '001', 'qty': 200, 'sku': 'bar'},
{'dept': '001', 'qty': 400, 'sku': 'foo'},
{'dept': '002', 'qty': 900, 'sku': 'baz'},
{'dept': '002', 'qty': 600, 'sku': 'qux'},
{'dept': '003', 'qty': 700, 'sku': 'foo'}]
And to get the averages, you can simply change the contents inside the for loop, like this
temp_dict = dict(zip(["dept", "sku"], key))
temp_list = [item["qty"] for item in grp]
temp_dict["avg"] = sum(temp_list) / len(temp_list)
result.append(temp_dict)
Output
[{'avg': 200, 'dept': '001', 'sku': 'bar'},
{'avg': 200, 'dept': '001', 'sku': 'foo'},
{'avg': 450, 'dept': '002', 'sku': 'baz'},
{'avg': 600, 'dept': '002', 'sku': 'qux'},
{'avg': 700, 'dept': '003', 'sku': 'foo'}]
Suggestion: Anyway, I would have added both the qty
and avg
in the same dict
like this
temp_dict = dict(zip(["dept", "sku"], key))
temp_list = [item["qty"] for item in grp]
temp_dict["qty"] = sum(temp_list)
temp_dict["avg"] = temp_dict["qty"] / len(temp_list)
result.append(temp_dict)
Output
[{'avg': 200, 'dept': '001', 'qty': 200, 'sku': 'bar'},
{'avg': 200, 'dept': '001', 'qty': 400, 'sku': 'foo'},
{'avg': 450, 'dept': '002', 'qty': 900, 'sku': 'baz'},
{'avg': 600, 'dept': '002', 'qty': 600, 'sku': 'qux'},
{'avg': 700, 'dept': '003', 'qty': 700, 'sku': 'foo'}]
Using the numpy EP you can find here, you could write:
inputs = dict( (k, [i[k] for i in input ]) for k in input[0].keys())
print group_by((inputs['dept'], inputs['sku'])).mean(inputs['qty'])
However, you may want to consider using the pandas package if you are doing a lot of relational operations of this kind.
Like always there are lots of valid solutions, I like the defaultdict one, since I find it easier to understand.
from collections import defaultdict as df
food = df(lambda:df(lambda:df(int)))
for dct in input: food[dct['transId']][dct['sku']][dct['dept']]=dct['qty']
output_tupl=[(d1,d2,sum(food[d1][d2][d3] for d3 in food[d1][d2]) )for d1 in food for d2 in food[d1]]
Inspired by Eelco Hoogendoorn’s answer. Here is another way to resolve this using Pandas package. The code is more readable.
import numpy as np
import pandas as pd
def sum_by_cusip_and_dept(data):
df = pd.DataFrame(data)
grouped = df.groupby(['sku', 'dept'])
sum = grouped.sum()
return [{'sku': r[0], 'dept': r[1], 'qty': kv.to_dict().get('qty')} for r, kv in sum.iterrows()]
You can put the quantities and the number of their occurrences in one big default dict:
from collections import defaultdict
counts = defaultdict(lambda: [0, 0])
for line in input_data:
entry = counts[(line['dept'], line['sku'])]
entry[0] += line['qty']
entry[1] += 1
Now it is only the question to get the numbers into a list of dicts:
sums_dict = [{'dept': k[0], 'sku': k[1], 'qty': v[0]}
for k, v in counts.items()]
avg_dict = [{'dept': k[0], 'sku': k[1], 'avg': float(v[0]) / v[1]} for
k, v in counts.items()]
The results for the sums:
sums_dict
[{'dept': '002', 'qty': 600, 'sku': 'qux'},
{'dept': '001', 'qty': 400, 'sku': 'foo'},
{'dept': '003', 'qty': 700, 'sku': 'foo'},
{'dept': '002', 'qty': 900, 'sku': 'baz'},
{'dept': '001', 'qty': 200, 'sku': 'bar'}]
and for the averages:
avg_dict
[{'avg': 600.0, 'dept': '002', 'sku': 'qux'},
{'avg': 200.0, 'dept': '001', 'sku': 'foo'},
{'avg': 700.0, 'dept': '003', 'sku': 'foo'},
{'avg': 450.0, 'dept': '002', 'sku': 'baz'},
{'avg': 200.0, 'dept': '001', 'sku': 'bar'}]
An alternative version without the default dict:
counts = {}
for line in input_data:
entry = counts.setdefault((line['dept'], line['sku']), [0, 0])
entry[0] += line['qty']
entry[1] += 1
The rest is the same:
sums_dict = [{'dept': k[0], 'sku': k[1], 'qty': v[0]}
for k, v in counts.items()]
avg_dict = [{'dept': k[0], 'sku': k[1], 'avg': float(v[0]) / v[1]} for
k, v in counts.items()]
I had some extra requirements on top of the original question. I wanted to pass the grouper around and not have to pass around the original order of the fields if you need to reconstruct the grouping key as a dict.
namedtuple() works quite well in that it allows you to sort and use ._asdict()
from collections import namedtuple
def get_grouper(fields):
key = namedtuple('GroupingKey', fields)
def get_key(row):
return key(**{field: row[field] for field in fields})
return get_key
rows = [
{'a': 1, 'b': 1, 'c': 1},
{'a': 1, 'b': 2, 'c': 3},
{'a': 1, 'b': 1, 'c': 2},
{'a': 1, 'b': 0},
{'a': 1, 'b': 2, 'c': 4}
]
grouper = get_grouper(['a','b'])
rows = sorted(rows, key=grouper)
for k, g in groupby(rows, key=grouper):
print(k, list(g))
@thefourtheye If we use groupby
only one key, we should check the type of key after group, if not a tuple, return a list.
for key, grp in groupby(sorted(input_data, key = grouper), grouper):
if not isinstance(key, tuple):
key = [key]
Using pandas
and duckdb
you can easily query over datasets using SQL:
import pandas as pd
import duckdb
data = [
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId1', 'qty': 100},
{'dept': '001', 'sku': 'bar', 'transId': 'uniqueId2', 'qty': 200},
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId3', 'qty': 300},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId4', 'qty': 400},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId5', 'qty': 500},
{'dept': '002', 'sku': 'qux', 'transId': 'uniqueId6', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'transId': 'uniqueId7', 'qty': 700}
]
df = pd.DataFrame(data)
result = duckdb.query("""
SELECT dept, sku, SUM(qty), AVG(qty)
FROM df
GROUP BY dept, sku;
""").to_df()
print(result)
Output:
dept sku sum(qty) avg(qty)
0 001 foo 400.0 200.0
1 001 bar 200.0 200.0
2 002 baz 900.0 450.0
3 002 qux 600.0 600.0
4 003 foo 700.0 700.0
What is the most pythonic way to group by multiple keys and summarize/average values of a list of dictionaries in Python please? Say I have a list of dictionaries as below:
input = [
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId1', 'qty': 100},
{'dept': '001', 'sku': 'bar', 'transId': 'uniqueId2', 'qty': 200},
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId3', 'qty': 300},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId4', 'qty': 400},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId5', 'qty': 500},
{'dept': '002', 'sku': 'qux', 'transId': 'uniqueId6', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'transId': 'uniqueId7', 'qty': 700}
]
Desired output for aggregation:
output=[
{'dept': '001', 'sku': 'foo', 'qty': 400},
{'dept': '001', 'sku': 'bar', 'qty': 200},
{'dept': '002', 'sku': 'baz', 'qty': 900},
{'dept': '002', 'sku': 'qux', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'qty': 700}
]
or average:
output=[
{'dept': '001', 'sku': 'foo', 'avg': 200},
{'dept': '001', 'sku': 'bar', 'avg': 200},
{'dept': '002', 'sku': 'baz', 'avg': 450},
{'dept': '002', 'sku': 'qux', 'avg': 600},
{'dept': '003', 'sku': 'foo', 'avg': 700}
]
I have found this: Group by and aggregate the values of a list of dictionaries in Python but it doesn’t seem to give me what I want.
To get the aggregated results
from itertools import groupby
from operator import itemgetter
grouper = itemgetter("dept", "sku")
result = []
for key, grp in groupby(sorted(input_data, key = grouper), grouper):
temp_dict = dict(zip(["dept", "sku"], key))
temp_dict["qty"] = sum(item["qty"] for item in grp)
result.append(temp_dict)
from pprint import pprint
pprint(result)
Output
[{'dept': '001', 'qty': 200, 'sku': 'bar'},
{'dept': '001', 'qty': 400, 'sku': 'foo'},
{'dept': '002', 'qty': 900, 'sku': 'baz'},
{'dept': '002', 'qty': 600, 'sku': 'qux'},
{'dept': '003', 'qty': 700, 'sku': 'foo'}]
And to get the averages, you can simply change the contents inside the for loop, like this
temp_dict = dict(zip(["dept", "sku"], key))
temp_list = [item["qty"] for item in grp]
temp_dict["avg"] = sum(temp_list) / len(temp_list)
result.append(temp_dict)
Output
[{'avg': 200, 'dept': '001', 'sku': 'bar'},
{'avg': 200, 'dept': '001', 'sku': 'foo'},
{'avg': 450, 'dept': '002', 'sku': 'baz'},
{'avg': 600, 'dept': '002', 'sku': 'qux'},
{'avg': 700, 'dept': '003', 'sku': 'foo'}]
Suggestion: Anyway, I would have added both the qty
and avg
in the same dict
like this
temp_dict = dict(zip(["dept", "sku"], key))
temp_list = [item["qty"] for item in grp]
temp_dict["qty"] = sum(temp_list)
temp_dict["avg"] = temp_dict["qty"] / len(temp_list)
result.append(temp_dict)
Output
[{'avg': 200, 'dept': '001', 'qty': 200, 'sku': 'bar'},
{'avg': 200, 'dept': '001', 'qty': 400, 'sku': 'foo'},
{'avg': 450, 'dept': '002', 'qty': 900, 'sku': 'baz'},
{'avg': 600, 'dept': '002', 'qty': 600, 'sku': 'qux'},
{'avg': 700, 'dept': '003', 'qty': 700, 'sku': 'foo'}]
Using the numpy EP you can find here, you could write:
inputs = dict( (k, [i[k] for i in input ]) for k in input[0].keys())
print group_by((inputs['dept'], inputs['sku'])).mean(inputs['qty'])
However, you may want to consider using the pandas package if you are doing a lot of relational operations of this kind.
Like always there are lots of valid solutions, I like the defaultdict one, since I find it easier to understand.
from collections import defaultdict as df
food = df(lambda:df(lambda:df(int)))
for dct in input: food[dct['transId']][dct['sku']][dct['dept']]=dct['qty']
output_tupl=[(d1,d2,sum(food[d1][d2][d3] for d3 in food[d1][d2]) )for d1 in food for d2 in food[d1]]
Inspired by Eelco Hoogendoorn’s answer. Here is another way to resolve this using Pandas package. The code is more readable.
import numpy as np
import pandas as pd
def sum_by_cusip_and_dept(data):
df = pd.DataFrame(data)
grouped = df.groupby(['sku', 'dept'])
sum = grouped.sum()
return [{'sku': r[0], 'dept': r[1], 'qty': kv.to_dict().get('qty')} for r, kv in sum.iterrows()]
You can put the quantities and the number of their occurrences in one big default dict:
from collections import defaultdict
counts = defaultdict(lambda: [0, 0])
for line in input_data:
entry = counts[(line['dept'], line['sku'])]
entry[0] += line['qty']
entry[1] += 1
Now it is only the question to get the numbers into a list of dicts:
sums_dict = [{'dept': k[0], 'sku': k[1], 'qty': v[0]}
for k, v in counts.items()]
avg_dict = [{'dept': k[0], 'sku': k[1], 'avg': float(v[0]) / v[1]} for
k, v in counts.items()]
The results for the sums:
sums_dict
[{'dept': '002', 'qty': 600, 'sku': 'qux'},
{'dept': '001', 'qty': 400, 'sku': 'foo'},
{'dept': '003', 'qty': 700, 'sku': 'foo'},
{'dept': '002', 'qty': 900, 'sku': 'baz'},
{'dept': '001', 'qty': 200, 'sku': 'bar'}]
and for the averages:
avg_dict
[{'avg': 600.0, 'dept': '002', 'sku': 'qux'},
{'avg': 200.0, 'dept': '001', 'sku': 'foo'},
{'avg': 700.0, 'dept': '003', 'sku': 'foo'},
{'avg': 450.0, 'dept': '002', 'sku': 'baz'},
{'avg': 200.0, 'dept': '001', 'sku': 'bar'}]
An alternative version without the default dict:
counts = {}
for line in input_data:
entry = counts.setdefault((line['dept'], line['sku']), [0, 0])
entry[0] += line['qty']
entry[1] += 1
The rest is the same:
sums_dict = [{'dept': k[0], 'sku': k[1], 'qty': v[0]}
for k, v in counts.items()]
avg_dict = [{'dept': k[0], 'sku': k[1], 'avg': float(v[0]) / v[1]} for
k, v in counts.items()]
I had some extra requirements on top of the original question. I wanted to pass the grouper around and not have to pass around the original order of the fields if you need to reconstruct the grouping key as a dict.
namedtuple() works quite well in that it allows you to sort and use ._asdict()
from collections import namedtuple
def get_grouper(fields):
key = namedtuple('GroupingKey', fields)
def get_key(row):
return key(**{field: row[field] for field in fields})
return get_key
rows = [
{'a': 1, 'b': 1, 'c': 1},
{'a': 1, 'b': 2, 'c': 3},
{'a': 1, 'b': 1, 'c': 2},
{'a': 1, 'b': 0},
{'a': 1, 'b': 2, 'c': 4}
]
grouper = get_grouper(['a','b'])
rows = sorted(rows, key=grouper)
for k, g in groupby(rows, key=grouper):
print(k, list(g))
@thefourtheye If we use groupby
only one key, we should check the type of key after group, if not a tuple, return a list.
for key, grp in groupby(sorted(input_data, key = grouper), grouper):
if not isinstance(key, tuple):
key = [key]
Using pandas
and duckdb
you can easily query over datasets using SQL:
import pandas as pd
import duckdb
data = [
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId1', 'qty': 100},
{'dept': '001', 'sku': 'bar', 'transId': 'uniqueId2', 'qty': 200},
{'dept': '001', 'sku': 'foo', 'transId': 'uniqueId3', 'qty': 300},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId4', 'qty': 400},
{'dept': '002', 'sku': 'baz', 'transId': 'uniqueId5', 'qty': 500},
{'dept': '002', 'sku': 'qux', 'transId': 'uniqueId6', 'qty': 600},
{'dept': '003', 'sku': 'foo', 'transId': 'uniqueId7', 'qty': 700}
]
df = pd.DataFrame(data)
result = duckdb.query("""
SELECT dept, sku, SUM(qty), AVG(qty)
FROM df
GROUP BY dept, sku;
""").to_df()
print(result)
Output:
dept sku sum(qty) avg(qty)
0 001 foo 400.0 200.0
1 001 bar 200.0 200.0
2 002 baz 900.0 450.0
3 002 qux 600.0 600.0
4 003 foo 700.0 700.0