Merge another field with duplicates IDs python
Question:
I essentially have this dataset, and I’d like to add/combine the points of each item where the userID is a duplicate.
I know its somewhere along the lines of for (a,b) in array
but asking for assistance learning.
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}, {'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}, {'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 278}, {'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}]
Result should be
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 786},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}
]
Appreciate you guys and SO.
Answers:
Assuming l
the input list, you can use a dictionary to group by ID while aggregating, then convert back to list:
out = {}
for d in l:
if d['userID'] not in out:
out[d['userID']] = d.copy() # to not modify original
else:
out[d['userID']]['points'] += d['points']
out = list(out.values())
Alternative with setdefault
and a dictionary template:
out = {}
for d in l:
out.setdefault(d['userID'],
{'userID': d['userID'],
'points': 0,
})['points'] += d['points']
out = list(out.values())
Output:
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 786},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}]
I would summarize into a single dictionary
d = [{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268},
{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 278},
{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}]
ans = {}
for el in d:
k, v = el["userID"], el["points"]
ans[k] = ans.get(k, 0) + v
print(ans)
# {'QzzucRibfSahbUGwr2PGuhFSU242': 786, '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2': 268}
If you want a list at the end, you can do
[{"userID": k, "points": v} for k, v in ans.items()]
Pandas is useful for doing this kind of computation efficiently:
import pandas as pd
points_by_id = pd.DataFrame(data).groupby('userID').sum()
points
userID
5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2 268
QzzucRibfSahbUGwr2PGuhFSU242 786
Couldn’t resist doing some timing on these methods… turns out the simplest method in Python (which was not proposed yet) is also the fastest. Even faster than Pandas when you already have the data as a dataframe!
Test data – I arbitrarily chose 10,000 records for 50 user IDs:
import random
import string
import pandas as pd
from collections import defaultdict
from functools import reduce
# Prepare fake data
userIDs = [''.join(random.choice(string.ascii_lowercase)
for _ in range(28)) for _ in range(50)]
data = [{'userID': random.choice(userIDs), 'points': random.randint(0,1000)}
for _ in range(10000)]
data_df = pd.DataFrame(data)
Algorithm implementations (based on answers to this question):
def method_db(data):
ans = dict.fromkeys((x["userID"] for x in data), 0)
for el in data:
k, v = el["userID"], el["points"]
ans[k] += v
return [{"userID": k, "points": v} for k, v in ans.items()]
def method_mozway1(data):
out = {}
for d in data:
if d['userID'] not in out:
out[d['userID']] = d.copy() # to not modify original
else:
out[d['userID']]['points'] += d['points']
return list(out.values())
def method_mozway2(data):
out = {}
for d in data:
out.setdefault(
d['userID'],
{'userID': d['userID'], 'points': 0}
)['points'] += d['points']
return list(out.values())
def method_Bill1(data):
d = defaultdict(int)
for item in data:
d[item['userID']] += item['points']
return [{'userID': k, 'points': v} for k, v in d.items()]
def method_Bill2(data):
d = defaultdict(int)
def process(d, item):
d[item['userID']] += item['points']
return d
return [{'userID': k, 'points': v} for k, v
in reduce(process, data, d).items()]
def method_Pandas(data_df):
return data_df.groupby('userID').sum()
# Validity checks
out1 = method_db(data)
assert(out1 == method_mozway1(data))
assert(out1 == method_mozway2(data))
assert(out1 == method_Bill1(data))
assert(out1 == method_Bill2(data))
pd.testing.assert_frame_equal(
pd.DataFrame(out1).set_index('userID').sort_index(),
method_Pandas(data_df).sort_index()
)
Test results
In [2]: %timeit method_db(data)
2.7 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [3]: %timeit method_mozway1(data)
2.39 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [4]: %timeit method_mozway2(data)
3.38 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [5]: %timeit method_Bill1(data)
1.59 ms ± 42.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
In [6]: %timeit method_Bill2(data)
2.32 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [7]: %timeit method_Pandas(data_df)
2.05 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
I essentially have this dataset, and I’d like to add/combine the points of each item where the userID is a duplicate.
I know its somewhere along the lines of for (a,b) in array
but asking for assistance learning.
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}, {'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}, {'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 278}, {'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}]
Result should be
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 786},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}
]
Appreciate you guys and SO.
Assuming l
the input list, you can use a dictionary to group by ID while aggregating, then convert back to list:
out = {}
for d in l:
if d['userID'] not in out:
out[d['userID']] = d.copy() # to not modify original
else:
out[d['userID']]['points'] += d['points']
out = list(out.values())
Alternative with setdefault
and a dictionary template:
out = {}
for d in l:
out.setdefault(d['userID'],
{'userID': d['userID'],
'points': 0,
})['points'] += d['points']
out = list(out.values())
Output:
[{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 786},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268}]
I would summarize into a single dictionary
d = [{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254},
{'userID': '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2', 'points': 268},
{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 278},
{'userID': 'QzzucRibfSahbUGwr2PGuhFSU242', 'points': 254}]
ans = {}
for el in d:
k, v = el["userID"], el["points"]
ans[k] = ans.get(k, 0) + v
print(ans)
# {'QzzucRibfSahbUGwr2PGuhFSU242': 786, '5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2': 268}
If you want a list at the end, you can do
[{"userID": k, "points": v} for k, v in ans.items()]
Pandas is useful for doing this kind of computation efficiently:
import pandas as pd
points_by_id = pd.DataFrame(data).groupby('userID').sum()
points
userID
5lyU0TCyqRcTD3y7Rs2FGV8h2Sd2 268
QzzucRibfSahbUGwr2PGuhFSU242 786
Couldn’t resist doing some timing on these methods… turns out the simplest method in Python (which was not proposed yet) is also the fastest. Even faster than Pandas when you already have the data as a dataframe!
Test data – I arbitrarily chose 10,000 records for 50 user IDs:
import random
import string
import pandas as pd
from collections import defaultdict
from functools import reduce
# Prepare fake data
userIDs = [''.join(random.choice(string.ascii_lowercase)
for _ in range(28)) for _ in range(50)]
data = [{'userID': random.choice(userIDs), 'points': random.randint(0,1000)}
for _ in range(10000)]
data_df = pd.DataFrame(data)
Algorithm implementations (based on answers to this question):
def method_db(data):
ans = dict.fromkeys((x["userID"] for x in data), 0)
for el in data:
k, v = el["userID"], el["points"]
ans[k] += v
return [{"userID": k, "points": v} for k, v in ans.items()]
def method_mozway1(data):
out = {}
for d in data:
if d['userID'] not in out:
out[d['userID']] = d.copy() # to not modify original
else:
out[d['userID']]['points'] += d['points']
return list(out.values())
def method_mozway2(data):
out = {}
for d in data:
out.setdefault(
d['userID'],
{'userID': d['userID'], 'points': 0}
)['points'] += d['points']
return list(out.values())
def method_Bill1(data):
d = defaultdict(int)
for item in data:
d[item['userID']] += item['points']
return [{'userID': k, 'points': v} for k, v in d.items()]
def method_Bill2(data):
d = defaultdict(int)
def process(d, item):
d[item['userID']] += item['points']
return d
return [{'userID': k, 'points': v} for k, v
in reduce(process, data, d).items()]
def method_Pandas(data_df):
return data_df.groupby('userID').sum()
# Validity checks
out1 = method_db(data)
assert(out1 == method_mozway1(data))
assert(out1 == method_mozway2(data))
assert(out1 == method_Bill1(data))
assert(out1 == method_Bill2(data))
pd.testing.assert_frame_equal(
pd.DataFrame(out1).set_index('userID').sort_index(),
method_Pandas(data_df).sort_index()
)
Test results
In [2]: %timeit method_db(data)
2.7 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [3]: %timeit method_mozway1(data)
2.39 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [4]: %timeit method_mozway2(data)
3.38 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [5]: %timeit method_Bill1(data)
1.59 ms ± 42.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
In [6]: %timeit method_Bill2(data)
2.32 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [7]: %timeit method_Pandas(data_df)
2.05 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)