Compare objects in a list to identify those with certain identical key/value pairs and those without
Question:
Using Python, how can I find objects in a list that share certain key/value pairs, then create two separate lists – one for objects that share those certain key/value pairs, and one for objects that don’t?
For example, take the following simple list –
[
{
"id": "111",
"host": "aaa",
"path": "/b/c/d"
},
{
"id": "222",
"host": "bbb",
"path": "/x/y/z"
},
{
"id": "333",
"host": "aaa",
"path": "/b/c/d"
},
{
"id": "444",
"host": "aaa",
"path": "/b/c/d"
}
]
I’d like to end up with two lists –
-
Objects with duplicate host
and path
.
[
{
"host": "aaa",
"path": "/b/c/d"
"ids": [
"111",
"333",
"444",
}
]
-
Objects without duplicate host
and path
.
[
{
"id": "222",
"host": "bbb",
"path": "/x/y/z"
}
]
My best attempt so far has yielded two lists, but all of the objects in the original list are added to dups_list
, regardless of whether or not they are actually duplicates.
Please note that I have tried taking a deepcopy
of main_list
to use in the second for
statement, but that yielded the exact same results.
>>> import jsonpickle
>>> main_list = list((dict(Id="111",host="aaa",path="/b/c/d"),dict(Id="222",host="bbb",path="/x/y/z"),dict(Id="333",host="aaa",path="/b/c/d"),dict(Id="444",host="aaa",path="/b/c/d")))
>>> dups_list = list()
>>> non_dups_list = list()
>>> for o in main_list:
... is_duplicate = False
... for o2 in main_list:
... if o2['host'] == o['host'] and o2['path'] == o['path']:
... is_duplicate = True
... break
... if is_duplicate:
... dups_list.append(o)
... else:
... non_dups_list.append(o)
...
>>> print(jsonpickle.encode(non_dups_list, indent=4))
[]
>>> print(jsonpickle.encode(dups_list, indent=4))
[
{
"Id": "111",
"host": "aaa",
"path": "/b/c/d"
},
{
"Id": "222",
"host": "bbb",
"path": "/x/y/z"
},
{
"Id": "333",
"host": "aaa",
"path": "/b/c/d"
},
{
"Id": "444",
"host": "aaa",
"path": "/b/c/d"
}
]
Answers:
I suggest using itertools.
from itertools import groupby
def get_key(d):
# Define a custom key function to group by multiple keys
return d['host'], d['path']
data = [...your data here]
grouped_data = []
for k, g in groupby(sorted(data, key=get_key), key=get_key):
grouped_data.append({'host': k[0], 'path': k[1], 'ids': [i['id'] for i in list(g)]})
I would use defaultdict
with dict/listcomps :
from collections import defaultdict
g = defaultdict(list)
for obj in list_objs:
g[(obj["host"], obj["path"])].append(obj["id"])
dups = [{"host": k[0], "path": k[1], "ids": v} for k, v in g.items() if len(v) > 1]
uniqs = [obj for obj in list_objs if (obj["host"], obj["path"])
not in [k for k, v in g.items() if len(v) > 1]]
#12.2 µs ± 316 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Output :
>>> print(dups)
#[{'host': 'aaa', 'path': '/b/c/d', 'ids': ['111', '333', '444']}]
>>> print(uniqs)
#[{'id': '222', 'host': 'bbb', 'path': '/x/y/z'}]
Using Python, how can I find objects in a list that share certain key/value pairs, then create two separate lists – one for objects that share those certain key/value pairs, and one for objects that don’t?
For example, take the following simple list –
[
{
"id": "111",
"host": "aaa",
"path": "/b/c/d"
},
{
"id": "222",
"host": "bbb",
"path": "/x/y/z"
},
{
"id": "333",
"host": "aaa",
"path": "/b/c/d"
},
{
"id": "444",
"host": "aaa",
"path": "/b/c/d"
}
]
I’d like to end up with two lists –
-
Objects with duplicate
host
andpath
.[ { "host": "aaa", "path": "/b/c/d" "ids": [ "111", "333", "444", } ]
-
Objects without duplicate
host
andpath
.[ { "id": "222", "host": "bbb", "path": "/x/y/z" } ]
My best attempt so far has yielded two lists, but all of the objects in the original list are added to dups_list
, regardless of whether or not they are actually duplicates.
Please note that I have tried taking a deepcopy
of main_list
to use in the second for
statement, but that yielded the exact same results.
>>> import jsonpickle
>>> main_list = list((dict(Id="111",host="aaa",path="/b/c/d"),dict(Id="222",host="bbb",path="/x/y/z"),dict(Id="333",host="aaa",path="/b/c/d"),dict(Id="444",host="aaa",path="/b/c/d")))
>>> dups_list = list()
>>> non_dups_list = list()
>>> for o in main_list:
... is_duplicate = False
... for o2 in main_list:
... if o2['host'] == o['host'] and o2['path'] == o['path']:
... is_duplicate = True
... break
... if is_duplicate:
... dups_list.append(o)
... else:
... non_dups_list.append(o)
...
>>> print(jsonpickle.encode(non_dups_list, indent=4))
[]
>>> print(jsonpickle.encode(dups_list, indent=4))
[
{
"Id": "111",
"host": "aaa",
"path": "/b/c/d"
},
{
"Id": "222",
"host": "bbb",
"path": "/x/y/z"
},
{
"Id": "333",
"host": "aaa",
"path": "/b/c/d"
},
{
"Id": "444",
"host": "aaa",
"path": "/b/c/d"
}
]
I suggest using itertools.
from itertools import groupby
def get_key(d):
# Define a custom key function to group by multiple keys
return d['host'], d['path']
data = [...your data here]
grouped_data = []
for k, g in groupby(sorted(data, key=get_key), key=get_key):
grouped_data.append({'host': k[0], 'path': k[1], 'ids': [i['id'] for i in list(g)]})
I would use defaultdict
with dict/listcomps :
from collections import defaultdict
g = defaultdict(list)
for obj in list_objs:
g[(obj["host"], obj["path"])].append(obj["id"])
dups = [{"host": k[0], "path": k[1], "ids": v} for k, v in g.items() if len(v) > 1]
uniqs = [obj for obj in list_objs if (obj["host"], obj["path"])
not in [k for k, v in g.items() if len(v) > 1]]
#12.2 µs ± 316 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Output :
>>> print(dups)
#[{'host': 'aaa', 'path': '/b/c/d', 'ids': ['111', '333', '444']}]
>>> print(uniqs)
#[{'id': '222', 'host': 'bbb', 'path': '/x/y/z'}]