Intersection and difference of multiple sets
Question:
I have multiple lists of "associations" as input:
input1 = [
('id1', 'id2', 'id3', 'id4'),
('id5',),
('id6', 'id7', 'id8')
]
input2 = [
('id1', 'id2', 'id4'),
('id3',),
('id5',),
('id7', 'id6', 'id8')
]
input3 = [
('id1', 'id2'),
('id3', 'id4'),
('id5',),
('id8', 'id7', 'id6')
]
-
Each input contains all the ids.
-
Each id appears a single time in an input.
-
The number of inputs can vary from 1 to 4.
I would like to process those inputs and generate something like this:
assocs = {
# never associated
0: [
('id5',),
],
# associated a single time
1: [
('id1', 'id3'),
('id2', 'id3'),
],
# associated twice
2: [ #
('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
],
# associated everytime
3: [
('id1', 'id2'),
('id6', 'id7', 'id8'),
]
}
I can somehow count the number of times that each association is encountered with:
temp = defaultdict(int)
for ids in map(sorted, itertools.chain(input1, input2, input3)):
for i in range(1, len(ids)+1):
for comb in itertools.combinations(ids, i):
temp[comb] += 1
But now I’m stuck with the following step, which would be cleaning the temp
dict; not to mention that I’m not sure that I chose the right strategy in the first place.
Answers:
The logic is not fully clear to me, but it looks like you could generate the powersets of each tuple and count them, handling the loners specifically:
from itertools import chain, combinations
from collections import Counter
inputs = [input1, input2, input3]
def powerset(iterable, min=2):
# modified powerset recipe to yield at least pairs
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(min, len(s)+1))
cnt = Counter(tuple(sorted(s)) for lst in inputs for t in lst
for s in (powerset(t) if len(t)>1 else (t,)))
out = {}
for k, c in cnt.items():
out.setdefault(c if len(k)>1 else 0, []).append(k)
Output:
{0: [('id5',),
('id3',)],
1: [('id1', 'id3'),
('id2', 'id3'),
('id1', 'id2', 'id3'),
('id1', 'id3', 'id4'),
('id2', 'id3', 'id4'),
('id1', 'id2', 'id3', 'id4')],
2: [('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
('id1', 'id2', 'id4')],
3: [('id1', 'id2'),
('id6', 'id7'),
('id6', 'id8'),
('id7', 'id8'),
('id6', 'id7', 'id8')]}
Different way of handling the loners:
cnt = Counter(tuple(sorted(s)) for lst in inputs for t in lst
for s in powerset(t))
out = {0: [(x,) for x in set().union(*input1) - set().union(*cnt.keys())]}
for k, c in cnt.items():
out.setdefault(c if len(k)>1 else 0, []).append(k)
output:
{0: [('id5',)],
1: [('id1', 'id3'),
('id2', 'id3'),
('id1', 'id2', 'id3'),
('id1', 'id3', 'id4'),
('id2', 'id3', 'id4'),
('id1', 'id2', 'id3', 'id4')],
2: [('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
('id1', 'id2', 'id4')],
3: [('id1', 'id2'),
('id6', 'id7'),
('id6', 'id8'),
('id7', 'id8'),
('id6', 'id7', 'id8')]}
I have multiple lists of "associations" as input:
input1 = [
('id1', 'id2', 'id3', 'id4'),
('id5',),
('id6', 'id7', 'id8')
]
input2 = [
('id1', 'id2', 'id4'),
('id3',),
('id5',),
('id7', 'id6', 'id8')
]
input3 = [
('id1', 'id2'),
('id3', 'id4'),
('id5',),
('id8', 'id7', 'id6')
]
-
Each input contains all the ids.
-
Each id appears a single time in an input.
-
The number of inputs can vary from 1 to 4.
I would like to process those inputs and generate something like this:
assocs = {
# never associated
0: [
('id5',),
],
# associated a single time
1: [
('id1', 'id3'),
('id2', 'id3'),
],
# associated twice
2: [ #
('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
],
# associated everytime
3: [
('id1', 'id2'),
('id6', 'id7', 'id8'),
]
}
I can somehow count the number of times that each association is encountered with:
temp = defaultdict(int)
for ids in map(sorted, itertools.chain(input1, input2, input3)):
for i in range(1, len(ids)+1):
for comb in itertools.combinations(ids, i):
temp[comb] += 1
But now I’m stuck with the following step, which would be cleaning the temp
dict; not to mention that I’m not sure that I chose the right strategy in the first place.
The logic is not fully clear to me, but it looks like you could generate the powersets of each tuple and count them, handling the loners specifically:
from itertools import chain, combinations
from collections import Counter
inputs = [input1, input2, input3]
def powerset(iterable, min=2):
# modified powerset recipe to yield at least pairs
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(min, len(s)+1))
cnt = Counter(tuple(sorted(s)) for lst in inputs for t in lst
for s in (powerset(t) if len(t)>1 else (t,)))
out = {}
for k, c in cnt.items():
out.setdefault(c if len(k)>1 else 0, []).append(k)
Output:
{0: [('id5',),
('id3',)],
1: [('id1', 'id3'),
('id2', 'id3'),
('id1', 'id2', 'id3'),
('id1', 'id3', 'id4'),
('id2', 'id3', 'id4'),
('id1', 'id2', 'id3', 'id4')],
2: [('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
('id1', 'id2', 'id4')],
3: [('id1', 'id2'),
('id6', 'id7'),
('id6', 'id8'),
('id7', 'id8'),
('id6', 'id7', 'id8')]}
Different way of handling the loners:
cnt = Counter(tuple(sorted(s)) for lst in inputs for t in lst
for s in powerset(t))
out = {0: [(x,) for x in set().union(*input1) - set().union(*cnt.keys())]}
for k, c in cnt.items():
out.setdefault(c if len(k)>1 else 0, []).append(k)
output:
{0: [('id5',)],
1: [('id1', 'id3'),
('id2', 'id3'),
('id1', 'id2', 'id3'),
('id1', 'id3', 'id4'),
('id2', 'id3', 'id4'),
('id1', 'id2', 'id3', 'id4')],
2: [('id1', 'id4'),
('id2', 'id4'),
('id3', 'id4'),
('id1', 'id2', 'id4')],
3: [('id1', 'id2'),
('id6', 'id7'),
('id6', 'id8'),
('id7', 'id8'),
('id6', 'id7', 'id8')]}