Generate a set of non repeating pairs of users in multiple groups
Question:
I have a dataset that looks like this.
To give some context, there can multiple user groups (odd number of people in it). Each of the groups can contain multiple users in it. So, within each and every group, I needed to select pairs of users in such a fashion that,
A person must not be repeated in any of the pairs, until the entire user list is exhausted. The partial solution below starts pairing users that do not belong to the same group as well. Not sure how to tackle this grouping constraint.
group_id
user_id
1
a1
1
b1
1
c1
1
d1
2
x1
import pandas as pd
import numpy as np
df = [[1, 'a1'],
[1, 'b1'],
[1, 'c1'],
[1, 'd1'],
[2, 'x1'],
[2, 'y1'],
[2, 'z1']]
df = pd.DataFrame(df, columns=['group_id', 'user_id'])
df.head()
I have a partial solution after going through numerous questions and answers.
This solution starts pairing users that do not belong to the same group as well.
Which is not what I want:
from itertools import combinations
# Even Number of users Required
users = df['user_id'].to_list()
n = int(len(users) / 2)
stages = []
for i in range(len(users) - 1):
t = users[:1] + users[-i:] + users[1:-i] if i else users
stages.append(list(zip(t[:n], reversed(t[n:]))))
print(stages)
Not sure how to store the pairs back into a pandas data frame.
Expected Output (which was updated later on):
For group 1 and group 2, note that there can n number of groups:
group_id
combinations
1
a1-d1
1
b1-c1
1
a1-c1
1
d1-b1
1
a1-b1
2
x1-x2
2
x2-x3
2
x1-x3
Error while running @mozway’s code:
This error happens for all inputs:
AssertionError Traceback (most recent call last)
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _finalize_columns_and_data(content, columns, dtype)
981 try:
--> 982 columns = _validate_or_indexify_columns(contents, columns)
983 except AssertionError as err:
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _validate_or_indexify_columns(content, columns)
1029 # caller's responsibility to check for this...
-> 1030 raise AssertionError(
1031 f"{len(columns)} columns passed, passed data had "
AssertionError: 1 columns passed, passed data had 6 columns
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~AppDataLocalTempipykernel_14172369883545.py in <module>
24 return stages
25
---> 26 out = (df.groupby(['group_id'], as_index=False)['user_id'].apply(combine).explode('user_id'))
27 print(out.head())
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygroupby.py in apply(self, func, *args, **kwargs)
1421 with option_context("mode.chained_assignment", None):
1422 try:
-> 1423 result = self._python_apply_general(f, self._selected_obj)
1424 except TypeError:
1425 # gh-20949
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygroupby.py in _python_apply_general(self, f, data, not_indexed_same)
1467 not_indexed_same = mutated or self.mutated
1468
-> 1469 return self._wrap_applied_output(
1470 data, values, not_indexed_same=not_indexed_same
1471 )
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygeneric.py in _wrap_applied_output(self, data, values, not_indexed_same)
1025 return self.obj._constructor_sliced(values, index=key_index)
1026 else:
-> 1027 result = self.obj._constructor(values, columns=[self._selection])
1028 self._insert_inaxis_grouper_inplace(result)
1029 return result
C:ProgramDataAnaconda3libsite-packagespandascoreframe.py in __init__(self, data, index, columns, dtype, copy)
719 # ndarray], Index, Series], Sequence[Any]]"
720 columns = ensure_index(columns) # type: ignore[arg-type]
--> 721 arrays, columns, index = nested_data_to_arrays(
722 # error: Argument 3 to "nested_data_to_arrays" has incompatible
723 # type "Optional[Collection[Any]]"; expected "Optional[Index]"
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in nested_data_to_arrays(data, columns, index, dtype)
517 columns = ensure_index(data[0]._fields)
518
--> 519 arrays, columns = to_arrays(data, columns, dtype=dtype)
520 columns = ensure_index(columns)
521
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in to_arrays(data, columns, dtype)
881 arr = _list_to_arrays(data)
882
--> 883 content, columns = _finalize_columns_and_data(arr, columns, dtype)
884 return content, columns
885
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _finalize_columns_and_data(content, columns, dtype)
983 except AssertionError as err:
984 # GH#26429 do not raise user-facing AssertionError
--> 985 raise ValueError(err) from err
986
987 if len(contents) and contents[0].dtype == np.object_:
ValueError: 1 columns passed, passed data had 6 columns```
Answers:
updated answer
Using your code here, but applying it per group:
def combine(s):
users = s.tolist()
n = int(len(users) / 2)
stages = []
for i in range(len(users) - 1):
t = users[:1] + users[-i:] + users[1:-i] if i else users
stages.extend([f'{a}-{b}' for a,b in zip(t[:n], reversed(t[n:]))])
return stages
out = (df.groupby('group_id', as_index=False)['user_id']
.apply(combine).explode('user_id')
)
Output:
group_id user_id
0 1 a1-d1
0 1 b1-c1
0 1 a1-c1
0 1 d1-b1
0 1 a1-b1
0 1 c1-d1
1 2 x1-z1
1 2 x1-y1
original answer before question clarirication (incorrect)
You can use:
from itertools import combinations
out = [c for k, g in df.groupby('group_id')['user_id']
for c in combinations(g, 2)]
Output:
[('a1', 'b1'),
('a1', 'c1'),
('a1', 'd1'),
('b1', 'c1'),
('b1', 'd1'),
('c1', 'd1'),
('x1', 'y1'),
('x1', 'z1'),
('y1', 'z1')]
I have a dataset that looks like this.
To give some context, there can multiple user groups (odd number of people in it). Each of the groups can contain multiple users in it. So, within each and every group, I needed to select pairs of users in such a fashion that,
A person must not be repeated in any of the pairs, until the entire user list is exhausted. The partial solution below starts pairing users that do not belong to the same group as well. Not sure how to tackle this grouping constraint.
group_id | user_id |
---|---|
1 | a1 |
1 | b1 |
1 | c1 |
1 | d1 |
2 | x1 |
import pandas as pd
import numpy as np
df = [[1, 'a1'],
[1, 'b1'],
[1, 'c1'],
[1, 'd1'],
[2, 'x1'],
[2, 'y1'],
[2, 'z1']]
df = pd.DataFrame(df, columns=['group_id', 'user_id'])
df.head()
I have a partial solution after going through numerous questions and answers.
This solution starts pairing users that do not belong to the same group as well.
Which is not what I want:
from itertools import combinations
# Even Number of users Required
users = df['user_id'].to_list()
n = int(len(users) / 2)
stages = []
for i in range(len(users) - 1):
t = users[:1] + users[-i:] + users[1:-i] if i else users
stages.append(list(zip(t[:n], reversed(t[n:]))))
print(stages)
Not sure how to store the pairs back into a pandas data frame.
Expected Output (which was updated later on):
For group 1 and group 2, note that there can n number of groups:
group_id | combinations |
---|---|
1 | a1-d1 |
1 | b1-c1 |
1 | a1-c1 |
1 | d1-b1 |
1 | a1-b1 |
2 | x1-x2 |
2 | x2-x3 |
2 | x1-x3 |
Error while running @mozway’s code:
This error happens for all inputs:
AssertionError Traceback (most recent call last)
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _finalize_columns_and_data(content, columns, dtype)
981 try:
--> 982 columns = _validate_or_indexify_columns(contents, columns)
983 except AssertionError as err:
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _validate_or_indexify_columns(content, columns)
1029 # caller's responsibility to check for this...
-> 1030 raise AssertionError(
1031 f"{len(columns)} columns passed, passed data had "
AssertionError: 1 columns passed, passed data had 6 columns
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~AppDataLocalTempipykernel_14172369883545.py in <module>
24 return stages
25
---> 26 out = (df.groupby(['group_id'], as_index=False)['user_id'].apply(combine).explode('user_id'))
27 print(out.head())
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygroupby.py in apply(self, func, *args, **kwargs)
1421 with option_context("mode.chained_assignment", None):
1422 try:
-> 1423 result = self._python_apply_general(f, self._selected_obj)
1424 except TypeError:
1425 # gh-20949
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygroupby.py in _python_apply_general(self, f, data, not_indexed_same)
1467 not_indexed_same = mutated or self.mutated
1468
-> 1469 return self._wrap_applied_output(
1470 data, values, not_indexed_same=not_indexed_same
1471 )
C:ProgramDataAnaconda3libsite-packagespandascoregroupbygeneric.py in _wrap_applied_output(self, data, values, not_indexed_same)
1025 return self.obj._constructor_sliced(values, index=key_index)
1026 else:
-> 1027 result = self.obj._constructor(values, columns=[self._selection])
1028 self._insert_inaxis_grouper_inplace(result)
1029 return result
C:ProgramDataAnaconda3libsite-packagespandascoreframe.py in __init__(self, data, index, columns, dtype, copy)
719 # ndarray], Index, Series], Sequence[Any]]"
720 columns = ensure_index(columns) # type: ignore[arg-type]
--> 721 arrays, columns, index = nested_data_to_arrays(
722 # error: Argument 3 to "nested_data_to_arrays" has incompatible
723 # type "Optional[Collection[Any]]"; expected "Optional[Index]"
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in nested_data_to_arrays(data, columns, index, dtype)
517 columns = ensure_index(data[0]._fields)
518
--> 519 arrays, columns = to_arrays(data, columns, dtype=dtype)
520 columns = ensure_index(columns)
521
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in to_arrays(data, columns, dtype)
881 arr = _list_to_arrays(data)
882
--> 883 content, columns = _finalize_columns_and_data(arr, columns, dtype)
884 return content, columns
885
C:ProgramDataAnaconda3libsite-packagespandascoreinternalsconstruction.py in _finalize_columns_and_data(content, columns, dtype)
983 except AssertionError as err:
984 # GH#26429 do not raise user-facing AssertionError
--> 985 raise ValueError(err) from err
986
987 if len(contents) and contents[0].dtype == np.object_:
ValueError: 1 columns passed, passed data had 6 columns```
updated answer
Using your code here, but applying it per group:
def combine(s):
users = s.tolist()
n = int(len(users) / 2)
stages = []
for i in range(len(users) - 1):
t = users[:1] + users[-i:] + users[1:-i] if i else users
stages.extend([f'{a}-{b}' for a,b in zip(t[:n], reversed(t[n:]))])
return stages
out = (df.groupby('group_id', as_index=False)['user_id']
.apply(combine).explode('user_id')
)
Output:
group_id user_id
0 1 a1-d1
0 1 b1-c1
0 1 a1-c1
0 1 d1-b1
0 1 a1-b1
0 1 c1-d1
1 2 x1-z1
1 2 x1-y1
original answer before question clarirication (incorrect)
You can use:
from itertools import combinations
out = [c for k, g in df.groupby('group_id')['user_id']
for c in combinations(g, 2)]
Output:
[('a1', 'b1'),
('a1', 'c1'),
('a1', 'd1'),
('b1', 'c1'),
('b1', 'd1'),
('c1', 'd1'),
('x1', 'y1'),
('x1', 'z1'),
('y1', 'z1')]