Match column names to column values in another dataframe and get corresponding column value
Question:
If df.columns
match map["query"]
, I want to replace df.columns
with the corresponding map["symbol"]
.
import pandas as pd
df = df.T.loc[df.columns.isin(map["query"])].T
df.columns = map["symbol"]
df= df[df.columns.dropna()]
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [60], in <cell line: 1>()
----> 1 df.columns = map["symbol"]
2 df = df[df.columns.dropna()]
3 df.head()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)
5586 try:
5587 object.__getattribute__(self, name)
-> 5588 return object.__setattr__(self, name, value)
5589 except AttributeError:
5590 pass
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)
767 def _set_axis(self, axis: int, labels: Index) -> None:
768 labels = ensure_index(labels)
--> 769 self._mgr.set_axis(axis, labels)
770 self._clear_item_cache()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)
212 def set_axis(self, axis: int, new_labels: Index) -> None:
213 # Caller is responsible for ensuring we have an Index object.
--> 214 self._validate_set_axis(axis, new_labels)
215 self.axes[axis] = new_labels
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)
66 pass
68 elif new_len != old_len:
---> 69 raise ValueError(
70 f"Length mismatch: Expected axis has {old_len} elements, new "
71 f"values have {new_len} elements"
72 )
ValueError: Length mismatch: Expected axis has 44598 elements, new values have 44603 elements
Data:
df
pd.DataFrame({'ENSG00000279928': {2: 0, 3: 0},
'ENSG00000228037': {2: 0, 3: 0},
'ENSG00000142611': {2: 0, 3: 13},
'ENSG00000284616': {2: 0, 3: 0},
'ENSG00000157911': {2: 0, 3: 8},
'ENSG00000269896': {2: 0, 3: 0},
'ENSG00000228463': {2: 0, 3: 0},
'ENSG00000260972': {2: 0, 3: 0},
'ENSG00000224340': {2: 0, 3: 0},
'ENSG00000226374': {2: 0, 3: 0},
'ENSG00000229280': {2: 0, 3: 0},
'ENSG00000142655': {2: 0, 3: 2},
'ENSG00000232596': {2: 0, 3: 0},
'ENSG00000235054': {2: 0, 3: 0},
'ENSG00000231510': {2: 0, 3: 0},
'ENSG00000149527': {2: 0, 3: 0},
'ENSG00000284739': {2: 0, 3: 0},
'ENSG00000171621': {2: 0, 3: 0},
'ENSG00000272235': {2: 0, 3: 0}})
map
pd.DataFrame({'query': {0: 'ENSG00000279928',
1: 'ENSG00000228037',
2: 'ENSG00000142611',
4: 'ENSG00000157911',
5: 'ENSG00000269896',
6: 'ENSG00000228463',
8: 'ENSG00000224340',
9: 'ENSG00000226374',
10: 'ENSG00000229280',
11: 'ENSG00000142655',
12: 'ENSG00000232596',
13: 'ENSG00000235054',
14: 'ENSG00000231510',
15: 'ENSG00000149527',
17: 'ENSG00000171621'},
'_id': {0: 'ENSG00000279928',
1: '100996583',
2: '63976',
4: '5192',
5: '100129534',
6: '728481',
8: '100270877',
9: '105376672',
10: '644357',
11: '5195',
12: '105376679',
13: '284661',
14: 'ENSG00000231510',
15: '9651',
17: '80176'},
'_score': {0: 8.327029,
1: 25.81547,
2: 24.07959,
4: 24.19017,
5: 8.320914,
6: 8.06594,
8: 8.327571,
9: 25.815289,
10: 8.327029,
11: 24.080423,
12: 25.932892,
13: 25.794834,
14: 25.811064,
15: 24.476448,
17: 25.008629},
'symbol': {0: 'DDX11L17',
1: 'LOC100996583',
2: 'PRDM16',
4: 'PEX10',
5: 'LOC100129534',
6: 'RPL23AP21',
8: 'RPL21P21',
9: 'LINC01345',
10: 'EEF1DP6',
11: 'PEX14',
12: 'LINC01646',
13: 'LINC01777',
14: 'LINC02782',
15: 'PLCH2',
17: 'SPSB1'}})
Answers:
IIUC, you want to replace current columns by symbol and drop no matching name columns:
# dmap is map dataframe (map is a builtin name, don't use it as variable name)
out = (df.set_axis(df.columns.map(dmap.set_index('query')['symbol']), axis=1)
.loc[:, lambda x: x.columns.notna()])
print(out)
# Output
DDX11L17 LOC100996583 PRDM16 PEX10 LOC100129534 RPL23AP21 RPL21P21 LINC01345 EEF1DP6 PEX14 LINC01646 LINC01777 LINC02782 PLCH2 SPSB1
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 13 8 0 0 0 0 0 2 0 0 0 0 0
Another solution:
cols = (pd.merge(df.columns.to_series(name='query'),
dmap[['query', 'symbol']], how='left')
.drop_duplicates('query', ignore_index=True)['symbol'])
df = df.set_axis(cols, axis=1).loc[:, lambda x: x.columns.notna()]
If df.columns
match map["query"]
, I want to replace df.columns
with the corresponding map["symbol"]
.
import pandas as pd
df = df.T.loc[df.columns.isin(map["query"])].T
df.columns = map["symbol"]
df= df[df.columns.dropna()]
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [60], in <cell line: 1>()
----> 1 df.columns = map["symbol"]
2 df = df[df.columns.dropna()]
3 df.head()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)
5586 try:
5587 object.__getattribute__(self, name)
-> 5588 return object.__setattr__(self, name, value)
5589 except AttributeError:
5590 pass
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)
767 def _set_axis(self, axis: int, labels: Index) -> None:
768 labels = ensure_index(labels)
--> 769 self._mgr.set_axis(axis, labels)
770 self._clear_item_cache()
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)
212 def set_axis(self, axis: int, new_labels: Index) -> None:
213 # Caller is responsible for ensuring we have an Index object.
--> 214 self._validate_set_axis(axis, new_labels)
215 self.axes[axis] = new_labels
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)
66 pass
68 elif new_len != old_len:
---> 69 raise ValueError(
70 f"Length mismatch: Expected axis has {old_len} elements, new "
71 f"values have {new_len} elements"
72 )
ValueError: Length mismatch: Expected axis has 44598 elements, new values have 44603 elements
Data:
df
pd.DataFrame({'ENSG00000279928': {2: 0, 3: 0},
'ENSG00000228037': {2: 0, 3: 0},
'ENSG00000142611': {2: 0, 3: 13},
'ENSG00000284616': {2: 0, 3: 0},
'ENSG00000157911': {2: 0, 3: 8},
'ENSG00000269896': {2: 0, 3: 0},
'ENSG00000228463': {2: 0, 3: 0},
'ENSG00000260972': {2: 0, 3: 0},
'ENSG00000224340': {2: 0, 3: 0},
'ENSG00000226374': {2: 0, 3: 0},
'ENSG00000229280': {2: 0, 3: 0},
'ENSG00000142655': {2: 0, 3: 2},
'ENSG00000232596': {2: 0, 3: 0},
'ENSG00000235054': {2: 0, 3: 0},
'ENSG00000231510': {2: 0, 3: 0},
'ENSG00000149527': {2: 0, 3: 0},
'ENSG00000284739': {2: 0, 3: 0},
'ENSG00000171621': {2: 0, 3: 0},
'ENSG00000272235': {2: 0, 3: 0}})
map
pd.DataFrame({'query': {0: 'ENSG00000279928',
1: 'ENSG00000228037',
2: 'ENSG00000142611',
4: 'ENSG00000157911',
5: 'ENSG00000269896',
6: 'ENSG00000228463',
8: 'ENSG00000224340',
9: 'ENSG00000226374',
10: 'ENSG00000229280',
11: 'ENSG00000142655',
12: 'ENSG00000232596',
13: 'ENSG00000235054',
14: 'ENSG00000231510',
15: 'ENSG00000149527',
17: 'ENSG00000171621'},
'_id': {0: 'ENSG00000279928',
1: '100996583',
2: '63976',
4: '5192',
5: '100129534',
6: '728481',
8: '100270877',
9: '105376672',
10: '644357',
11: '5195',
12: '105376679',
13: '284661',
14: 'ENSG00000231510',
15: '9651',
17: '80176'},
'_score': {0: 8.327029,
1: 25.81547,
2: 24.07959,
4: 24.19017,
5: 8.320914,
6: 8.06594,
8: 8.327571,
9: 25.815289,
10: 8.327029,
11: 24.080423,
12: 25.932892,
13: 25.794834,
14: 25.811064,
15: 24.476448,
17: 25.008629},
'symbol': {0: 'DDX11L17',
1: 'LOC100996583',
2: 'PRDM16',
4: 'PEX10',
5: 'LOC100129534',
6: 'RPL23AP21',
8: 'RPL21P21',
9: 'LINC01345',
10: 'EEF1DP6',
11: 'PEX14',
12: 'LINC01646',
13: 'LINC01777',
14: 'LINC02782',
15: 'PLCH2',
17: 'SPSB1'}})
IIUC, you want to replace current columns by symbol and drop no matching name columns:
# dmap is map dataframe (map is a builtin name, don't use it as variable name)
out = (df.set_axis(df.columns.map(dmap.set_index('query')['symbol']), axis=1)
.loc[:, lambda x: x.columns.notna()])
print(out)
# Output
DDX11L17 LOC100996583 PRDM16 PEX10 LOC100129534 RPL23AP21 RPL21P21 LINC01345 EEF1DP6 PEX14 LINC01646 LINC01777 LINC02782 PLCH2 SPSB1
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 13 8 0 0 0 0 0 2 0 0 0 0 0
Another solution:
cols = (pd.merge(df.columns.to_series(name='query'),
dmap[['query', 'symbol']], how='left')
.drop_duplicates('query', ignore_index=True)['symbol'])
df = df.set_axis(cols, axis=1).loc[:, lambda x: x.columns.notna()]