Match column names to column values in another dataframe and get corresponding column value

Question:

If df.columns match map["query"], I want to replace df.columns with the corresponding map["symbol"].

import pandas as pd

df = df.T.loc[df.columns.isin(map["query"])].T
df.columns = map["symbol"]
df= df[df.columns.dropna()]

Traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [60], in <cell line: 1>()
----> 1 df.columns = map["symbol"]
      2 df = df[df.columns.dropna()]
      3 df.head()

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)
   5586 try:
   5587     object.__getattribute__(self, name)
-> 5588     return object.__setattr__(self, name, value)
   5589 except AttributeError:
   5590     pass

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)
    767 def _set_axis(self, axis: int, labels: Index) -> None:
    768     labels = ensure_index(labels)
--> 769     self._mgr.set_axis(axis, labels)
    770     self._clear_item_cache()

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)
    212 def set_axis(self, axis: int, new_labels: Index) -> None:
    213     # Caller is responsible for ensuring we have an Index object.
--> 214     self._validate_set_axis(axis, new_labels)
    215     self.axes[axis] = new_labels

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)
     66     pass
     68 elif new_len != old_len:
---> 69     raise ValueError(
     70         f"Length mismatch: Expected axis has {old_len} elements, new "
     71         f"values have {new_len} elements"
     72     )

ValueError: Length mismatch: Expected axis has 44598 elements, new values have 44603 elements

Data:

df

pd.DataFrame({'ENSG00000279928': {2: 0, 3: 0},
 'ENSG00000228037': {2: 0, 3: 0},
 'ENSG00000142611': {2: 0, 3: 13},
 'ENSG00000284616': {2: 0, 3: 0},
 'ENSG00000157911': {2: 0, 3: 8},
 'ENSG00000269896': {2: 0, 3: 0},
 'ENSG00000228463': {2: 0, 3: 0},
 'ENSG00000260972': {2: 0, 3: 0},
 'ENSG00000224340': {2: 0, 3: 0},
 'ENSG00000226374': {2: 0, 3: 0},
 'ENSG00000229280': {2: 0, 3: 0},
 'ENSG00000142655': {2: 0, 3: 2},
 'ENSG00000232596': {2: 0, 3: 0},
 'ENSG00000235054': {2: 0, 3: 0},
 'ENSG00000231510': {2: 0, 3: 0},
 'ENSG00000149527': {2: 0, 3: 0},
 'ENSG00000284739': {2: 0, 3: 0},
 'ENSG00000171621': {2: 0, 3: 0},
 'ENSG00000272235': {2: 0, 3: 0}})

map

pd.DataFrame({'query': {0: 'ENSG00000279928',
  1: 'ENSG00000228037',
  2: 'ENSG00000142611',
  4: 'ENSG00000157911',
  5: 'ENSG00000269896',
  6: 'ENSG00000228463',
  8: 'ENSG00000224340',
  9: 'ENSG00000226374',
  10: 'ENSG00000229280',
  11: 'ENSG00000142655',
  12: 'ENSG00000232596',
  13: 'ENSG00000235054',
  14: 'ENSG00000231510',
  15: 'ENSG00000149527',
  17: 'ENSG00000171621'},
 '_id': {0: 'ENSG00000279928',
  1: '100996583',
  2: '63976',
  4: '5192',
  5: '100129534',
  6: '728481',
  8: '100270877',
  9: '105376672',
  10: '644357',
  11: '5195',
  12: '105376679',
  13: '284661',
  14: 'ENSG00000231510',
  15: '9651',
  17: '80176'},
 '_score': {0: 8.327029,
  1: 25.81547,
  2: 24.07959,
  4: 24.19017,
  5: 8.320914,
  6: 8.06594,
  8: 8.327571,
  9: 25.815289,
  10: 8.327029,
  11: 24.080423,
  12: 25.932892,
  13: 25.794834,
  14: 25.811064,
  15: 24.476448,
  17: 25.008629},
 'symbol': {0: 'DDX11L17',
  1: 'LOC100996583',
  2: 'PRDM16',
  4: 'PEX10',
  5: 'LOC100129534',
  6: 'RPL23AP21',
  8: 'RPL21P21',
  9: 'LINC01345',
  10: 'EEF1DP6',
  11: 'PEX14',
  12: 'LINC01646',
  13: 'LINC01777',
  14: 'LINC02782',
  15: 'PLCH2',
  17: 'SPSB1'}})
Asked By: melolili

||

Answers:

IIUC, you want to replace current columns by symbol and drop no matching name columns:

# dmap is map dataframe (map is a builtin name, don't use it as variable name)
out = (df.set_axis(df.columns.map(dmap.set_index('query')['symbol']), axis=1)
         .loc[:, lambda x: x.columns.notna()])
print(out)

# Output
   DDX11L17  LOC100996583  PRDM16  PEX10  LOC100129534  RPL23AP21  RPL21P21  LINC01345  EEF1DP6  PEX14  LINC01646  LINC01777  LINC02782  PLCH2  SPSB1
2         0             0       0      0             0          0         0          0        0      0          0          0          0      0      0
3         0             0      13      8             0          0         0          0        0      2          0          0          0      0      0

Another solution:

cols = (pd.merge(df.columns.to_series(name='query'),
                dmap[['query', 'symbol']], how='left')
          .drop_duplicates('query', ignore_index=True)['symbol'])

df = df.set_axis(cols, axis=1).loc[:, lambda x: x.columns.notna()]
Answered By: Corralien
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.