How to match the index of a dataframe with a column of another dataframe and replace it with the values of an adjacent column?

Question

If the "Composite_Element_REF" column values of the sym dataframe matches the index of df_normal_symbol, I want to replace the index of df_normal_symbol with the adjacent column in the sym dataframe, i.e., the Gene_Symbol column.

df_normal_symbol.index = df_normal.loc[sym["Composite_Element_REF"], df_normal.index].values

Traceback:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [38], in <cell line: 1>()
----> 1 df_normal_symbol.index = df_normal.loc[sym["Composite_Element_REF"], df_normal.index].values

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexing.py:961, in _LocationIndexer.__getitem__(self, key)
    959     if self._is_scalar_access(key):
    960         return self.obj._get_value(*key, takeable=self._takeable)
--> 961     return self._getitem_tuple(key)
    962 else:
    963     # we by definition only have the 0th axis
    964     axis = self.axis or 0

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1147, in _LocIndexer._getitem_tuple(self, tup)
   1145 # ugly hack for GH #836
   1146 if self._multi_take_opportunity(tup):
-> 1147     return self._multi_take(tup)
   1149 return self._getitem_tuple_same_dim(tup)

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1098, in _LocIndexer._multi_take(self, tup)
   1082 """
   1083 Create the indexers for the passed tuple of keys, and
   1084 executes the take operation. This allows the take operation to be
   (...)
   1095 values: same type as the object being indexed
   1096 """
   1097 # GH 836
-> 1098 d = {
   1099     axis: self._get_listlike_indexer(key, axis)
   1100     for (key, axis) in zip(tup, self.obj._AXIS_ORDERS)
   1101 }
   1102 return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True)

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1099, in <dictcomp>(.0)
   1082 """
   1083 Create the indexers for the passed tuple of keys, and
   1084 executes the take operation. This allows the take operation to be
   (...)
   1095 values: same type as the object being indexed
   1096 """
   1097 # GH 836
   1098 d = {
-> 1099     axis: self._get_listlike_indexer(key, axis)
   1100     for (key, axis) in zip(tup, self.obj._AXIS_ORDERS)
   1101 }
   1102 return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True)

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1327, in _LocIndexer._get_listlike_indexer(self, key, axis)
   1324 ax = self.obj._get_axis(axis)
   1325 axis_name = self.obj._get_axis_name(axis)
-> 1327 keyarr, indexer = ax._get_indexer_strict(key, axis_name)
   1329 return keyarr, indexer

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexes/base.py:5782, in Index._get_indexer_strict(self, key, axis_name)
   5779 else:
   5780     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5782 self._raise_if_missing(keyarr, indexer, axis_name)
   5784 keyarr = self.take(indexer)
   5785 if isinstance(key, Index):
   5786     # GH 42790 - Preserve name from an Index

File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/pandas/core/indexes/base.py:5845, in Index._raise_if_missing(self, key, indexer, axis_name)
   5842     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   5844 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5845 raise KeyError(f"{not_found} not in index")

Expected output:

pd.DataFrame({'TCGA-CZ-5457-11A': {nan: 0.102035759907132,
  'VDAC3': 0.893345348116849,
  'ACTN1': 0.847131904106541,
  'ATP2A1': 0.580488869725658,
  'SFRP1': 0.470767306311169,
  nan: 0.147416341092933,
  'NIPA2': 0.0120942766037886},
 'TCGA-BQ-5888-11A': {nan: 0.147149659097321,
  'VDAC3': 0.910195291355705,
  'ACTN1': 0.816669300689161,
  'ATP2A1': 0.514358122653833,
  'SFRP1': 0.441313292788889,
  nan: 0.245573257728479,
  'NIPA2': 0.0147939578910346},
 'TCGA-B0-4846-11A': {nan: 0.113480434528015,
  'VDAC3': 0.886088576813537,
  'ACTN1': 0.664793188247786,
  'ATP2A1': 0.516081593815069,
  'SFRP1': 0.400027063258341,
  nan: 0.190871544331105,
  'NIPA2': 0.0099210543418163},
 'TCGA-CJ-4920-11A': {nan: 0.111657157534977,
  'VDAC3': 0.918934002233238,
  'ACTN1': 0.773517265412361,
  'ATP2A1': 0.571990354691741,
  'SFRP1': 0.489779654823996,
  nan: 0.24188997202946,
  'NIPA2': 0.0097521875052793,
  'cg00000658': 0.919644862137697,
  'cg00000721': 0.94229345837988},
 'TCGA-B0-4849-11A': {nan: 0.13898299158527,
  'VDAC3': 0.892691778501429,
  'ACTN1': 0.697181652158477,
  'ATP2A1': 0.47288614791789,
  'SFRP1': 0.377593967259526,
  nan: 0.149294919317939,
  'NIPA2': 0.0107697567839102},
 'TCGA-BQ-5891-11A': {nan: 0.0943910860490585,
  'VDAC3': 0.798899904372697,
  'ACTN1': 0.689450514637892,
  'ATP2A1': 0.568046821756013,
  'SFRP1': 0.464626018317553,
  nan: 0.231639837864006,
  'NIPA2': 0.0487962187571897},
 'TCGA-BP-5186-11A': {'cg00000165': 0.110112361205661,
  'VDAC3': 0.827523582109836,
  'ACTN1': 0.757610109046985,
  'ATP2A1': 0.484209696051666,
  'SFRP1': 0.412811564854099,
  nan: 0.167420794630144,
  'NIPA2': 0.0104916507529456},
 'TCGA-A3-3373-11A': {nan: 0.117830727124756,
  'VDAC3': 0.90581935721054,
  'ACTN1': 0.761457792189881,
  'ATP2A1': 0.507633250448944,
  'SFRP1': 0.51611998698701,
  nan: 0.1737386620934,
  'NIPA2': 0.0108894792403789},
 'TCGA-BP-5180-11A': {nan: 0.119205137521098,
  'VDAC3': 0.891261719087507,
  'ACTN1': 0.746767379239554,
  'ATP2A1': 0.463089282194905,
  'SFRP1': 0.464692516947339,
  nan: 0.228609755811405,
  'NIPA2': 0.0095536851256427}})

Data:

df_normal_symbol

pd.DataFrame({'TCGA-CZ-5457-11A': {'cg00000165': 0.102035759907132,
  'cg00000236': 0.893345348116849,
  'cg00000289': 0.847131904106541,
  'cg00000292': 0.580488869725658,
  'cg00000321': 0.470767306311169,
  'cg00000363': 0.147416341092933,
  'cg00000622': 0.0120942766037886,
  'cg00000658': 0.93695494977688,
  'cg00000721': 0.975854444522775},
 'TCGA-BQ-5888-11A': {'cg00000165': 0.147149659097321,
  'cg00000236': 0.910195291355705,
  'cg00000289': 0.816669300689161,
  'cg00000292': 0.514358122653833,
  'cg00000321': 0.441313292788889,
  'cg00000363': 0.245573257728479,
  'cg00000622': 0.0147939578910346,
  'cg00000658': 0.933589698841974,
  'cg00000721': 0.93311604425552},
 'TCGA-B0-4846-11A': {'cg00000165': 0.113480434528015,
  'cg00000236': 0.886088576813537,
  'cg00000289': 0.664793188247786,
  'cg00000292': 0.516081593815069,
  'cg00000321': 0.400027063258341,
  'cg00000363': 0.190871544331105,
  'cg00000622': 0.0099210543418163,
  'cg00000658': 0.863861413753196,
  'cg00000721': 0.935039379256587},
 'TCGA-CJ-4920-11A': {'cg00000165': 0.111657157534977,
  'cg00000236': 0.918934002233238,
  'cg00000289': 0.773517265412361,
  'cg00000292': 0.571990354691741,
  'cg00000321': 0.489779654823996,
  'cg00000363': 0.24188997202946,
  'cg00000622': 0.0097521875052793,
  'cg00000658': 0.919644862137697,
  'cg00000721': 0.94229345837988},
 'TCGA-B0-4849-11A': {'cg00000165': 0.13898299158527,
  'cg00000236': 0.892691778501429,
  'cg00000289': 0.697181652158477,
  'cg00000292': 0.47288614791789,
  'cg00000321': 0.377593967259526,
  'cg00000363': 0.149294919317939,
  'cg00000622': 0.0107697567839102,
  'cg00000658': 0.855919013625267,
  'cg00000721': 0.927295110742551},
 'TCGA-BQ-5891-11A': {'cg00000165': 0.0943910860490585,
  'cg00000236': 0.798899904372697,
  'cg00000289': 0.689450514637892,
  'cg00000292': 0.568046821756013,
  'cg00000321': 0.464626018317553,
  'cg00000363': 0.231639837864006,
  'cg00000622': 0.0487962187571897,
  'cg00000658': 0.879745629519866,
  'cg00000721': 0.575514399845868},
 'TCGA-BP-5186-11A': {'cg00000165': 0.110112361205661,
  'cg00000236': 0.827523582109836,
  'cg00000289': 0.757610109046985,
  'cg00000292': 0.484209696051666,
  'cg00000321': 0.412811564854099,
  'cg00000363': 0.167420794630144,
  'cg00000622': 0.0104916507529456,
  'cg00000658': 0.889507665618008,
  'cg00000721': 0.956223420054809},
 'TCGA-A3-3373-11A': {'cg00000165': 0.117830727124756,
  'cg00000236': 0.90581935721054,
  'cg00000289': 0.761457792189881,
  'cg00000292': 0.507633250448944,
  'cg00000321': 0.51611998698701,
  'cg00000363': 0.1737386620934,
  'cg00000622': 0.0108894792403789,
  'cg00000658': 0.831762722499429,
  'cg00000721': 0.950671976784028},
 'TCGA-BP-5180-11A': {'cg00000165': 0.119205137521098,
  'cg00000236': 0.891261719087507,
  'cg00000289': 0.746767379239554,
  'cg00000292': 0.463089282194905,
  'cg00000321': 0.464692516947339,
  'cg00000363': 0.228609755811405,
  'cg00000622': 0.0095536851256427,
  'cg00000658': 0.922630855301534,
  'cg00000721': 0.958168591617036}})

sym

pd.DataFrame({'Composite_Element_REF': {1: 'cg00000108',
  2: 'cg00000109',
  3: 'cg00000165',
  4: 'cg00000236',
  5: 'cg00000289',
  6: 'cg00000292',
  7: 'cg00000321',
  8: 'cg00000363',
  9: 'cg00000622'},
 'Gene_Symbol': {1: 'C3orf35',
  2: 'FNDC3B',
  3: nan,
  4: 'VDAC3',
  5: 'ACTN1',
  6: 'ATP2A1',
  7: 'SFRP1',
  8: nan,
  9: 'NIPA2'}})

Asked By: melolili

||

Source

Answer 1

your approach has two issues.

The first one is that .loc needs a list of elements from its index and columns as arguments. In order to use it to find the arguments you would thus have to first make the Composite_Element_REF column into an index

sym = sym.set_index('Composite_Element_REF')
df_normal_symbol.index = sym.loc[df_normal_symbol.index,'Gene_Symbol'].values

Note however that this will not work as the .loc slicer cannot handle missing values. sym.loc[df_normal_symbol.index[:-2],'Gene_Symbol'].values for example works. This second problem could be solved with brute force by splitting the dataframe rewritting the index you know and concatting it again. However if I might make a suggestion create a dict and use it in a list comprehension.

ce_ref_dict = dict(zip(sym.Composite_Element_REF,sym.Gene_Symbol))
df_normal_symbol.index = [ce_ref_dict[i] if i in ce_ref_dict 
                          else i 
                          for i in df_normal_symbol.index]

Answered By: Arnau

How to match the index of a dataframe with a column of another dataframe and replace it with the values of an adjacent column?

Question:

Answers: