Building dictionary of unique IDs for pairs of matching strings
Question:
I have a dataframe like this
#Test dataframe
import pandas as pd
import numpy as np
#Build df
titles = {'Title': ['title1', 'cat', 'dog']}
references = {'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]}
df = pd.DataFrame({'Title': ['title1', 'cat', 'dog'], 'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]})
#Insert IDs for UNIQUE titles
title_ids = {'IDs':list(np.arange(0,len(df)) + 1)}
df['IDs'] = list(np.arange(0,len(df)) + 1)
df = df[['Title','IDs','References']]
and I want to generate IDs for the references column that looks like the data frame below. If there is a matching between the strings, assign the same ID as in the IDs column and if not, assign a new unique ID.
My first attempt is using the function
#Matching function
def string_match(string1,string2):
if string1 == string2:
a = 1
else:
a = 0
return a
and to loop over each string/title combination but this gets tricky with multiple for loops and if statements. Is there a better way I can do this that is more pythonic?
Answers:
# Explode to one reference per row
references = df["References"].explode()
# Combine existing titles with new title from References
titles = pd.concat([df["Title"], references]).unique()
# Assign each title an index number
mappings = {t: i + 1 for i, t in enumerate(titles)}
# Map the reference to the index number and convert to list
df["RefIDs"] = references.map(mappings).groupby(level=0).apply(list)
Let us try with factorize
s = df['References'].explode()
s[:] = pd.concat([df['Title'],s]).factorize()[0][len(df['Title']):]
df['new'] = (s+1).groupby(level=0).agg(list)
Out[237]:
0 [4, 5]
1 [1, 3]
2 [6, 7]
Name: References, dtype: object
- An alternative to using
pd.concat
could be to use Series.map
, and then to fill NaN
values with Series.cumsum
applied to those NaN
values (starting from refs.max()
, i.e. 3
plus 1
). So, this generates 4, 5, 6, 7
as new IDs
for the unmatched values.
- Next, we apply
df.groupby
in the same way as suggested by the other 2 answers.
refs = df["References"].explode().map(df.set_index('Title').IDs)
refs = refs.fillna(refs.isnull().cumsum().add(refs.max())).astype(int)
df['RefIDs'] = refs.groupby(level=0).apply(list)
print(df)
Title IDs References RefIDs
0 title1 1 [donkey, chicken] [4, 5]
1 cat 2 [title1, dog] [1, 3]
2 dog 3 [bird, snake] [6, 7]
In addition to the answers this can also be done with the help of a function, apply and lambda:
id_info=dict(df[['Title','IDs']].values)
def check(title,ref):
new_id_ = max(id_info.values()) #get latest id
ids=[]
for i in ref:
if i in id_info: #if Reference value is defined before, get its id
new_id=id_info[i]
else:
new_id=new_id_ + 1 #define a new id if not defined before and update dictionary to get latest id in next steps
new_id_+=1
id_info.update({i:new_id})
ids.append(new_id)
return ids
df['new_id']=df.apply(lambda x: check(x['Title'],x['References']),axis=1)
print(df)
'''
Title IDs References RefIDs
0 title1 1 ['donkey', 'chicken'] [4, 5]
1 cat 2 ['title1', 'dog'] [1, 3]
2 dog 3 ['bird', 'snake'] [6, 7]
'''
I have a dataframe like this
#Test dataframe
import pandas as pd
import numpy as np
#Build df
titles = {'Title': ['title1', 'cat', 'dog']}
references = {'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]}
df = pd.DataFrame({'Title': ['title1', 'cat', 'dog'], 'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]})
#Insert IDs for UNIQUE titles
title_ids = {'IDs':list(np.arange(0,len(df)) + 1)}
df['IDs'] = list(np.arange(0,len(df)) + 1)
df = df[['Title','IDs','References']]
and I want to generate IDs for the references column that looks like the data frame below. If there is a matching between the strings, assign the same ID as in the IDs column and if not, assign a new unique ID.
My first attempt is using the function
#Matching function
def string_match(string1,string2):
if string1 == string2:
a = 1
else:
a = 0
return a
and to loop over each string/title combination but this gets tricky with multiple for loops and if statements. Is there a better way I can do this that is more pythonic?
# Explode to one reference per row
references = df["References"].explode()
# Combine existing titles with new title from References
titles = pd.concat([df["Title"], references]).unique()
# Assign each title an index number
mappings = {t: i + 1 for i, t in enumerate(titles)}
# Map the reference to the index number and convert to list
df["RefIDs"] = references.map(mappings).groupby(level=0).apply(list)
Let us try with factorize
s = df['References'].explode()
s[:] = pd.concat([df['Title'],s]).factorize()[0][len(df['Title']):]
df['new'] = (s+1).groupby(level=0).agg(list)
Out[237]:
0 [4, 5]
1 [1, 3]
2 [6, 7]
Name: References, dtype: object
- An alternative to using
pd.concat
could be to useSeries.map
, and then to fillNaN
values withSeries.cumsum
applied to thoseNaN
values (starting fromrefs.max()
, i.e.3
plus1
). So, this generates4, 5, 6, 7
as newIDs
for the unmatched values. - Next, we apply
df.groupby
in the same way as suggested by the other 2 answers.
refs = df["References"].explode().map(df.set_index('Title').IDs)
refs = refs.fillna(refs.isnull().cumsum().add(refs.max())).astype(int)
df['RefIDs'] = refs.groupby(level=0).apply(list)
print(df)
Title IDs References RefIDs
0 title1 1 [donkey, chicken] [4, 5]
1 cat 2 [title1, dog] [1, 3]
2 dog 3 [bird, snake] [6, 7]
In addition to the answers this can also be done with the help of a function, apply and lambda:
id_info=dict(df[['Title','IDs']].values)
def check(title,ref):
new_id_ = max(id_info.values()) #get latest id
ids=[]
for i in ref:
if i in id_info: #if Reference value is defined before, get its id
new_id=id_info[i]
else:
new_id=new_id_ + 1 #define a new id if not defined before and update dictionary to get latest id in next steps
new_id_+=1
id_info.update({i:new_id})
ids.append(new_id)
return ids
df['new_id']=df.apply(lambda x: check(x['Title'],x['References']),axis=1)
print(df)
'''
Title IDs References RefIDs
0 title1 1 ['donkey', 'chicken'] [4, 5]
1 cat 2 ['title1', 'dog'] [1, 3]
2 dog 3 ['bird', 'snake'] [6, 7]
'''