How to produce three new columns from one column that has unequal length in Python (pandas or Numpy)
Question:
I have a column that is a list of names, some have titles and some don’t, how do I create the right columns for the right length for each row?
Names
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
But I would like to have
Title First Name Last Name
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
My code that does not work is:
import pandas as pd
title = []
firstname = []
lastname = []
saluations = ['Mr', 'Mr.', 'Mrs', 'Mrs.', 'Miss', 'Ms', 'Ms.', 'Dr.', 'Lord', 'Lady']
x = df.names.str.split(' ').str[0]
y = df.names.str.split(' ').str[1]
z = df.names.str.split(' ').str[-1]
if x in saluations:
title.append(x)
firstname.append(y)
else:
firstname.append(x)
lastname.append(z)
I have also tried:
import pandas as pd
for a in df.names.str.split(' ').str[0]:
for b in df.names.str.split(' ').str[1]:
if a in saluations:
title.append(a)
firstname.append(b)
else:
firstname.append(a)
for c in df.names.str.split(' ').str[-1]:
lastname.append(c)
Any help you have for me I would be very grateful and also any pointers that would make my code neater.
Answers:
Split names in Title, First name, Last name
Assumptions:
- Names consist of exactly two parts – first and last name in this order.
- Names may be preceded by a respectful title.
- At least one name is preceded by a title.
import pandas as pd
from io import StringIO
# Prepare data
data = '''
Names
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
Lordina Mahama
Ladya Cheryl
'''
df = pd.read_csv(StringIO(data))
# Process names
salutations = [
'Mr', 'Mr.', 'Mrs', 'Mrs.',
'Miss', 'Ms', 'Ms.', 'Dr.',
'Lord', 'Lady',
'Sir', 'Master', 'Dr.',
]
names = (
df['Names']
.str.split(n=2)
.apply(pd.Series)
.set_axis(['Title','First Name','Last Name'], axis='columns')
)
not_title = ~names['Title'].isin(salutations)
names[not_title] = names[not_title].shift(1, axis=1)
names.fillna('', inplace=True)
display(names)
Split names in title and arbitrary number of parts
Assumptions:
- Names consist of at least the last name in the last position.
- First Name comes first if exists.
- Names may be preceded by a title.
- There may not be a name with a title.
import pandas as pd
# Data preparation
data = {'Names': [
'Simon Cool',
'Mrs. Sarah Smart',
'Mr Harry Adams',
'Miss Jane Super',
'Lordina Mahama',
'Ladya Cheryl',
'Dr. Jonson',
'Sir Elton Hercules John',
'Jon Bon Jovi',
'Sir Arthur Ignatius Conan Doyle',
]}
df = pd.DataFrame(data)
# The main code
names = df['Names']
NumberOfParts = 4 # 2: First & Last names; 3: First, Middle, Last names; 4:...
NameID = {1:'First name', 2:'Middle name', 3:'Third name', 'last':'Surname'}
assert len(NameID) >= NumberOfParts # make sure we have enough name IDs
salutations = [
'Mr', 'Mr.', 'Mrs', 'Mrs.', 'Miss',
'Ms', 'Ms.', 'Dr.', 'Lord', 'Lady',
'Sir', 'Master', 'Dr.',
]
parts = names.str.split(n=1)
has_salut = parts.str.get(0).isin(salutations)
title = parts[has_salut].str.get(0).rename('Title')
names[has_salut] = parts.str.get(1) # clear titles from names
parts = [title] # save title as the very first part of names
# if we are happy to lose any names out of NumberOfParts
# then put n=-1 in the line below; in this case
# we end up with first (NumberOfParts-1) parts of name and the last one,
# which is surname by default:
names = names.str.split(n=NumberOfParts-1) # split names into Number of Parts
name_len = names.str.len() # count obtained number of parts for each name
for i in range(1, NumberOfParts):
parts.append(
names[name_len > i].str.get(i-1).rename(NameID[i])
)
parts.append(
names.str.get(-1).rename(NameID['last'])
)
names = pd.concat(parts, axis=1).fillna('')
display(names)
Output on test data with NumberOfParts = 4
:
Just convert the data row by row. You don’t need to search fancy pandas APIs.
This is an example pseudo code.
import pandas
df = ...
honorifics = ['Mr', ...]
new_data = []
for name in df.names:
tokens = name.split()
if not tokens[0] in honorifics:
tokens.insert(0, '')
new_data.append(tokens)
new_frame = pandas.DataFrame(new_data, columns=['honorific', 'first', 'last'])
I have a column that is a list of names, some have titles and some don’t, how do I create the right columns for the right length for each row?
Names
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
But I would like to have
Title First Name Last Name
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
My code that does not work is:
import pandas as pd
title = []
firstname = []
lastname = []
saluations = ['Mr', 'Mr.', 'Mrs', 'Mrs.', 'Miss', 'Ms', 'Ms.', 'Dr.', 'Lord', 'Lady']
x = df.names.str.split(' ').str[0]
y = df.names.str.split(' ').str[1]
z = df.names.str.split(' ').str[-1]
if x in saluations:
title.append(x)
firstname.append(y)
else:
firstname.append(x)
lastname.append(z)
I have also tried:
import pandas as pd
for a in df.names.str.split(' ').str[0]:
for b in df.names.str.split(' ').str[1]:
if a in saluations:
title.append(a)
firstname.append(b)
else:
firstname.append(a)
for c in df.names.str.split(' ').str[-1]:
lastname.append(c)
Any help you have for me I would be very grateful and also any pointers that would make my code neater.
Split names in Title, First name, Last name
Assumptions:
- Names consist of exactly two parts – first and last name in this order.
- Names may be preceded by a respectful title.
- At least one name is preceded by a title.
import pandas as pd
from io import StringIO
# Prepare data
data = '''
Names
Simon Cool
Mrs. Sarah Smart
Mr Harry Adams
Rupert Clever
Miss Jane Super
Lordina Mahama
Ladya Cheryl
'''
df = pd.read_csv(StringIO(data))
# Process names
salutations = [
'Mr', 'Mr.', 'Mrs', 'Mrs.',
'Miss', 'Ms', 'Ms.', 'Dr.',
'Lord', 'Lady',
'Sir', 'Master', 'Dr.',
]
names = (
df['Names']
.str.split(n=2)
.apply(pd.Series)
.set_axis(['Title','First Name','Last Name'], axis='columns')
)
not_title = ~names['Title'].isin(salutations)
names[not_title] = names[not_title].shift(1, axis=1)
names.fillna('', inplace=True)
display(names)
Split names in title and arbitrary number of parts
Assumptions:
- Names consist of at least the last name in the last position.
- First Name comes first if exists.
- Names may be preceded by a title.
- There may not be a name with a title.
import pandas as pd
# Data preparation
data = {'Names': [
'Simon Cool',
'Mrs. Sarah Smart',
'Mr Harry Adams',
'Miss Jane Super',
'Lordina Mahama',
'Ladya Cheryl',
'Dr. Jonson',
'Sir Elton Hercules John',
'Jon Bon Jovi',
'Sir Arthur Ignatius Conan Doyle',
]}
df = pd.DataFrame(data)
# The main code
names = df['Names']
NumberOfParts = 4 # 2: First & Last names; 3: First, Middle, Last names; 4:...
NameID = {1:'First name', 2:'Middle name', 3:'Third name', 'last':'Surname'}
assert len(NameID) >= NumberOfParts # make sure we have enough name IDs
salutations = [
'Mr', 'Mr.', 'Mrs', 'Mrs.', 'Miss',
'Ms', 'Ms.', 'Dr.', 'Lord', 'Lady',
'Sir', 'Master', 'Dr.',
]
parts = names.str.split(n=1)
has_salut = parts.str.get(0).isin(salutations)
title = parts[has_salut].str.get(0).rename('Title')
names[has_salut] = parts.str.get(1) # clear titles from names
parts = [title] # save title as the very first part of names
# if we are happy to lose any names out of NumberOfParts
# then put n=-1 in the line below; in this case
# we end up with first (NumberOfParts-1) parts of name and the last one,
# which is surname by default:
names = names.str.split(n=NumberOfParts-1) # split names into Number of Parts
name_len = names.str.len() # count obtained number of parts for each name
for i in range(1, NumberOfParts):
parts.append(
names[name_len > i].str.get(i-1).rename(NameID[i])
)
parts.append(
names.str.get(-1).rename(NameID['last'])
)
names = pd.concat(parts, axis=1).fillna('')
display(names)
Output on test data with NumberOfParts = 4
:
Just convert the data row by row. You don’t need to search fancy pandas APIs.
This is an example pseudo code.
import pandas
df = ...
honorifics = ['Mr', ...]
new_data = []
for name in df.names:
tokens = name.split()
if not tokens[0] in honorifics:
tokens.insert(0, '')
new_data.append(tokens)
new_frame = pandas.DataFrame(new_data, columns=['honorific', 'first', 'last'])