How to extract certain portion from a dataframe column and populate that value in a other column?
Question:
I have a dataframe looks like this:
df1:
section_id section_name
1.Test Summary9
1.1.Synopsis9
1.2.Schema12
1.3.1.Test Period I - Screening13
1.3.2.Period II - obes-Treatment 15
Synopsis
Test Period I - Screening
As you can see it has blank rows as well.What is best approach to populate the section id column like this:
section_id section_name
1 1.Test Summary9
1.1 1.1.Synopsis9
1.2 1.2.Schema12
1.3.1 1.3.1.Test Period I - Screening13
1.3.2 1.3.2.Period II - obes-Treatment 15
1.1 Synopsis
1.3.1 Test Period I - Screening
Basically in case section name starts with section id then straightaway extract and populate. In case similar section name(ex:1.1.Synopsis9 and Synopsis
) populate same section id, for blank value do nothing.
I tried this but some cases it’s not working. Suggest me the best way to do this:
import pandas as pd
data = {
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening'
]
}
df = pd.DataFrame(data)
def extract_section_id(section_name, current_section_id):
if section_name.startswith(current_section_id):
return current_section_id
else:
return section_name.split('.')[0]
current_section_id = ''
section_ids = []
for index, row in df.iterrows():
section_name = row['section_name'].strip()
if section_name != '':
section_id = extract_section_id(section_name, current_section_id)
current_section_id = section_id
else:
section_id = ''
section_ids.append(section_id)
df['section_id'] = section_ids
print(df)
Answers:
You could do it this way:
import pandas as pd
import numpy as np
import re
data = {
'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening',
None,
''
]
}
df1 = pd.DataFrame(data)
def extract_section_id(row, prev_ids):
if row['section_name'] is None or row['section_name'].strip() == '':
return np.nan
match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
if match:
sec_id, sec_name = match.groups()
prev_ids[sec_name.strip()] = sec_id
return sec_id
else:
for name, id_ in prev_ids.items():
if row['section_name'].strip() in name:
return id_
return np.nan
prev_ids = {}
df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)
print(df1)
which will give you
section_id section_name
0 1 1.Test Summary9
1 1.1 1.1.Synopsis9
2 1.2 1.2.Schema12
3 1.3.1 1.3.1.Test Period I - Screening13
4 1.3.2 1.3.2.Period II - obes-Treatment 15
5 1.1 Synopsis
6 1.3.1 Test Period I - Screening
7 NaN None
8 NaN
If you absolutely want a blank
import pandas as pd
import numpy as np
import re
data = {
'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening',
None,
''
]
}
df1 = pd.DataFrame(data)
def extract_section_id(row, prev_ids):
if row['section_name'] is None or row['section_name'].strip() == '':
return ''
match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
if match:
sec_id, sec_name = match.groups()
prev_ids[sec_name.strip()] = sec_id
return sec_id
else:
for name, id_ in prev_ids.items():
if row['section_name'].strip() in name:
return id_
return np.nan
prev_ids = {}
df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)
print(df1)
will give
section_id section_name
0 1 1.Test Summary9
1 1.1 1.1.Synopsis9
2 1.2 1.2.Schema12
3 1.3.1 1.3.1.Test Period I - Screening13
4 1.3.2 1.3.2.Period II - obes-Treatment 15
5 1.1 Synopsis
6 1.3.1 Test Period I - Screening
7 None
8
I have a dataframe looks like this:
df1:
section_id section_name
1.Test Summary9
1.1.Synopsis9
1.2.Schema12
1.3.1.Test Period I - Screening13
1.3.2.Period II - obes-Treatment 15
Synopsis
Test Period I - Screening
As you can see it has blank rows as well.What is best approach to populate the section id column like this:
section_id section_name
1 1.Test Summary9
1.1 1.1.Synopsis9
1.2 1.2.Schema12
1.3.1 1.3.1.Test Period I - Screening13
1.3.2 1.3.2.Period II - obes-Treatment 15
1.1 Synopsis
1.3.1 Test Period I - Screening
Basically in case section name starts with section id then straightaway extract and populate. In case similar section name(ex:1.1.Synopsis9 and Synopsis
) populate same section id, for blank value do nothing.
I tried this but some cases it’s not working. Suggest me the best way to do this:
import pandas as pd
data = {
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening'
]
}
df = pd.DataFrame(data)
def extract_section_id(section_name, current_section_id):
if section_name.startswith(current_section_id):
return current_section_id
else:
return section_name.split('.')[0]
current_section_id = ''
section_ids = []
for index, row in df.iterrows():
section_name = row['section_name'].strip()
if section_name != '':
section_id = extract_section_id(section_name, current_section_id)
current_section_id = section_id
else:
section_id = ''
section_ids.append(section_id)
df['section_id'] = section_ids
print(df)
You could do it this way:
import pandas as pd
import numpy as np
import re
data = {
'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening',
None,
''
]
}
df1 = pd.DataFrame(data)
def extract_section_id(row, prev_ids):
if row['section_name'] is None or row['section_name'].strip() == '':
return np.nan
match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
if match:
sec_id, sec_name = match.groups()
prev_ids[sec_name.strip()] = sec_id
return sec_id
else:
for name, id_ in prev_ids.items():
if row['section_name'].strip() in name:
return id_
return np.nan
prev_ids = {}
df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)
print(df1)
which will give you
section_id section_name
0 1 1.Test Summary9
1 1.1 1.1.Synopsis9
2 1.2 1.2.Schema12
3 1.3.1 1.3.1.Test Period I - Screening13
4 1.3.2 1.3.2.Period II - obes-Treatment 15
5 1.1 Synopsis
6 1.3.1 Test Period I - Screening
7 NaN None
8 NaN
If you absolutely want a blank
import pandas as pd
import numpy as np
import re
data = {
'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'section_name': [
'1.Test Summary9',
'1.1.Synopsis9',
'1.2.Schema12',
'1.3.1.Test Period I - Screening13',
'1.3.2.Period II - obes-Treatment 15',
'Synopsis',
'Test Period I - Screening',
None,
''
]
}
df1 = pd.DataFrame(data)
def extract_section_id(row, prev_ids):
if row['section_name'] is None or row['section_name'].strip() == '':
return ''
match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
if match:
sec_id, sec_name = match.groups()
prev_ids[sec_name.strip()] = sec_id
return sec_id
else:
for name, id_ in prev_ids.items():
if row['section_name'].strip() in name:
return id_
return np.nan
prev_ids = {}
df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)
print(df1)
will give
section_id section_name
0 1 1.Test Summary9
1 1.1 1.1.Synopsis9
2 1.2 1.2.Schema12
3 1.3.1 1.3.1.Test Period I - Screening13
4 1.3.2 1.3.2.Period II - obes-Treatment 15
5 1.1 Synopsis
6 1.3.1 Test Period I - Screening
7 None
8