How to extract certain portion from a dataframe column and populate that value in a other column?

Question:

I have a dataframe looks like this:
df1:

section_id  section_name
            1.Test Summary9
            1.1.Synopsis9
            1.2.Schema12
            1.3.1.Test Period  I - Screening13
            1.3.2.Period II - obes-Treatment 15
            Synopsis

            Test Period  I - Screening

As you can see it has blank rows as well.What is best approach to populate the section id column like this:

section_id  section_name
1           1.Test Summary9
1.1         1.1.Synopsis9
1.2         1.2.Schema12
1.3.1       1.3.1.Test Period  I - Screening13
1.3.2       1.3.2.Period II - obes-Treatment 15
1.1         Synopsis
1.3.1       Test Period  I - Screening

Basically in case section name starts with section id then straightaway extract and populate. In case similar section name(ex:1.1.Synopsis9 and Synopsis) populate same section id, for blank value do nothing.

I tried this but some cases it’s not working. Suggest me the best way to do this:

import pandas as pd

data = {
    'section_name': [
        '1.Test Summary9',
        '1.1.Synopsis9',
        '1.2.Schema12',
        '1.3.1.Test Period  I - Screening13',
        '1.3.2.Period II - obes-Treatment 15',
        'Synopsis',
        'Test Period  I - Screening'
    ]
}

df = pd.DataFrame(data)

def extract_section_id(section_name, current_section_id):
    if section_name.startswith(current_section_id):
        return current_section_id
    else:
        return section_name.split('.')[0]

current_section_id = ''
section_ids = []

for index, row in df.iterrows():
    section_name = row['section_name'].strip()
    if section_name != '':
        section_id = extract_section_id(section_name, current_section_id)
        current_section_id = section_id
    else:
        section_id = ''
    section_ids.append(section_id)

df['section_id'] = section_ids

print(df)
Asked By: ista120

||

Answers:

You could do it this way:

import pandas as pd
import numpy as np
import re

data = {
    'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
    'section_name': [
        '1.Test Summary9',
        '1.1.Synopsis9',
        '1.2.Schema12',
        '1.3.1.Test Period  I - Screening13',
        '1.3.2.Period II - obes-Treatment 15',
        'Synopsis',
        'Test Period  I - Screening',
        None,
        ''
    ]
}
df1 = pd.DataFrame(data)

def extract_section_id(row, prev_ids):
    if row['section_name'] is None or row['section_name'].strip() == '':
        return np.nan
    
    match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
    if match:
        sec_id, sec_name = match.groups()
        prev_ids[sec_name.strip()] = sec_id
        return sec_id
    else:
        for name, id_ in prev_ids.items():
            if row['section_name'].strip() in name:
                return id_
    return np.nan

prev_ids = {}

df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)

print(df1)

which will give you

  section_id                         section_name
0          1                      1.Test Summary9
1        1.1                        1.1.Synopsis9
2        1.2                         1.2.Schema12
3      1.3.1   1.3.1.Test Period  I - Screening13
4      1.3.2  1.3.2.Period II - obes-Treatment 15
5        1.1                             Synopsis
6      1.3.1           Test Period  I - Screening
7        NaN                                 None
8        NaN                                     

If you absolutely want a blank

import pandas as pd
import numpy as np
import re

data = {
    'section_id': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
    'section_name': [
        '1.Test Summary9',
        '1.1.Synopsis9',
        '1.2.Schema12',
        '1.3.1.Test Period  I - Screening13',
        '1.3.2.Period II - obes-Treatment 15',
        'Synopsis',
        'Test Period  I - Screening',
        None,
        ''
    ]
}
df1 = pd.DataFrame(data)

def extract_section_id(row, prev_ids):
    if row['section_name'] is None or row['section_name'].strip() == '':
        return ''
    
    match = re.match(r'(d+(?:.d+)*)(.*)', str(row['section_name']))
    if match:
        sec_id, sec_name = match.groups()
        prev_ids[sec_name.strip()] = sec_id
        return sec_id
    else:
        for name, id_ in prev_ids.items():
            if row['section_name'].strip() in name:
                return id_
    return np.nan

prev_ids = {}

df1['section_id'] = df1.apply(lambda row: extract_section_id(row, prev_ids), axis=1)

print(df1)

will give

  section_id                         section_name
0          1                      1.Test Summary9
1        1.1                        1.1.Synopsis9
2        1.2                         1.2.Schema12
3      1.3.1   1.3.1.Test Period  I - Screening13
4      1.3.2  1.3.2.Period II - obes-Treatment 15
5        1.1                             Synopsis
6      1.3.1           Test Period  I - Screening
7                                            None
8