How to replace values in columns with its mean based on the sequential information from another column?

Question:

I have time-series data. My task is to average some of values in the feature columns based on the sequences in target column, and then replace those values in feature columns by averaged value for each sequence. So, for example, in the table below, we have a sequence of three 1s in target, we take average of values in this sequence for feature1 and then for feature2 and after that we replace those values with our mean values. And that should be done for each sequence in target.

ID time(s) feature1 feature2 target
abc 500 2.56789 91.12834 0
abc 1000 2.45678 91.23452 1
abc 1500 2.36589 91.54398 1
abc 2000 2.56428 91.32348 1
abc 2500 2.25869 91.79322 0
cba 500 5.36589 93.54398 1
cba 1000 5.56428 93.32348 1
cba 1500 5.25869 94.79322 0

To do this, I made an algorithm based on indexing.

def averaging(input_df: pd.DataFrame):
    output_df = input_df.copy()
    target_index = []
    
    for _, sub_df in input_df.groupby('ID'): 
        _index = sub_df.index
        _targets = sub_df['target'].tolist()

        before = 1 if _targets[0] == 1 else 0

        tmp = []
        if before:
            tmp.append(_index[0])

        for i, flag in enumerate(_targets):
            if flag == 1 and before == 0:
                tmp.append(_index[i]) 
            elif flag == 0 and before == 1:
                tmp.append(_index[i])
                target_index.append(tmp)
                tmp = []
            before = flag
            
        if tmp:
            tmp.append(_index[-1]+1)
            target_index.append(tmp)

    output_df['target_id'] = 0
    for i, (indexi, indexj) in enumerate(target_index):
        output_df.iloc[indexi:indexj]['target_id'] = i+1

    targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
    targetid2avg.columns = ['target_feature1', 'target_feature2']

    output_df = output_df.merge(targetid2avg, on='target_id', how='left')
    output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
    
    output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
    
    return output_df

However, something goes wrong when I start indexing the target_id column in output_df and instead of indexes there, I keep getting empty column (zeros). Tried to figure out what the problem is but nothing helped. Does anyone know what is wrong here or how to solve this problem in another way?

Will really appreciate any help.

Asked By: Ayo

||

Answers:

The problem with your code is contained in this bit:

    output_df['target_id'] = 0
    for i, (indexi, indexj) in enumerate(target_index):
        output_df.iloc[indexi:indexj]['target_id'] = i+1

When you run this part, you’ll get a SettingWithCopyWarning:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df.iloc[indexi:indexj]['target_id'] = i+1

What this means essentially, is that you aren’t updating the actual DataFrame, you are updating a copy of a slice.

Indeed, after running this line, you’ll find that the values in column target_id are still all zeros:

print(output_df.target_id.sum()) # 0

To fix this you could use the following line instead:

    output_df['target_id'] = 0
    for i, (indexi, indexj) in enumerate(target_index):
        # output_df.iloc[indexi:indexj]['target_id'] = i+1
        output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1

Once you have corrected this line, your function should generate the expected outcome. To be sure, the last bit could also simply be:

    for i, (indexi, indexj) in enumerate(target_index):

        means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
        output_df.loc[indexi:indexj-1,['feature1', 'feature2']] = 
            output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
    
    return output_df

Same result. Finally, you could do all of this a bit quicker actually. E.g. using something like this:

def averaging_new(input_df: pd.DataFrame):
    output_df = input_df.copy()
    output_df['target_id'] = 0
    
    zeros = output_df.target != 0
    sequences = (zeros != zeros.shift()).cumsum()

    features = ['feature1','feature2']

    gb = output_df[zeros].groupby([sequences, 'ID'])[features].transform('mean')
    gb['target_id'] = output_df[zeros].groupby([sequences, 'ID'])[features].ngroup() + 1
    
    output_df.loc[gb.index, features] = gb[features]
    output_df.loc[gb.index, 'target_id'] = gb['target_id']
    
    return output_df

output_df = averaging_new(input_df)
output_df

    ID  time(s)   feature1   feature2  target  target_id
0  abc      500   2.567890  91.128340       0          0
1  abc     1000   2.462317  91.367327       1          1
2  abc     1500   2.462317  91.367327       1          1
3  abc     2000   2.462317  91.367327       1          1
4  abc     2500   2.258690  91.793220       0          0
5  cba      500   5.465085  93.433730       1          2
6  cba     1000   5.465085  93.433730       1          2
7  cba     1500   5.258690  94.793220       0          0

# comparing that result with your function corrected as well as your function refactored:

def averaging_corrected(input_df: pd.DataFrame):
    output_df = input_df.copy()
    target_index = []
    
    for _, sub_df in input_df.groupby('ID'): 
        _index = sub_df.index
        _targets = sub_df['target'].tolist()

        before = 1 if _targets[0] == 1 else 0

        tmp = []
        if before:
            tmp.append(_index[0])

        for i, flag in enumerate(_targets):
            if flag == 1 and before == 0:
                tmp.append(_index[i]) 
            elif flag == 0 and before == 1:
                tmp.append(_index[i])
                target_index.append(tmp)
                tmp = []
            before = flag
            
        if tmp:
            tmp.append(_index[-1]+1)
            target_index.append(tmp)

    output_df['target_id'] = 0
    for i, (indexi, indexj) in enumerate(target_index):
        # output_df.iloc[indexi:indexj]['target_id'] = i+1
        output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1

    targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
    targetid2avg.columns = ['target_feature1', 'target_feature2']

    output_df = output_df.merge(targetid2avg, on='target_id', how='left')
    output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
    
    output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
    
    return output_df

def averaging_refactored(input_df: pd.DataFrame):
    output_df = input_df.copy()
    target_index = []
    
    for _, sub_df in input_df.groupby('ID'): 
        _index = sub_df.index
        _targets = sub_df['target'].tolist()

        before = 1 if _targets[0] == 1 else 0

        tmp = []
        if before:
            tmp.append(_index[0])

        for i, flag in enumerate(_targets):
            if flag == 1 and before == 0:
                tmp.append(_index[i]) 
            elif flag == 0 and before == 1:
                tmp.append(_index[i])
                target_index.append(tmp)
                tmp = []
            before = flag
            
        if tmp:
            tmp.append(_index[-1]+1)
            target_index.append(tmp)

    output_df['target_id'] = 0
    for i, (indexi, indexj) in enumerate(target_index):
        output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1

        means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
        output_df.loc[indexi:indexj-1,['feature1', 'feature2']] = 
            output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
    
    return output_df

output_df2 = averaging_corrected(input_df)
output_df3 = averaging_refactored(input_df)

dfs = [output_df, output_df2, output_df3]
all(x.equals(dfs[0]) for x in dfs)
# True

N.B. in averaging_new and averaging_refactored, I’m not actually using the column target_id to fill the feature columns. Just adding it for the comparison.

Answered By: ouroboros1