How to replace values in columns with its mean based on the sequential information from another column?
Question:
I have time-series data. My task is to average some of values in the feature columns based on the sequences in target column, and then replace those values in feature columns by averaged value for each sequence. So, for example, in the table below, we have a sequence of three 1s in target, we take average of values in this sequence for feature1
and then for feature2
and after that we replace those values with our mean values. And that should be done for each sequence in target.
ID
time(s)
feature1
feature2
target
abc
500
2.56789
91.12834
0
abc
1000
2.45678
91.23452
1
abc
1500
2.36589
91.54398
1
abc
2000
2.56428
91.32348
1
abc
2500
2.25869
91.79322
0
cba
500
5.36589
93.54398
1
cba
1000
5.56428
93.32348
1
cba
1500
5.25869
94.79322
0
To do this, I made an algorithm based on indexing.
def averaging(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i+1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
However, something goes wrong when I start indexing the target_id
column in output_df
and instead of indexes there, I keep getting empty column (zeros). Tried to figure out what the problem is but nothing helped. Does anyone know what is wrong here or how to solve this problem in another way?
Will really appreciate any help.
Answers:
The problem with your code is contained in this bit:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i+1
When you run this part, you’ll get a SettingWithCopyWarning
:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
output_df.iloc[indexi:indexj]['target_id'] = i+1
What this means essentially, is that you aren’t updating the actual DataFrame, you are updating a copy of a slice.
Indeed, after running this line, you’ll find that the values in column target_id
are still all zeros:
print(output_df.target_id.sum()) # 0
To fix this you could use the following line instead:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i+1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
Once you have corrected this line, your function should generate the expected outcome. To be sure, the last bit could also simply be:
for i, (indexi, indexj) in enumerate(target_index):
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] =
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
Same result. Finally, you could do all of this a bit quicker actually. E.g. using something like this:
def averaging_new(input_df: pd.DataFrame):
output_df = input_df.copy()
output_df['target_id'] = 0
zeros = output_df.target != 0
sequences = (zeros != zeros.shift()).cumsum()
features = ['feature1','feature2']
gb = output_df[zeros].groupby([sequences, 'ID'])[features].transform('mean')
gb['target_id'] = output_df[zeros].groupby([sequences, 'ID'])[features].ngroup() + 1
output_df.loc[gb.index, features] = gb[features]
output_df.loc[gb.index, 'target_id'] = gb['target_id']
return output_df
output_df = averaging_new(input_df)
output_df
ID time(s) feature1 feature2 target target_id
0 abc 500 2.567890 91.128340 0 0
1 abc 1000 2.462317 91.367327 1 1
2 abc 1500 2.462317 91.367327 1 1
3 abc 2000 2.462317 91.367327 1 1
4 abc 2500 2.258690 91.793220 0 0
5 cba 500 5.465085 93.433730 1 2
6 cba 1000 5.465085 93.433730 1 2
7 cba 1500 5.258690 94.793220 0 0
# comparing that result with your function corrected as well as your function refactored:
def averaging_corrected(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i+1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
def averaging_refactored(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] =
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
output_df2 = averaging_corrected(input_df)
output_df3 = averaging_refactored(input_df)
dfs = [output_df, output_df2, output_df3]
all(x.equals(dfs[0]) for x in dfs)
# True
N.B. in averaging_new
and averaging_refactored
, I’m not actually using the column target_id
to fill the feature columns. Just adding it for the comparison.
I have time-series data. My task is to average some of values in the feature columns based on the sequences in target column, and then replace those values in feature columns by averaged value for each sequence. So, for example, in the table below, we have a sequence of three 1s in target, we take average of values in this sequence for feature1
and then for feature2
and after that we replace those values with our mean values. And that should be done for each sequence in target.
ID | time(s) | feature1 | feature2 | target |
---|---|---|---|---|
abc | 500 | 2.56789 | 91.12834 | 0 |
abc | 1000 | 2.45678 | 91.23452 | 1 |
abc | 1500 | 2.36589 | 91.54398 | 1 |
abc | 2000 | 2.56428 | 91.32348 | 1 |
abc | 2500 | 2.25869 | 91.79322 | 0 |
cba | 500 | 5.36589 | 93.54398 | 1 |
cba | 1000 | 5.56428 | 93.32348 | 1 |
cba | 1500 | 5.25869 | 94.79322 | 0 |
To do this, I made an algorithm based on indexing.
def averaging(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i+1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
However, something goes wrong when I start indexing the target_id
column in output_df
and instead of indexes there, I keep getting empty column (zeros). Tried to figure out what the problem is but nothing helped. Does anyone know what is wrong here or how to solve this problem in another way?
Will really appreciate any help.
The problem with your code is contained in this bit:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i+1
When you run this part, you’ll get a SettingWithCopyWarning
:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
output_df.iloc[indexi:indexj]['target_id'] = i+1
What this means essentially, is that you aren’t updating the actual DataFrame, you are updating a copy of a slice.
Indeed, after running this line, you’ll find that the values in column target_id
are still all zeros:
print(output_df.target_id.sum()) # 0
To fix this you could use the following line instead:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i+1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
Once you have corrected this line, your function should generate the expected outcome. To be sure, the last bit could also simply be:
for i, (indexi, indexj) in enumerate(target_index):
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] =
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
Same result. Finally, you could do all of this a bit quicker actually. E.g. using something like this:
def averaging_new(input_df: pd.DataFrame):
output_df = input_df.copy()
output_df['target_id'] = 0
zeros = output_df.target != 0
sequences = (zeros != zeros.shift()).cumsum()
features = ['feature1','feature2']
gb = output_df[zeros].groupby([sequences, 'ID'])[features].transform('mean')
gb['target_id'] = output_df[zeros].groupby([sequences, 'ID'])[features].ngroup() + 1
output_df.loc[gb.index, features] = gb[features]
output_df.loc[gb.index, 'target_id'] = gb['target_id']
return output_df
output_df = averaging_new(input_df)
output_df
ID time(s) feature1 feature2 target target_id
0 abc 500 2.567890 91.128340 0 0
1 abc 1000 2.462317 91.367327 1 1
2 abc 1500 2.462317 91.367327 1 1
3 abc 2000 2.462317 91.367327 1 1
4 abc 2500 2.258690 91.793220 0 0
5 cba 500 5.465085 93.433730 1 2
6 cba 1000 5.465085 93.433730 1 2
7 cba 1500 5.258690 94.793220 0 0
# comparing that result with your function corrected as well as your function refactored:
def averaging_corrected(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i+1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
def averaging_refactored(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1]+1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i+1
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] =
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
output_df2 = averaging_corrected(input_df)
output_df3 = averaging_refactored(input_df)
dfs = [output_df, output_df2, output_df3]
all(x.equals(dfs[0]) for x in dfs)
# True
N.B. in averaging_new
and averaging_refactored
, I’m not actually using the column target_id
to fill the feature columns. Just adding it for the comparison.