calculating over partition in pandas dataframe
Question:
I have a table like so:
ID Timestamp Status
A 5/30/2022 2:29 Run Ended
A 5/30/2022 0:23 In Progress
A 5/30/2022 0:22 Prepared
B 5/30/2022 11:15 Run Ended
B 5/30/2022 9:18 In Progress
B 5/30/2022 0:55 Prepared
I want to compute the duration between each status grouped by the ID.
So the resulting output table would be:
ID Duration(min) Status change
A 0.40 In Progress-Prepared
A 125.82 Run Ended - In Progress
B 502.78 In Progress-Prepared
B 117.34 Run Ended - In Progress
How do I order it by descending timestamp (grouped by ID) and then subtract the last row from the previous row all the way upto the top for each ID group?
Answers:
You can use groupby('ID')[value].shift(1)
to access the previous value
in the same ID
group.
import pandas as pd
df = pd.DataFrame({
'ID': ['a','a','a','b','b','b'],
'time': [1,2,3,1,4,5],
'status': ['x','y','z','xx','yy','zz']
})
df['previous_time'] = df.groupby('ID')['time'].shift(1)
df['previous_status'] = df.groupby('ID')['status'].shift(1)
df = df.dropna()
df['duration'] = df['time'] - df['previous_time'] # change this line to calculate duration between time instead
df['status_change'] = df['previous_status'] + '-' + df['status']
print (df[['ID','duration','status_change']].to_markdown(index=False))
Output:
ID
duration
status_change
a
1
x-y
a
1
y-z
b
3
xx-yy
b
1
yy-zz
PS. you can subtract time
and previous_time
with answer in this thread
You can use a groupby.diff
and groupby.shift
:
out = (df
.assign(**{'Duration(min)': pd.to_datetime(df['Timestamp'], dayfirst=False)
.groupby(df['ID'])
.diff(-1).dt.total_seconds() # diff in seconds to next time in group
.div(60), # convert to minutes
'Status change': df.groupby('ID')['Status'].shift(-1)+'-'+df['Status']
})
.dropna(subset='Duration(min)') # get rid of empty rows
[['ID', 'Duration(min)', 'Status change']]
)
Output:
ID Duration(min) Status change
0 A 126.0 In Progress-Run Ended
1 A 1.0 Prepared-In Progress
3 B 117.0 In Progress-Run Ended
4 B 503.0 Prepared-In Progress
def function1(dd:pd.DataFrame):
val1=dd.query("Status=='In Progress'").Timestamp.squeeze()-dd.query("Status=='Prepared'").Timestamp.squeeze()
dd1=pd.DataFrame({'ID':dd.name,"Duration(min)":val1.total_seconds()/60,"Status change":"In Progress-Prepared"},[0])
val2=dd.query("Status=='Run Ended'").Timestamp.squeeze()-dd.query("Status=='In Progress'").Timestamp.squeeze()
dd2=pd.DataFrame({'ID':dd.name,"Duration(min)":val2.total_seconds()/60,"Status change":"Run Ended - In Progress"},[1])
return pd.concat([dd1,dd2])
df1=df1.assign(Timestamp=pd.to_datetime(df1.Timestamp))
df1.groupby('ID').apply(function1).reset_index(drop=True)
out
ID Duration(min) Status change
0 A 1.0 In Progress-Prepared
1 A 126.0 Run Ended - In Progress
2 B 503.0 In Progress-Prepared
3 B 117.0 Run Ended - In Progress
I have a table like so:
ID Timestamp Status
A 5/30/2022 2:29 Run Ended
A 5/30/2022 0:23 In Progress
A 5/30/2022 0:22 Prepared
B 5/30/2022 11:15 Run Ended
B 5/30/2022 9:18 In Progress
B 5/30/2022 0:55 Prepared
I want to compute the duration between each status grouped by the ID.
So the resulting output table would be:
ID Duration(min) Status change
A 0.40 In Progress-Prepared
A 125.82 Run Ended - In Progress
B 502.78 In Progress-Prepared
B 117.34 Run Ended - In Progress
How do I order it by descending timestamp (grouped by ID) and then subtract the last row from the previous row all the way upto the top for each ID group?
You can use groupby('ID')[value].shift(1)
to access the previous value
in the same ID
group.
import pandas as pd
df = pd.DataFrame({
'ID': ['a','a','a','b','b','b'],
'time': [1,2,3,1,4,5],
'status': ['x','y','z','xx','yy','zz']
})
df['previous_time'] = df.groupby('ID')['time'].shift(1)
df['previous_status'] = df.groupby('ID')['status'].shift(1)
df = df.dropna()
df['duration'] = df['time'] - df['previous_time'] # change this line to calculate duration between time instead
df['status_change'] = df['previous_status'] + '-' + df['status']
print (df[['ID','duration','status_change']].to_markdown(index=False))
Output:
ID | duration | status_change |
---|---|---|
a | 1 | x-y |
a | 1 | y-z |
b | 3 | xx-yy |
b | 1 | yy-zz |
PS. you can subtract time
and previous_time
with answer in this thread
You can use a groupby.diff
and groupby.shift
:
out = (df
.assign(**{'Duration(min)': pd.to_datetime(df['Timestamp'], dayfirst=False)
.groupby(df['ID'])
.diff(-1).dt.total_seconds() # diff in seconds to next time in group
.div(60), # convert to minutes
'Status change': df.groupby('ID')['Status'].shift(-1)+'-'+df['Status']
})
.dropna(subset='Duration(min)') # get rid of empty rows
[['ID', 'Duration(min)', 'Status change']]
)
Output:
ID Duration(min) Status change
0 A 126.0 In Progress-Run Ended
1 A 1.0 Prepared-In Progress
3 B 117.0 In Progress-Run Ended
4 B 503.0 Prepared-In Progress
def function1(dd:pd.DataFrame):
val1=dd.query("Status=='In Progress'").Timestamp.squeeze()-dd.query("Status=='Prepared'").Timestamp.squeeze()
dd1=pd.DataFrame({'ID':dd.name,"Duration(min)":val1.total_seconds()/60,"Status change":"In Progress-Prepared"},[0])
val2=dd.query("Status=='Run Ended'").Timestamp.squeeze()-dd.query("Status=='In Progress'").Timestamp.squeeze()
dd2=pd.DataFrame({'ID':dd.name,"Duration(min)":val2.total_seconds()/60,"Status change":"Run Ended - In Progress"},[1])
return pd.concat([dd1,dd2])
df1=df1.assign(Timestamp=pd.to_datetime(df1.Timestamp))
df1.groupby('ID').apply(function1).reset_index(drop=True)
out
ID Duration(min) Status change
0 A 1.0 In Progress-Prepared
1 A 126.0 Run Ended - In Progress
2 B 503.0 In Progress-Prepared
3 B 117.0 Run Ended - In Progress