How to show datetime object when using groupby and resample in Pandas?
Question:
I have the following data.
import numpy as np
import pandas as pd
import random
df = pd.DataFrame({'DATE_TIME': pd.date_range('2022-11-01', '2022-11-06 23:00:00', freq='20min'),
'ID': [random.randrange(1, 20) for n in range(430)]})
df['VALUE1'] = [random.randrange(110, 140) for n in range(430)]
df['VALUE2'] = [random.randrange(50, 60) for n in range(430)]
df['VALUE3'] = [random.randrange(80, 100) for n in range(430)]
df['VALUE4'] = [random.randrange(30, 50) for n in range(430)]
df['MODEL'] = [random.randrange(1, 3) for n in range(430)]
df['SOLD'] = [random.randrange(0, 2) for n in range(430)]
df['INSPECTION'] = df['DATE_TIME'].dt.day
df['MODE'] = np.select([df['INSPECTION'] == 1, df['INSPECTION'].isin([2, 3])], ['A', 'B'], 'C')
df['TIME'] = df['DATE_TIME'].dt.time
# df['TIME'] = pd.to_timedelta(df['TIME'])
df['TIME'] = df['TIME'].astype('str')
# Create DAY Night columns only-------------------------------------------------------------------------
def cycle_day_period(dataframe: pd.DataFrame, midnight='00:00:00', start_of_morning='06:00:00',
start_of_afternoon='13:00:00',
start_of_evening='18:00:00', end_of_evening='23:00:00', start_of_night='24:00:00'):
bins = [midnight, start_of_morning, start_of_afternoon, start_of_evening, end_of_evening, start_of_night]
labels = ['Night', 'Morning', 'Morning', 'Night', 'Night']
return pd.cut(
pd.to_timedelta(dataframe),
bins=list(map(pd.Timedelta, bins)),
labels=labels, right=False, ordered=False
)
df['CYCLE_PART'] = cycle_day_period(df['TIME'], '00:00:00', '06:00:00', '13:00:00', '18:00:00', '23:00:00', '24:00:00')
I resample my time series depending on a condition using the following script.
freq = {'Night': '30min', 'Morning': '15min'}
out = (df.groupby('CYCLE_PART')
.apply(lambda g: g.resample(freq[g.name], on='DATE_TIME').mean())
.reset_index()
)
But, when I print its output neither DATE_TIME nor TIME is visible. How can I solve this problem?
Answers:
You were on the right track. Here is an update of what you did:
out = (df.groupby(['CYCLE_PART', pd.Grouper(key='DATE_TIME', freq='1min'), 'TIME'], as_index=False)
.apply(lambda g: g.resample(freq[g.name[0]], on='DATE_TIME').mean(numeric_only=True))
)
out = out.reset_index()
out['TIME'] = out['DATE_TIME'].dt.time
which gives:
level_0 DATE_TIME ID VALUE1 VALUE2 VALUE3 VALUE4 MODEL
0 0 2022-11-01 06:00:00 6.0 110.0 56.0 92.0 40.0 2.0
1 1 2022-11-01 06:15:00 19.0 132.0 55.0 82.0 37.0 2.0
2 2 2022-11-01 06:30:00 9.0 130.0 50.0 87.0 31.0 2.0
3 3 2022-11-01 07:00:00 5.0 118.0 52.0 95.0 31.0 1.0
4 4 2022-11-01 07:15:00 17.0 125.0 56.0 81.0 48.0 1.0
.. ... ... ... ... ... ... ... ...
425 425 2022-11-06 21:30:00 7.0 132.0 58.0 94.0 39.0 1.0
426 426 2022-11-06 22:00:00 12.0 126.0 53.0 87.0 41.0 1.0
427 427 2022-11-06 22:00:00 8.0 129.0 52.0 96.0 34.0 2.0
428 428 2022-11-06 22:30:00 9.0 114.0 50.0 98.0 45.0 1.0
429 429 2022-11-06 23:00:00 7.0 110.0 50.0 96.0 35.0 2.0
SOLD INSPECTION TIME
0 1.0 1.0 06:00:00
1 0.0 1.0 06:15:00
2 0.0 1.0 06:30:00
3 1.0 1.0 07:00:00
4 0.0 1.0 07:15:00
.. ... ... ...
425 1.0 6.0 21:30:00
426 1.0 6.0 22:00:00
427 1.0 6.0 22:00:00
428 1.0 6.0 22:30:00
429 0.0 6.0 23:00:00
[430 rows x 11 columns]
One idea is create DatetimeIndex
first, so possible omit on
parameter:
out = (df.set_index('DATE_TIME')
.groupby('CYCLE_PART')
.apply(lambda g: g.resample(freq[g.name]).mean())
.reset_index())
I have the following data.
import numpy as np
import pandas as pd
import random
df = pd.DataFrame({'DATE_TIME': pd.date_range('2022-11-01', '2022-11-06 23:00:00', freq='20min'),
'ID': [random.randrange(1, 20) for n in range(430)]})
df['VALUE1'] = [random.randrange(110, 140) for n in range(430)]
df['VALUE2'] = [random.randrange(50, 60) for n in range(430)]
df['VALUE3'] = [random.randrange(80, 100) for n in range(430)]
df['VALUE4'] = [random.randrange(30, 50) for n in range(430)]
df['MODEL'] = [random.randrange(1, 3) for n in range(430)]
df['SOLD'] = [random.randrange(0, 2) for n in range(430)]
df['INSPECTION'] = df['DATE_TIME'].dt.day
df['MODE'] = np.select([df['INSPECTION'] == 1, df['INSPECTION'].isin([2, 3])], ['A', 'B'], 'C')
df['TIME'] = df['DATE_TIME'].dt.time
# df['TIME'] = pd.to_timedelta(df['TIME'])
df['TIME'] = df['TIME'].astype('str')
# Create DAY Night columns only-------------------------------------------------------------------------
def cycle_day_period(dataframe: pd.DataFrame, midnight='00:00:00', start_of_morning='06:00:00',
start_of_afternoon='13:00:00',
start_of_evening='18:00:00', end_of_evening='23:00:00', start_of_night='24:00:00'):
bins = [midnight, start_of_morning, start_of_afternoon, start_of_evening, end_of_evening, start_of_night]
labels = ['Night', 'Morning', 'Morning', 'Night', 'Night']
return pd.cut(
pd.to_timedelta(dataframe),
bins=list(map(pd.Timedelta, bins)),
labels=labels, right=False, ordered=False
)
df['CYCLE_PART'] = cycle_day_period(df['TIME'], '00:00:00', '06:00:00', '13:00:00', '18:00:00', '23:00:00', '24:00:00')
I resample my time series depending on a condition using the following script.
freq = {'Night': '30min', 'Morning': '15min'}
out = (df.groupby('CYCLE_PART')
.apply(lambda g: g.resample(freq[g.name], on='DATE_TIME').mean())
.reset_index()
)
But, when I print its output neither DATE_TIME nor TIME is visible. How can I solve this problem?
You were on the right track. Here is an update of what you did:
out = (df.groupby(['CYCLE_PART', pd.Grouper(key='DATE_TIME', freq='1min'), 'TIME'], as_index=False)
.apply(lambda g: g.resample(freq[g.name[0]], on='DATE_TIME').mean(numeric_only=True))
)
out = out.reset_index()
out['TIME'] = out['DATE_TIME'].dt.time
which gives:
level_0 DATE_TIME ID VALUE1 VALUE2 VALUE3 VALUE4 MODEL
0 0 2022-11-01 06:00:00 6.0 110.0 56.0 92.0 40.0 2.0
1 1 2022-11-01 06:15:00 19.0 132.0 55.0 82.0 37.0 2.0
2 2 2022-11-01 06:30:00 9.0 130.0 50.0 87.0 31.0 2.0
3 3 2022-11-01 07:00:00 5.0 118.0 52.0 95.0 31.0 1.0
4 4 2022-11-01 07:15:00 17.0 125.0 56.0 81.0 48.0 1.0
.. ... ... ... ... ... ... ... ...
425 425 2022-11-06 21:30:00 7.0 132.0 58.0 94.0 39.0 1.0
426 426 2022-11-06 22:00:00 12.0 126.0 53.0 87.0 41.0 1.0
427 427 2022-11-06 22:00:00 8.0 129.0 52.0 96.0 34.0 2.0
428 428 2022-11-06 22:30:00 9.0 114.0 50.0 98.0 45.0 1.0
429 429 2022-11-06 23:00:00 7.0 110.0 50.0 96.0 35.0 2.0
SOLD INSPECTION TIME
0 1.0 1.0 06:00:00
1 0.0 1.0 06:15:00
2 0.0 1.0 06:30:00
3 1.0 1.0 07:00:00
4 0.0 1.0 07:15:00
.. ... ... ...
425 1.0 6.0 21:30:00
426 1.0 6.0 22:00:00
427 1.0 6.0 22:00:00
428 1.0 6.0 22:30:00
429 0.0 6.0 23:00:00
[430 rows x 11 columns]
One idea is create DatetimeIndex
first, so possible omit on
parameter:
out = (df.set_index('DATE_TIME')
.groupby('CYCLE_PART')
.apply(lambda g: g.resample(freq[g.name]).mean())
.reset_index())