how to fill missing seconds in pandas dataframe
Question:
I have a data frame I want to fill missing seconds values in Time data frame how to do that
this is my data
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '0', '0', '12:47:33', '12:47:34', '0', '12:47:36', '12:47:37', '0', '0'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
In Time column after 12:47:30 in place of 0 it has to be 12:47:31. In other words, my expected output is:
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '12:47:31', '12:47:32', '12:47:33', '12:47:34', '12:47:35', '12:47:36', '12:47:37', '12:47:38', '12:47:39'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
Answers:
Since your column always increments by one second, you can just "create" it with pd.date_range
The following line gives the desired output.
df['Time'] = pd.date_range(start='12:47:30', end='12:47:39', freq='s')
If you have a big dataset, instead of specifying the end, you can simply pass the number of values to create with the periods
parameter.
Create DatetimeIndex
first and then use DataFrame.resample
, last set columns values:
df.index = pd.to_datetime(df['Date'] + df['Time'].astype(str),
format='%m/%d%H:%M:%S',
errors='coerce')
out = df.resample('S').first()
out['Time'] = out.index.time
out['Date'] = out.index.strftime('%m/%d')
out['rpm'] = out['rpm'].fillna(0)
out['sec'] = out.groupby('Date').cumcount().add(1)
print (out)
sec Date Time rpm
1900-07-07 12:47:30 1 07/07 12:47:30 0.0
1900-07-07 12:47:31 2 07/07 12:47:31 0.0
1900-07-07 12:47:32 3 07/07 12:47:32 0.0
1900-07-07 12:47:33 4 07/07 12:47:33 0.0
1900-07-07 12:47:34 5 07/07 12:47:34 0.0
1900-07-07 12:47:35 6 07/07 12:47:35 0.0
1900-07-07 12:47:36 7 07/07 12:47:36 0.0
1900-07-07 12:47:37 8 07/07 12:47:37 0.0
out = out.reset_index(drop=True)
print (out)
sec Date Time rpm
0 1 07/07 12:47:30 0.0
1 2 07/07 12:47:31 0.0
2 3 07/07 12:47:32 0.0
3 4 07/07 12:47:33 0.0
4 5 07/07 12:47:34 0.0
5 6 07/07 12:47:35 0.0
6 7 07/07 12:47:36 0.0
7 8 07/07 12:47:37 0.0
Another solution with forward filling dates by Series.ffill
with add second for non times values created by GroupBy.cumcount
and to_timedelta
:
dates = pd.to_datetime(df['Date'] + df['Time'].astype(str),
format='%m/%d%H:%M:%S',
errors='coerce')
sec = pd.to_timedelta(df.groupby(dates.notna().cumsum()).cumcount(), unit='s')
df['Time'] = dates.ffill().add(sec).dt.strftime('%H:%M:%S')
print (df)
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Here is the code that you want:
import pandas as pd
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '0', '0', '12:47:33', '12:47:34', '0', '12:47:36', '12:47:37', '0', '0'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
# Create a mask for rows with '0' time values
mask = df['Time'] == '0'
# Find the index of the first non-zero time value
first_nonzero_idx = df.loc[~mask, 'Time'].index[0]
# Convert the 'Time' column to a list for easier manipulation
times = df['Time'].tolist()
# Fill in the missing time values by incrementing from the previous non-zero time value
for i in range(first_nonzero_idx + 1, len(times)):
if mask[i]:
prev_time = pd.to_datetime(times[i-1])
times[i] = (prev_time + pd.DateOffset(seconds=1)).strftime('%H:%M:%S')
# Update the 'Time' column in the dataframe
df['Time'] = times
print(df)
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Convert Time
column .to_datetime
and add one second to previous time, as show below
Code:
# Convert 'Time' column to datetime and '0' values to NaT (Not a time)
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')
# Iterate over the 'Time' column and replace NaT values
# with the time by adding one second to the previous time
previous_time = None
for i, time in enumerate(df['Time']):
if pd.isnull(time):
new_time = (previous_time + timedelta(seconds=1))
df.at[i, 'Time'] = new_time
previous_time = new_time
else:
previous_time = time
df['Time'] = df['Time'].apply(lambda x: x.strftime('%H:%M:%S'))
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Another possible solution, which uses linear interpolation to fill the null times:
from scipy.interpolate import interp1d
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')
df_nonan = df[['sec', 'Time']].dropna()
f = interp1d(df_nonan.iloc[:, 0], df_nonan.iloc[:, 1],
fill_value='extrapolate')
df['Time'] = pd.to_datetime(f(df['sec']))
df['Time'] = df['Time'].dt.time
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
I have a data frame I want to fill missing seconds values in Time data frame how to do that
this is my data
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '0', '0', '12:47:33', '12:47:34', '0', '12:47:36', '12:47:37', '0', '0'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
In Time column after 12:47:30 in place of 0 it has to be 12:47:31. In other words, my expected output is:
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '12:47:31', '12:47:32', '12:47:33', '12:47:34', '12:47:35', '12:47:36', '12:47:37', '12:47:38', '12:47:39'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
Since your column always increments by one second, you can just "create" it with pd.date_range
The following line gives the desired output.
df['Time'] = pd.date_range(start='12:47:30', end='12:47:39', freq='s')
If you have a big dataset, instead of specifying the end, you can simply pass the number of values to create with the periods
parameter.
Create DatetimeIndex
first and then use DataFrame.resample
, last set columns values:
df.index = pd.to_datetime(df['Date'] + df['Time'].astype(str),
format='%m/%d%H:%M:%S',
errors='coerce')
out = df.resample('S').first()
out['Time'] = out.index.time
out['Date'] = out.index.strftime('%m/%d')
out['rpm'] = out['rpm'].fillna(0)
out['sec'] = out.groupby('Date').cumcount().add(1)
print (out)
sec Date Time rpm
1900-07-07 12:47:30 1 07/07 12:47:30 0.0
1900-07-07 12:47:31 2 07/07 12:47:31 0.0
1900-07-07 12:47:32 3 07/07 12:47:32 0.0
1900-07-07 12:47:33 4 07/07 12:47:33 0.0
1900-07-07 12:47:34 5 07/07 12:47:34 0.0
1900-07-07 12:47:35 6 07/07 12:47:35 0.0
1900-07-07 12:47:36 7 07/07 12:47:36 0.0
1900-07-07 12:47:37 8 07/07 12:47:37 0.0
out = out.reset_index(drop=True)
print (out)
sec Date Time rpm
0 1 07/07 12:47:30 0.0
1 2 07/07 12:47:31 0.0
2 3 07/07 12:47:32 0.0
3 4 07/07 12:47:33 0.0
4 5 07/07 12:47:34 0.0
5 6 07/07 12:47:35 0.0
6 7 07/07 12:47:36 0.0
7 8 07/07 12:47:37 0.0
Another solution with forward filling dates by Series.ffill
with add second for non times values created by GroupBy.cumcount
and to_timedelta
:
dates = pd.to_datetime(df['Date'] + df['Time'].astype(str),
format='%m/%d%H:%M:%S',
errors='coerce')
sec = pd.to_timedelta(df.groupby(dates.notna().cumsum()).cumcount(), unit='s')
df['Time'] = dates.ffill().add(sec).dt.strftime('%H:%M:%S')
print (df)
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Here is the code that you want:
import pandas as pd
df = pd.DataFrame({
'sec': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Date': ['7/7', '0', '0', '7/7', '7/7', '0', '7/7', '7/7', '0', '0'],
'Time': ['12:47:30', '0', '0', '12:47:33', '12:47:34', '0', '12:47:36', '12:47:37', '0', '0'],
'rpm': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
})
# Create a mask for rows with '0' time values
mask = df['Time'] == '0'
# Find the index of the first non-zero time value
first_nonzero_idx = df.loc[~mask, 'Time'].index[0]
# Convert the 'Time' column to a list for easier manipulation
times = df['Time'].tolist()
# Fill in the missing time values by incrementing from the previous non-zero time value
for i in range(first_nonzero_idx + 1, len(times)):
if mask[i]:
prev_time = pd.to_datetime(times[i-1])
times[i] = (prev_time + pd.DateOffset(seconds=1)).strftime('%H:%M:%S')
# Update the 'Time' column in the dataframe
df['Time'] = times
print(df)
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Convert Time
column .to_datetime
and add one second to previous time, as show below
Code:
# Convert 'Time' column to datetime and '0' values to NaT (Not a time)
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')
# Iterate over the 'Time' column and replace NaT values
# with the time by adding one second to the previous time
previous_time = None
for i, time in enumerate(df['Time']):
if pd.isnull(time):
new_time = (previous_time + timedelta(seconds=1))
df.at[i, 'Time'] = new_time
previous_time = new_time
else:
previous_time = time
df['Time'] = df['Time'].apply(lambda x: x.strftime('%H:%M:%S'))
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0
Another possible solution, which uses linear interpolation to fill the null times:
from scipy.interpolate import interp1d
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')
df_nonan = df[['sec', 'Time']].dropna()
f = interp1d(df_nonan.iloc[:, 0], df_nonan.iloc[:, 1],
fill_value='extrapolate')
df['Time'] = pd.to_datetime(f(df['sec']))
df['Time'] = df['Time'].dt.time
Output:
sec Date Time rpm
0 1 7/7 12:47:30 0.0
1 2 0 12:47:31 0.0
2 3 0 12:47:32 0.0
3 4 7/7 12:47:33 0.0
4 5 7/7 12:47:34 0.0
5 6 0 12:47:35 0.0
6 7 7/7 12:47:36 0.0
7 8 7/7 12:47:37 0.0
8 9 0 12:47:38 0.0
9 10 0 12:47:39 0.0