How can I filter pandas dataframe column that contains both date and time for specific times?

Question

Pandas dataframe date&time filtering

Hey team! The whole idea of the script was to access the link at specified times ["20:00", "20:30", "22:15", "22:45", "23:30", "00:00", "01:45", "02:30", "03:00", "03:30"] and to filter the column Expected Ship Date by the time the script ran + 1 hour.

Example: Script ran at 04/06/2023 23:30, it will filter the Expected Ship Date by 05/06/2023 00:30

At the moment, it does not seem to work properly, for whatever reason, no matter the time I run it, it returns the current day 21:00.

Any idea what am I missing?

__author__ = "studorie"
__version__ = "1.0.1"

from datetime import datetime, timedelta
from io import StringIO

import pandas as pd
from apscheduler.schedulers.blocking import BlockingScheduler

from function.requests_retryer import requests_retry_session


def roster(fc):
    url = (
        f"https://link.com/{fc}"
        )

    with requests_retry_session() as request:
        response = request.get(url)

    if response.status_code != 200:
        print(response.raise_for_status())

    df = pd.read_csv(StringIO(response.text), sep=",")
    df.drop(['FN SKU', 'Scannable ID', 'Condition', 'Ship Method', 'Ship Option', 'Pick Priority'], axis=1,
            inplace=True)

    df["Expected Ship Date"] = pd.to_datetime(df["Expected Ship Date"])

    # Get today's date
    today = datetime.now().date()

    # Get the current time
    current_time = datetime.now().time()

    # Find the index of the next job time
    job_index = next(
        (i for i, job_time in enumerate(job_times) if datetime.strptime(job_time, "%H:%M").time() > current_time), 0)

    # Get the adjusted job time
    adjusted_job_time = job_times[job_index]

    # Calculate the adjusted expected ship date based on the scheduled job time + 1 hour
    if adjusted_job_time == "00:00":
        # If the adjusted job time is midnight, add 1 day instead of 1 hour
        adjusted_date = today + timedelta(days=1)
    else:
        adjusted_date = today

    adjusted_datetime = datetime.combine(adjusted_date,
                                         datetime.strptime(adjusted_job_time, "%H:%M").time()) + timedelta(hours=1)
    adjusted_expected_ship_date = adjusted_datetime.strftime("%d/%m/%Y %H:%M")

    # Filter the DataFrame based on the adjusted expected ship date
    filter_condition = df["Expected Ship Date"].dt.strftime("%d/%m/%Y %H:%M") == adjusted_expected_ship_date
    filtered_df = df[filter_condition]

    timestamp = datetime.now().strftime("%d-%m-%Y-%H-%M")
    filename = f"Y:/Public/L&D/Reports for ops/Flow risk/Pick_SLA_{timestamp}.csv"

    # Save the filtered data to the specified filename
    filtered_df.to_csv(filename, index=False)
    print(f"Filtered data saved to {filename}")


if __name__ == "__main__":
    schedule = BlockingScheduler()
    job_times = ["20:00", "20:30", "22:15", "22:45", "23:30", "00:00", "01:45", "02:30", "03:00", "03:30"]
    for job_time in job_times:
        schedule.add_job(roster, 'cron', timezone="Europe/London", hour=int(job_time.split(':')[0]),
                         minute=int(job_time.split(':')[1]), args=["EMA2"])
    schedule.start()

Asked By: Alexandru Tudorie

||

Source

Answer 1

In the function roster(fc),

 # Find the index of the next job time
    job_index = next(
        (i for i, job_time in enumerate(job_times) if datetime.strptime(job_time, "%H:%M").time() > current_time), 0)

Each time roster runs creates a new iterator, then it gets the first datetime 20:00 and returns 21:00

(i for i, job_time in enumerate(job_times) if datetime.strptime(job_time, "%H:%M").time() > current_time)

You should create a iterator before roster runs,

if __name__ == "__main__":
    schedule = BlockingScheduler()
    job_times = ["20:00", "20:30", "22:15", "22:45", "23:30", "00:00", "01:45", "02:30", "03:00", "03:30"]
    
    date_iter = (i for i, job_time in enumerate(job_times)
                if datetime.strptime(job_time, "%H:%M").time() > current_time)
    
    for job_time in job_times:
        schedule.add_job(roster, 'cron', timezone="Europe/London", hour=int(job_time.split(':')[0]),
                         minute=int(job_time.split(':')[1]), args=["EMA2"])
    schedule.start()

roster gets element from date_iter

def roster(fc):
    ...
    # Find the index of the next job time
    job_index = next(date_iter, 0)
    ...

Answered By: mdkfm

Answer 2

You can use Triggers

from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.combining import OrTrigger
from apscheduler.triggers.cron import CronTrigger



if __name__ == "__main__":
    schedule = BlockingScheduler()
    
    cron1 = CronTrigger(day_of_week="*", hour=22, minute=15, timezone="Europe/London")
    cron2 = CronTrigger(day_of_week="*", hour="20,0,3", minute=0, timezone="Europe/London")
    cron3 = CronTrigger(day_of_week="*", hour="22,1", minute=45, timezone="Europe/London")
    cron4 = CronTrigger(day_of_week="*", hour="20,2,3", minute=30, timezone="Europe/London")

    schedule.add_job(
        roster,
        OrTrigger([cron1, cron2, cron3, cron4]),
        misfire_grace_time=1000,
        max_instances=20,
        coalesce=True
    )
    schedule.start()

Answered By: Jason Baker

Answer 3

I have worked together with @Jason Baker and came up with this and does the job perfectly!

__author__ = "studorie"
_version_ = "1.0.1"

from io import StringIO

import pandas as pd
import pendulum
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.combining import OrTrigger
from apscheduler.triggers.cron import CronTrigger

from converters.change_case import to_snake_case
from function.pandas_formatter import pandas_format
from function.requests_retryer import requests_retry_session


def rodeo(fc: str) -> None:
    now = pendulum.now("Europe/London").add(hours=1).replace(minute=0, second=0).to_datetime_string()
    url = (
    f"https://link.com/{fc}"
    )

    with requests_retry_session() as request:
        response = request.get(url)
    if response.status_code != 200:
        print(response.raise_for_status())

    df = pd.read_csv(StringIO(response.text), sep=",")
    df.columns = df.columns.to_series().apply(to_snake_case)
    df = (df
          .drop(columns=["fn_sku", "scannable_id", "condition", "ship_method", "ship_option", "pick_priority"], axis=1)
          .assign(expected_ship_date=pd.to_datetime(df["expected_ship_date"]))
          .query(expr="expected_ship_date == @now")
          )

    if not df.empty:
        timestamp = pendulum.now().strftime("%d-%m-%Y-%H-%M")
        filename = f"Y:/Public/L&D/Reports for ops/Flow risk/Pick_SLA_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"Filtered data saved to {filename}")


def run_job():
    rodeo(fc="EMA2")


if _name_ == "_main_":
    pandas_format()
    schedule = BlockingScheduler()
    cron1 = CronTrigger(day_of_week="*", hour=22, minute=15, timezone="Europe/London")
    cron2 = CronTrigger(day_of_week="*", hour="10,12,13,15,16,20,0,3", minute=0, timezone="Europe/London")
    cron3 = CronTrigger(day_of_week="*", hour="22,1", minute=45, timezone="Europe/London")
    cron4 = CronTrigger(day_of_week="*", hour="10,12,14,19,20,2,3", minute=30, timezone="Europe/London")

    schedule.add_job(
        run_job,
        OrTrigger([cron1, cron2, cron3, cron4]),
        misfire_grace_time=1000,
        max_instances=20,
        coalesce=True
    )
    schedule.start()

Answered By: Alexandru Tudorie

How can I filter pandas dataframe column that contains both date and time for specific times?

Question:

Answers: