Sleep Decorator after N Function Calls Python/Selenium

Question:

Using Python/Selenium, I have defined a sleepy decorator to wait for 20 seconds between 5 function calls (requests to the server) for a webscraping project, so that I don’t overwhelm their server. Looking at the terminal output it seems to be working as I intend it to, but when I observe the output file "Hitachi.csv" when it’s being created it doesn’t seem to pause at the fifth url but at the end, leading me to believe that the sleepy decorator isn’t pausing at the 5th call. Please help:)

def sleepy(f):
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        print(f"{f.__name__} called {wrapped.calls} times")
        if wrapped.calls % 5 == 0:
            print("Sleeping...")
            sleep(20)
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped
# script_concurrent.py

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os

from scrapers.scraper import connect_to_base, parse_html, write_to_file


def counted(f):
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


def sleepy(f):
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        print(f"{f.__name__} called {wrapped.calls} times")
        if wrapped.calls % 5 == 0:
            print("Sleeping...")
            sleep(20)
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


@counted
@sleepy
def run_process(filename="Hitachi.csv"):

    # init browser
    os.environ["WDM_LOG_LEVEL"] = "0"
    browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    if connect_to_base(browser):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print("Error connecting to AVS")

    # exit
    browser.quit()


if __name__ == "__main__":

    start_time = time()
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"output_{output_timestamp}.csv"

    futures = []

    with ThreadPoolExecutor() as executor:
        futures.extend(executor.submit(run_process) for _ in range(2, 12))

    wait(futures)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
    print(f"Calls to run_process: {run_process.calls}")

# scraper.py

import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

BASE_DIR = Path(__file__).resolve(strict=True).parent.parent


def csv_to_iter(filename, idx=0):
    pd.set_option("display.max_rows", None)
    df = pd.read_csv(filename)
    df = df.iloc[:, [idx]]
    df = df.values.tolist()
    df = list(itertools.chain(*df))
    df = sorted(list(set(df)))
    return iter(df)


my_iter = csv_to_iter(
    filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)


def connect_to_base(browser):
    my_next_iter = next(my_iter)
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(my_next_iter)
            # wait for table element with id = 'content' to load
            # before returning True
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
            )
            return True
        except Exception as e:
            print(e)
            connection_attempts += 1
            print(f"Error connecting to {my_next_iter}.")
            print(f"Attempt #{connection_attempts}.")
    return False


def parse_html(html):
    # create soup object
    soup = BeautifulSoup(html, "html.parser")
    # parse soup object to get wikipedia article url, title, and last modified date
    # part_position = [
    #     item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
    # ]
    part_number_1 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part รขโ€žโ€“"})
    ]
    part_number_2 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part โ„–"})
    ]

    if not part_number_1:
        pass
    else:
        part_number = part_number_1

    if not part_number_2:
        pass
    else:
        part_number = part_number_2

    part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]

    part_name = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
    ]

    part_comments = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
    ]

    machine = [
        item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
    ]

    alternative_machines = [
        item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
    ]

    title = [item.text for item in soup.findAll("span", {"class": "trans"})]

    parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]

    article_info = {
        # "Pos.": part_position,
        "Part No": part_number,
        "Qty": part_qty,
        "Parts name": part_name,
        "Comments": part_comments,
        "Machine": machine,
        "Alternative_machines": alternative_machines,
        "Title": title,
        "Parts_group": parts_group,
    }

    return [article_info]


def get_load_time(article_url):
    try:
        # set headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = "Loading Error"
    return load_time


def write_to_file(output_list, filename="Hitachi.csv"):
    for row in output_list:
        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
            fieldnames = [
                "Pos.",
                "Part No",
                "Qty",
                "Parts name",
                "Comments",
                "Machine",
                "Alternative_machines",
                "Title",
                "Parts_group",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

Output

run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,433 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,439 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,440 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,450 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
run_process called 6 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,461 INFO ====== WebDriver manager ======
run_process called 7 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,467 INFO ====== WebDriver manager ======
run_process called 8 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,477 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,690 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,690 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,720 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,720 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,733 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,789 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,790 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,793 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,793 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,798 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,798 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,807 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,807 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,868 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,909 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,946 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,974 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,007 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,016 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,038 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache

[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:19,459 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:19,552 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:19,552 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:19,647 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 9 times

[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:42,827 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:43,131 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:43,131 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:43,745 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 10 times
Sleeping...

Data

0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b45/
Asked By: Martin H

||

Answers:

The run_process called and Sleeping... do seem to be organized, but if you look more closely you will see that almost all the printout

[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,690 INFO Current google-chrome version is 103.0.5060

is between run_process called 8 and run_process called 9.

This info is from

webdriver.Chrome(service=Service(ChromeDriverManager().install()))

where the actual work is.

The cause is the use of ThreadPoolExecutor, run_process() is executed 10 times simultaneously, even if the printout indicates otherwise (you can’t count on its order as printing to the console isn’t synchronized).

You can add lock in sleepy on the waiting block

def sleepy(f):
    def wrapped(*args, **kwargs):
        with lock:
            wrapped.calls += 1
            print(f"{f.__name__} called {wrapped.calls} times")
            if wrapped.calls % 5 == 0:
                print("Sleeping...")
                sleep(20)
        return f(*args, **kwargs)

    lock = threading.Lock()
    wrapped.calls = 0

    return wrapped
Answered By: Guy