How to fetch information from web page chain wise(solving captcha)?

Question

I have to go to here

Here I have to choose applicant name = “ltd”

But here before submitting the page, I have to solve a captcha. How to fetch the next page’s information(application number, application title, date, application status etc…..) in an excel format using web scrapping?

—————- Running the following script, getting error —–

import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def save_to_csv(data: list) -> None:
    with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
        writer = csv.writer(f, lineterminator='n')
        writer.writerow([*data])

def start_from_page(page_number: int, driver: WebDriver) -> None:
    driver.execute_script(
    f"""
    document.querySelector('button.next').value = {page_number}; 
    document.querySelector('button.next').click();
    """
    )

def titles_validation(driver: WebDriver) -> None:
    """replace empty title name with '_'"""
    driver.execute_script(
        """
        let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
        Array.from(titles).forEach((e) => {
            if (!e.textContent.trim()) {
                e.textContent = '_';
            }
        });
        """
    )

def get_network_data(log: dict, driver: WebDriver) -> dict:
    log = json.loads(log["message"])["message"]
    if all([
        "Network.responseReceived" in log["method"], 
        "params" in log.keys(), 
        'CaptchaAudio' in str(log["params"].values())
        ]):
        return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})

def get_captcha_text(driver: WebDriver, timeout: float) -> str:
    """Return captcha text

    Arguments:
        - driver: WebDriver
        - timeout: pause before receiving data from the web driver log
    """
    driver.execute_script(
        """
        // document.querySelector('img[title="Captcha"]').click()
        document.querySelector('img[title="Captcha  Audio"]').click()
        """
        )
    sleep(timeout)
    logs = driver.get_log('performance')
    responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
    if responses:
        return json.loads(responses[0]['body'])['CaptchaImageText']
    else:
        get_captcha_text(driver)

def submit_captcha(captcha_text: str, btn_name: str) -> None:
    """Submit captcha

    Arguments:
        - btn_name: captcha send button name["submit" or "search"]
    """
    if btn_name == 'search':
        captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
    elif btn_name == 'submit':
        captcha_locator = (By.ID, 'btnSubmit')
    wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
    wait.until(EC.visibility_of_element_located(captcha_locator)).click()


# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# capabilities = DesiredCapabilities.CHROME
# capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# # driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)

table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')

driver.get('https://ipindiaservices.gov.in/PublicSearch/')

# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)

while True:
    start = time()
    # get current page number
    current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
    print(f"Current page: {current_page}")
    # get all application status WebElements
    app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))

    for element in range(len(app_status_elements)):
        print(f"App number: {element}")
        # update application status WebElements
        app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
        # click on application status
        wait.until(EC.visibility_of(app_status_elements[element])).click()
        # wait 2 seconds for the captcha to change
        sleep(2)
        # get text and submit captcha
        captcha_text = get_captcha_text(driver, 1)
        submit_captcha(captcha_text, "submit")
        try:
            # get all table data values(without titles) WebElements
            table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
            # if there are empty rows in the table replace them with "_"
            titles_validation(driver)
            # save data to csv
            save_to_csv([val.text.replace('n', ' ') for val in table_data_values])
        except TimeoutException:
            print("Application Number does not exist")
        finally:
            driver.back()
    # print the current page number to the console
    print(f"Time per page: {round(time()-start, 3)}")
    # if the current page is equal to the specified one, then stop the search and close the driver
    if current_page == '3776':
        break
    # click next page
    wait.until(EC.visibility_of_element_located(next_btn_locator)).click()

driver.quit()

Asked By: XYZ

||

Source

Answer 1

You can use any capctha solving sites. On these sites, users usually fix it themselves, so they can be slow, but it does the job.

Sample website (I didn’t receive any ads)

You can use selenium to pull the information. It will be enough to take and place the elements with the "id" tag on the site.
Library for reading/writing excel in python

Answered By: Ramazan Akbal

Answer 2

On this site, captcha can be solved without resorting to third-party services. When you click on the "Captcha Audio" button, a GET request is sent to the endpoint https://ipindiaservices.gov.in/PublicSearch/Captcha/CaptchaAudio The response is a dictionary {"CaptchaImageText":"hnnxd"} which you can access get from Selenium via the "Chrome Devtools Protocol" using the Network.getResponseBody method, or you can use the requests library.
To save data in csv, you can use, for example, the csv module included in the standard library.

Here is one possible solution:

import re
import csv
import json
from time import sleep
from selenium import webdriver
from typing import Generator, List, Tuple
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC


def get_captcha_text(log: dict, driver: WebDriver) -> dict:
    log = json.loads(log["message"])["message"]
    if all([
        "Network.responseReceived" in log["method"], 
        "params" in log.keys(), 
        'CaptchaAudio' in str(log["params"].values())
        ]):
        return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})

def save_to_csv(table_data: Generator[None, None, List[Tuple[str, str, str, str]]]) -> None:
    for app_num, title, app_date, status in zip(*list(zip(*table_data))):
        data = {
            'Application Number': app_num,
            'Title': title,
            'Application Date': app_date,
            'Status': status
        }
        with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
            writer = csv.writer(f, lineterminator='n')
            writer.writerow([data['Application Number'], data['Title'], data['Application Date'], data['Status']])

def start_from_page(page_number: int, driver: WebDriver) -> None:
    driver.execute_script(
    f"""
    document.querySelector('button.next').value = {page_number}; 
    document.querySelector('button.next').click();
    """
    )

def titles_validation(driver: WebDriver) -> None:
    """replace empty title name with '_'"""
    driver.execute_script(
        """
        let titles = document.querySelectorAll('#tableData>tbody>tr>td.title')
        Array.from(titles).forEach((e) => {
            if (!e.textContent.trim()) {
                e.textContent = '_';
            }
        });
        """
    )

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)

# regular expression to search for data in a table
pattern = r'^([0-9A-Z/-,]+) (.+)? ([0-9/]+) (w+)'

driver.get('https://ipindiaservices.gov.in/PublicSearch/')

# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
driver.find_element(By.CSS_SELECTOR, 'img[title="Captcha  Audio"]').click()
driver.find_element(By.ID, 'TextField6').send_keys('ltd')
# short pause is needed here to write the log, otherwise we will get an empty list
sleep(1)
logs = driver.get_log('performance')
# get request data that is generated when click on the button listen to the text of the captcha
responses = [get_captcha_text(log, driver) for log in logs if get_captcha_text(log, driver)]
# get captcha text
captcha_text = json.loads(responses[0]['body'])['CaptchaImageText']
# enter the captcha text and click "Search" button
driver.find_element(By.ID, 'CaptchaText').send_keys(captcha_text)
driver.find_element(By.CSS_SELECTOR, 'input[name="submit"]').click()

# the page where the search starts
start_from_page(1, driver)

while True:
    # get current page number
    current_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.Selected'))).text
    # print the current page number to the console
    print(f"Current page: {current_page}")
    # get all fields of the table
    table_web_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#tableData>tbody>tr')))
    # check title name
    titles_validation(driver)
    # get all table data on current page
    table_data = (re.findall(pattern, data.text)[0] for data in table_web_elements)
    # save table data to csv
    save_to_csv(table_data)
    # if the current page is equal to the specified one, then stop the search and close the driver
    if current_page == '3768':
        break
    # click next page
    driver.find_element(By.CSS_SELECTOR, 'button.next').click()

driver.quit()

The performance of this solution is about 280sec per 100 pages.

It will take about 2.5-3 hours to collect all the data.

Therefore, the ability to stop data collection on a specific page has been added (by default, this is the last page):

if current_page == '3768':
     break

And start collection data from the specified page (by default, this is the first page):

start_from_page(1, driver)

Output is ipindiaservices.csv

202247057786,NON-AQUEOUS ELECTROLYTE SECONDARY BATTERY,10/10/2022,Published
202247057932,"COMMUNICATION METHOD, APPARATUS AND SYSTEM",10/10/2022,Published
202247057855,POLYOLEFIN RESIN FILM,10/10/2022,Published
202247057853,CEMENT COMPOSITION AND CURED PRODUCT THEREOF,10/10/2022,Published
...

To use this solution in Google Collab, follow these steps:

Install Selenium and ChromeDriver

!pip install selenium
!apt-get update
!apt install chromium-chromedriver

Make the necessary imports

import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC

Set options

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)

Everything else remains unchanged.
And don’t put all the code in one cell.

Update:
This is one possible solution to collect all the information from the table of each application:

import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def save_to_csv(data: list) -> None:
    with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
        writer = csv.writer(f, lineterminator='n')
        writer.writerow([*data])

def start_from_page(page_number: int, driver: WebDriver) -> None:
    driver.execute_script(
    f"""
    document.querySelector('button.next').value = {page_number}; 
    document.querySelector('button.next').click();
    """
    )

def titles_validation(driver: WebDriver) -> None:
    """replace empty title name with '_'"""
    driver.execute_script(
        """
        let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
        Array.from(titles).forEach((e) => {
            if (!e.textContent.trim()) {
                e.textContent = '_';
            }
        });
        """
    )

def get_network_data(log: dict, driver: WebDriver) -> dict:
    log = json.loads(log["message"])["message"]
    if all([
        "Network.responseReceived" in log["method"], 
        "params" in log.keys(), 
        'CaptchaAudio' in str(log["params"].values())
        ]):
        return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})

def get_captcha_text(driver: WebDriver, timeout: float) -> str:
    """Return captcha text

    Arguments:
        - driver: WebDriver
        - timeout: pause before receiving data from the web driver log
    """
    driver.execute_script(
        """
        // document.querySelector('img[title="Captcha"]').click()
        document.querySelector('img[title="Captcha  Audio"]').click()
        """
        )
    sleep(timeout)
    logs = driver.get_log('performance')
    if responses := [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]:
        return json.loads(responses[0]['body'])['CaptchaImageText']
    else:
        get_captcha_text(driver, timeout)

def submit_captcha(captcha_text: str, btn_name: str) -> None:
    """Submit captcha

    Arguments:
        - btn_name: captcha send button name["submit" or "search"]
    """
    if btn_name == 'search':
        captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
    elif btn_name == 'submit':
        captcha_locator = (By.ID, 'btnSubmit')
    wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
    wait.until(EC.visibility_of_element_located(captcha_locator)).click()


options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)

table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')

driver.get('https://ipindiaservices.gov.in/PublicSearch/')

# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)

while True:
    start = time()
    # get current page number
    current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
    print(f"Current page: {current_page}")
    # get all application status WebElements
    app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))

    for element in range(len(app_status_elements)):
        print(f"App number: {element}")
        # update application status WebElements
        app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
        # click on application status
        wait.until(EC.visibility_of(app_status_elements[element])).click()
        # wait 2 seconds for the captcha to change
        sleep(2)
        # get text and submit captcha
        captcha_text = get_captcha_text(driver, 1)
        submit_captcha(captcha_text, "submit")
        try:
            # get all table data values(without titles) WebElements
            table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
            # if there are empty rows in the table replace them with "_"
            titles_validation(driver)
            # save data to csv
            save_to_csv([val.text.replace('n', ' ') for val in table_data_values])
        except TimeoutException:
            print("Application Number does not exist")
        finally:
            driver.back()
    # print the current page number to the console
    print(f"Time per page: {round(time()-start, 3)}")
    # if the current page is equal to the specified one, then stop the search and close the driver
    if current_page == '3776':
        break
    # click next page
    wait.until(EC.visibility_of_element_located(next_btn_locator)).click()

driver.quit()

Output is ipindiaservices.csv

202247059447,PCT NATIONAL PHASE APPLICATION,18/10/2022,"PANASONIC INTELLECTUAL PROPERTY MANAGEMENT CO., LTD.",SOLID ELECTROLYTE MATERIAL AND BATTERY USING SAME,ELECTRICAL,[email protected],_,_,PCT/JP2021/004427,05/02/2021,31/03/2020,--,21/10/2022
202247059470,PCT NATIONAL PHASE APPLICATION,18/10/2022,"SHENZHEN SKYWORTH-RGB ELECTRONIC CO., LTD.","ATTACHMENT FORCE ADJUSTMENT METHOD AND APPARATUS, DEVICE, AND STORAGE MEDIUM",COMPUTER SCIENCE,[email protected],_,_,PCT/CN2020/125555,30/10/2020,02/04/2020,18/10/2022,21/10/2022
202247058733,PCT NATIONAL PHASE APPLICATION,14/10/2022,"SUMITOMO ELECTRIC OPTIFRONTIER CO., LTD.","FUSION SPLICING SYSTEM, FUSION SPLICING DEVICE, AND DETERIORATION DETERMINATION METHOD",COMPUTER SCIENCE,[email protected],_,_,PCT/JP2021/017016,28/04/2021,30/04/2020,--,21/10/2022

The performance of this solution is about 230sec per page.

Sometimes there may be no data on the aplication status page (for example, by the number "00054/CAL/1998" we get "Application Number does not exist") Therefore, the script simply ignores this aplication.

2 sec timeout before receiving the captcha text is due to the fact that after clicking on "Application Status" one captcha is shown and after ~1sec it changes to another one which we must enter

Fix:
Since the captcha that appeared after clicking on Application status was removed from the site, its solution from the script was also removed. Also, the developers of this resource made the same classes for Application Number and Application status, so we needed to change the css selector for Application status.
Solution for Google Colab:

import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def save_to_csv(data: list) -> None:
    with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
        writer = csv.writer(f, lineterminator='n')
        writer.writerow([*data])

def start_from_page(page_number: int, driver: WebDriver) -> None:
    driver.execute_script(
    f"""
    document.querySelector('button.next').value = {page_number}; 
    document.querySelector('button.next').click();
    """
    )

def titles_validation(driver: WebDriver) -> None:
    """replace empty title name with '_'"""
    driver.execute_script(
        """
        let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
        Array.from(titles).forEach((e) => {
            if (!e.textContent.trim()) {
                e.textContent = '_';
            }
        });
        """
    )

def get_network_data(log: dict, driver: WebDriver) -> dict:
    log = json.loads(log["message"])["message"]
    if all([
        "Network.responseReceived" in log["method"], 
        "params" in log.keys(), 
        'CaptchaAudio' in str(log["params"].values())
        ]):
        return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})

def get_captcha_text(driver: WebDriver, timeout: float) -> str:
    """Return captcha text

    Arguments:
        - driver: WebDriver
        - timeout: pause before receiving data from the web driver log
    """
    driver.execute_script(
        """
        // document.querySelector('img[title="Captcha"]').click()
        document.querySelector('img[title="Captcha  Audio"]').click()
        """
        )
    sleep(timeout)
    logs = driver.get_log('performance')
    responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
    if responses:
        return json.loads(responses[0]['body'])['CaptchaImageText']
    else:
        get_captcha_text(driver, timeout)

def submit_captcha(captcha_text: str, btn_name: str) -> None:
    """Submit captcha

    Arguments:
        - btn_name: captcha send button name["submit" or "search"]
    """
    if btn_name == 'search':
        captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
    elif btn_name == 'submit':
        captcha_locator = (By.ID, 'btnSubmit')
    wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
    wait.until(EC.visibility_of_element_located(captcha_locator)).click()


options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(executable_path='chromedriver', options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)

table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button[name="ApplicationSatus"]')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')

driver.get('https://ipindiaservices.gov.in/PublicSearch/')

# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)

while True:
    start = time()
    # get current page number
    current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
    print(f"Current page: {current_page}")
    # get all application status WebElements
    app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))

    for element in range(len(app_status_elements)):
        print(f"App number: {element}")
        # update application status WebElements
        app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
        # click on application status
        wait.until(EC.visibility_of(app_status_elements[element])).click()
        try:
            # switch to new tab
            driver.switch_to.window(driver.window_handles[-1])
            # get all table data values(without titles) WebElements
            table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
            # if there are empty rows in the table replace them with "_"
            titles_validation(driver)
            # save data to csv
            save_to_csv([val.text.replace('n', ' ') for val in table_data_values])
        except TimeoutException:
            print("Application Number does not exist")
        finally:
            # close new tab
            driver.close()
            # switch to main tab
            driver.switch_to.window(driver.window_handles[0])
    # print the current page number to the console
    print(f"Time per page: {round(time()-start, 3)}")
    # if the current page is equal to the specified one, then stop the search and close the driver
    if current_page == '100':
        break
    # click next page
    wait.until(EC.visibility_of_element_located(next_btn_locator)).click()

driver.quit()

These fixes have been tested on Google Colab(python 3.7.15).
The performance of this solution is about 55sec per page.

Previous versions of script are left for visual comparison.

Answered By: Brze

How to fetch information from web page chain wise(solving captcha)?

Question:

Answers: