Handling pagination in python playwright when the url doesn't change

Question

I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn’t change after you click on the next button. How do I solve this pagination problem?
Here’s my code
`

from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright

url = 'https://franchisedisclosure.gov.au/Register'

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False, slow_mo=50)
    page = browser.new_page()
    page.goto(url)
    page.locator("text=I agree to the terms of use").click()
    page.locator("text=Continue").click()
    page.wait_for_load_state('domcontentloaded')
    page.is_visible('tbody')
    html = page.inner_html('table.table.table-hover')
    soup = bs(html, 'html.parser')
    table = soup.find('tbody')
    rows = table.findAll('tr')
    names = []
    industry = []
    Locations = []
    for row in rows:
        info = row.findAll('td')
        name = info[0].text.strip()
        industry = info[1].text.strip()
        Locations = info[2].text.strip()

`

I’ve checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.

Asked By: Chiedozie Agwu

||

Source

Answer 1

With some small adjustments you can get it, lets try this:

from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time

url = 'https://franchisedisclosure.gov.au/Register'

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False, slow_mo=100)
    page = browser.new_page()
    page.goto(url)
    page.locator("text=I agree to the terms of use").click()
    page.locator("text=Continue").click()
    page.wait_for_load_state('domcontentloaded')
    names = []
    industry = []
    Locations = []
    # When you click to next page, an element with text "Loading" appears in the screen, so we save that element
    loading_icon = "//strong[text()='Loading...']"
    # This is the "next page" button
    next_page_locator = "//ul[@class='pagination']/li[3]"
    # We select the option of 50 elements per page
    page.select_option("#perPageCount", value="50")
    # We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
    page.wait_for_selector(loading_icon, state="visible")
    page.wait_for_selector(loading_icon, state="hidden")
    time.sleep(1)
    # We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
    while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
        # We get the info you wanted
        page.is_visible('tbody')
        html = page.inner_html('table.table.table-hover')
        soup = bs(html, 'html.parser')
        table = soup.find('tbody')
        rows = table.findAll('tr')
        for row in rows:
            info = row.findAll('td')
            name = info[0].text.strip()
            industry = info[1].text.strip()
            Locations = info[2].text.strip()
        # Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
        page.click(next_page_locator)
        page.wait_for_selector(loading_icon, state="visible")
        page.wait_for_selector(loading_icon, state="hidden")
        time.sleep(1)

Answered By: Jaky Ruby

Handling pagination in python playwright when the url doesn't change

Question:

Answers: