Handling pagination in python playwright when the url doesn't change
Question:
I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn’t change after you click on the next button. How do I solve this pagination problem?
Here’s my code
`
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=50)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
names = []
industry = []
Locations = []
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
`
I’ve checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.
Answers:
With some small adjustments you can get it, lets try this:
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[@class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn’t change after you click on the next button. How do I solve this pagination problem?
Here’s my code
`
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=50)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
names = []
industry = []
Locations = []
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
`
I’ve checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.
With some small adjustments you can get it, lets try this:
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[@class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)