Using selenium to scrape paginated table data (Python)

Question:

I have this table: https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page=1. It’s paginated I want to scrape all the content from the table starting from page 1 to the very end. I am trying to use the xpath but can’t seem to get it to work.

Here is my code, any help welcome!

from selenium import webdriver
from selenium.webdriver.common.by import By

import os




# co.add_argument('--ignore-certificate-errors')
#co.add_argument('--no-proxy-server')
#co.add_argument("--proxy-server='direct://'")
#co.add_argument("--proxy-bypass-list=*")
co = webdriver.ChromeOptions()
co.add_argument('--headless')
driver = webdriver.Chrome(executable_path="C:/Users/user/Desktop/IG Trading/chromedriver.exe", chrome_options=co)
driver.get('https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page=1')
stock_names = driver.find_elements(By.XPATH, '/html/body/app-root/app-handshake/div/app-page-content/app-filter-toggle/app-ftse-index-table/section/table')
print(stock_names)

# for stock_name in stock_names:
#     print(stock_name)
#     text = stock_name.text
#     print(text)
Asked By: STEIN

||

Answers:

This is one way you can obtain that information:

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
from tqdm import tqdm

firefox_options = Firefox_Options()

# firefox_options.add_argument("--width=1500")
# firefox_options.add_argument("--height=500")
# firefox_options.headless = True

driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)

big_df = pd.DataFrame()

browser.get('https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table')     
try:
    WebDriverWait(browser, 3).until(EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))).click()
    print('accepted cookies')
except Exception as e:
    print('no cookie button!')
t.sleep(2)

for i in tqdm(range(1, 40)):
    browser.get(f'https://www.londonstockexchange.com/indices/ftse-aim-all-share/constituents/table?page={i}') 
    t.sleep(1)
    df = pd.read_html(WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[class='full-width ftse-index-table-table']"))).get_attribute('outerHTML'))[0]
    big_df = pd.concat([big_df, df], axis=0, ignore_index=True)

print(big_df)
big_df.to_csv('lse_companies.csv')
print('all done')
browser.quit()

This will display in terminal the big dataframe once all pages scraped, and also save it as a csv file on disk (in the same folder you are running your script from). Setup is Firefox/geckodriver on linux, however you can adapt it to your own, just observe the imports, and the logic after defining the browser/driver.
Selenium docs: https://www.selenium.dev/documentation/

TQDM: https://pypi.org/project/tqdm/

Answered By: platipus_on_fire
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.