Could not extract data from website using selenium

Question:

I am trying to extract data from https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==, and it contains multiple sub-pages. But each sub-page doesn’t have a separate link to extra.

So I use selenium to dynamically load the website and navigate to each page. But when I try to extract data from the second page, it returns only the first page’s content.

This is the code that I used to run the programme.

from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import time
from pathlib import Path
disable_warnings(InsecureRequestWarning)
agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) 
         Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",}
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

url='https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path='C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver


service = Service(path)  
driver = webdriver.Chrome(service=service)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

def get_data():
    start = time.process_time()
    url=main_url
    product_name=[]
    product_price=[]
    count=0
    all_pages=10 #this number is only for testing purpose
    print('Get Data Processing .....')

    for i in range(all_pages):
        if(count==0):
            add_boxs_v1=soup.find_all(class_='veg')
            for product in add_boxs_v1:
                product_name.append(product.find('p').text)
            add_boxs_v2=soup.find_all(class_='strike1')
            for price in add_boxs_v2:
                product_price.append(price.find('h4').text)
            count+=1
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']"))).click()
            time.sleep(5)
    print('done')
    df=pd.DataFrame({'Product_name':product_name,'Price':product_price})
    return df

df=get_data()
df.head()

Please, someone could guide me on what step I did wrong in this process.

Asked By: Chanaka Eranga

||

Answers:

Below is my simple code to move to page 2 from your URL. Other pages have the same CSS pattern so you don’t need to worry. I test and it works. You just need to integrate the extracted data part from your code.

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time


# create a new Chrome browser instance
browser = webdriver.Chrome(ChromeDriverManager().install())

# navigate to the website
browser.get("https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw==")
time.sleep(10)

# Get the css_path of the button (page 2).
# For each page you will increase the number inside li:nth-child(4)
# For example li:nth-child(5), li:nth-child(6)
css_path = """
#divProducts > div.divPagingProd > ul > li:nth-child(4) > a
"""

# Find the button and scroll down to that button, then click
button = browser.find_element(By.CSS_SELECTOR, css_path)
browser.execute_script("arguments[0].scrollIntoView();", button)
browser.execute_script("arguments[0].click();", button)
time.sleep(10)


browser.quit()

you are getting only the first page, because your page_source is first page only. On every click operation you need to capture the current page_source.

you need to move the page_source inside the for loop to get the latest page_source everytime.

url='https://cargillsonline.com/Web/Product?IC=Mg==&NC=QmFieSBQcm9kdWN0cw=='
path='C:/Users/dell/Desktop/Data/DataScraping/chrome_driver/chromedriver


service = Service(path)  
driver = webdriver.Chrome(service=service)
driver.get(url)

def get_data():
    start = time.process_time()
    url=main_url
    product_name=[]
    product_price=[]
    count=0
    all_pages=10 #this number is only for testing purpose
    print('Get Data Processing .....')

    for i in range(all_pages):
        if(count==0):
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            add_boxs_v1=soup.find_all(class_='veg')
            for product in add_boxs_v1:
                product_name.append(product.find('p').text)
            add_boxs_v2=soup.find_all(class_='strike1')
            for price in add_boxs_v2:
                product_price.append(price.find('h4').text)
            count+=1
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='selectPage(page + 1, $event)']"))).click()
            time.sleep(5)
    print('done')
    df=pd.DataFrame({'Product_name':product_name,'Price':product_price})
    return df

df=get_data()
df.head()
Answered By: KunduK