How to have all the results using beautifulsoup with selenium and disable the automate test page?

Question:

I’m trying to web scraping the website somehow it only show 24 results how to I load all result with hidden automate test page?

Below the code:

    # import library
    from selenium import webdriver
    from selenium.webdriver import Chrome
    import pandas as pd
    import bs4


    #create list
    items = []
    prices = []
    volumes = []


    driver = webdriver.Chrome()
    driver.get("https://www.fairprice.com.sg/category/milk-powder")
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    allelem = soup.find_all('div',class_='sc-1plwklf-0 iknXK product-container')

    #read all element
    for item in allelem:
      items.append(item.find('span', class_='sc-1bsd7ul-1 eJoyLL').text.strip())
  
    #read price
    for price in allelem:
      prices.append(price.find('span', class_='sc-1bsd7ul-1 sc-1svix5t-1 gJhHzP biBzHY').text.strip())


    #read volume
    for volume in allelem:
      volumes.append(volume.find('span', class_='sc-1bsd7ul-1 eeyOqy').text.strip())

    print(items)
    print(volumes)
    print(prices)

    #create dataframe
    final_array = []
    for item,price,volume in zip(items,prices,volumes):
     final_array.append({'Item':item,'Volume':volume,'Price':price})
    
    # covert to excel
    df = pd.DataFrame(final_array)
    print(df)
    df.to_excel('ntucv4milk.xlsx',index=False)

end of code

Asked By: jjbkd

||

Answers:

My suggestion is to define three lists (items, prices, volumes) which are going to incrementally grow by scrolling down the page. If you have a list elements of webelements, you can easily scroll to the last one by running

driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', elements[-1])

Then all you have to do is wait until new items are loaded and then add them to the three lists. If no items are loaded in a given amount of time (max_wait, which is 10 seconds), then probably there are no more items to load and we can break the loop.

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

chromedriver_path = '...'
driver = webdriver.Chrome(service=Service(chromedriver_path))

driver.get('https://www.fairprice.com.sg/category/milk-powder')

items, prices, volumes = [], [], []
c = 0 # counter
max_wait = 10
no_new_items = False

while 1:
    items_new = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
    items   += [item.text.strip()  for item  in items_new[c:]]
    prices  += [price.text.strip() for price in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product"] span[weight="black"]')[c:]]
    volumes += [vol.text.strip()   for vol   in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] div>span:first-child')[c:]]
    counter = len(items)
    print(counter,'items scraped',end='r')
    
    driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', items_new[-1])

    items_loaded = items_new.copy()
    start = time.time()
    # wait up to `max_wait` seconds for new elements to be loaded
    while len(items_new) == len(items_loaded):
        items_loaded = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
        if time.time() - start > max_wait:
            no_new_items = True
            break
    if no_new_items:
        break

pd.DataFrame({'item':items,'price':prices,'volume':volumes})

Output

enter image description here

Answered By: sound wave