How to have all the results using beautifulsoup with selenium and disable the automate test page?
Question:
I’m trying to web scraping the website somehow it only show 24 results how to I load all result with hidden automate test page?
Below the code:
# import library
from selenium import webdriver
from selenium.webdriver import Chrome
import pandas as pd
import bs4
#create list
items = []
prices = []
volumes = []
driver = webdriver.Chrome()
driver.get("https://www.fairprice.com.sg/category/milk-powder")
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
allelem = soup.find_all('div',class_='sc-1plwklf-0 iknXK product-container')
#read all element
for item in allelem:
items.append(item.find('span', class_='sc-1bsd7ul-1 eJoyLL').text.strip())
#read price
for price in allelem:
prices.append(price.find('span', class_='sc-1bsd7ul-1 sc-1svix5t-1 gJhHzP biBzHY').text.strip())
#read volume
for volume in allelem:
volumes.append(volume.find('span', class_='sc-1bsd7ul-1 eeyOqy').text.strip())
print(items)
print(volumes)
print(prices)
#create dataframe
final_array = []
for item,price,volume in zip(items,prices,volumes):
final_array.append({'Item':item,'Volume':volume,'Price':price})
# covert to excel
df = pd.DataFrame(final_array)
print(df)
df.to_excel('ntucv4milk.xlsx',index=False)
end of code
Answers:
My suggestion is to define three lists (items, prices, volumes) which are going to incrementally grow by scrolling down the page. If you have a list elements
of webelements, you can easily scroll to the last one by running
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', elements[-1])
Then all you have to do is wait until new items are loaded and then add them to the three lists. If no items are loaded in a given amount of time (max_wait
, which is 10 seconds), then probably there are no more items to load and we can break the loop.
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
chromedriver_path = '...'
driver = webdriver.Chrome(service=Service(chromedriver_path))
driver.get('https://www.fairprice.com.sg/category/milk-powder')
items, prices, volumes = [], [], []
c = 0 # counter
max_wait = 10
no_new_items = False
while 1:
items_new = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
items += [item.text.strip() for item in items_new[c:]]
prices += [price.text.strip() for price in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product"] span[weight="black"]')[c:]]
volumes += [vol.text.strip() for vol in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] div>span:first-child')[c:]]
counter = len(items)
print(counter,'items scraped',end='r')
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', items_new[-1])
items_loaded = items_new.copy()
start = time.time()
# wait up to `max_wait` seconds for new elements to be loaded
while len(items_new) == len(items_loaded):
items_loaded = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
if time.time() - start > max_wait:
no_new_items = True
break
if no_new_items:
break
pd.DataFrame({'item':items,'price':prices,'volume':volumes})
Output
I’m trying to web scraping the website somehow it only show 24 results how to I load all result with hidden automate test page?
Below the code:
# import library
from selenium import webdriver
from selenium.webdriver import Chrome
import pandas as pd
import bs4
#create list
items = []
prices = []
volumes = []
driver = webdriver.Chrome()
driver.get("https://www.fairprice.com.sg/category/milk-powder")
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
allelem = soup.find_all('div',class_='sc-1plwklf-0 iknXK product-container')
#read all element
for item in allelem:
items.append(item.find('span', class_='sc-1bsd7ul-1 eJoyLL').text.strip())
#read price
for price in allelem:
prices.append(price.find('span', class_='sc-1bsd7ul-1 sc-1svix5t-1 gJhHzP biBzHY').text.strip())
#read volume
for volume in allelem:
volumes.append(volume.find('span', class_='sc-1bsd7ul-1 eeyOqy').text.strip())
print(items)
print(volumes)
print(prices)
#create dataframe
final_array = []
for item,price,volume in zip(items,prices,volumes):
final_array.append({'Item':item,'Volume':volume,'Price':price})
# covert to excel
df = pd.DataFrame(final_array)
print(df)
df.to_excel('ntucv4milk.xlsx',index=False)
end of code
My suggestion is to define three lists (items, prices, volumes) which are going to incrementally grow by scrolling down the page. If you have a list elements
of webelements, you can easily scroll to the last one by running
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', elements[-1])
Then all you have to do is wait until new items are loaded and then add them to the three lists. If no items are loaded in a given amount of time (max_wait
, which is 10 seconds), then probably there are no more items to load and we can break the loop.
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
chromedriver_path = '...'
driver = webdriver.Chrome(service=Service(chromedriver_path))
driver.get('https://www.fairprice.com.sg/category/milk-powder')
items, prices, volumes = [], [], []
c = 0 # counter
max_wait = 10
no_new_items = False
while 1:
items_new = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
items += [item.text.strip() for item in items_new[c:]]
prices += [price.text.strip() for price in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product"] span[weight="black"]')[c:]]
volumes += [vol.text.strip() for vol in driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] div>span:first-child')[c:]]
counter = len(items)
print(counter,'items scraped',end='r')
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', items_new[-1])
items_loaded = items_new.copy()
start = time.time()
# wait up to `max_wait` seconds for new elements to be loaded
while len(items_new) == len(items_loaded):
items_loaded = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="product-name-and-metadata"] > span[weight="normal"]')
if time.time() - start > max_wait:
no_new_items = True
break
if no_new_items:
break
pd.DataFrame({'item':items,'price':prices,'volume':volumes})
Output