Python web scraping with Selenium only extracts first element's data in list of lazy loaded elements
Question:
I’m attempting to use Python and Selenium to scrape the NFTrade web page and fetch all the card IDs and prices for a particular collection of NFTs.
Even though the elements on the page are lazy loaded 75 cards at a time, I’ve managed to set up a while loop that will periodically scroll down until the next batch of cards is loaded up to a maximum card count I’ve designated.
My problem is: when I then go to iterate through my list of cards and extract the card’s ID and price, inside of the function processing each card somehow only the very first card’s data is extracted and added to a list, instead of each individual card’s data being accessed and added to the list.
I really can’t understand why as when I print the card_data
‘s raw HTML inside of the get_card_data
function using print(card_data.get_attribute("innerHTML")
, I can see the cards IDs and prices of each card in the HTML, and yet when I try to extract that self same data using Selenium’s By.XPATH
and get_attribute("innerHTML")
, it only gives me the first card data back.
Here’s the code I’ve been using, please excuse the Python if it’s not the best, it’s not my normal programming language.
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
class NFTradeScraper:
def __init__(self):
options = Options()
options.add_argument('--headless')
options.add_argument('--start-maximized')
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, 5)
def get_latest_card_count(self):
"""Get the count of cards loaded into list of cards."""
return len(self.driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]'))
def get_cards(self, max_card_count = 200):
"""Extracts and returns card ID and price."""
URL="https://nftrade.com/collection/zombienft?search=&sort=min_price_asc&contractAddress=0xc031218cef355994d51cda0911b86f0a0e0dccaa&chainName="
self.driver.get(URL)
last_card_count = 0
while last_card_count < max_card_count:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
last_card_count = self.get_latest_card_count()
cards = self.driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]')
return cards
def get_card_data(self, card_data):
"""Extracts and prints out card specific data."""
card_name_element = self.driver.find_element(By.XPATH, '//div[contains(@class, "Item_itemName__ckoHR")]')
card_name = card_name_element.get_attribute("innerHTML")
card_id = card_name.partition('#')[-1]
card_price_element = card_data.find_element(By.XPATH, '//div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
card_price = card_price_element.get_attribute("innerHTML")
return {
'id': int(card_id), 'price': card_price
}
if __name__ == '__main__':
scraper = NFTradeScraper();
cards = scraper.get_cards(max_card_count=100)
card_data = []
for card in cards:
info = (scraper.get_card_data(card))
card_data.append(info)
pprint(card_data)
Answers:
When you are using xpath and you want to restrict the search inside a particular element (called context node, in this case card_data
), you have to add a dot (.
) in front of the path, otherwise it will search all the document. That’s why you were getting always the first card’s data.
So you have to change the definition of card_name_element
from
self.driver.find_element(By.XPATH, '//div[contains(@class, "Item_itemName__ckoHR")]')
to
card_data.find_element(By.XPATH, './/div[contains(@class, "Item_itemName__ckoHR")]')
and the definition of card_price_element
from
card_data.find_element(By.XPATH, '//div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
to
card_data.find_element(By.XPATH, './/div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
Output after correcting the code
[{'id': 6674, 'price': '0.23'},
{'id': 2382, 'price': '0.25'},
{'id': 9876, 'price': '0.25'},
{'id': 1013, 'price': '0.26'},
{'id': 6975, 'price': '0.27'},
{'id': 3295, 'price': '0.3'},
{'id': 2228, 'price': '0.3'},
{'id': 3839, 'price': '0.3'},
{'id': 153, 'price': '0.3'},
{'id': 6534, 'price': '0.3'}
...
I noticed that some cards don’t have the price, for example
and for these cases your code raises the error NoSuchElementException
when trying to find card_price_element
. To avoid this you have two possibilities: add a try-except block or use execute_script
instead of find_element
.
Usually the execute_script
solution is better because it let you also speed up the code execution. For example, the following code
cards = driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]')
card_data = []
for idx,card in enumerate(cards):
print(f'{idx=}',end='r')
card_name_element = card.find_element(By.XPATH, './/div[contains(@class, "Item_itemName__ckoHR")]')
card_name = card_name_element.get_attribute("innerHTML")
card_id = card_name.partition('#')[-1]
card_price_element = card.find_elements(By.XPATH, './/div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
if card_price_element:
card_price = card_price_element[0].get_attribute("innerHTML")
else:
card_price = None
info = {'id': int(card_id), 'price': card_price}
card_data.append(info)
took 25 seconds to execute on my computer, with 150 cards loaded on the page (i.e. len(cards)=150
). By replacing find_element
with javascript in the for loop we obtain
cards = driver.find_elements(By.CSS_SELECTOR, 'div[class*=itemContent]')
card_data = []
for idx,card in enumerate(cards):
print(f'{idx=}',end='r')
card_name = driver.execute_script('return arguments[0].querySelector("div[class*=itemName]")?.innerText', card)
card_id = card_name.split('#')[-1]
card_price = driver.execute_script('return arguments[0].querySelector("div[class*=itemPriceValueTxt]")?.innerText', card)
info = {'id': int(card_id), 'price': card_price}
card_data.append(info)
which was executed in 5 seconds. Actually, we can turn all the previous code into javascript
card_data = driver.execute_script(
"var id = [];" +
"var price = [];" +
"var all = document.querySelectorAll('div[class*=itemContent]');" +
"for (var i=0, max=all.length; i < max; i++) {" +
" id .push(all[i].querySelector('div[class*=itemName]')?.innerText.split('#').pop()); "+
" price.push(all[i].querySelector('div[class*=itemPriceValueTxt]')?.innerText); "+
"} " +
" return {'id':id,'price':price};"
)
which was executed in about 50 milliseconds, about 500 times faster than the original code!
With javascript we can avoid using try-except blocks thanks to the ?
appearing here
.querySelector(css_selector)?.innerText
What follows ?
is executed only if what precedes it doesn’t return None
. So if querySelector
finds an element then innerText
is executed and we get a string, otherwise execution stops at ?
and we get None
.
As a final suggestion, you can use .text
instead of .get_attribute("innerHTML")
.
I’m attempting to use Python and Selenium to scrape the NFTrade web page and fetch all the card IDs and prices for a particular collection of NFTs.
Even though the elements on the page are lazy loaded 75 cards at a time, I’ve managed to set up a while loop that will periodically scroll down until the next batch of cards is loaded up to a maximum card count I’ve designated.
My problem is: when I then go to iterate through my list of cards and extract the card’s ID and price, inside of the function processing each card somehow only the very first card’s data is extracted and added to a list, instead of each individual card’s data being accessed and added to the list.
I really can’t understand why as when I print the card_data
‘s raw HTML inside of the get_card_data
function using print(card_data.get_attribute("innerHTML")
, I can see the cards IDs and prices of each card in the HTML, and yet when I try to extract that self same data using Selenium’s By.XPATH
and get_attribute("innerHTML")
, it only gives me the first card data back.
Here’s the code I’ve been using, please excuse the Python if it’s not the best, it’s not my normal programming language.
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
class NFTradeScraper:
def __init__(self):
options = Options()
options.add_argument('--headless')
options.add_argument('--start-maximized')
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, 5)
def get_latest_card_count(self):
"""Get the count of cards loaded into list of cards."""
return len(self.driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]'))
def get_cards(self, max_card_count = 200):
"""Extracts and returns card ID and price."""
URL="https://nftrade.com/collection/zombienft?search=&sort=min_price_asc&contractAddress=0xc031218cef355994d51cda0911b86f0a0e0dccaa&chainName="
self.driver.get(URL)
last_card_count = 0
while last_card_count < max_card_count:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
last_card_count = self.get_latest_card_count()
cards = self.driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]')
return cards
def get_card_data(self, card_data):
"""Extracts and prints out card specific data."""
card_name_element = self.driver.find_element(By.XPATH, '//div[contains(@class, "Item_itemName__ckoHR")]')
card_name = card_name_element.get_attribute("innerHTML")
card_id = card_name.partition('#')[-1]
card_price_element = card_data.find_element(By.XPATH, '//div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
card_price = card_price_element.get_attribute("innerHTML")
return {
'id': int(card_id), 'price': card_price
}
if __name__ == '__main__':
scraper = NFTradeScraper();
cards = scraper.get_cards(max_card_count=100)
card_data = []
for card in cards:
info = (scraper.get_card_data(card))
card_data.append(info)
pprint(card_data)
When you are using xpath and you want to restrict the search inside a particular element (called context node, in this case card_data
), you have to add a dot (.
) in front of the path, otherwise it will search all the document. That’s why you were getting always the first card’s data.
So you have to change the definition of card_name_element
from
self.driver.find_element(By.XPATH, '//div[contains(@class, "Item_itemName__ckoHR")]')
to
card_data.find_element(By.XPATH, './/div[contains(@class, "Item_itemName__ckoHR")]')
and the definition of card_price_element
from
card_data.find_element(By.XPATH, '//div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
to
card_data.find_element(By.XPATH, './/div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
Output after correcting the code
[{'id': 6674, 'price': '0.23'},
{'id': 2382, 'price': '0.25'},
{'id': 9876, 'price': '0.25'},
{'id': 1013, 'price': '0.26'},
{'id': 6975, 'price': '0.27'},
{'id': 3295, 'price': '0.3'},
{'id': 2228, 'price': '0.3'},
{'id': 3839, 'price': '0.3'},
{'id': 153, 'price': '0.3'},
{'id': 6534, 'price': '0.3'}
...
I noticed that some cards don’t have the price, for example
and for these cases your code raises the error NoSuchElementException
when trying to find card_price_element
. To avoid this you have two possibilities: add a try-except block or use execute_script
instead of find_element
.
Usually the execute_script
solution is better because it let you also speed up the code execution. For example, the following code
cards = driver.find_elements(By.XPATH, '//div[contains(@class, "Item_itemContent__1XIcH")]')
card_data = []
for idx,card in enumerate(cards):
print(f'{idx=}',end='r')
card_name_element = card.find_element(By.XPATH, './/div[contains(@class, "Item_itemName__ckoHR")]')
card_name = card_name_element.get_attribute("innerHTML")
card_id = card_name.partition('#')[-1]
card_price_element = card.find_elements(By.XPATH, './/div[contains(@class, "Item_itemPriceValueTxt__lblqJ")]')
if card_price_element:
card_price = card_price_element[0].get_attribute("innerHTML")
else:
card_price = None
info = {'id': int(card_id), 'price': card_price}
card_data.append(info)
took 25 seconds to execute on my computer, with 150 cards loaded on the page (i.e. len(cards)=150
). By replacing find_element
with javascript in the for loop we obtain
cards = driver.find_elements(By.CSS_SELECTOR, 'div[class*=itemContent]')
card_data = []
for idx,card in enumerate(cards):
print(f'{idx=}',end='r')
card_name = driver.execute_script('return arguments[0].querySelector("div[class*=itemName]")?.innerText', card)
card_id = card_name.split('#')[-1]
card_price = driver.execute_script('return arguments[0].querySelector("div[class*=itemPriceValueTxt]")?.innerText', card)
info = {'id': int(card_id), 'price': card_price}
card_data.append(info)
which was executed in 5 seconds. Actually, we can turn all the previous code into javascript
card_data = driver.execute_script(
"var id = [];" +
"var price = [];" +
"var all = document.querySelectorAll('div[class*=itemContent]');" +
"for (var i=0, max=all.length; i < max; i++) {" +
" id .push(all[i].querySelector('div[class*=itemName]')?.innerText.split('#').pop()); "+
" price.push(all[i].querySelector('div[class*=itemPriceValueTxt]')?.innerText); "+
"} " +
" return {'id':id,'price':price};"
)
which was executed in about 50 milliseconds, about 500 times faster than the original code!
With javascript we can avoid using try-except blocks thanks to the ?
appearing here
.querySelector(css_selector)?.innerText
What follows ?
is executed only if what precedes it doesn’t return None
. So if querySelector
finds an element then innerText
is executed and we get a string, otherwise execution stops at ?
and we get None
.
As a final suggestion, you can use .text
instead of .get_attribute("innerHTML")
.