Stale Element – Selenium – Python

Question

So I’ll start by saying that this has became such a mess with me trying to solve this issue, other times I have been able to resolve the stale element issue.

Problem all starts after the first players stats are stored ( Everything it should be doing up this point ), and then once it goes back to loop and find the next player we have the issue.

I’m not sure if its caused by the nested loops or what.
I try reinstating the variable that is giving me the issues I assume all throughout the code.
player_stats

The thing is I did have it previously going through 5 players, and I am not sure what happened, or when the bug first established itself lol, as I was working on getting the rounds won, and played situated.

(We aren’t even able to print("Found playerCol element") on the second go around)

All print statements works till it hangs in the while loop after the first iteration.

Here is the full code (with comments):

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions importStaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import re
import time

# Initialize the webdriver
driver = webdriver.Firefox()

# Navigate to the website
url = "https://www.hltv.org/stats/players"
driver.get(url)

WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()

# Find the elements containing the player statistics
player_stats = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail"))
)


# Extract the relevant data from the elements
players = []

for i, player_stat in enumerate(player_stats):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
        while True:
            player_stats = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
            try:    
                if "playerCol" in player_stat.get_attribute("class"):
                    print("Found playerCol element")
                    name = player_stat.find_element(By.CSS_SELECTOR, "a").text if player_stat.find_elements(By.CSS_SELECTOR, "a") else player_stat.text
                    print(f"Name: {name}")
                elif "statsDetail" in player_stat.get_attribute("class"):
                    stats = player_stat.text.split()
                    if len(stats) >= 1 and re.search(r"d+.d+", stats[0]):
                        kd_ratio = stats[0]
                break
            except StaleElementReferenceException as e:
                player_stats = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
                player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
                print(f"An error occurred while processing match stats: {e}")
                break

        # Extract the player stats
        if "statsDetail" in player_stat.get_attribute("class"):
            stats = player_stat.text.split()
            if len(stats) >= 1 and re.search(r"d+.d+", stats[0]):
                kd_ratio = stats[0]

                # Process match stats for the player
                try:
                    time.sleep(1)
                    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".playerCol, .statsDetail")))
                    player_link = driver.find_element(By.XPATH, f"//a[contains(text(), '{name}')]")
                    print(player_link.get_attribute('outerHTML'))
                    driver.execute_script("arguments[0].click();", player_link)
                    time.sleep(1)
                    player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
                    player = [name, kd_ratio]

                    # Extract additional player stats
                    headshot_percentage = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Headshot %')]/following-sibling::span"))).text
                    player.append(headshot_percentage)

                    kpr = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Kills / round')]/following-sibling::span"))).text
                    player.append(kpr)

                    dpr = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Deaths / round')]/following-sibling::span"))).text
                    player.append(dpr)

                    # Extract match stats for the player
                    matches_link = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/stats/players/matches/'][data-link-tracking-destination='Click on Matches -> Individual -> Overview [subnavigation]']")))
                    driver.execute_script("arguments[0].click();", matches_link)
                    
                    match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))
                    match_scores = []
                    num_of_matches = 0
                    rounds_won = 0
                    rounds_played = 0
                    # Process match stats for the player
                    for i, match_stat in enumerate(match_stats):
                        player_name = player[0]
                        player_team = driver.find_element(By.CSS_SELECTOR, ".gtSmartphone-only span:last-of-type").text
                        try:
                            team_name = ""
                            score = ""
                            while team_name == "" or score == "":
                                try:
                                    team = match_stat.find_element(By.CSS_SELECTOR, ".gtSmartphone-only span:last-of-type").text
                                    team_name = team.strip()
                                    
                                    score_span = match_stat.find_element(By.XPATH, ".//div[contains(@class, 'gtSmartphone-only')]//*[contains(text(), '(')]")
                                    score_text = score_span.text.strip()
                                
                                    score = re.search(r'((d+))', score_text).group(1)
                                    
                                except:
                                    time.sleep(1)
                                    match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))
                                    match_stat = match_stats[i]
                            team_data = match_stat.find_elements(By.CSS_SELECTOR, ".gtSmartphone-only span")
                            print("Team data:", team_data[3].text)
                            if team_name.lower() == player_team.lower():
                                player_score = score
                                opposing_team_name = team_data[2].text.strip()
                                print(opposing_team_name)
                                opposing_team_score = team_data[3].text.strip('()')
                                print("Score strip: ", opposing_team_score)
                                rounds_won += int(player_score)
                                rounds_played += int(player_score) + int(opposing_team_score)
                            else:
                                player_score = team_data[1].text.strip('()')
                                print(player_score)
                                opposing_team_score = score
                                print(opposing_team_score)
                                opposing_team_name = team_data[0].text.strip()
                                print(opposing_team_name)
                                rounds_won += int(opposing_team_score)
                                rounds_played += int(player_score) + int(opposing_team_score)

                            match_scores.append((team_name, opposing_team_name, player_score, opposing_team_score))
                            num_of_matches += 1

                            if num_of_matches == 5: # exit loop after 5 iterations
                                break

                        except:
                            # Refresh the page if the element can't be found
                            driver.back()
                            player_stats = driver.find_elements(By.CSS_SELECTOR, ".playerCol, .statsDetail")
                            time.sleep(1)
                            match_stats = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.group-2, tr.group-1")))

                except Exception as e:
                    print(f"An error occurred while processing data for player {name}: {e}")
                    continue

                players.append([name, kd_ratio, headshot_percentage, kpr, dpr, rounds_won, rounds_played])
                print(players)
                print(f"{player_name}: {rounds_won} rounds won out of {rounds_played} rounds played in {num_of_matches} matches")
                driver.get(url)
                time.sleep(1)
    except StaleElementReferenceException as e:
    # handle the exception here
        print(f"An error occurred while processing match stats: {e}")
        break
# Close the webdriver
driver.quit()
# Store the data in a Pandas dataframe
df = pd.DataFrame(players, columns=["Name", "K/D", "HS %", "KPR", "DPR", "RW", "RP"])

# Clean the data
df["K/D"] = df["K/D"].str.extract(r"(d+.d+)").astype(float)
df["HS %"] = df["HS %"].str.extract(r"(d+.d+)").astype(float)
df["KPR"] = df["KPR"].str.extract(r"(d+.d+)").astype(float)
df["DPR"] = df["DPR"].str.extract(r"(d+.d+)").astype(float)



# Drop any rows that have missing or invalid data
df.dropna(subset=["Name", "K/D", "HS %", "KPR", "DPR"], inplace=True)


# Save the data to a CSV file
df.to_csv("player_stats.csv", index=False, sep='t')

# Close the webdriver
driver.quit()

Asked By: Blue

||

Source

Answer 1

At first I tried to correct your code, but it is too messy and a lot of code is unnecessary, so I gave up and tried instead using faster libraries than selenium.

In general, when your scraping project requires to load a lot of webpages (as in this case) and/or if the selenium code is slow, you should consider trying requests or BeautifulSoup. These two libraries are way faster than selenium, but as a drawback cannot load javascript and other stuff. However, the data you want to scrape on hltv.org isn’t loaded by javascript, so we can scrape it with requests or BeautifulSoup.

The following code scrapes data for the first 10 players. If you want more, change the value of limit. I had to add time.sleep(1) to prevent ban from the site.

import re, time, requests, lxml, lxml.html

url = 'https://www.hltv.org/stats/players'
html_home = lxml.html.fromstring(requests.get(url).text)

players = {key:[] for key in ["Name", "K/D", "HS %", "KPR", "DPR", "RW", "RP"]}
limit = 10
players["Name"] = html_home.xpath("//td[contains(@class,'playerCol')]//text()")[:limit]
players["K/D"]  = html_home.xpath("//td[contains(@class,'statsDetail')][3]/text()")[:limit]

for idx,href in enumerate(html.xpath("//td[contains(@class,'playerCol')]/a/@href")[:limit]): # href is for example '/stats/players/11893/zywoo'
    
    time.sleep(1)
    url = 'https://www.hltv.org' + href
    html_player = lxml.html.fromstring(requests.get(url).text)
    players["HS %"] += [html_player.xpath("//span[contains(text(), 'Headshot %')]/following-sibling::span")[0].text]
    players["KPR"]  += [html_player.xpath("//span[contains(text(), 'Kills / round')]/following-sibling::span")[0].text]
    players["DPR"]  += [html_player.xpath("//span[contains(text(), 'Deaths / round')]/following-sibling::span")[0].text]
    
    time.sleep(1)
    url = url.replace('/players/', '/players/matches/') # url is for example https://www.hltv.org/stats/players/matches/11893/zywoo
    html_matches = lxml.html.fromstring(requests.get(url).text)
    match_stats = html_matches.xpath("//tr[contains(@class,'group-1') or contains(@class,'group-2')]")
    player_team = html_matches.xpath("//div[@class='gtSmartphone-only']/a/span/text()")[0]
    num_of_matches = 5
    rounds_won = 0
    rounds_played = 0

    for match_stat in match_stats[:num_of_matches]:
        team_data = match_stat.xpath(".//div[@class='gtSmartphone-only']//span/text()")
        team_name = team_data[0].strip()
        player_score        = re.search(r'((d+))', team_data[1]).group(1)
        opposing_team_score = re.search(r'((d+))', team_data[3]).group(1)
        rounds_played += int(player_score) + int(opposing_team_score)
        if team_name.lower() == player_team.lower():
            rounds_won += int(player_score)
        else:
            rounds_won += int(opposing_team_score)
    
    players["RW"] += [rounds_won]
    players["RP"] += [rounds_played]
    
    print(f'{players["Name"][idx]}: {rounds_won} rounds won out of {rounds_played} rounds played in {num_of_matches} matches')

My computer took 36 seconds to execute it with limit = 10. Finally, by running pd.DataFrame(players) we get

Answered By: sound wave

Stale Element – Selenium – Python

Question:

Answers: