I get the same output in for loop

Question

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd

s=Service("C:selenium driverchromedriver.exe")
driver = webdriver.Chrome(service=s)

companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []

driver.get("https://www.saveface.co.uk/search/")

driver.implicitly_wait(10)

blocks = driver.find_elements(By.XPATH, "//div[@class='result clientresult']")

for block in range(30):
    
    company_name = blocks[block].find_element(By.XPATH, "//h3[@class='resulttitle']").text.strip()
    companies_names.append(company_name)

    person_name = blocks[block].find_element(By.XPATH, "//p[@class='name_wrapper']").text.strip()
    persons_names.append(person_name)

    phone_number = blocks[block].find_element(By.XPATH, "//div[@class='searchContact phone']").text.strip()
    phones_numbers.append(phone_number)

    location = blocks[block].find_element(By.XPATH, "//li[@class='cls_loc']").text.strip()
    locations.append(location)

    opening_hour = blocks[block].find_element(By.XPATH, "//li[@class='opening-hours']").text.strip()
    opening_hours.append(opening_hour)

    profile = blocks[block].find_element(By.XPATH, "//a[@class='visitpage']").get_attribute("href")
    all_profiles.append(profile)
    
    print(company_name, person_name, phone_number, location, opening_hour, profile)

    if block == 29:
        two_page = driver.find_element(By.XPATH, "//a[@class='facetwp-page']")
        two_page.click()
        
        driver.implicitly_wait(10)

        blocks = driver.find_elements(By.XPATH, "//div[@class='result clientresult']")

for i in range(len(all_profiles)):
    
    driver.get(all_profiles[i])

    description = driver.find_element(By.XPATH, "//div[@class='desc-text-left']").text.strip()
    descriptions.append(description)

    website_link = driver.find_element(By.XPATH, "//a[@class='visitwebsite website']").get_attribute("href")
    websites_links.append(website_link)        


driver.implicitly_wait(10)
driver.close()

df = pd.DataFrame(
    {
        "company_name": companies_names,
        "person_name": persons_names,
        "phone_number": phones_numbers,
        "location": locations,
        "opening_hour": opening_hours,
        "description": descriptions,
        "website_link": websites_links,
        "profile_on_saveface": all_profiles
    }
)

df.to_csv('saveface.csv',index=False)
#print(df)

This is the result:

The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/

Asked By: Zeyad Magdy

||

Source

Answer 1

To restric the search within a subtree rooted at the context node, your expression should start with .// so you have to replace // with .// in each of the commands

... = blocks[block].find_element(...)

The meaning of // is to search the document from the document’s root, ignoring the context node blocks[block] altogether.

Moreover, notice that not all the blocks have a location as you can see from this image

in this case

location = blocks[block].find_element(By.XPATH, "//li[@class='cls_loc']")

will raise a NoSuchElementException. To avoid this you have to put the command in a try...except... block

UPDATE

Scraping 400 blocks with selenium takes about 1 minute on my computer, I tried with BeautifulSoup and it just takes less than 1 second! The slow part is to scrape the profiles, because for each of them we have to download a new webpage, however is still way faster with BeautifulSoup.

So I write a script without using selenium, just BeautifulSoup (you can install by running pip install beautifulsoup4 in the terminal)

import requests
from bs4 import BeautifulSoup
url = 'https://www.saveface.co.uk/search/'
soup = BeautifulSoup(requests.get(url).text, "html.parser")

css_selector = {
    'company name' : ".title",
    'person name'  : ".name_wrapper",
    'phone number' : ".phone",
    'location'     : ".cls_loc",
    'opening hours': ".opening-hours",
    'profile link' : ".visitpage",
}
data = {key:[] for key in list(css_selector)+['description','website link']}
number_of_pages = int(str(soup).split('total_pages":')[1].split('}')[0])

for page in range(2,number_of_pages+2):
    
    blocks = soup.select('.clientresult')
    for idx,block in enumerate(blocks):
        print(f'blocks {idx+1}/{len(blocks)}',end='r')
        for key in list(css_selector):
            try:
                if 'link' in key:
                    data[key] += [ block.select_one(css_selector[key])['href'] ]
                else:
                    data[key] += [ block.select_one(css_selector[key]).text.strip().replace('rn',', ') ]
            except AttributeError:
                data[key] += ['*missing value*']

    if page <= number_of_pages:
        print('nloading page', page)
        url_page = f'{url}?fwp_paged={page}'
        soup = BeautifulSoup(requests.get(url_page).text, "html.parser")

print('nno more pages to load, moving to scrape profile links...')

for idx,url in enumerate(data['profile link']):
    
    print(f"profile link {idx+1}/{len(data['profile link'])}  ",end='r')
    soup_profile = BeautifulSoup(requests.get(url).text, "html.parser")
    try:
        data['description'] += [soup_profile.select_one('.clinicContent > .description').text.strip()]
    except AttributeError:
        data['description'] += ['*missing value*']
    try:
        data['website link'] += [soup_profile.select_one('.visitwebsite')['href']]
    except AttributeError:
        data['website link'] += ['*missing value*']

Output (it took about 8 minutes to complete the execution)

blocks 400/400
loading page 2
blocks 109/109
no more pages to load, moving to scrape profile links...
profile link 509/509

Then you can easily create the dataframe by running pd.DataFrame(data)

Answered By: sound wave

Answer 2

this is the new code
but it returns the same output on every page why:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd

s=Service("C:selenium driverchromedriver.exe")
driver = webdriver.Chrome(service=s)

companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []

driver.get("https://www.saveface.co.uk/search/")

driver.implicitly_wait(10)

pages = driver.find_elements(By.XPATH, ".//a[@class='facetwp-page']")

for page in range(len(pages)+1):

    blocks = driver.find_elements(By.XPATH, ".//div[@class='result clientresult']")
    
    for block in range(10):

        try:    
            company_name = blocks[block].find_element(By.XPATH, ".//h3[@class='resulttitle']").text.strip()
            companies_names.append(company_name)
        except:
            companies_names.append("Not found on the site")

        try:    
            person_name = blocks[block].find_element(By.XPATH, ".//p[@class='name_wrapper']").text.strip()
            persons_names.append(person_name)
        except:
            persons_names.append("Not found on the site")

        try:    
            phone_number = blocks[block].find_element(By.XPATH, ".//div[@class='searchContact phone']").text.strip()
            phones_numbers.append(phone_number)
        except:
            phones_numbers.append("Not found on the site")

        try:    
            location = blocks[block].find_element(By.XPATH, ".//li[@class='cls_loc']").text.strip()
            locations.append(location)
        except:
            locations.append("Not found on the site")

        try:    
            opening_hour = blocks[block].find_element(By.XPATH, ".//li[@class='opening-hours']").text.strip()
            opening_hours.append(opening_hour)
        except:
            opening_hours.append("Not found on the site")
            
        try:    
            profile = blocks[block].find_element(By.XPATH, ".//a[@class='visitpage']").get_attribute("href")
            all_profiles.append(profile)
        except:
            all_profiles.append("Not found on the site")
                
    two_page = driver.find_element(By.XPATH, ".//a[@class='facetwp-page']")
    two_page.click()
            
for i in range(len(all_profiles)):

    try:
        
        driver.get(all_profiles[i])

        driver.implicitly_wait(10)
        
        try:    
            description = driver.find_element(By.XPATH, ".//div[@class='desc-text-left']").text.strip()
            descriptions.append(description)
        except:
            descriptions.append("Not found on the site")
            
        try:    
            website_link = driver.find_element(By.XPATH, ".//a[@class='visitwebsite website']").get_attribute("href")
            websites_links.append(website_link)        
        except:
            websites_links.append("Not found on the site")

    except:
        descriptions.append("Not found on the site")
        websites_links.append("Not found on the site")

driver.implicitly_wait(10)
driver.close()

df = pd.DataFrame(
    {
        "company_name": companies_names,
        "person_name": persons_names,
        "phone_number": phones_numbers,
        "location": locations,
        "opening_hour": opening_hours,
        "description": descriptions,
        "website_link": websites_links,
        "profile_on_saveface": all_profiles
    }
)

df.to_csv('saveface.csv',index=False)
print(df)

Answered By: Zeyad Magdy

I get the same output in for loop

Question:

Answers:

UPDATE