How to parse data after specific text Python Selenium bs4

Question

On one of the sites for which I am writing a parser, I encountered the following problem:
I need to take all the data from the table, but they are not signed in the html code and are swapped
html example

The table looks like this:
table

At first I used XPATH for this, but when parsing, I found that some data was swapped, such as engine and registration number, or not at all. So XPATH is not suitable, because data with mileage can get into the line with the engine in the csv file

Is it possible somehow in selenium or through bs4 to first search for a word, and then parse the data after it?

That is, what would find the word Engine in the html code, and then take the data below
html text that I need

My code:

import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth

def collect_data():
    global driver
    options = webdriver.ChromeOptions()
    options.set_preference('general.useragent.override',
                           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
                           'Safari/537.36')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    # Background mode
    # options.add_argument('headless')

    try:
        driver = webdriver.Chrome(options=options)
        stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
                )

driver.get(
            url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
        )
        time.sleep(10)
        '''Collect all URLs'''
        soup = BeautifulSoup(driver.page_source, 'lxml')
        car_url_list = []
        total_page = soup.find('span', class_='totPage').text
        print('Ford Mustang')
        print(f'Total pages: {total_page}')
        print(f'Page 1 of {total_page} URL collected')
        r = (int(total_page) + 1)
        count = 1
        for i in range(1, r, 1):
            driver.get(
                url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
            )
            driver.implicitly_wait(10)
            soup = BeautifulSoup(driver.page_source, 'lxml')
            car_cards = soup.find_all('a', class_='tricky_link')
            count += 1
            print(f'Page {count} of {total_page} URL collected')
            for car_ulr in car_cards:
                car_ulr = car_ulr.get('href')
                car_url_list.append(car_ulr)
            with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
                for line in car_url_list:
                    file.write(f'{line}n')
        count = 0
        row = []

        '''Collect car's data'''

        with open('ford_mustang_url.txt', encoding='utf8') as f:

            r = len(car_url_list)
            print('Total cars: ' + str(r))
            for i in range(r):

                driver.get(f.readline())
                driver.implicitly_wait(30)
                soup = BeautifulSoup(driver.page_source, 'lxml')
                count += 1



                '''Car Data'''
                car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace('  ', ' ').strip()
                car_price = soup.find('span', class_='GAPrice').find('span').text
                car_year = soup.find('div', class_='mid_border').get('data-year')
                car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
                car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
                car_url = soup.find('link', hreflang='en').get('href')
                # car_engine

                '''If section'''
                if car_reg_number == 'ABC-123':
                    car_reg_number = None

                if car_mileage == '100000000':
                    car_mileage = None

                print(f'{count}. ' + car_name)
                print('Price: ' + f'{car_price}')
                print('Year: ' + f'{car_year}')
                print('Mileage: ' + f'{car_mileage}')
                print('Reg.Number: ' + f'{car_reg_number}')
                print('URL: ' + f'{car_url}n')

                data = {
                    'Name': car_name,
                    'Price': car_price,
                    'Year': car_year,
                    'Mileage': car_mileage,
                    'Reg.Number': car_reg_number,
                    'URL': car_url,
                }
                row.append(data)

            csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
        with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=csv_title)
            writer.writeheader()
            writer.writerows(row)

    except Exception as ex:
        print(ex)
    finally:
        driver.close()
        driver.quit()


def main():
    collect_data()


if __name__ == '__main__':
    main()

Asked By: Tomas Adomson

||

Source

Answer 1

Here is a solution for your problem, not based on selenium (it’s not the right tool for this job), which will produce a dataframe/csv with all the details you’re after:

import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

scraper = cloudscraper.create_scraper()

big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
    r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
    soup = BeautifulSoup(r.text, 'html.parser')
    car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
    for link in car_links:
        urls_list.append(link)
for url in tqdm(urls_list):
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    dfs = pd.read_html(str(r.text))
    df_list = []
    title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
    subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
    df_list.append(('make_model', title))
    df_list.append(('variant', subtitle))
    for i, row in dfs[0].iterrows():
        df_list.append((row[0], row[1]))
        df_list.append((row[3], row[4]))
    correct_df = pd.DataFrame(df_list).T
    new_header = correct_df.iloc[0]
    correct_df = correct_df[1:]
    correct_df.columns = new_header
    big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')

A couple of notes: first 2 cars descriptions are in Finnish, the rest are in English, so the end df/csv will be a bit funny, but the data will be there. Also, you might get some warnings in the terminal about pd append/use concat, but those are just warnings, the program will run.

You can install cloudscraper with pip install cloudscraper, and tqdm with pip install tqdm. Of course, if you’re keen on using Selenium, you can apply the same methods on html obtained from selenium.

Answered By: platipus_on_fire

Answer 2

I have find some solution with selenium by using if else:

car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text

 if car_engine == 'Engine':
     car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
 else:
      car_engine = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]

For Drive type it doesn’t work, so I did this…

drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
    if drive_type == 'Drive type':
        drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
    else:
        drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
        if drive_type == 'Drive type':
            drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
        else:
            drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
            if drive_type == 'Drive type':
                drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
            else:
                drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
                if drive_type == 'Drive type':
                    drive_type = driver.find_element(By.XPATH, '//*[@id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
                else:
                    pass

Answered By: Tomas Adomson

How to parse data after specific text Python Selenium bs4

Question:

Answers: