Why is the following only scraping one page? How can I scrape the other pages as well?

Question:

I am trying to scrape multiple pages, but the following code scrapes only one page. How can I scrape the other pages?

import requests
from bs4 import BeautifulSoup

headers ={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
    r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={}".format(page * 10)') 
    soup=BeautifulSoup(r.content, 'lxml')
    tags=  soup.find_all('li', attrs={'class': 's-item'})
for pro in tags:
    title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
    price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
    

    print(title,price)
Asked By: Arslan Aziz

||

Answers:

I think you should move the for pro in tags loop into the for page in range(0, 10) loop.

Answered By: Ehsan Poursaeed

Few problems: 1) you need the for loop within your loop of pages. and 2) your url syntax is incorrect.

Give this a try:

import requests
from bs4 import BeautifulSoup

headers ={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
    r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page)) 
    soup=BeautifulSoup(r.content, 'lxml')
    tags=  soup.find_all('li', attrs={'class': 's-item'})
    for pro in tags:
        try:
            title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
            price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
        except:
            pass
        
    
        print(title,price)
Answered By: chitown88

Now using correct locator, I’m getting working output:

Code:

import requests
from bs4 import BeautifulSoup
import pandas as pd


Title = []
p = []

headers ={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
for page in range(0, 10):
    r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page), headers = headers) 
    soup=BeautifulSoup(r.content, 'lxml')
    tags=  soup.find_all('div', attrs={'class': 's-item__info clearfix'})
    for pro in tags:
        title=pro.find('h3',class_='s-item__title').text
        Title.append(title)
        
        price =pro.find('div',class_='s-item__detail s-item__detail--primary').text
        p.append(price)
    
        #print(title,price)
        
df = pd.DataFrame(
    
    {"Title": Title, "Price": p}
)
print(df)

Output:

  Title                            Price
0                                                       7S0ponso rPA Eed-1 UJ 0F -1-1  
1    Converse CHUCK TAYLOR All Star Low Top Unisex ...                 $38.95 to $64.95
2    Air Jordan 1 Mid University Gold White Black Y...                          $130.00
3    Air Jordan 1 Mid Metallic Red Gym Red Black Wh...                          $149.95
4    Nike Air Force 1 Low Triple White ‘07 BRAND NE...                           $99.99
..                                                 ...                              ...
465    Salomon S-lab XT 6  Soft Ground Size  Men 11 US                           $50.00
466  Nike Dunk Low Light Bone Tropical Twist (GS) -...                          $170.99
467  Nike Air Force 1 '07 Shoes Black Men's Multi S...                          $100.00
468  12 Mens 13.5 W Reebok Classic Harman Run S Pri...                           $47.99  
469  NIKE SHOX ZOOM AIR Mid High Basketball  Shoes ...                           $50.00  

[470 rows x 2 columns]
Answered By: Fazlul

To extract results from all pages and not just one or ten pages (i.e hardcoded), the best solution is to use an infinite while loop and check for something (button, element) that will trigger to exit it.

This solution is better than a hardcoded for loop since the while loop will run until a certain condition is fulfilled. In our case, this is the presence of a button on the page (.pagination__next selector).

Checking if the button exists and moving to the next page:

if soup.select_one(".pagination__next"):
        params['_pgn'] += 1
    else:
        break

Check code with pagination in online IDE.

from bs4 import BeautifulSoup
import requests, json, lxml

# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
    }
    
params = {
    '_nkw': 'sneakerss',     # search query  
    '_pgn': 1                # page number
    }

data = []

while True:
    page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(page.text, 'lxml')
    
    print(f"Extracting page: {params['_pgn']}")

    print("-" * 10)
    
    for products in soup.select(".s-item__info"):
        title = products.select_one(".s-item__title span").text
        price = products.select_one(".s-item__price").text
        link = products.select_one(".s-item__link")["href"]
        
        data.append({
          "title" : title,
          "price" : price,
          "link" : link
        })

    if soup.select_one(".pagination__next"):
        params['_pgn'] += 1
    else:
        break

    print(json.dumps(data, indent=2, ensure_ascii=False))

Example output:

Extracting page: 1
----------
[
  {
    "title": "Men's shoes DATE 9 (EU 42) sneakers white leather black suede BG148-42",
    "price": "$166.90",
    "link": "https://www.ebay.com/itm/275031117789?hash=item400921cfdd:g:kMsAAOSwXzRhlM3v&amdata=enc%3AAQAHAAAAwNBs47deR8e8PTTlQkvbHokYv%2BebkVKKF1o1v6tMxJ4Yl7GODAYbaWBlBGoY6%2BAXAy1Ay0PP1919S%2FPb9rV4%2FWmGwoiqfd5tR4TcahgUlULiaAR8b1QWZz%2BJle3LwwPh7ZCyrAEeQZu6Z1L6%2FHufDGJTiEV6MEa1aYIT7ErUEY8wdGo2L0D7qHhKtdFqx1piOcA5LtYHLxPBRDqFXmjXvVeq2S2c2Xxvu1JJrB6r%2BEQtuwM3m%2FxgoljOQy48lA26iA%3D%3D%7Ctkp%3ABlBMUPK98f6KYQ"
  },
  {
    "title": "Fashion Running Shoes Men's Walking Athletic Sneakers Tennis Sports Lightweight",
    "price": "$21.99 to $25.99",
    "link": "https://www.ebay.com/itm/374241964856?hash=item57228f3338:g:lVQAAOSwEDdjFgDl&amdata=enc%3AAQAHAAAA4FeR8IbHqB04y4afcJaRYMr%2BNXDhrPrQn%2FtBex%2F76%2Be21v2Q22s89LTKxXzzDChI6PM7kF3HiAjPbP1SVazcZTYhDOA69eyoTeyoKlhQ0g8UF8%2B%2F4Ouvc4kmZ2LIodFQnpmxTXmN6DMby3o60gZOMBGoxyo5AvvjGS9NHmMO0TYn6vHH3RX4ExwuLawJc3Yjx%2BTbyC%2B%2BhMR8ly72gvd2fLu%2F8nVbs8WNqIQj%2BBE0lM07DHLg17o6Ugcy6STNoPWaoU54BaMOYhixu0JkuujfE6h8lgy5%2FLMi1USi4sDPyeP7%7Ctkp%3ABFBM8r3x_oph"
  },
  {
    "title": "adidas Ultraboost 22 Shoes Men's",
    "price": "$133.00 to $190.00",
    "link": "https://www.ebay.com/itm/155163747912?hash=item24207ae648:g:WB0AAOSw2qJjRdhe&amdata=enc%3AAQAHAAAA4Df80B3%2F9%2Bmu1RODyiDdIg14%2B4fbOWSpAXrxoWnz1Uy6mBcGbSuKwlMzISqXwT39a97yuUVxw3Mnl3IgZ6XmZJVAjPPyb40BYs%2F7lBJdBQjRQpwraxSPIK5LY7%2BMLA3krQ29xgfeA8bYFKL4uqmpkgmhNqjhJoCSlB%2B0yQ8w70STypE9D4t%2FYVMznZ%2BpXFILLYwxu%2Bqw46oudcEwT18dtIm0UvueKugPcIjNl%2B7xlPr7ArjO6uMyAnBiG7HLHGukqaJLlP96%2B01brePidilLfNJ0STEvSjECV7txDIG3TXK%2F%7Ctkp%3ABFBM8r3x_oph"
  },
  # ...
]

As an alternative, you can use Ebay Organic Results API from SerpApi. It’s a paid API with a free plan that handles blocks and parsing on their backend.

Example code with pagination:

from serpapi import EbaySearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json

params = {
    "api_key": os.getenv("API_KEY"),  # serpapi api key    
    "engine": "ebay",                 # search engine
    "ebay_domain": "ebay.com",        # ebay domain
    "_nkw": "sneakerss",              # search query
  # "LH_Sold": "1"                    # shows sold items
}

search = EbaySearch(params)        # where data extraction happens

page_num = 0

data = []

while True:
    results = search.get_dict()     # JSON -> Python dict

    if "error" in results:
        print(results["error"])
        break
    
    for organic_result in results.get("organic_results", []):
        link = organic_result.get("link")
        price = organic_result.get("price")

        data.append({
          "price" : price,
          "link" : link
        })
                    
    page_num += 1
    print(page_num)
    
    next_page_query_dict = dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)) 
    current_page = results["serpapi_pagination"]["current"] # 1,2,3...

    # looks for the next page data (_pgn):
    if "next" in results.get("pagination", {}):
        
        # if current_page = 20 and next_page_query_dict["_pgn"] = 20: break
        if int(current_page) == int(next_page_query_dict["_pgn"]):
            break
        
        # update next page data
        search.params_dict.update(next_page_query_dict)
    else:
        break
    print(json.dumps(data, indent=2))

Output:

[
    {
    "price": {
      "raw": "$50.00",
      "extracted": 50.0
    },
    "link": "https://www.ebay.com/itm/155159964475?hash=item2420412b3b:g:sc4AAOSwBj1jOcJe&amdata=enc%3AAQAHAAAAoGu%2FVS9Jqy79HSnvhDrxGfI2MAvPyAO%2BhHiSviy2A%2Bgn%2BYCi8IkFzN2dHBPlVlUgTFe6qGbssF1%2B83svDr2H4K5SnbR79MOdCho1ttV3VXkcgq%2FH0xNnOrr%2Bx5dkpF0jqzJU38Io93wdRt%2B4LZkkQhkXLg0HA%2F%2B%2BRCT9D%2Boyorl0lQBMjpXdDoWsbkcV9eroKOnmMwvQILNGme61V6zFb2Q%3D%7Ctkp%3ABk9SR7iu-f-KYQ"
  },
  {
    "price": {
      "raw": "$168.67",
      "extracted": 168.67
    },
    "link": "https://www.ebay.com/itm/125289511527?var=426318775584&epid=5053638274&hash=item1d2bd63a67:g:yUIAAOSwE0libYFy&amdata=enc%3AAQAHAAAA4F5dnQv6eqwesf4aXjp2fToBBYgqeE8vREnDFNgHPi%2FKvVwgofgKdEoBtIKP1pM5SoyMa63An1rIMvAsn0iA7MOt%2Bcp8StGY%2BKXfWHbM0T1x287nGBdZZbF7YEIHEpvkM70ZfU32l8VYS%2F6mtT2gkci4tdB8kIzALUS29mjiL3PBhuFZWPmJu1SZqvwgVoBSmiaG0q8AfWNB2OZ5XtT%2Fyt56HjCw9WOKRKxMDT%2BhqEm%2FHt72e8ebO2IwR56BEoLUPPBZ9wT8YR1qpTq6J3qkWMPZiQNYOnlsu0BDGT58Q13t%7Ctkp%3ABk9SR7iu-f-KYQ"
  },
  # ...
]
Answered By: Denis Skopa