Why is the following only scraping one page? How can I scrape the other pages as well?
Question:
I am trying to scrape multiple pages, but the following code scrapes only one page. How can I scrape the other pages?
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={}".format(page * 10)')
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('li', attrs={'class': 's-item'})
for pro in tags:
title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
print(title,price)
Answers:
I think you should move the for pro in tags
loop into the for page in range(0, 10)
loop.
Few problems: 1) you need the for loop within your loop of pages. and 2) your url syntax is incorrect.
Give this a try:
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page))
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('li', attrs={'class': 's-item'})
for pro in tags:
try:
title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
except:
pass
print(title,price)
Now using correct locator, I’m getting working output:
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
Title = []
p = []
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page), headers = headers)
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('div', attrs={'class': 's-item__info clearfix'})
for pro in tags:
title=pro.find('h3',class_='s-item__title').text
Title.append(title)
price =pro.find('div',class_='s-item__detail s-item__detail--primary').text
p.append(price)
#print(title,price)
df = pd.DataFrame(
{"Title": Title, "Price": p}
)
print(df)
Output:
Title Price
0 7S0ponso rPA Eed-1 UJ 0F -1-1
1 Converse CHUCK TAYLOR All Star Low Top Unisex ... $38.95 to $64.95
2 Air Jordan 1 Mid University Gold White Black Y... $130.00
3 Air Jordan 1 Mid Metallic Red Gym Red Black Wh... $149.95
4 Nike Air Force 1 Low Triple White ‘07 BRAND NE... $99.99
.. ... ...
465 Salomon S-lab XT 6 Soft Ground Size Men 11 US $50.00
466 Nike Dunk Low Light Bone Tropical Twist (GS) -... $170.99
467 Nike Air Force 1 '07 Shoes Black Men's Multi S... $100.00
468 12 Mens 13.5 W Reebok Classic Harman Run S Pri... $47.99
469 NIKE SHOX ZOOM AIR Mid High Basketball Shoes ... $50.00
[470 rows x 2 columns]
To extract results from all pages and not just one or ten pages (i.e hardcoded), the best solution is to use an infinite while
loop and check for something (button, element) that will trigger to exit it.
This solution is better than a hardcoded for
loop since the while
loop will run until a certain condition is fulfilled. In our case, this is the presence of a button on the page (.pagination__next
selector).
Checking if the button exists and moving to the next page:
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
Check code with pagination in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'_nkw': 'sneakerss', # search query
'_pgn': 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
Extracting page: 1
----------
[
{
"title": "Men's shoes DATE 9 (EU 42) sneakers white leather black suede BG148-42",
"price": "$166.90",
"link": "https://www.ebay.com/itm/275031117789?hash=item400921cfdd:g:kMsAAOSwXzRhlM3v&amdata=enc%3AAQAHAAAAwNBs47deR8e8PTTlQkvbHokYv%2BebkVKKF1o1v6tMxJ4Yl7GODAYbaWBlBGoY6%2BAXAy1Ay0PP1919S%2FPb9rV4%2FWmGwoiqfd5tR4TcahgUlULiaAR8b1QWZz%2BJle3LwwPh7ZCyrAEeQZu6Z1L6%2FHufDGJTiEV6MEa1aYIT7ErUEY8wdGo2L0D7qHhKtdFqx1piOcA5LtYHLxPBRDqFXmjXvVeq2S2c2Xxvu1JJrB6r%2BEQtuwM3m%2FxgoljOQy48lA26iA%3D%3D%7Ctkp%3ABlBMUPK98f6KYQ"
},
{
"title": "Fashion Running Shoes Men's Walking Athletic Sneakers Tennis Sports Lightweight",
"price": "$21.99 to $25.99",
"link": "https://www.ebay.com/itm/374241964856?hash=item57228f3338:g:lVQAAOSwEDdjFgDl&amdata=enc%3AAQAHAAAA4FeR8IbHqB04y4afcJaRYMr%2BNXDhrPrQn%2FtBex%2F76%2Be21v2Q22s89LTKxXzzDChI6PM7kF3HiAjPbP1SVazcZTYhDOA69eyoTeyoKlhQ0g8UF8%2B%2F4Ouvc4kmZ2LIodFQnpmxTXmN6DMby3o60gZOMBGoxyo5AvvjGS9NHmMO0TYn6vHH3RX4ExwuLawJc3Yjx%2BTbyC%2B%2BhMR8ly72gvd2fLu%2F8nVbs8WNqIQj%2BBE0lM07DHLg17o6Ugcy6STNoPWaoU54BaMOYhixu0JkuujfE6h8lgy5%2FLMi1USi4sDPyeP7%7Ctkp%3ABFBM8r3x_oph"
},
{
"title": "adidas Ultraboost 22 Shoes Men's",
"price": "$133.00 to $190.00",
"link": "https://www.ebay.com/itm/155163747912?hash=item24207ae648:g:WB0AAOSw2qJjRdhe&amdata=enc%3AAQAHAAAA4Df80B3%2F9%2Bmu1RODyiDdIg14%2B4fbOWSpAXrxoWnz1Uy6mBcGbSuKwlMzISqXwT39a97yuUVxw3Mnl3IgZ6XmZJVAjPPyb40BYs%2F7lBJdBQjRQpwraxSPIK5LY7%2BMLA3krQ29xgfeA8bYFKL4uqmpkgmhNqjhJoCSlB%2B0yQ8w70STypE9D4t%2FYVMznZ%2BpXFILLYwxu%2Bqw46oudcEwT18dtIm0UvueKugPcIjNl%2B7xlPr7ArjO6uMyAnBiG7HLHGukqaJLlP96%2B01brePidilLfNJ0STEvSjECV7txDIG3TXK%2F%7Ctkp%3ABFBM8r3x_oph"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It’s a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "sneakerss", # search query
# "LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
next_page_query_dict = dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query))
current_page = results["serpapi_pagination"]["current"] # 1,2,3...
# looks for the next page data (_pgn):
if "next" in results.get("pagination", {}):
# if current_page = 20 and next_page_query_dict["_pgn"] = 20: break
if int(current_page) == int(next_page_query_dict["_pgn"]):
break
# update next page data
search.params_dict.update(next_page_query_dict)
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$50.00",
"extracted": 50.0
},
"link": "https://www.ebay.com/itm/155159964475?hash=item2420412b3b:g:sc4AAOSwBj1jOcJe&amdata=enc%3AAQAHAAAAoGu%2FVS9Jqy79HSnvhDrxGfI2MAvPyAO%2BhHiSviy2A%2Bgn%2BYCi8IkFzN2dHBPlVlUgTFe6qGbssF1%2B83svDr2H4K5SnbR79MOdCho1ttV3VXkcgq%2FH0xNnOrr%2Bx5dkpF0jqzJU38Io93wdRt%2B4LZkkQhkXLg0HA%2F%2B%2BRCT9D%2Boyorl0lQBMjpXdDoWsbkcV9eroKOnmMwvQILNGme61V6zFb2Q%3D%7Ctkp%3ABk9SR7iu-f-KYQ"
},
{
"price": {
"raw": "$168.67",
"extracted": 168.67
},
"link": "https://www.ebay.com/itm/125289511527?var=426318775584&epid=5053638274&hash=item1d2bd63a67:g:yUIAAOSwE0libYFy&amdata=enc%3AAQAHAAAA4F5dnQv6eqwesf4aXjp2fToBBYgqeE8vREnDFNgHPi%2FKvVwgofgKdEoBtIKP1pM5SoyMa63An1rIMvAsn0iA7MOt%2Bcp8StGY%2BKXfWHbM0T1x287nGBdZZbF7YEIHEpvkM70ZfU32l8VYS%2F6mtT2gkci4tdB8kIzALUS29mjiL3PBhuFZWPmJu1SZqvwgVoBSmiaG0q8AfWNB2OZ5XtT%2Fyt56HjCw9WOKRKxMDT%2BhqEm%2FHt72e8ebO2IwR56BEoLUPPBZ9wT8YR1qpTq6J3qkWMPZiQNYOnlsu0BDGT58Q13t%7Ctkp%3ABk9SR7iu-f-KYQ"
},
# ...
]
I am trying to scrape multiple pages, but the following code scrapes only one page. How can I scrape the other pages?
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={}".format(page * 10)')
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('li', attrs={'class': 's-item'})
for pro in tags:
title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
print(title,price)
I think you should move the for pro in tags
loop into the for page in range(0, 10)
loop.
Few problems: 1) you need the for loop within your loop of pages. and 2) your url syntax is incorrect.
Give this a try:
import requests
from bs4 import BeautifulSoup
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page))
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('li', attrs={'class': 's-item'})
for pro in tags:
try:
title=pro.find('h3',class_='s-item__title').text.encode("utf-8")
price=pro.find('div',class_='s-item__detail s-item__detail--primary').text.encode("utf-8")
except:
pass
print(title,price)
Now using correct locator, I’m getting working output:
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
Title = []
p = []
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
for page in range(0, 10):
r =requests.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=sneakers&_sacat=0&_pgn={page}'.format(page=page), headers = headers)
soup=BeautifulSoup(r.content, 'lxml')
tags= soup.find_all('div', attrs={'class': 's-item__info clearfix'})
for pro in tags:
title=pro.find('h3',class_='s-item__title').text
Title.append(title)
price =pro.find('div',class_='s-item__detail s-item__detail--primary').text
p.append(price)
#print(title,price)
df = pd.DataFrame(
{"Title": Title, "Price": p}
)
print(df)
Output:
Title Price
0 7S0ponso rPA Eed-1 UJ 0F -1-1
1 Converse CHUCK TAYLOR All Star Low Top Unisex ... $38.95 to $64.95
2 Air Jordan 1 Mid University Gold White Black Y... $130.00
3 Air Jordan 1 Mid Metallic Red Gym Red Black Wh... $149.95
4 Nike Air Force 1 Low Triple White ‘07 BRAND NE... $99.99
.. ... ...
465 Salomon S-lab XT 6 Soft Ground Size Men 11 US $50.00
466 Nike Dunk Low Light Bone Tropical Twist (GS) -... $170.99
467 Nike Air Force 1 '07 Shoes Black Men's Multi S... $100.00
468 12 Mens 13.5 W Reebok Classic Harman Run S Pri... $47.99
469 NIKE SHOX ZOOM AIR Mid High Basketball Shoes ... $50.00
[470 rows x 2 columns]
To extract results from all pages and not just one or ten pages (i.e hardcoded), the best solution is to use an infinite while
loop and check for something (button, element) that will trigger to exit it.
This solution is better than a hardcoded for
loop since the while
loop will run until a certain condition is fulfilled. In our case, this is the presence of a button on the page (.pagination__next
selector).
Checking if the button exists and moving to the next page:
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
Check code with pagination in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'_nkw': 'sneakerss', # search query
'_pgn': 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
Extracting page: 1
----------
[
{
"title": "Men's shoes DATE 9 (EU 42) sneakers white leather black suede BG148-42",
"price": "$166.90",
"link": "https://www.ebay.com/itm/275031117789?hash=item400921cfdd:g:kMsAAOSwXzRhlM3v&amdata=enc%3AAQAHAAAAwNBs47deR8e8PTTlQkvbHokYv%2BebkVKKF1o1v6tMxJ4Yl7GODAYbaWBlBGoY6%2BAXAy1Ay0PP1919S%2FPb9rV4%2FWmGwoiqfd5tR4TcahgUlULiaAR8b1QWZz%2BJle3LwwPh7ZCyrAEeQZu6Z1L6%2FHufDGJTiEV6MEa1aYIT7ErUEY8wdGo2L0D7qHhKtdFqx1piOcA5LtYHLxPBRDqFXmjXvVeq2S2c2Xxvu1JJrB6r%2BEQtuwM3m%2FxgoljOQy48lA26iA%3D%3D%7Ctkp%3ABlBMUPK98f6KYQ"
},
{
"title": "Fashion Running Shoes Men's Walking Athletic Sneakers Tennis Sports Lightweight",
"price": "$21.99 to $25.99",
"link": "https://www.ebay.com/itm/374241964856?hash=item57228f3338:g:lVQAAOSwEDdjFgDl&amdata=enc%3AAQAHAAAA4FeR8IbHqB04y4afcJaRYMr%2BNXDhrPrQn%2FtBex%2F76%2Be21v2Q22s89LTKxXzzDChI6PM7kF3HiAjPbP1SVazcZTYhDOA69eyoTeyoKlhQ0g8UF8%2B%2F4Ouvc4kmZ2LIodFQnpmxTXmN6DMby3o60gZOMBGoxyo5AvvjGS9NHmMO0TYn6vHH3RX4ExwuLawJc3Yjx%2BTbyC%2B%2BhMR8ly72gvd2fLu%2F8nVbs8WNqIQj%2BBE0lM07DHLg17o6Ugcy6STNoPWaoU54BaMOYhixu0JkuujfE6h8lgy5%2FLMi1USi4sDPyeP7%7Ctkp%3ABFBM8r3x_oph"
},
{
"title": "adidas Ultraboost 22 Shoes Men's",
"price": "$133.00 to $190.00",
"link": "https://www.ebay.com/itm/155163747912?hash=item24207ae648:g:WB0AAOSw2qJjRdhe&amdata=enc%3AAQAHAAAA4Df80B3%2F9%2Bmu1RODyiDdIg14%2B4fbOWSpAXrxoWnz1Uy6mBcGbSuKwlMzISqXwT39a97yuUVxw3Mnl3IgZ6XmZJVAjPPyb40BYs%2F7lBJdBQjRQpwraxSPIK5LY7%2BMLA3krQ29xgfeA8bYFKL4uqmpkgmhNqjhJoCSlB%2B0yQ8w70STypE9D4t%2FYVMznZ%2BpXFILLYwxu%2Bqw46oudcEwT18dtIm0UvueKugPcIjNl%2B7xlPr7ArjO6uMyAnBiG7HLHGukqaJLlP96%2B01brePidilLfNJ0STEvSjECV7txDIG3TXK%2F%7Ctkp%3ABFBM8r3x_oph"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It’s a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "sneakerss", # search query
# "LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
next_page_query_dict = dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query))
current_page = results["serpapi_pagination"]["current"] # 1,2,3...
# looks for the next page data (_pgn):
if "next" in results.get("pagination", {}):
# if current_page = 20 and next_page_query_dict["_pgn"] = 20: break
if int(current_page) == int(next_page_query_dict["_pgn"]):
break
# update next page data
search.params_dict.update(next_page_query_dict)
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$50.00",
"extracted": 50.0
},
"link": "https://www.ebay.com/itm/155159964475?hash=item2420412b3b:g:sc4AAOSwBj1jOcJe&amdata=enc%3AAQAHAAAAoGu%2FVS9Jqy79HSnvhDrxGfI2MAvPyAO%2BhHiSviy2A%2Bgn%2BYCi8IkFzN2dHBPlVlUgTFe6qGbssF1%2B83svDr2H4K5SnbR79MOdCho1ttV3VXkcgq%2FH0xNnOrr%2Bx5dkpF0jqzJU38Io93wdRt%2B4LZkkQhkXLg0HA%2F%2B%2BRCT9D%2Boyorl0lQBMjpXdDoWsbkcV9eroKOnmMwvQILNGme61V6zFb2Q%3D%7Ctkp%3ABk9SR7iu-f-KYQ"
},
{
"price": {
"raw": "$168.67",
"extracted": 168.67
},
"link": "https://www.ebay.com/itm/125289511527?var=426318775584&epid=5053638274&hash=item1d2bd63a67:g:yUIAAOSwE0libYFy&amdata=enc%3AAQAHAAAA4F5dnQv6eqwesf4aXjp2fToBBYgqeE8vREnDFNgHPi%2FKvVwgofgKdEoBtIKP1pM5SoyMa63An1rIMvAsn0iA7MOt%2Bcp8StGY%2BKXfWHbM0T1x287nGBdZZbF7YEIHEpvkM70ZfU32l8VYS%2F6mtT2gkci4tdB8kIzALUS29mjiL3PBhuFZWPmJu1SZqvwgVoBSmiaG0q8AfWNB2OZ5XtT%2Fyt56HjCw9WOKRKxMDT%2BhqEm%2FHt72e8ebO2IwR56BEoLUPPBZ9wT8YR1qpTq6J3qkWMPZiQNYOnlsu0BDGT58Q13t%7Ctkp%3ABk9SR7iu-f-KYQ"
},
# ...
]