Looping through the page numbers with Python BeautifulSoup
Question:
Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))
Answers:
You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range()
is not included, so you probably want to do for page in range(1, 4):
to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
The solution from bitflip is fine, however a few things I’ll point out to help you.
- try to avoid variable names that are predefined functions in python. For example
list
being one of those.
- while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It’s a very powerful tool.
Here’s how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))
Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))
You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range()
is not included, so you probably want to do for page in range(1, 4):
to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
The solution from bitflip is fine, however a few things I’ll point out to help you.
- try to avoid variable names that are predefined functions in python. For example
list
being one of those. - while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It’s a very powerful tool.
Here’s how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))