How to save all links from all pages to csv using python beautiful soup
Question:
Am trying to save all links collected from multiple paginated pages to csv. from print(links) I can see all the links I want to save from multiple pages but unfortunately, when I open csv file, am only finding one URL save. How can I save all URLs I see from the terminal (print(links) to csv?
below is my code:
def scrape_pages(url) -> None:
#max_pages = 10
max_pages = 5 # doing 3 pages for examples sake
current_page = 1
# Loop through all pages dynamically and build the url using the page number suffix the website uses
while current_page <= max_pages:
print(f'{url}page/{current_page}')
# Get each page's html
raw_html1 = requests.get(f'{url}page/{current_page}')
soup1 = BeautifulSoup(raw_html1.text, 'html.parser')
current_page += 1
# Find all table rows and from each table row get the needed data
#root = 'https://wwwxxxxxx.com'
for link1 in soup1.find_all('li',{'class':'xxx'}):
link2 = link1.find('a',href=True)
link3 = 'https://www.xxxxxxx.com'+(link2['href'])
links = []
[links.append(link3) for link2 in link1 ]
for link2 in links:
raw_html = urlopen(link3)
soup = BeautifulSoup(raw_html.read(), 'html.parser')
def getTitle(soup):
return soup.find('h2', class_="xxx").text.strip()
with open('output.csv', 'w', encoding='utf8', newline='') as
f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Title'])
row = [getTitle(soup)]
print(row)
for f_output in row:
csv_output.writerow(row)
# print(product, row, Title)
time.sleep(5) # sleep before scraping next page to not send too
many requests at once
print('nn') # Clearing console up
def main() -> int:
URL = 'https://www.xxxxxx.com/'
scrape_pages(URL)
return 0
if name == ‘main‘:
exit(main())
Answers:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time as t
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
links_list = []
for x in range(1, 3):
r = s.get(f'https://www.myjobmag.com/page/{x}')
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.select_one('ul.job-list').select('li.job-list-li')
for link in links:
try:
title = link.select_one('h2').text.strip()
url = link.select_one('h2').select_one('a').get('href')
r = s.get(f'https://www.myjobmag.com{url}')
soup = BeautifulSoup(r.text, 'html.parser')
key_info = soup.select_one('ul.job-key-info').text.strip()
description = soup.select_one('div.job-details').text.strip()
application_method = soup.select_one('div.mag-b.bm-b-30').text.strip()
links_list.append((title, key_info, description, application_method, url))
print(f'done {title} -- {url}')
t.sleep(5)
except Exception as e:
print(e)
df = pd.DataFrame(links_list, columns = ['title', 'key_info', 'description', 'application_method', 'url'])
df.to_csv('my_wonderful_jobs_list.csv')
This will return a csv file with job title, key info, description, application method, and url.
Am trying to save all links collected from multiple paginated pages to csv. from print(links) I can see all the links I want to save from multiple pages but unfortunately, when I open csv file, am only finding one URL save. How can I save all URLs I see from the terminal (print(links) to csv?
below is my code:
def scrape_pages(url) -> None:
#max_pages = 10
max_pages = 5 # doing 3 pages for examples sake
current_page = 1
# Loop through all pages dynamically and build the url using the page number suffix the website uses
while current_page <= max_pages:
print(f'{url}page/{current_page}')
# Get each page's html
raw_html1 = requests.get(f'{url}page/{current_page}')
soup1 = BeautifulSoup(raw_html1.text, 'html.parser')
current_page += 1
# Find all table rows and from each table row get the needed data
#root = 'https://wwwxxxxxx.com'
for link1 in soup1.find_all('li',{'class':'xxx'}):
link2 = link1.find('a',href=True)
link3 = 'https://www.xxxxxxx.com'+(link2['href'])
links = []
[links.append(link3) for link2 in link1 ]
for link2 in links:
raw_html = urlopen(link3)
soup = BeautifulSoup(raw_html.read(), 'html.parser')
def getTitle(soup):
return soup.find('h2', class_="xxx").text.strip()
with open('output.csv', 'w', encoding='utf8', newline='') as
f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Title'])
row = [getTitle(soup)]
print(row)
for f_output in row:
csv_output.writerow(row)
# print(product, row, Title)
time.sleep(5) # sleep before scraping next page to not send too
many requests at once
print('nn') # Clearing console up
def main() -> int:
URL = 'https://www.xxxxxx.com/'
scrape_pages(URL)
return 0
if name == ‘main‘:
exit(main())
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time as t
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
links_list = []
for x in range(1, 3):
r = s.get(f'https://www.myjobmag.com/page/{x}')
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.select_one('ul.job-list').select('li.job-list-li')
for link in links:
try:
title = link.select_one('h2').text.strip()
url = link.select_one('h2').select_one('a').get('href')
r = s.get(f'https://www.myjobmag.com{url}')
soup = BeautifulSoup(r.text, 'html.parser')
key_info = soup.select_one('ul.job-key-info').text.strip()
description = soup.select_one('div.job-details').text.strip()
application_method = soup.select_one('div.mag-b.bm-b-30').text.strip()
links_list.append((title, key_info, description, application_method, url))
print(f'done {title} -- {url}')
t.sleep(5)
except Exception as e:
print(e)
df = pd.DataFrame(links_list, columns = ['title', 'key_info', 'description', 'application_method', 'url'])
df.to_csv('my_wonderful_jobs_list.csv')
This will return a csv file with job title, key info, description, application method, and url.