Download all pdf files from a website using Python
Question:
I have followed several online guides in an attempt to build a script that can identify and download all pdfs from a website to save me from doing it manually. Here is my code so far:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href']))
#print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf",""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
The code can appear to find all the pdfs (uncomment the print(url_list)
to see this). However, it fails at the download stage. In particular I get this error and I am not able to understand what’s gone wrong:
E:webscraping>python get_pdfs.py
http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf
E:webscrapinghttp://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet
Traceback (most recent call last):
File "get_pdfs.py", line 26, in <module>
request.urlretrieve(url, fullfilename)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 532, in open
response = meth(req, response)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 570, in error
return self._call_chain(*args)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 504, in _call_chain
result = func(*args)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Can somebody help me please?
Answers:
couple of links where already containing the server address which caused the 404 not found. Also you should not remove the .pdf
from the filename as it will save it without extension.
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
Check out the following implementation. I’ve used requests
module instead of urllib
to do the download. Moreover, I’ve used .select()
method instead of .find_all()
to avoid using re
.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
#If there is no such folder, the script will create one automatically
folder_location = r'E:webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
Generally, the answers above should work. However, you should evaluate the html source of the webpage you’re trying to work with. For example, some might have the og_url property in the meta tag while others may not have it. This is possible if you’re working with a secure website (let’s say your university’s course web-page). In this case, you will have to extract the pdf links differently.
You can find a good explanation and solution here:
I write a novel script based on @SIM’s answer with the additional argparse
. My full code is as follows:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse
#%% Example
# one pdf
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html
#%% Functions
def all_pdf_download(args):
base_url = args.link
if args.save_here:
folder_path = os.getcwd()
else:
folder_path = args.folder_path
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
print("====== 1. Set savepath: {} ======".format(folder_path))
print("====== 2. Start searching ======")
#response = requests.get(base_url)
response = requests.get(base_url, headers={'User-Agent': 'Custom'})
soup= BeautifulSoup(response.text, "html.parser")
search_res = soup.select("a[href$='.pdf']")
print("{} files found!!!".format(len(search_res)))
print("====== 3. Start downloading ======")
for counter, link in enumerate(search_res):
#Name the pdf files using the last portion of each link which are unique in this case
filename = link['href'].split('/')[-1]
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
if args.print_all:
print("[{}/{}] {}".format(counter+1, len(search_res), filename))
with open(file_save_path, 'wb') as f:
f.write(requests.get(urljoin(base_url,link['href'])).content)
print("====== 4. Finished!!! ======")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test argparse')
####################################
############ ALL OPTION ############
## Main option
# -l/--link
parser.add_argument('-l', '--link', required=True, type=str,
help='write down site name')
# --print-all
parser.add_argument('--print-all', dest='print_all', action='store_true',
help="print all filename")
parser.set_defaults(print_all=True)
# --save-here
parser.add_argument('--save-here', dest='save_here', action='store_true',
help="save files here")
parser.set_defaults(save_here=False)
# --save--folder
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
type=str, help='save files in the given folder')
########################################
############ PARSING OPTION ############
args = parser.parse_args()
all_pdf_download(args)
For more details and update, you can refer to my gist-hibetterheyj/all_pdf_dl.py
Best!
Variations to @SIM’s answer for my needs:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=Compilers&doc=docs/slides.html"
pdfPath = "http://openclassroom.stanford.edu/MainFolder/courses/Compilers/docs/"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append(pdfPath + el['href'])
print(f'url_list: {url_list}n')
# download the pdfs to a specified location
for url in url_list:
print(f'urL: {url}n')
fullfilename = os.path.join(r'standfordPdfs', url.replace(pdfPath, ""))
print(f'fullfilename: {fullfilename}')
request.urlretrieve(url, fullfilename)
I have followed several online guides in an attempt to build a script that can identify and download all pdfs from a website to save me from doing it manually. Here is my code so far:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href']))
#print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf",""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
The code can appear to find all the pdfs (uncomment the print(url_list)
to see this). However, it fails at the download stage. In particular I get this error and I am not able to understand what’s gone wrong:
E:webscraping>python get_pdfs.py
http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf
E:webscrapinghttp://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet
Traceback (most recent call last):
File "get_pdfs.py", line 26, in <module>
request.urlretrieve(url, fullfilename)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 532, in open
response = meth(req, response)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 570, in error
return self._call_chain(*args)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 504, in _call_chain
result = func(*args)
File "C:UsersUserAnaconda3envssnakeliburllibrequest.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Can somebody help me please?
couple of links where already containing the server address which caused the 404 not found. Also you should not remove the .pdf
from the filename as it will save it without extension.
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
Check out the following implementation. I’ve used requests
module instead of urllib
to do the download. Moreover, I’ve used .select()
method instead of .find_all()
to avoid using re
.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
#If there is no such folder, the script will create one automatically
folder_location = r'E:webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
Generally, the answers above should work. However, you should evaluate the html source of the webpage you’re trying to work with. For example, some might have the og_url property in the meta tag while others may not have it. This is possible if you’re working with a secure website (let’s say your university’s course web-page). In this case, you will have to extract the pdf links differently.
You can find a good explanation and solution here:
I write a novel script based on @SIM’s answer with the additional argparse
. My full code is as follows:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse
#%% Example
# one pdf
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html
#%% Functions
def all_pdf_download(args):
base_url = args.link
if args.save_here:
folder_path = os.getcwd()
else:
folder_path = args.folder_path
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
print("====== 1. Set savepath: {} ======".format(folder_path))
print("====== 2. Start searching ======")
#response = requests.get(base_url)
response = requests.get(base_url, headers={'User-Agent': 'Custom'})
soup= BeautifulSoup(response.text, "html.parser")
search_res = soup.select("a[href$='.pdf']")
print("{} files found!!!".format(len(search_res)))
print("====== 3. Start downloading ======")
for counter, link in enumerate(search_res):
#Name the pdf files using the last portion of each link which are unique in this case
filename = link['href'].split('/')[-1]
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
if args.print_all:
print("[{}/{}] {}".format(counter+1, len(search_res), filename))
with open(file_save_path, 'wb') as f:
f.write(requests.get(urljoin(base_url,link['href'])).content)
print("====== 4. Finished!!! ======")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test argparse')
####################################
############ ALL OPTION ############
## Main option
# -l/--link
parser.add_argument('-l', '--link', required=True, type=str,
help='write down site name')
# --print-all
parser.add_argument('--print-all', dest='print_all', action='store_true',
help="print all filename")
parser.set_defaults(print_all=True)
# --save-here
parser.add_argument('--save-here', dest='save_here', action='store_true',
help="save files here")
parser.set_defaults(save_here=False)
# --save--folder
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
type=str, help='save files in the given folder')
########################################
############ PARSING OPTION ############
args = parser.parse_args()
all_pdf_download(args)
For more details and update, you can refer to my gist-hibetterheyj/all_pdf_dl.py
Best!
Variations to @SIM’s answer for my needs:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=Compilers&doc=docs/slides.html"
pdfPath = "http://openclassroom.stanford.edu/MainFolder/courses/Compilers/docs/"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append(pdfPath + el['href'])
print(f'url_list: {url_list}n')
# download the pdfs to a specified location
for url in url_list:
print(f'urL: {url}n')
fullfilename = os.path.join(r'standfordPdfs', url.replace(pdfPath, ""))
print(f'fullfilename: {fullfilename}')
request.urlretrieve(url, fullfilename)