How to Use Selenium Webdriver to download files via a list of URLs
Question:
I wrote a code that use Selenium Webdriver to download files via a list of URLs but for some reason it didn’t download anything to my assignedn directory. The code works perfectly fine when I only download it one by one but when I use a for loop, it doesn’t work.
This is an example URL: https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf
Here is my code:
download_dir = '/Users/datawizard/files/'
for web in down_link:
try:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": '/Users/clinton/GRA_2021/scraping_project/pdf/',
"download.prompt_for_download": False,
"download.directory_upgrade": True,
# "safebrowsing.enabled": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
command_result = driver.execute("send_command", params)
driver.get(url)
except:
print(str(web)+"Link cannot be open")
I am wondering did I do something wrong with the code since it doesn’t give me any error when I ran the code above.
Answers:
You don’t need Selenium to download files, you can download files easily using the request
library
import requests
for web in down_link:
fileName = YOUR_DOWNLOAD_PATH + web.split("=")[1].split("&")[0] + ".pdf" #I created a filename
r = requests.get(web, stream=True)
with open(fileName, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
Updated Answer based on Selenium
#replace the below value with your urls list
down_link = [
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf',
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf']
download_dir = "/Users/datawizard/files/"
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
for web in down_link:
driver.get(web)
time.sleep(5) #wait for the download to end, a better handling it's to check if the file exists
driver.quit()
If your files don’t have a unique file name – the above code will replace the existing file with the downloaded one.
I wrote a code that use Selenium Webdriver to download files via a list of URLs but for some reason it didn’t download anything to my assignedn directory. The code works perfectly fine when I only download it one by one but when I use a for loop, it doesn’t work.
This is an example URL: https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf
Here is my code:
download_dir = '/Users/datawizard/files/'
for web in down_link:
try:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": '/Users/clinton/GRA_2021/scraping_project/pdf/',
"download.prompt_for_download": False,
"download.directory_upgrade": True,
# "safebrowsing.enabled": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
command_result = driver.execute("send_command", params)
driver.get(url)
except:
print(str(web)+"Link cannot be open")
I am wondering did I do something wrong with the code since it doesn’t give me any error when I ran the code above.
You don’t need Selenium to download files, you can download files easily using the request
library
import requests
for web in down_link:
fileName = YOUR_DOWNLOAD_PATH + web.split("=")[1].split("&")[0] + ".pdf" #I created a filename
r = requests.get(web, stream=True)
with open(fileName, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
Updated Answer based on Selenium
#replace the below value with your urls list
down_link = [
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf',
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf']
download_dir = "/Users/datawizard/files/"
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
for web in down_link:
driver.get(web)
time.sleep(5) #wait for the download to end, a better handling it's to check if the file exists
driver.quit()
If your files don’t have a unique file name – the above code will replace the existing file with the downloaded one.