Download large file in python with requests
Question:
Requests is a really nice library. I’d like to use it for downloading big files (>1GB).
The problem is it’s not possible to keep whole file in memory; I need to read it in chunks. And this is a problem with the following code:
import requests
def DownloadFile(url)
local_filename = url.split('/')[-1]
r = requests.get(url)
f = open(local_filename, 'wb')
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
return
For some reason it doesn’t work this way; it still loads the response into memory before it is saved to a file.
Answers:
Your chunk size could be too large, have you tried dropping that – maybe 1024 bytes at a time? (also, you could use with
to tidy up the syntax)
def DownloadFile(url):
local_filename = url.split('/')[-1]
r = requests.get(url)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return
Incidentally, how are you deducing that the response has been loaded into memory?
It sounds as if python isn’t flushing the data to file, from other SO questions you could try f.flush()
and os.fsync()
to force the file write and free memory;
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
os.fsync(f.fileno())
With the following streaming code, the Python memory usage is restricted regardless of the size of the downloaded file:
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
Note that the number of bytes returned using iter_content
is not exactly the chunk_size
; it’s expected to be a random number that is often far bigger, and is expected to be different in every iteration.
See body-content-workflow and Response.iter_content for further reference.
It’s much easier if you use Response.raw
and shutil.copyfileobj()
:
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
This streams the file to disk without using excessive memory, and the code is simple.
Note: According to the documentation, Response.raw
will not decode gzip
and deflate
transfer-encodings, so you will need to do this manually.
Not exactly what OP was asking, but… it’s ridiculously easy to do that with urllib
:
from urllib.request import urlretrieve
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
dst = 'ubuntu-16.04.2-desktop-amd64.iso'
urlretrieve(url, dst)
Or this way, if you want to save it to a temporary file:
from urllib.request import urlopen
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
with urlopen(url) as fsrc, NamedTemporaryFile(delete=False) as fdst:
copyfileobj(fsrc, fdst)
I watched the process:
watch 'ps -p 18647 -o pid,ppid,pmem,rsz,vsz,comm,args; ls -al *.iso'
And I saw the file growing, but memory usage stayed at 17 MB. Am I missing something?
Based on the Roman’s most upvoted comment above, here is my implementation,
Including "download as" and "retries" mechanism:
def download(url: str, file_path='', attempts=2):
"""Downloads a URL content into a file (with large file support by streaming)
:param url: URL to download
:param file_path: Local file name to contain the data downloaded
:param attempts: Number of attempts
:return: New file path. Empty string if the download failed
"""
if not file_path:
file_path = os.path.realpath(os.path.basename(url))
logger.info(f'Downloading {url} content to {file_path}')
url_sections = urlparse(url)
if not url_sections.scheme:
logger.debug('The given url is missing a scheme. Adding http scheme')
url = f'http://{url}'
logger.debug(f'New url: {url}')
for attempt in range(1, attempts+1):
try:
if attempt > 1:
time.sleep(10) # 10 seconds wait time between downloads
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(file_path, 'wb') as out_file:
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
out_file.write(chunk)
logger.info('Download finished successfully')
return file_path
except Exception as ex:
logger.error(f'Attempt #{attempt} failed with error: {ex}')
return ''
use wget
module of python instead. Here is a snippet
import wget
wget.download(url)
requests
is good, but how about socket
solution?
def stream_(host):
import socket
import ssl
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
context = ssl.create_default_context(Purpose.CLIENT_AUTH)
with context.wrap_socket(sock, server_hostname=host) as wrapped_socket:
wrapped_socket.connect((socket.gethostbyname(host), 443))
wrapped_socket.send(
"GET / HTTP/1.1rnHost:thiscatdoesnotexist.comrnAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9rnrn".encode())
resp = b""
while resp[-4:-1] != b"rnr":
resp += wrapped_socket.recv(1)
else:
resp = resp.decode()
content_length = int("".join([tag.split(" ")[1] for tag in resp.split("rn") if "content-length" in tag.lower()]))
image = b""
while content_length > 0:
data = wrapped_socket.recv(2048)
if not data:
print("EOF")
break
image += data
content_length -= len(data)
with open("image.jpeg", "wb") as file:
file.write(image)
Here is additional approach for the use-case of async chunked download, without reading all the file content to memory.
It means that both read from the URL and the write to file are implemented with asyncio
libraries (aiohttp
to read from the URL and aiofiles
to write the file).
The following code should work on Python 3.7
and later.
Just edit SRC_URL
and DEST_FILE
variables before copy and paste.
import aiofiles
import aiohttp
import asyncio
async def async_http_download(src_url, dest_file, chunk_size=65536):
async with aiofiles.open(dest_file, 'wb') as fd:
async with aiohttp.ClientSession() as session:
async with session.get(src_url) as resp:
async for chunk in resp.content.iter_chunked(chunk_size):
await fd.write(chunk)
SRC_URL = "/path/to/url"
DEST_FILE = "/path/to/file/on/local/machine"
asyncio.run(async_http_download(SRC_URL, DEST_FILE))
Requests is a really nice library. I’d like to use it for downloading big files (>1GB).
The problem is it’s not possible to keep whole file in memory; I need to read it in chunks. And this is a problem with the following code:
import requests
def DownloadFile(url)
local_filename = url.split('/')[-1]
r = requests.get(url)
f = open(local_filename, 'wb')
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
return
For some reason it doesn’t work this way; it still loads the response into memory before it is saved to a file.
Your chunk size could be too large, have you tried dropping that – maybe 1024 bytes at a time? (also, you could use with
to tidy up the syntax)
def DownloadFile(url):
local_filename = url.split('/')[-1]
r = requests.get(url)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return
Incidentally, how are you deducing that the response has been loaded into memory?
It sounds as if python isn’t flushing the data to file, from other SO questions you could try f.flush()
and os.fsync()
to force the file write and free memory;
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
os.fsync(f.fileno())
With the following streaming code, the Python memory usage is restricted regardless of the size of the downloaded file:
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
Note that the number of bytes returned using iter_content
is not exactly the chunk_size
; it’s expected to be a random number that is often far bigger, and is expected to be different in every iteration.
See body-content-workflow and Response.iter_content for further reference.
It’s much easier if you use Response.raw
and shutil.copyfileobj()
:
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
This streams the file to disk without using excessive memory, and the code is simple.
Note: According to the documentation, Response.raw
will not decode gzip
and deflate
transfer-encodings, so you will need to do this manually.
Not exactly what OP was asking, but… it’s ridiculously easy to do that with urllib
:
from urllib.request import urlretrieve
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
dst = 'ubuntu-16.04.2-desktop-amd64.iso'
urlretrieve(url, dst)
Or this way, if you want to save it to a temporary file:
from urllib.request import urlopen
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
url = 'http://mirror.pnl.gov/releases/16.04.2/ubuntu-16.04.2-desktop-amd64.iso'
with urlopen(url) as fsrc, NamedTemporaryFile(delete=False) as fdst:
copyfileobj(fsrc, fdst)
I watched the process:
watch 'ps -p 18647 -o pid,ppid,pmem,rsz,vsz,comm,args; ls -al *.iso'
And I saw the file growing, but memory usage stayed at 17 MB. Am I missing something?
Based on the Roman’s most upvoted comment above, here is my implementation,
Including "download as" and "retries" mechanism:
def download(url: str, file_path='', attempts=2):
"""Downloads a URL content into a file (with large file support by streaming)
:param url: URL to download
:param file_path: Local file name to contain the data downloaded
:param attempts: Number of attempts
:return: New file path. Empty string if the download failed
"""
if not file_path:
file_path = os.path.realpath(os.path.basename(url))
logger.info(f'Downloading {url} content to {file_path}')
url_sections = urlparse(url)
if not url_sections.scheme:
logger.debug('The given url is missing a scheme. Adding http scheme')
url = f'http://{url}'
logger.debug(f'New url: {url}')
for attempt in range(1, attempts+1):
try:
if attempt > 1:
time.sleep(10) # 10 seconds wait time between downloads
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(file_path, 'wb') as out_file:
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
out_file.write(chunk)
logger.info('Download finished successfully')
return file_path
except Exception as ex:
logger.error(f'Attempt #{attempt} failed with error: {ex}')
return ''
use wget
module of python instead. Here is a snippet
import wget
wget.download(url)
requests
is good, but how about socket
solution?
def stream_(host):
import socket
import ssl
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
context = ssl.create_default_context(Purpose.CLIENT_AUTH)
with context.wrap_socket(sock, server_hostname=host) as wrapped_socket:
wrapped_socket.connect((socket.gethostbyname(host), 443))
wrapped_socket.send(
"GET / HTTP/1.1rnHost:thiscatdoesnotexist.comrnAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9rnrn".encode())
resp = b""
while resp[-4:-1] != b"rnr":
resp += wrapped_socket.recv(1)
else:
resp = resp.decode()
content_length = int("".join([tag.split(" ")[1] for tag in resp.split("rn") if "content-length" in tag.lower()]))
image = b""
while content_length > 0:
data = wrapped_socket.recv(2048)
if not data:
print("EOF")
break
image += data
content_length -= len(data)
with open("image.jpeg", "wb") as file:
file.write(image)
Here is additional approach for the use-case of async chunked download, without reading all the file content to memory.
It means that both read from the URL and the write to file are implemented with asyncio
libraries (aiohttp
to read from the URL and aiofiles
to write the file).
The following code should work on Python 3.7
and later.
Just edit SRC_URL
and DEST_FILE
variables before copy and paste.
import aiofiles
import aiohttp
import asyncio
async def async_http_download(src_url, dest_file, chunk_size=65536):
async with aiofiles.open(dest_file, 'wb') as fd:
async with aiohttp.ClientSession() as session:
async with session.get(src_url) as resp:
async for chunk in resp.content.iter_chunked(chunk_size):
await fd.write(chunk)
SRC_URL = "/path/to/url"
DEST_FILE = "/path/to/file/on/local/machine"
asyncio.run(async_http_download(SRC_URL, DEST_FILE))