Downloading a large file in parts using multiple parallel threads
Question:
I have a use case, where a large remote file needs to be downloaded in parts, by using multiple threads.
Each thread must run simultaneously (in parallel), grabbing a specific part of the file.
The expectation is to combine the parts into a single (original) file, once all parts were successfully downloaded.
Perhaps using the requests library could do the job, but then I am not sure how I would multithread this into a solution that combines the chunks together.
url = 'https://url.com/file.iso'
headers = {"Range": "bytes=0-1000000"} # first megabyte
r = get(url, headers=headers)
I was also thinking of using curl where Python would orchestrate the downloads, but I am not sure that’s the correct way to go. It just seems to be too complex and swaying away from the vanilla Python solution. Something like this:
curl --range 200000000-399999999 -o file.iso.part2
Can someone explain how you’d go about something like this? Or post a code example of something that works in Python 3? I usually find the Python-related answers quite easily, but the solution to this problem seems to be eluding me.
Answers:
You could use grequests to download in parallel.
import grequests
URL = 'https://cdimage.debian.org/debian-cd/current/amd64/iso-cd/debian-10.1.0-amd64-netinst.iso'
CHUNK_SIZE = 104857600 # 100 MB
HEADERS = []
_start, _stop = 0, 0
for x in range(4): # file size is > 300MB, so we download in 4 parts.
_start = _stop
_stop = 104857600 * (x + 1)
HEADERS.append({"Range": "bytes=%s-%s" % (_start, _stop)})
rs = (grequests.get(URL, headers=h) for h in HEADERS)
downloads = grequests.map(rs)
with open('/tmp/debian-10.1.0-amd64-netinst.iso', 'ab') as f:
for download in downloads:
print(download.status_code)
f.write(download.content)
PS: I did not check if the Ranges are correctly determinded and if the downloaded md5sum matches! This should just show in general how it could work.
Here is a version using Python 3 with Asyncio, it’s just an example, it can be improved, but you should be able to get everything you need.
get_size
: Send an HEAD request to get the size of the file
download_range
: Download a single chunk
download
: Download all the chunks and merge them
import asyncio
import concurrent.futures
import functools
import requests
import os
URL = 'https://download.samplelib.com/mp4/sample-30s.mp4'
OUTPUT = 'video.mp4'
async def get_size(url):
response = requests.head(url)
size = int(response.headers['Content-Length'])
return size
def download_range(url, start, end, output):
headers = {'Range': f'bytes={start}-{end}'}
response = requests.get(url, headers=headers)
with open(output, 'wb') as f:
for part in response.iter_content(1024):
f.write(part)
async def download(run, loop, url, output, chunk_size=1000000):
file_size = await get_size(url)
chunks = range(0, file_size, chunk_size)
tasks = [
run(
download_range,
url,
start,
start + chunk_size - 1,
f'{output}.part{i}',
)
for i, start in enumerate(chunks)
]
await asyncio.wait(tasks)
with open(output, 'wb') as o:
for i in range(len(chunks)):
chunk_path = f'{output}.part{i}'
with open(chunk_path, 'rb') as s:
o.write(s.read())
os.remove(chunk_path)
if __name__ == '__main__':
executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
loop = asyncio.new_event_loop()
run = functools.partial(loop.run_in_executor, executor)
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
download(run, loop, URL, OUTPUT)
)
finally:
loop.close()
The best way i found is to use a module called pySmartDL.
step 1: pip install pySmartDL
step 2: for downloading the file you could use
from pySmartDL import SmartDL
obj = SmartDL(url, destination)
obj.start()
Note: This gives you a download meter by default.
In case you need to hook the download progress to a gui you could use
obj = SmartDL(url, dest,progress_bar=False)
obj.start(blocking=False)
while not obj.isFinished():
download_precentage = round(obj.get_progress()*100,2)
time.sleep(0.2)
print(download_precentage)
if you want to use more threads you can use
obj = SmartDL(url, destination,threads=7) #by default thread = 5
obj.start()
you can find many more features from the project page
Downloads: http://pypi.python.org/pypi/pySmartDL/
Documentation: http://itaybb.github.io/pySmartDL/
Project page: https://github.com/iTaybb/pySmartDL/
Bugs and Issues: https://github.com/iTaybb/pySmartDL/issues
You can also you use ThreadPoolExecutor
(or ProcessPoolExecutor
) from concurrent.futures
instead of using asyncio
. The following shows how to modify bug’s answer by using ThreadPoolExecutor
:
Bonus: The following snippet also uses tqdm
to show a progress bar of the download. If you don’t want to use tqdm
then just comment out the block below with tqdm(total=file_size . . .
. More information on tqdm
is here which can be installed with pip install tqdm
. Btw, tqdm
can also be used with asyncio
.
import requests
import concurrent.futures
from concurrent.futures import as_completed
from tqdm import tqdm
import os
def download_part(url_and_headers_and_partfile):
url, headers, partfile = url_and_headers_and_partfile
response = requests.get(url, headers=headers)
# setting same as below in the main block, but not necessary:
chunk_size = 1024*1024
# Need size to make tqdm work.
size=0
with open(partfile, 'wb') as f:
for chunk in response.iter_content(chunk_size):
if chunk:
size+=f.write(chunk)
return size
def make_headers(start, chunk_size):
end = start + chunk_size - 1
return {'Range': f'bytes={start}-{end}'}
url = 'https://file-examples.com/storage/fe4999944e63361b793404c/2017/04/file_example_MP4_1920_18MG.mp4'
file_name = 'video.mp4'
response = requests.get(url, stream=True)
file_size = int(response.headers.get('content-length', 0))
chunk_size = 1024*1024
chunks = range(0, file_size, chunk_size)
my_iter = [[url, make_headers(chunk, chunk_size), f'{file_name}.part{i}'] for i, chunk in enumerate(chunks)]
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
jobs = [executor.submit(download_part, i) for i in my_iter]
with tqdm(total=file_size, unit='iB', unit_scale=True, unit_divisor=chunk_size, leave=True, colour='cyan') as bar:
for job in as_completed(jobs):
size = job.result()
bar.update(size)
with open(file_name, 'wb') as outfile:
for i in range(len(chunks)):
chunk_path = f'{file_name}.part{i}'
with open(chunk_path, 'rb') as s:
outfile.write(s.read())
os.remove(chunk_path)
I have a use case, where a large remote file needs to be downloaded in parts, by using multiple threads.
Each thread must run simultaneously (in parallel), grabbing a specific part of the file.
The expectation is to combine the parts into a single (original) file, once all parts were successfully downloaded.
Perhaps using the requests library could do the job, but then I am not sure how I would multithread this into a solution that combines the chunks together.
url = 'https://url.com/file.iso'
headers = {"Range": "bytes=0-1000000"} # first megabyte
r = get(url, headers=headers)
I was also thinking of using curl where Python would orchestrate the downloads, but I am not sure that’s the correct way to go. It just seems to be too complex and swaying away from the vanilla Python solution. Something like this:
curl --range 200000000-399999999 -o file.iso.part2
Can someone explain how you’d go about something like this? Or post a code example of something that works in Python 3? I usually find the Python-related answers quite easily, but the solution to this problem seems to be eluding me.
You could use grequests to download in parallel.
import grequests
URL = 'https://cdimage.debian.org/debian-cd/current/amd64/iso-cd/debian-10.1.0-amd64-netinst.iso'
CHUNK_SIZE = 104857600 # 100 MB
HEADERS = []
_start, _stop = 0, 0
for x in range(4): # file size is > 300MB, so we download in 4 parts.
_start = _stop
_stop = 104857600 * (x + 1)
HEADERS.append({"Range": "bytes=%s-%s" % (_start, _stop)})
rs = (grequests.get(URL, headers=h) for h in HEADERS)
downloads = grequests.map(rs)
with open('/tmp/debian-10.1.0-amd64-netinst.iso', 'ab') as f:
for download in downloads:
print(download.status_code)
f.write(download.content)
PS: I did not check if the Ranges are correctly determinded and if the downloaded md5sum matches! This should just show in general how it could work.
Here is a version using Python 3 with Asyncio, it’s just an example, it can be improved, but you should be able to get everything you need.
get_size
: Send an HEAD request to get the size of the filedownload_range
: Download a single chunkdownload
: Download all the chunks and merge them
import asyncio
import concurrent.futures
import functools
import requests
import os
URL = 'https://download.samplelib.com/mp4/sample-30s.mp4'
OUTPUT = 'video.mp4'
async def get_size(url):
response = requests.head(url)
size = int(response.headers['Content-Length'])
return size
def download_range(url, start, end, output):
headers = {'Range': f'bytes={start}-{end}'}
response = requests.get(url, headers=headers)
with open(output, 'wb') as f:
for part in response.iter_content(1024):
f.write(part)
async def download(run, loop, url, output, chunk_size=1000000):
file_size = await get_size(url)
chunks = range(0, file_size, chunk_size)
tasks = [
run(
download_range,
url,
start,
start + chunk_size - 1,
f'{output}.part{i}',
)
for i, start in enumerate(chunks)
]
await asyncio.wait(tasks)
with open(output, 'wb') as o:
for i in range(len(chunks)):
chunk_path = f'{output}.part{i}'
with open(chunk_path, 'rb') as s:
o.write(s.read())
os.remove(chunk_path)
if __name__ == '__main__':
executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
loop = asyncio.new_event_loop()
run = functools.partial(loop.run_in_executor, executor)
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
download(run, loop, URL, OUTPUT)
)
finally:
loop.close()
The best way i found is to use a module called pySmartDL.
step 1: pip install pySmartDL
step 2: for downloading the file you could use
from pySmartDL import SmartDL
obj = SmartDL(url, destination)
obj.start()
Note: This gives you a download meter by default.
In case you need to hook the download progress to a gui you could use
obj = SmartDL(url, dest,progress_bar=False)
obj.start(blocking=False)
while not obj.isFinished():
download_precentage = round(obj.get_progress()*100,2)
time.sleep(0.2)
print(download_precentage)
if you want to use more threads you can use
obj = SmartDL(url, destination,threads=7) #by default thread = 5
obj.start()
you can find many more features from the project page
Downloads: http://pypi.python.org/pypi/pySmartDL/
Documentation: http://itaybb.github.io/pySmartDL/
Project page: https://github.com/iTaybb/pySmartDL/
Bugs and Issues: https://github.com/iTaybb/pySmartDL/issues
You can also you use ThreadPoolExecutor
(or ProcessPoolExecutor
) from concurrent.futures
instead of using asyncio
. The following shows how to modify bug’s answer by using ThreadPoolExecutor
:
Bonus: The following snippet also uses tqdm
to show a progress bar of the download. If you don’t want to use tqdm
then just comment out the block below with tqdm(total=file_size . . .
. More information on tqdm
is here which can be installed with pip install tqdm
. Btw, tqdm
can also be used with asyncio
.
import requests
import concurrent.futures
from concurrent.futures import as_completed
from tqdm import tqdm
import os
def download_part(url_and_headers_and_partfile):
url, headers, partfile = url_and_headers_and_partfile
response = requests.get(url, headers=headers)
# setting same as below in the main block, but not necessary:
chunk_size = 1024*1024
# Need size to make tqdm work.
size=0
with open(partfile, 'wb') as f:
for chunk in response.iter_content(chunk_size):
if chunk:
size+=f.write(chunk)
return size
def make_headers(start, chunk_size):
end = start + chunk_size - 1
return {'Range': f'bytes={start}-{end}'}
url = 'https://file-examples.com/storage/fe4999944e63361b793404c/2017/04/file_example_MP4_1920_18MG.mp4'
file_name = 'video.mp4'
response = requests.get(url, stream=True)
file_size = int(response.headers.get('content-length', 0))
chunk_size = 1024*1024
chunks = range(0, file_size, chunk_size)
my_iter = [[url, make_headers(chunk, chunk_size), f'{file_name}.part{i}'] for i, chunk in enumerate(chunks)]
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
jobs = [executor.submit(download_part, i) for i in my_iter]
with tqdm(total=file_size, unit='iB', unit_scale=True, unit_divisor=chunk_size, leave=True, colour='cyan') as bar:
for job in as_completed(jobs):
size = job.result()
bar.update(size)
with open(file_name, 'wb') as outfile:
for i in range(len(chunks)):
chunk_path = f'{file_name}.part{i}'
with open(chunk_path, 'rb') as s:
outfile.write(s.read())
os.remove(chunk_path)