Python How to download multiple files in parallel using multiprocessing.pool
Question:
I am trying to download and extract zip files using multiprocessing.Pool
.But every time I execute the script only 3 zips will be downloaded and remaining files are not seen in the directory(CPU % is also touching 100%). Can someone help me how to solve this problem/suggest better approach and following the snippet that I have tried. I am completely new to multiprocessing. My goal is to download multiple files in parallel without reaching max CPU.
import StringIO
import os
import sys
import zipfile
from multiprocessing import Pool, cpu_count
import requests
filePath = os.path.dirname(os.path.abspath(__file__))
print("filePath is %s " % filePath)
sys.path.append(filePath)
url = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
def download_zips(url):
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(StringIO.StringIO(response.content))
print("n Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print("extracted {} n".format(file_name))
sourceZip.close()
if __name__ == "__main__":
print("There are {} CPUs on this machine ".format(cpu_count()))
pool = Pool(cpu_count())
results = pool.map(download_zips, url)
pool.close()
pool.join()
output below
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
There are 4 CPUs on this machine
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
Downloaded bbcsport.zip
extracted bbcsport.zip
Downloaded 3sources.zip
extracted 3sources.zip
Downloaded multiview_data_20130124.zip
Downloaded movielists_20130821.zip
Downloaded movielists_20130821.zip
extracted multiview_data_20130124.zip
extracted movielists_20130821.zip
extracted movielists_20130821.zip
Answers:
I’ve made a few minor tweeks in your function and it works fine. Please note that:
- the file
".../movielists_20130821.zip"
appears on your list twice, so you’re donwloading the same thing twice (maybe a typo?)
- The files
".../multiview_data_20130124.zip"
, ".../movielists_20130821.zip"
and ".../3sources.zip"
, when extracted, yield a new directory. The file ".../bbcsport.zip"
, though, when extracted, places it’s files in the root folder, your current working directory (see image below). Maybe you missed this check?
- I added a try/except block in the donwload function. Why? Multiprocessing works by creating new (sub)processes to run stuff. If a subprocess throws an exception, the parent process does not catch it. So if any erros occour in this subprocess, it must be logged/handled there.
import sys, os
import zipfile
import requests
from multiprocessing import Pool, cpu_count
from functools import partial
from io import BytesIO
def download_zip(url, filePath):
try:
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(BytesIO(response.content))
print(" Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print(" extracted {}".format(file_name))
sourceZip.close()
except Exception as e:
print(e)
if __name__ == "__main__":
filePath = os.path.dirname(os.path.abspath(__file__))
print("filePath is %s " % filePath)
# sys.path.append(filePath) # why do you need this?
urls = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
print("There are {} CPUs on this machine ".format(cpu_count()))
pool = Pool(cpu_count())
download_func = partial(download_zip, filePath = filePath)
results = pool.map(download_func, urls)
pool.close()
pool.join()
i suggest you do it using multithreading since it’s an I/O bound like the following :
import requests, zipfile, io
import concurrent.futures
urls = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
def download_zips(url):
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(io.BytesIO(response.content))
print("n Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print("extracted {} n".format(file_name))
sourceZip.close()
with concurrent.futures.ThreadPoolExecutor() as exector :
exector.map(download_zip, urls)
I am trying to download and extract zip files using multiprocessing.Pool
.But every time I execute the script only 3 zips will be downloaded and remaining files are not seen in the directory(CPU % is also touching 100%). Can someone help me how to solve this problem/suggest better approach and following the snippet that I have tried. I am completely new to multiprocessing. My goal is to download multiple files in parallel without reaching max CPU.
import StringIO
import os
import sys
import zipfile
from multiprocessing import Pool, cpu_count
import requests
filePath = os.path.dirname(os.path.abspath(__file__))
print("filePath is %s " % filePath)
sys.path.append(filePath)
url = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
def download_zips(url):
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(StringIO.StringIO(response.content))
print("n Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print("extracted {} n".format(file_name))
sourceZip.close()
if __name__ == "__main__":
print("There are {} CPUs on this machine ".format(cpu_count()))
pool = Pool(cpu_count())
results = pool.map(download_zips, url)
pool.close()
pool.join()
output below
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
There are 4 CPUs on this machine
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
filePath is C:UsersDocumentsGitHubPython-Examples-Internetmulti_processing
Downloaded bbcsport.zip
extracted bbcsport.zip
Downloaded 3sources.zip
extracted 3sources.zip
Downloaded multiview_data_20130124.zip
Downloaded movielists_20130821.zip
Downloaded movielists_20130821.zip
extracted multiview_data_20130124.zip
extracted movielists_20130821.zip
extracted movielists_20130821.zip
I’ve made a few minor tweeks in your function and it works fine. Please note that:
- the file
".../movielists_20130821.zip"
appears on your list twice, so you’re donwloading the same thing twice (maybe a typo?) - The files
".../multiview_data_20130124.zip"
,".../movielists_20130821.zip"
and".../3sources.zip"
, when extracted, yield a new directory. The file".../bbcsport.zip"
, though, when extracted, places it’s files in the root folder, your current working directory (see image below). Maybe you missed this check? - I added a try/except block in the donwload function. Why? Multiprocessing works by creating new (sub)processes to run stuff. If a subprocess throws an exception, the parent process does not catch it. So if any erros occour in this subprocess, it must be logged/handled there.
import sys, os
import zipfile
import requests
from multiprocessing import Pool, cpu_count
from functools import partial
from io import BytesIO
def download_zip(url, filePath):
try:
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(BytesIO(response.content))
print(" Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print(" extracted {}".format(file_name))
sourceZip.close()
except Exception as e:
print(e)
if __name__ == "__main__":
filePath = os.path.dirname(os.path.abspath(__file__))
print("filePath is %s " % filePath)
# sys.path.append(filePath) # why do you need this?
urls = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
print("There are {} CPUs on this machine ".format(cpu_count()))
pool = Pool(cpu_count())
download_func = partial(download_zip, filePath = filePath)
results = pool.map(download_func, urls)
pool.close()
pool.join()
i suggest you do it using multithreading since it’s an I/O bound like the following :
import requests, zipfile, io
import concurrent.futures
urls = ["http://mlg.ucd.ie/files/datasets/multiview_data_20130124.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/bbcsport.zip",
"http://mlg.ucd.ie/files/datasets/movielists_20130821.zip",
"http://mlg.ucd.ie/files/datasets/3sources.zip"]
def download_zips(url):
file_name = url.split("/")[-1]
response = requests.get(url)
sourceZip = zipfile.ZipFile(io.BytesIO(response.content))
print("n Downloaded {} ".format(file_name))
sourceZip.extractall(filePath)
print("extracted {} n".format(file_name))
sourceZip.close()
with concurrent.futures.ThreadPoolExecutor() as exector :
exector.map(download_zip, urls)