Python – download entire directory from Google Cloud Storage
Question:
At the following page
https://googlecloudplatform.github.io/google-cloud-python/latest/storage/blobs.html
there are all the API calls which can be used for Python & Google Cloud storage. Even in the “official” samples on github
don’t have a related example.
Finally, downloading a directory with the same method used for download files gives the error
Error: [Errno 21] Is a directory:
Answers:
You just have to first list all the files in a directory and then download them one by one:
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
filename = blob.name.replace('/', '_')
blob.download_to_filename(dl_dir + filename) # Download
blob.name
includes the entire directory structure + filename, so if you want the same file name as in the bucket, you might want to extract it first (instead of replacing /
with _
)
Lets say, we want to download FINALFOLDER
from the storage path: gs://TEST_BUCKET_NAME/FOLDER1/FOLDER2/FINALFOLDER
After downloading, the final path will look like: D:\my_blob_dataFINALFOLDER
from os import makedirs
from os.path import join, isdir, isfile, basename
from google.cloud import storage
# if your environment was authenticated, the default config will be picked up
storage_client = storage.Client() # comment this line if you want to use service account
# uncomment the line below if you have a service account json
# storage_client = storage.Client.from_service_account_json('creds/sa.json')
bucket_name = 'TEST_BUCKET_NAME'
prefix = 'FOLDER2'
dst_path = 'D:\my_blob_data'
if isdir(dstPath) == False:
makedirs(dstPath)
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
blob_name = blob.name
dst_file_name = blob_name.replace('FOLDER1/FOLDER2', dst_path) #.replace('FOLDER1/FOLDER2', 'D:\my_blob_data')
# extract the final directory and create it in the destination path if it does not exist
dst_dir = dst_file_name.replace('/' + basename(dst_file_name), '')
if isdir(dst_dir) == False:
makedirs(dst_dir)
# download the blob object
blob.download_to_filename(dst_file_name)
Refer This Link- https://medium.com/@sandeepsinh/multiple-file-download-form-google-cloud-storage-using-python-and-gcs-api-1dbcab23c44
1 – Add Your Credential Json
2 – List Bucket Items
3 – Download
import logging
import os
from google.cloud import storage
global table_id
global bucket_name
logging.basicConfig(format=’%(levelname)s:%(message)s’, level=logging.DEBUG)
bucket_name = ‘mybucket’
table_id = ‘shakespeare’
storage_client = storage.Client.from_service_account_json(‘/google-cloud/keyfile/service_account.json’)
# The “folder” where the files you want to download are
folder=’/google-cloud/download/{}’.format(table_id)
delimiter=’/’
bucket=storage_client.get_bucket(bucket_name)
blobs=bucket.list_blobs(prefix=table_id, delimiter=delimiter) #List all objects that satisfy the filter.
# Download the file to a destination
def download_to_local():
logging.info(‘File download Started…. Wait for the job to complete.’)
# Create this folder locally if not exists
if not os.path.exists(folder):
os.makedirs(folder)
# Iterating through for loop one by one using API call
for blob in blobs:
logging.info(‘Blobs: {}’.format(blob.name))
destination_uri = ‘{}/{}’.format(folder, blob.name)
blob.download_to_filename(destination_uri)
logging.info(‘Exported {} to {}’.format(
blob.name, destination_uri))
if __name__ == ‘__main__’:
download_to_local()
If you want to keep the same directory structure without renaming and also create nested folders. I have for python 3.5+ a solution based on @ksbg answer :
from pathlib import Path
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
if blob.name.endswith("/"):
continue
file_split = blob.name.split("/")
directory = "/".join(file_split[0:-1])
Path(directory).mkdir(parents=True, exist_ok=True)
blob.download_to_filename(blob.name)
Using tensoflow gfile package, here is a recursive function.
- root_dir is the GCS parent folder.
- local_base_dir is the parent folder created at local
def copy_recursively(root_dir, local_base_dir):
if tf.io.gfile.exists(local_base_dir):
tf.io.gfile.rmtree(local_base_dir)
tf.io.gfile.mkdir(local_base_dir)
file_list = tf.io.gfile.glob(root_dir+'/**')
for item in file_list:
if not tf.io.gfile.isdir(item):
fname = item.rsplit('/',1)[-1]
if not fname.startswith('.'):
tf.io.gfile.copy(item,
os.path.join(local_base_dir,fname),
overwrite=False)
else:
child_dir= item.rsplit('/',1)[-1]
full_dir_path = os.path.join(local_base_dir,child_dir)
print(f"Setting up child directory: {full_dir_path}")
copy_recursively(item,full_dir_path)
root_dir = 'gs://.../.../..'
local_base_dir = root_dir.rsplit('/',1)[-1]
copy_recursively(root_dir, local_base_dir)
Local Download all files and child directory inside a parent directory in a zip format and upload to any GCS bucket.
Hope this code helps you as well.
from google.cloud import storage
from zipfile import ZipFile, ZipInfo, io, os
from datetime import datetime
# The ID of your GCS bucket
bucket_name = "SOURCE_BUCKET"
# The ID of your GCS object
prefix = 'Fold1/'
archive = io.BytesIO()
with ZipFile(archive, 'w') as zip:
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
blobs = source_bucket.list_blobs(prefix=prefix)
for blob in blobs:
if blob.name.endswith("/"): continue
filename = blob.name #.replace('/', '_')
data = source_bucket.blob(filename).download_as_string()
zip_file = ZipInfo(filename)
zip.writestr(zip_file,data)
archive.seek(0)
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H:%M:%S")
object_name = "Fold1_"+"dt_string"+".zip"
##### download to local
blob.download_to_filename(object_name)
##### upload to any bucket
target_bucket = "TARGET_BUCKET"
bucket = storage_client.get_bucket(target_bucket)
blob = storage.Blob(object_name, bucket)
blob.upload_from_file(archive, content_type='application/zip')
Recursively downloads all the folders in same relative order as source gcs directory
def download_gcs_folder_recursively_to_local(blob_folder_path, destination_folder_path, gcs_project_name, gcs_bucket_name):
if not blob_folder_path.endswith("/"):
blob_folder_path = blob_folder_path+"/"
if not destination_folder_path.endswith("/"):
destination_folder_path = destination_folder_path+"/"
storage_client = storage.Client(gcs_project_name)
bucket = storage_client.get_bucket(gcs_bucket_name)
blobs = bucket.list_blobs(prefix=blob_folder_path)
os.makedirs("tmp_cp_folder", exist_ok=True)
for blob in blobs:
if blob.name.endswith("/"):
continue
tmp_filename = blob.name.replace('/', '_')
relative_file_path = blob.name[len(blob_folder_path):]
relative_file_parent_folder = "" if len(relative_file_path.split("/")) == 1 else relative_file_path.rsplit('/',1)[0]
os.makedirs(f"{destination_folder_path}{relative_file_parent_folder}", exist_ok=True)
blob.download_to_filename(f"tmp_cp_folder/{tmp_filename}")
os.system(f"mv tmp_cp_folder/{tmp_filename} {destination_folder_path}{relative_file_path}")
os.removedirs("tmp_cp_folder")
Based on one of the earlier solution: https://stackoverflow.com/a/49749281
At the following page
https://googlecloudplatform.github.io/google-cloud-python/latest/storage/blobs.html
there are all the API calls which can be used for Python & Google Cloud storage. Even in the “official” samples on github
don’t have a related example.
Finally, downloading a directory with the same method used for download files gives the error
Error: [Errno 21] Is a directory:
You just have to first list all the files in a directory and then download them one by one:
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
filename = blob.name.replace('/', '_')
blob.download_to_filename(dl_dir + filename) # Download
blob.name
includes the entire directory structure + filename, so if you want the same file name as in the bucket, you might want to extract it first (instead of replacing /
with _
)
Lets say, we want to download FINALFOLDER
from the storage path: gs://TEST_BUCKET_NAME/FOLDER1/FOLDER2/FINALFOLDER
After downloading, the final path will look like: D:\my_blob_dataFINALFOLDER
from os import makedirs
from os.path import join, isdir, isfile, basename
from google.cloud import storage
# if your environment was authenticated, the default config will be picked up
storage_client = storage.Client() # comment this line if you want to use service account
# uncomment the line below if you have a service account json
# storage_client = storage.Client.from_service_account_json('creds/sa.json')
bucket_name = 'TEST_BUCKET_NAME'
prefix = 'FOLDER2'
dst_path = 'D:\my_blob_data'
if isdir(dstPath) == False:
makedirs(dstPath)
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
blob_name = blob.name
dst_file_name = blob_name.replace('FOLDER1/FOLDER2', dst_path) #.replace('FOLDER1/FOLDER2', 'D:\my_blob_data')
# extract the final directory and create it in the destination path if it does not exist
dst_dir = dst_file_name.replace('/' + basename(dst_file_name), '')
if isdir(dst_dir) == False:
makedirs(dst_dir)
# download the blob object
blob.download_to_filename(dst_file_name)
Refer This Link- https://medium.com/@sandeepsinh/multiple-file-download-form-google-cloud-storage-using-python-and-gcs-api-1dbcab23c44
1 – Add Your Credential Json
2 – List Bucket Items
3 – Download
import logging
import os
from google.cloud import storage
global table_id
global bucket_name
logging.basicConfig(format=’%(levelname)s:%(message)s’, level=logging.DEBUG)
bucket_name = ‘mybucket’
table_id = ‘shakespeare’
storage_client = storage.Client.from_service_account_json(‘/google-cloud/keyfile/service_account.json’)
# The “folder” where the files you want to download are
folder=’/google-cloud/download/{}’.format(table_id)
delimiter=’/’
bucket=storage_client.get_bucket(bucket_name)
blobs=bucket.list_blobs(prefix=table_id, delimiter=delimiter) #List all objects that satisfy the filter.
# Download the file to a destination
def download_to_local():
logging.info(‘File download Started…. Wait for the job to complete.’)
# Create this folder locally if not exists
if not os.path.exists(folder):
os.makedirs(folder)
# Iterating through for loop one by one using API call
for blob in blobs:
logging.info(‘Blobs: {}’.format(blob.name))
destination_uri = ‘{}/{}’.format(folder, blob.name)
blob.download_to_filename(destination_uri)
logging.info(‘Exported {} to {}’.format(
blob.name, destination_uri))
if __name__ == ‘__main__’:
download_to_local()
If you want to keep the same directory structure without renaming and also create nested folders. I have for python 3.5+ a solution based on @ksbg answer :
from pathlib import Path
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
if blob.name.endswith("/"):
continue
file_split = blob.name.split("/")
directory = "/".join(file_split[0:-1])
Path(directory).mkdir(parents=True, exist_ok=True)
blob.download_to_filename(blob.name)
Using tensoflow gfile package, here is a recursive function.
- root_dir is the GCS parent folder.
- local_base_dir is the parent folder created at local
def copy_recursively(root_dir, local_base_dir):
if tf.io.gfile.exists(local_base_dir):
tf.io.gfile.rmtree(local_base_dir)
tf.io.gfile.mkdir(local_base_dir)
file_list = tf.io.gfile.glob(root_dir+'/**')
for item in file_list:
if not tf.io.gfile.isdir(item):
fname = item.rsplit('/',1)[-1]
if not fname.startswith('.'):
tf.io.gfile.copy(item,
os.path.join(local_base_dir,fname),
overwrite=False)
else:
child_dir= item.rsplit('/',1)[-1]
full_dir_path = os.path.join(local_base_dir,child_dir)
print(f"Setting up child directory: {full_dir_path}")
copy_recursively(item,full_dir_path)
root_dir = 'gs://.../.../..'
local_base_dir = root_dir.rsplit('/',1)[-1]
copy_recursively(root_dir, local_base_dir)
Local Download all files and child directory inside a parent directory in a zip format and upload to any GCS bucket.
Hope this code helps you as well.
from google.cloud import storage
from zipfile import ZipFile, ZipInfo, io, os
from datetime import datetime
# The ID of your GCS bucket
bucket_name = "SOURCE_BUCKET"
# The ID of your GCS object
prefix = 'Fold1/'
archive = io.BytesIO()
with ZipFile(archive, 'w') as zip:
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
blobs = source_bucket.list_blobs(prefix=prefix)
for blob in blobs:
if blob.name.endswith("/"): continue
filename = blob.name #.replace('/', '_')
data = source_bucket.blob(filename).download_as_string()
zip_file = ZipInfo(filename)
zip.writestr(zip_file,data)
archive.seek(0)
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H:%M:%S")
object_name = "Fold1_"+"dt_string"+".zip"
##### download to local
blob.download_to_filename(object_name)
##### upload to any bucket
target_bucket = "TARGET_BUCKET"
bucket = storage_client.get_bucket(target_bucket)
blob = storage.Blob(object_name, bucket)
blob.upload_from_file(archive, content_type='application/zip')
Recursively downloads all the folders in same relative order as source gcs directory
def download_gcs_folder_recursively_to_local(blob_folder_path, destination_folder_path, gcs_project_name, gcs_bucket_name):
if not blob_folder_path.endswith("/"):
blob_folder_path = blob_folder_path+"/"
if not destination_folder_path.endswith("/"):
destination_folder_path = destination_folder_path+"/"
storage_client = storage.Client(gcs_project_name)
bucket = storage_client.get_bucket(gcs_bucket_name)
blobs = bucket.list_blobs(prefix=blob_folder_path)
os.makedirs("tmp_cp_folder", exist_ok=True)
for blob in blobs:
if blob.name.endswith("/"):
continue
tmp_filename = blob.name.replace('/', '_')
relative_file_path = blob.name[len(blob_folder_path):]
relative_file_parent_folder = "" if len(relative_file_path.split("/")) == 1 else relative_file_path.rsplit('/',1)[0]
os.makedirs(f"{destination_folder_path}{relative_file_parent_folder}", exist_ok=True)
blob.download_to_filename(f"tmp_cp_folder/{tmp_filename}")
os.system(f"mv tmp_cp_folder/{tmp_filename} {destination_folder_path}{relative_file_path}")
os.removedirs("tmp_cp_folder")
Based on one of the earlier solution: https://stackoverflow.com/a/49749281