Speed up Boto3 file transfer across buckets

Question:

I want to copy a sub-subfolder in an S3 bucket into a different bucket using Python (boto3).
However, the process is painfully slow.

If I copy the folder "by hand" straight on S3 from the browser, the process takes 72 seconds (for a folder with around 140 objects, total size roughly 1.0 GB).

However, if I try to copy it with boto3, it takes 9 times longer (653 seconds).

This is the code that I am using, re-adapted from the boto3 documentation and various answers here in SO:

import boto3

s3 = boto3.resource('s3')

# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'  
client   = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)

# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)

folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)

# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]

# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]

for src,dest in zip(files_src,files_dest):
    
    copy_source = {
        'Bucket': src_bucket_name,
        'Key': src
    }

    dest_bucket.copy(copy_source, dest)

Note that up to the last for loop, the code takes a couple of seconds only to run.

Any idea of how to speed up this? Am I doing something stupid/should use some other way of copying files/entire folders?

Asked By: Fraccalo

||

Answers:

Thanks to @Suyog Shimpi (who pointed to a similar SO post), I was able to significantly speed up the copying process.

Here the code slightly readapted from the other post:

import os
import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm

s3 = boto3.resource('s3')

# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'  
client   = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)

# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)

folder = "folder_1/subfolder_1"
response_sub = client.list_objects_v2(Bucket=src_bucket_name, Prefix = folder)

# list files to be copied (select only images, but in this folder there are only images anyway)
files_src = [prefix['Key'] for prefix in response_sub['Contents'] if prefix['Key'].split('.')[-1].lower() in ['jpg','jpeg','png','tiff'] ]

# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]

botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)

transfer_config = s3transfer.TransferConfig(
    use_threads=True,
    max_concurrency=20,
)

# note that timing the process is optional
# total_size of the files can be obtained with boto3, or on the browser 
%time
progress = tqdm.tqdm(
    desc='upload',
    total=total_size, unit='B', unit_scale=1,
    position=0,
    bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')

s3t = s3transfer.create_transfer_manager(s3client, transfer_config)


for src,dest in zip(files_src,files_dest):
    
    copy_source = {
        'Bucket': src_bucket_name,
        'Key': src
    }

    s3t.copy(copy_source=copy_source,
             bucket = dest_bucket_name,
             key = dest,
             subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
             )

# close transfer job
s3t.shutdown()  
progress.close();
Answered By: Fraccalo

Thanks Fraccalo for your solution, it helped me a lot!
I adjusted it a little so that we can copy more than 1000 files:

import boto3
import botocore
import boto3.s3.transfer as s3transfer
import tqdm

s3 = boto3.resource('s3')

# define source bucket
src_bucket_name = 'bucket_1'
prefix = 'folder_1/'  
client   = boto3.client('s3')
src_bucket = s3.Bucket(src_bucket_name)

# define destination bucket
dest_bucket_name = 'bucket_2'
dest_bucket = s3.Bucket(dest_bucket_name)

folder = "folder_1/subfolder_1"
files_src = []
bucket_size = 0
# use paginator to read more than 1000 files
paginator = client.get_paginator('list_objects_v2')
operation_parameters = {'Bucket': src_bucket_name,
                        'Prefix': folder}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
    if page.get('Contents', None):
        files_src.extend([prefix['Key'] for prefix in page['Contents']])
        bucket_size += sum(obj['Size'] for obj in page['Contents'])



# list of file names after copy
dest_prefix = 'folder_1/subfolder_1/'
files_dest = [dest_prefix+i for i in files_src]

botocore_config = botocore.config.Config(max_pool_connections=20)
s3client = boto3.client('s3', config=botocore_config)

transfer_config = s3transfer.TransferConfig(
    use_threads=True,
    max_concurrency=20,
)

progress = tqdm.tqdm(
    desc='upload',
    total=bucket_size, unit='B', unit_scale=1,
    position=0,
    bar_format='{desc:<10}{percentage:3.0f}%|{bar:10}{r_bar}')

s3t = s3transfer.create_transfer_manager(s3client, transfer_config)


for src,dest in zip(files_src,files_dest):
    
    copy_source = {
        'Bucket': src_bucket_name,
        'Key': src
    }

    s3t.copy(copy_source=copy_source,
             bucket = dest_bucket_name,
             key = dest,
             subscribers=[s3transfer.ProgressCallbackInvoker(progress.update),],
             )

# close transfer job
s3t.shutdown()  
progress.close();
Answered By: tfleischer