How to read image file from S3 bucket directly into memory?
Question:
I have the following code
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
object.download_file('B01.jp2')
img=mpimg.imread('B01.jp2')
imgplot = plt.imshow(img)
plt.show(imgplot)
and it works. But the problem it downloads file into current directory first. Is it possible to read file and decode it as image directly in RAM?
Answers:
Greg Merritt’s answer below is better method.
I’d like to suggest using Python’s NamedTemporaryFile in tempfile
module. It creates temporary files that will be deleted as file is closed (Thanks to @NoamG)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import tempfile
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'wb') as f:
object.download_fileobj(f)
img=mpimg.imread(tmp.name)
# ...Do jobs using img
I would suggest using io module to read the file directly in to memory, without having to use a temporary file at all.
For example:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import io
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
file_stream = io.StringIO()
object.download_fileobj(file_stream)
img = mpimg.imread(file_stream)
# whatever you need to do
You could also use io.BytesIO
if your data is binary.
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
img_data = object.get().get('Body').read()
Streaming the image is possible by specifying the file format in imread()
.
import boto3
from io import BytesIO
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
resource = boto3.resource('s3', region_name='us-east-2')
bucket = resource.Bucket('sentinel-s2-l1c')
image_object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
image = mpimg.imread(BytesIO(image_object.get()['Body'].read()), 'jp2')
plt.figure(0)
plt.imshow(image)
Further development from Greg Merritt’s answer to solve all errors in the comment section, using BytesIO
instead of StringIO
, using PIL Image
instead of matplotlib.image
.
The following function works for python3
and boto3
. Similarly, write_image_to_s3
function is a bonus.
from PIL import Image
from io import BytesIO
import numpy as np
def read_image_from_s3(bucket, key, region_name='ap-southeast-1'):
"""Load image file from s3.
Parameters
----------
bucket: string
Bucket name
key : string
Path in s3
Returns
-------
np array
Image array
"""
s3 = boto3.resource('s3', region_name='ap-southeast-1')
bucket = s3.Bucket(bucket)
object = bucket.Object(key)
response = object.get()
file_stream = response['Body']
im = Image.open(file_stream)
return np.array(im)
def write_image_to_s3(img_array, bucket, key, region_name='ap-southeast-1'):
"""Write an image array into S3 bucket
Parameters
----------
bucket: string
Bucket name
key : string
Path in s3
Returns
-------
None
"""
s3 = boto3.resource('s3', region_name)
bucket = s3.Bucket(bucket)
object = bucket.Object(key)
file_stream = BytesIO()
im = Image.fromarray(img_array)
im.save(file_stream, format='jpeg')
object.put(Body=file_stream.getvalue())
The temporary file solution by Hyeungshik Jung looks good, but I noticed that the file somehow seem to be downloaded in a lazy fashion. This leads to a behavior that if you call img.shape()
and you’ll get an empty dimension tuple as a return value ()
even after you called object.download_fileobj(f)
. I resolved this issue by applying a f.seek(0,2)
to the file descriptor – then all following operations work properly, e.g. returning all proper dimensions (704, 1024)
.
...
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'wb') as f:
object.download_fileobj(f)
f.seek(0,2)
img=mpimg.imread(tmp.name)
print (img.shape)
Slightly different approach using client:
import boto3
import io
from matplotlib import pyplot as plt
client = boto3.client("s3")
bucket='my_bucket'
key= 'my_key'
outfile = io.BytesIO()
client.download_fileobj(bucket, key, outfile)
outfile.seek(0)
img = plt.imread(outfile)
plt.imshow(img)
plt.show()
I see a lot of good answers here. Here is my code snippet with AWS Config if you quickly test out the solution. Please note that it’s not recommended to put your AWS credentials in the code body, rather it should come from the .env file or from the AWS Keystore.
import os
import boto3
from PIL import Image
import io
AWS_ACCESS_KEY_ID = 'your-aws-access-key'
AWS_SECRET_ACCESS_KEY = 'your-aws-secret'
s3 = boto3.resource('s3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
def image_from_s3(bucket, key):
bucket = s3.Bucket(bucket)
image = bucket.Object(key)
img_data = image.get().get('Body').read()
return Image.open(io.BytesIO(img_data))
# call the function
image_from_s3("your-aws-bucket-name", "file-path")
# example
image_from_s3("my-images", "profile/2022/123.png")
Make sure that you will read a byte type data from S3 but Tensorflow needs a string tensor to convert to uint8 image. And this method doesn’t need Pillow.
import boto3
import tensorflow as tf
credentials = boto3.Session(botocore_session=boto3.setup_default_session(),
region_name="us-east-1").get_credentials()
s3 = boto3.Session(aws_access_key_id=credentials.access_key,
aws_secret_access_key=credentials.secret_key).client('s3')
#file_on_s3 : 's3://mybucket/data/sample.jpg'
bucket_name = 'mybucket'
file_key = 'data/sample.jpg'
file_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
# reading the file content in bytes
file_content = file_obj["Body"].read()
img = tf.io.decode_image(tf.convert_to_tensor(file_content, dtype=tf.string),
channels=3,
dtype=tf.dtypes.uint8,
name=None,
expand_animations=False)
img = tf.cast(img, tf.float32)
img_array = tf.image.resize(img,
size=(224, 224),
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
I have the following code
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
object.download_file('B01.jp2')
img=mpimg.imread('B01.jp2')
imgplot = plt.imshow(img)
plt.show(imgplot)
and it works. But the problem it downloads file into current directory first. Is it possible to read file and decode it as image directly in RAM?
Greg Merritt’s answer below is better method.
I’d like to suggest using Python’s NamedTemporaryFile in tempfile
module. It creates temporary files that will be deleted as file is closed (Thanks to @NoamG)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import tempfile
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'wb') as f:
object.download_fileobj(f)
img=mpimg.imread(tmp.name)
# ...Do jobs using img
I would suggest using io module to read the file directly in to memory, without having to use a temporary file at all.
For example:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import io
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('sentinel-s2-l1c')
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
file_stream = io.StringIO()
object.download_fileobj(file_stream)
img = mpimg.imread(file_stream)
# whatever you need to do
You could also use io.BytesIO
if your data is binary.
object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
img_data = object.get().get('Body').read()
Streaming the image is possible by specifying the file format in imread()
.
import boto3
from io import BytesIO
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
resource = boto3.resource('s3', region_name='us-east-2')
bucket = resource.Bucket('sentinel-s2-l1c')
image_object = bucket.Object('tiles/10/S/DG/2015/12/7/0/B01.jp2')
image = mpimg.imread(BytesIO(image_object.get()['Body'].read()), 'jp2')
plt.figure(0)
plt.imshow(image)
Further development from Greg Merritt’s answer to solve all errors in the comment section, using BytesIO
instead of StringIO
, using PIL Image
instead of matplotlib.image
.
The following function works for python3
and boto3
. Similarly, write_image_to_s3
function is a bonus.
from PIL import Image
from io import BytesIO
import numpy as np
def read_image_from_s3(bucket, key, region_name='ap-southeast-1'):
"""Load image file from s3.
Parameters
----------
bucket: string
Bucket name
key : string
Path in s3
Returns
-------
np array
Image array
"""
s3 = boto3.resource('s3', region_name='ap-southeast-1')
bucket = s3.Bucket(bucket)
object = bucket.Object(key)
response = object.get()
file_stream = response['Body']
im = Image.open(file_stream)
return np.array(im)
def write_image_to_s3(img_array, bucket, key, region_name='ap-southeast-1'):
"""Write an image array into S3 bucket
Parameters
----------
bucket: string
Bucket name
key : string
Path in s3
Returns
-------
None
"""
s3 = boto3.resource('s3', region_name)
bucket = s3.Bucket(bucket)
object = bucket.Object(key)
file_stream = BytesIO()
im = Image.fromarray(img_array)
im.save(file_stream, format='jpeg')
object.put(Body=file_stream.getvalue())
The temporary file solution by Hyeungshik Jung looks good, but I noticed that the file somehow seem to be downloaded in a lazy fashion. This leads to a behavior that if you call img.shape()
and you’ll get an empty dimension tuple as a return value ()
even after you called object.download_fileobj(f)
. I resolved this issue by applying a f.seek(0,2)
to the file descriptor – then all following operations work properly, e.g. returning all proper dimensions (704, 1024)
.
...
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'wb') as f:
object.download_fileobj(f)
f.seek(0,2)
img=mpimg.imread(tmp.name)
print (img.shape)
Slightly different approach using client:
import boto3
import io
from matplotlib import pyplot as plt
client = boto3.client("s3")
bucket='my_bucket'
key= 'my_key'
outfile = io.BytesIO()
client.download_fileobj(bucket, key, outfile)
outfile.seek(0)
img = plt.imread(outfile)
plt.imshow(img)
plt.show()
I see a lot of good answers here. Here is my code snippet with AWS Config if you quickly test out the solution. Please note that it’s not recommended to put your AWS credentials in the code body, rather it should come from the .env file or from the AWS Keystore.
import os
import boto3
from PIL import Image
import io
AWS_ACCESS_KEY_ID = 'your-aws-access-key'
AWS_SECRET_ACCESS_KEY = 'your-aws-secret'
s3 = boto3.resource('s3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
def image_from_s3(bucket, key):
bucket = s3.Bucket(bucket)
image = bucket.Object(key)
img_data = image.get().get('Body').read()
return Image.open(io.BytesIO(img_data))
# call the function
image_from_s3("your-aws-bucket-name", "file-path")
# example
image_from_s3("my-images", "profile/2022/123.png")
Make sure that you will read a byte type data from S3 but Tensorflow needs a string tensor to convert to uint8 image. And this method doesn’t need Pillow.
import boto3
import tensorflow as tf
credentials = boto3.Session(botocore_session=boto3.setup_default_session(),
region_name="us-east-1").get_credentials()
s3 = boto3.Session(aws_access_key_id=credentials.access_key,
aws_secret_access_key=credentials.secret_key).client('s3')
#file_on_s3 : 's3://mybucket/data/sample.jpg'
bucket_name = 'mybucket'
file_key = 'data/sample.jpg'
file_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
# reading the file content in bytes
file_content = file_obj["Body"].read()
img = tf.io.decode_image(tf.convert_to_tensor(file_content, dtype=tf.string),
channels=3,
dtype=tf.dtypes.uint8,
name=None,
expand_animations=False)
img = tf.cast(img, tf.float32)
img_array = tf.image.resize(img,
size=(224, 224),
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)