Use pathlib for S3 paths
Question:
I would like to build some functionality to move files between S3 and my local file system, but pathlib
appears to combine repeated slashes, breaking my aws-cli functionality:
>>> from pathlib import Path
>>> str(Path('s3://loc'))
s3:/loc'
How can I manipulate S3 paths in this way?
Answers:
No. pathlib
is for filesystem paths (i.e., paths to files on your computer), while S3 paths are URIs.
You can try combining urllib.parse with pathlib.
from urllib.parse import urlparse, urlunparse
from pathlib import PosixPath
s3_url = urlparse('s3://bucket/key')
s3_path = PosixPath(s3_url.path)
s3_path /= 'hello'
s3_new_url = urlunparse((s3_url.scheme, s3_url.netloc, s3_path.as_posix(), s3_url.params, s3_url.query, s3_url.fragment))
print(s3_new_url)
It’s quite cumbersome, but it’s what you asked for.
Using s3path
package
The s3path
package makes working with S3 paths a little less painful. It is installable from PyPI or conda-forge. Use the S3Path
class for actual objects in S3 and otherwise use PureS3Path
which shouldn’t actually access S3.
Although the previous answer by metaperture did mention this package, it didn’t include the URI syntax.
Also be aware that this package has certain deficiencies which are reported in its issues.
>>> from s3path import PureS3Path
>>> PureS3Path.from_uri('s3://mybucket/foo/bar') / 'add/me'
PureS3Path('/mybucket/foo/bar/add/me')
>>> _.as_uri()
's3://mybucket/foo/bar/add/me'
Note the instance relationships to pathlib
:
>>> from pathlib import Path, PurePath
>>> from s3path import S3Path, PureS3Path
>>> isinstance(S3Path('/my-bucket/some/prefix'), Path)
True
>>> isinstance(PureS3Path('/my-bucket/some/prefix'), PurePath)
True
Using pathlib.Path
This is a lazier version of the answer by kichik using only pathlib
. This approach is not necessarily recommended. It’s just not always entirely necessary to use urllib.parse
.
>>> from pathlib import Path
>>> orig_s3_path = 's3://mybucket/foo/bar'
>>> orig_path = Path(*Path(orig_s3_path).parts[1:])
>>> orig_path
PosixPath('mybucket/foo/bar')
>>> new_path = orig_path / 'add/me'
>>> new_s3_path = 's3://' + str(new_path)
>>> new_s3_path
's3://mybucket/foo/bar/add/me'
Using os.path.join
For simple joins only, how about os.path.join
?
>>> import os
>>> os.path.join('s3://mybucket/foo/bar', 'add/me')
's3://mybucket/foo/bar/add/me'
>>> os.path.join('s3://mybucket/foo/bar/', 'add/me')
's3://mybucket/foo/bar/add/me'
Windows users can apply .replace(os.sep, '/')
for platform safety.
os.path.normpath
cannot however be naively used:
>>> os.path.normpath('s3://mybucket/foo/bar') # Converts 's3://' to 's3:/'
's3:/mybucket/foo/bar'
Here’s a module that subclasses pathlib.Path for s3 paths: https://pypi.org/project/s3path/
Usage:
>>> from s3path import S3Path
>>> bucket_path = S3Path('/pypi-proxy/')
>>> [path for path in bucket_path.iterdir() if path.is_dir()]
[S3Path('/pypi-proxy/requests/'),
S3Path('/pypi-proxy/boto3/'),
S3Path('/pypi-proxy/botocore/')]
>>> boto3_package_path = S3Path('/pypi-proxy/boto3/boto3-1.4.1.tar.gz')
>>> boto3_package_path.exists()
True
>>> boto3_package_path.is_dir()
False
>>> boto3_package_path.is_file()
True
>>> botocore_index_path = S3Path('/pypi-proxy/botocore/index.html')
>>> with botocore_index_path.open() as f:
>>> print(f.read())
"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Package Index</title>
</head>
<body>
<a href="botocore-1.4.93.tar.gz">botocore-1.4.93.tar.gz</a><br>
</body>
</html>
"""
It’s useful and simple to extend the str class to handle this
class URL(str):
def __truediv__(self, val):
return URL(self + '/' + val)
an example usage would be URL('s3://mybucket') / 'test'
→ "s3://mybucket/test"
Using cloudpathlib
Wanted to add this as another option that has nice caching and transparent read/write access in addition the the standard path manipulations.
The cloupathlib
package provides pathlib methods support for S3 paths in addition to Google Cloud Storage and Azure Blob Storage.
For example:
from cloudpathlib import CloudPath
from itertools import islice
ladi = CloudPath("s3://ladi/Images/FEMA_CAP/2020/70349")
ladi.parent
#> S3Path('s3://ladi/Images/FEMA_CAP/2020')
ladi.bucket
#> 'ladi'
# list first 5 images for this incident
for p in islice(ladi.iterdir(), 5):
print(p)
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0002_a89f1b79-786f-4dac-9dcc-609fb1a977b1.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0003_02c30af6-911e-4e01-8c24-7644da2b8672.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0004_d37c02b9-01a8-4672-b06f-2690d70e5e6b.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0005_d05609ce-1c45-4de3-b0f1-401c2bb3412c.jpg
Pathy is pretty great for this:
https://github.com/justindujardin/pathy
It uses Smart open under the hood to provide file access to bucket storage, and so is better than s3path for this reason.
You can use Pathy.fluid
to make apis that work on both local files and files in bucket storage
from pathlib import BasePath
from pathy import Pathy, FluidPath
def process_f(f: Union[Union[str, Pathy, BasePath]):
path = Pathy.fluid(f)
# now you have a Pathlib you can process that's local or in s3/GCS
I agree with @jwodder answer, that pathlib is for fs path only. However, out of curiosity, I played a bit with inheritance from pathlib.Path and received quite a viable solution.
import pathlib
class S3Path(pathlib.PosixPath):
s3_schema = "s3:/"
def __new__(cls, *args, **kwargs):
if args[0].startswith(cls.s3_schema):
args = (args[0].replace(cls.s3_schema, "", 1),) + args[1:]
return super().__new__(cls, *args, **kwargs)
def __str__(self):
try:
return self.s3_schema + self._str
except AttributeError:
self._str = (
self._format_parsed_parts(
self._drv,
self._root,
self._parts,
)
or "."
)
return self.s3_schema + self._str
def test_basic():
s3_path_str: str = "s3://some/location"
s3_path = S3Path(s3_path_str)
assert str(s3_path) == s3_path_str
s3_path_1 = s3_path / "here"
assert str(s3_path_1) == s3_path_str + "/here"
assert s3_path.parent == S3Path("s3://some")
The advantage of it is that you don’t need any pip install dependency. Plus you could easily adopt it to any other URI like paths, for example hdfs
I would like to build some functionality to move files between S3 and my local file system, but pathlib
appears to combine repeated slashes, breaking my aws-cli functionality:
>>> from pathlib import Path
>>> str(Path('s3://loc'))
s3:/loc'
How can I manipulate S3 paths in this way?
No. pathlib
is for filesystem paths (i.e., paths to files on your computer), while S3 paths are URIs.
You can try combining urllib.parse with pathlib.
from urllib.parse import urlparse, urlunparse
from pathlib import PosixPath
s3_url = urlparse('s3://bucket/key')
s3_path = PosixPath(s3_url.path)
s3_path /= 'hello'
s3_new_url = urlunparse((s3_url.scheme, s3_url.netloc, s3_path.as_posix(), s3_url.params, s3_url.query, s3_url.fragment))
print(s3_new_url)
It’s quite cumbersome, but it’s what you asked for.
Using s3path
package
The s3path
package makes working with S3 paths a little less painful. It is installable from PyPI or conda-forge. Use the S3Path
class for actual objects in S3 and otherwise use PureS3Path
which shouldn’t actually access S3.
Although the previous answer by metaperture did mention this package, it didn’t include the URI syntax.
Also be aware that this package has certain deficiencies which are reported in its issues.
>>> from s3path import PureS3Path
>>> PureS3Path.from_uri('s3://mybucket/foo/bar') / 'add/me'
PureS3Path('/mybucket/foo/bar/add/me')
>>> _.as_uri()
's3://mybucket/foo/bar/add/me'
Note the instance relationships to pathlib
:
>>> from pathlib import Path, PurePath
>>> from s3path import S3Path, PureS3Path
>>> isinstance(S3Path('/my-bucket/some/prefix'), Path)
True
>>> isinstance(PureS3Path('/my-bucket/some/prefix'), PurePath)
True
Using pathlib.Path
This is a lazier version of the answer by kichik using only pathlib
. This approach is not necessarily recommended. It’s just not always entirely necessary to use urllib.parse
.
>>> from pathlib import Path
>>> orig_s3_path = 's3://mybucket/foo/bar'
>>> orig_path = Path(*Path(orig_s3_path).parts[1:])
>>> orig_path
PosixPath('mybucket/foo/bar')
>>> new_path = orig_path / 'add/me'
>>> new_s3_path = 's3://' + str(new_path)
>>> new_s3_path
's3://mybucket/foo/bar/add/me'
Using os.path.join
For simple joins only, how about os.path.join
?
>>> import os
>>> os.path.join('s3://mybucket/foo/bar', 'add/me')
's3://mybucket/foo/bar/add/me'
>>> os.path.join('s3://mybucket/foo/bar/', 'add/me')
's3://mybucket/foo/bar/add/me'
Windows users can apply .replace(os.sep, '/')
for platform safety.
os.path.normpath
cannot however be naively used:
>>> os.path.normpath('s3://mybucket/foo/bar') # Converts 's3://' to 's3:/'
's3:/mybucket/foo/bar'
Here’s a module that subclasses pathlib.Path for s3 paths: https://pypi.org/project/s3path/
Usage:
>>> from s3path import S3Path
>>> bucket_path = S3Path('/pypi-proxy/')
>>> [path for path in bucket_path.iterdir() if path.is_dir()]
[S3Path('/pypi-proxy/requests/'),
S3Path('/pypi-proxy/boto3/'),
S3Path('/pypi-proxy/botocore/')]
>>> boto3_package_path = S3Path('/pypi-proxy/boto3/boto3-1.4.1.tar.gz')
>>> boto3_package_path.exists()
True
>>> boto3_package_path.is_dir()
False
>>> boto3_package_path.is_file()
True
>>> botocore_index_path = S3Path('/pypi-proxy/botocore/index.html')
>>> with botocore_index_path.open() as f:
>>> print(f.read())
"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Package Index</title>
</head>
<body>
<a href="botocore-1.4.93.tar.gz">botocore-1.4.93.tar.gz</a><br>
</body>
</html>
"""
It’s useful and simple to extend the str class to handle this
class URL(str):
def __truediv__(self, val):
return URL(self + '/' + val)
an example usage would be URL('s3://mybucket') / 'test'
→ "s3://mybucket/test"
Using cloudpathlib
Wanted to add this as another option that has nice caching and transparent read/write access in addition the the standard path manipulations.
The cloupathlib
package provides pathlib methods support for S3 paths in addition to Google Cloud Storage and Azure Blob Storage.
For example:
from cloudpathlib import CloudPath
from itertools import islice
ladi = CloudPath("s3://ladi/Images/FEMA_CAP/2020/70349")
ladi.parent
#> S3Path('s3://ladi/Images/FEMA_CAP/2020')
ladi.bucket
#> 'ladi'
# list first 5 images for this incident
for p in islice(ladi.iterdir(), 5):
print(p)
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0002_a89f1b79-786f-4dac-9dcc-609fb1a977b1.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0003_02c30af6-911e-4e01-8c24-7644da2b8672.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0004_d37c02b9-01a8-4672-b06f-2690d70e5e6b.jpg
#> s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0005_d05609ce-1c45-4de3-b0f1-401c2bb3412c.jpg
Pathy is pretty great for this:
https://github.com/justindujardin/pathy
It uses Smart open under the hood to provide file access to bucket storage, and so is better than s3path for this reason.
You can use Pathy.fluid
to make apis that work on both local files and files in bucket storage
from pathlib import BasePath
from pathy import Pathy, FluidPath
def process_f(f: Union[Union[str, Pathy, BasePath]):
path = Pathy.fluid(f)
# now you have a Pathlib you can process that's local or in s3/GCS
I agree with @jwodder answer, that pathlib is for fs path only. However, out of curiosity, I played a bit with inheritance from pathlib.Path and received quite a viable solution.
import pathlib
class S3Path(pathlib.PosixPath):
s3_schema = "s3:/"
def __new__(cls, *args, **kwargs):
if args[0].startswith(cls.s3_schema):
args = (args[0].replace(cls.s3_schema, "", 1),) + args[1:]
return super().__new__(cls, *args, **kwargs)
def __str__(self):
try:
return self.s3_schema + self._str
except AttributeError:
self._str = (
self._format_parsed_parts(
self._drv,
self._root,
self._parts,
)
or "."
)
return self.s3_schema + self._str
def test_basic():
s3_path_str: str = "s3://some/location"
s3_path = S3Path(s3_path_str)
assert str(s3_path) == s3_path_str
s3_path_1 = s3_path / "here"
assert str(s3_path_1) == s3_path_str + "/here"
assert s3_path.parent == S3Path("s3://some")
The advantage of it is that you don’t need any pip install dependency. Plus you could easily adopt it to any other URI like paths, for example hdfs