Are pickle-able tuple-factories (with names) possible?
Question:
There are several questions about pickling namedtuples already, however none of the ones I found [1] [2] [3] [4] deals with the case of pickling a namedtuple that is bound on an object instance. Consider the following example
import pickle
from collections import namedtuple
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]]):
self.groups = groups
self.group_type = namedtuple("Groups", groups) # <-- How to replace this?
def __call__(self, x: tuple) -> tuple:
return self.group_type(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
encoder = TupleSplitter({"a": [0, 1, 2], "b": [2, 3, 4]})
encoder((1, 2, 3, 4, 5, 6))
pickle.dumps(encoder) # <-- PicklingError: attribute lookup Groups on __main__ failed
Question: Is it possible to have pickle-able tuple-factories with attribute names only known at runtime?
NOTE: I am not interested in any answers suggesting using a dictionary here, the return value MUST be a subclass of tuple!
NOTE: I am not interested in any answers proposing using a dill, cloudpickle or anything of the like. It must work with plain pickle!
Answers:
You will probably need two custom picklers, one for the factory and one for the tuples themselves.
This seems to work, but there may be pitfalls. For example, before pickling TupleSplitter.group_type
and the generated tuples’ types are the same; after pickling, they will be different (but equivalent) types. This can be "fixed" by maintaining a registry/cache for Group
s, but that will have different behaviour in other cases (same type for different splitters with same group names).
If only the factory needs to be pickleable, it should be straightforward (just skip the _group_pickle
and copyreg
stuff).
import copyreg
import pickle
from collections import namedtuple
def _group_pickle(ntup):
return (_group_unpickle, (ntup._fields, tuple(ntup)))
def _group_unpickle(groups, tup):
return namedtuple("Group", groups)(*tup)
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]]):
self.groups = groups
self.group_type = namedtuple("Group", groups)
copyreg.pickle(self.group_type, _group_pickle)
def __call__(self, x: tuple) -> tuple:
return self.group_type(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
def __reduce__(self):
return (self._unpickle, (self.groups,))
@staticmethod
def _unpickle(groups):
return TupleSplitter(groups)
encoder = TupleSplitter({"a": [0, 1, 2], "b": [2, 3, 4]})
encoder2 = TupleSplitter({"c": [0, 1, 2], "d": [2, 3, 4]})
print(pickle.loads(pickle.dumps(encoder((1, 2, 3, 4, 5, 6)))))
print(pickle.loads(pickle.dumps(encoder))((1, 2, 3, 4, 5, 6)))
print(pickle.loads(pickle.dumps(encoder2((1, 2, 3, 4, 5, 6)))))
->
Group(a=(1, 2, 3), b=(3, 4, 5)) # pickled tuple from encoder
Group(a=(1, 2, 3), b=(3, 4, 5)) # tuple from pickled encoder
Group(c=(1, 2, 3), d=(3, 4, 5)) # pickled tuple from encoder2
I found an alternative solution, based on a comment made here
The idea is based on 2 tricks:
- Classes that are added to
globals
can always be pickled
- The pickle lookup is performed by
__qualname__
and __name__
appears to be used as a fallback.
The second one allows us to hide a unique but ugly autogenerated classname.
import pickle
from collections import namedtuple
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]], *, name="Groups"):
self.groups = groups
self.tuple = namedtuple(name, groups)
# create a unique identifier and store it in globals
self.identifier = f"_{self.tuple.__name__}_{hash(self)}"
self.tuple.__qualname__ = self.identifier
if self.identifier in globals():
raise RuntimeError(f"A class of name '{self.identifier}' exists!!")
globals()[self.identifier] = self.tuple
def __call__(self, x: tuple) -> tuple:
return self.tuple(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
def __del__(self):
"""Delete the globals entry when the class instance is deleted."""
identifier = self.tuple.__qualname__
if identifier not in globals():
raise RuntimeError(f"'{identifier}' was already purged!")
del globals()[identifier]
del self
encoder = TupleSplitter({"a": [0, 1], "b": [2]})
groups1 = encoder(("foo1", "bar1", "baz1"))
groups2 = encoder(("foo2", "bar2", "baz2"))
pickle1 = pickle.dumps(groups1)
pickle2 = pickle.dumps(groups2)
tuple1 = pickle.loads(pickle1)
tuple2 = pickle.loads(pickle2)
assert type(groups1) == type(groups2)
assert type(tuple1) == type(tuple2)
assert type(tuple1) == type(groups1)
assert tuple1 == groups1
There are several questions about pickling namedtuples already, however none of the ones I found [1] [2] [3] [4] deals with the case of pickling a namedtuple that is bound on an object instance. Consider the following example
import pickle
from collections import namedtuple
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]]):
self.groups = groups
self.group_type = namedtuple("Groups", groups) # <-- How to replace this?
def __call__(self, x: tuple) -> tuple:
return self.group_type(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
encoder = TupleSplitter({"a": [0, 1, 2], "b": [2, 3, 4]})
encoder((1, 2, 3, 4, 5, 6))
pickle.dumps(encoder) # <-- PicklingError: attribute lookup Groups on __main__ failed
Question: Is it possible to have pickle-able tuple-factories with attribute names only known at runtime?
NOTE: I am not interested in any answers suggesting using a dictionary here, the return value MUST be a subclass of tuple!
NOTE: I am not interested in any answers proposing using a dill, cloudpickle or anything of the like. It must work with plain pickle!
You will probably need two custom picklers, one for the factory and one for the tuples themselves.
This seems to work, but there may be pitfalls. For example, before pickling TupleSplitter.group_type
and the generated tuples’ types are the same; after pickling, they will be different (but equivalent) types. This can be "fixed" by maintaining a registry/cache for Group
s, but that will have different behaviour in other cases (same type for different splitters with same group names).
If only the factory needs to be pickleable, it should be straightforward (just skip the _group_pickle
and copyreg
stuff).
import copyreg
import pickle
from collections import namedtuple
def _group_pickle(ntup):
return (_group_unpickle, (ntup._fields, tuple(ntup)))
def _group_unpickle(groups, tup):
return namedtuple("Group", groups)(*tup)
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]]):
self.groups = groups
self.group_type = namedtuple("Group", groups)
copyreg.pickle(self.group_type, _group_pickle)
def __call__(self, x: tuple) -> tuple:
return self.group_type(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
def __reduce__(self):
return (self._unpickle, (self.groups,))
@staticmethod
def _unpickle(groups):
return TupleSplitter(groups)
encoder = TupleSplitter({"a": [0, 1, 2], "b": [2, 3, 4]})
encoder2 = TupleSplitter({"c": [0, 1, 2], "d": [2, 3, 4]})
print(pickle.loads(pickle.dumps(encoder((1, 2, 3, 4, 5, 6)))))
print(pickle.loads(pickle.dumps(encoder))((1, 2, 3, 4, 5, 6)))
print(pickle.loads(pickle.dumps(encoder2((1, 2, 3, 4, 5, 6)))))
->
Group(a=(1, 2, 3), b=(3, 4, 5)) # pickled tuple from encoder
Group(a=(1, 2, 3), b=(3, 4, 5)) # tuple from pickled encoder
Group(c=(1, 2, 3), d=(3, 4, 5)) # pickled tuple from encoder2
I found an alternative solution, based on a comment made here
The idea is based on 2 tricks:
- Classes that are added to
globals
can always be pickled - The pickle lookup is performed by
__qualname__
and__name__
appears to be used as a fallback.
The second one allows us to hide a unique but ugly autogenerated classname.
import pickle
from collections import namedtuple
class TupleSplitter:
r"""Splits a tuple into namedtuple, given by the groups."""
def __init__(self, groups: dict[str, list[int]], *, name="Groups"):
self.groups = groups
self.tuple = namedtuple(name, groups)
# create a unique identifier and store it in globals
self.identifier = f"_{self.tuple.__name__}_{hash(self)}"
self.tuple.__qualname__ = self.identifier
if self.identifier in globals():
raise RuntimeError(f"A class of name '{self.identifier}' exists!!")
globals()[self.identifier] = self.tuple
def __call__(self, x: tuple) -> tuple:
return self.tuple(
**{key: tuple(x[k] for k in group) for key, group in self.groups.items()}
)
def __del__(self):
"""Delete the globals entry when the class instance is deleted."""
identifier = self.tuple.__qualname__
if identifier not in globals():
raise RuntimeError(f"'{identifier}' was already purged!")
del globals()[identifier]
del self
encoder = TupleSplitter({"a": [0, 1], "b": [2]})
groups1 = encoder(("foo1", "bar1", "baz1"))
groups2 = encoder(("foo2", "bar2", "baz2"))
pickle1 = pickle.dumps(groups1)
pickle2 = pickle.dumps(groups2)
tuple1 = pickle.loads(pickle1)
tuple2 = pickle.loads(pickle2)
assert type(groups1) == type(groups2)
assert type(tuple1) == type(tuple2)
assert type(tuple1) == type(groups1)
assert tuple1 == groups1