Subclass python set for deterministic iteration order

Question:

I’m looking for a minimal solution that exhibits the same API as set, but has deterministic iteration order. That is, the iteration order should depend only on the existing elements in the set. Here is a skeleton of the solution I came up with:

class MySet(UserDict):
 
    def __contains__(self, element):
        return element in self.data
 
    def __iter__(self):
        # from a comment by @juanpa.arrivillaga
        return iter(sorted(self.data))

    def __str__(self) -> str:
        return "{" + ", ".join([str(k) for k in self._keys()]) + "}"
 
    def add(self, element):
        # arbitraily map all elements to True
        self.data[element] = True
 
    def _keys(self):
        return sorted(self.data.keys())

Can it be improved ?

Asked By: OrenIshShalom

||

Answers:

It’s not clear from the question what precisely your requirement is, but based on your implementation it’s just a set where only iteration is in sorted order (it’s still not semantically ordered, e.g. you shouldn’t be able to access elements positionally and comparison wouldn’t consider order, and insertion order doesn’t matter). In that case I’d implement the existing MutableSet abstract base class and compose around a vanilla set:

from collections.abc import MutableSet
from typing import Any, Iterable, Iterator, Protocol, TypeVar


class Sortable(Protocol):
    def __lt__(self, other) -> bool: ...


T = TypeVar("T", bound=Sortable)


class OrderedSet(MutableSet[T]):

    _values: set[T]

    def __init__(self, values: Iterable[T] = None, /) -> None:
        self._values = set() if values is None else set(values)

    def add(self, value: T) -> None:
        self._values.add(value)

    def discard(self, value: T) -> None:
        self._values.discard(value)

    def __contains__(self, item: Any) -> bool:
        return item in self._values

    def __iter__(self) -> Iterator[T]:
        return iter(sorted(self._values))

    def __len__(self) -> int:
        return len(self._values)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({list(self)!r})"

The only custom behaviour is the use of sorted in the __iter__ implementation. These are the tests I drove it out with:

from unittest import TestCase

from ordered_set import OrderedSet


class TestOrderedSet(TestCase):

    def test_contains(self):
        self.assertNotIn("foo", OrderedSet())
        self.assertIn("foo", OrderedSet(["foo", "bar", "baz"]))

    def test_iteration(self):
        os = OrderedSet(("foo", "bar"))
        os.add("baz")
        self.assertEqual(
            list(os),
            ["bar", "baz", "foo"]
        )

    def test_size(self):
        self.assertEqual(len(OrderedSet()), 0)
        self.assertEqual(len(OrderedSet(["foo", "bar", "baz"])), 3)

    def test_addition(self):
        os = OrderedSet()
        for value in ["foo", "bar", "baz"]:
            os.add(value)
        self.assertEqual(len(os), 3)
        self.assertIn("bar", os)

    def test_removal(self):
        os = OrderedSet(["foo", "bar", "baz"])
        os.remove("baz")
        self.assertEqual(len(os), 2)
        self.assertNotIn("baz", os)

    def test_rejects_keyword_args(self):
        with self.assertRaises(TypeError):
            OrderedSet(values=["foo", "bar", "baz"])

    def test_repr(self):
        self.assertEqual(
            repr(OrderedSet([3, 1, 2])),
            "OrderedSet([1, 2, 3])",
        )

I implement MutableSet rather than extending UserDict because:

  1. it has better semantics, what we’re trying to create is a mutable set;

  2. it keeps you compliant with the Liskov substitution principle, meaning an OrderedSet can be used anywhere any other Set* or MutableSet can:

    • your implementation can be added to but lacks the removal methods: clear, discard, pop, remove;
    • your implementation also lacks the "magic" methods that allow e.g.:
      >>> OrderedSet((1, 3)) | {2, 4}
      OrderedSet([1, 2, 3, 4])
      
  3. it gives us more freedom in implementation. For example, if sorting the set for every iteration is a crucial performance bottleneck, we can keep the same tests and interface and swap over to a bisected list:

    from bisect import bisect_left
    from collections.abc import MutableSet
    from typing import Any, Iterable, Iterator, Protocol, TypeVar
    
    
    class Sortable(Protocol):
        def __lt__(self, other) -> bool: ...
    
    
    T = TypeVar("T", bound=Sortable)
    
    
    class OrderedSet(MutableSet[T]):
    
        _values: list[T]
    
        def __init__(self, values: Iterable[T] = None, /) -> None:
            self._values = [] if values is None else sorted(values)
    
        def add(self, value: T) -> None:
            index = bisect_left(self._values, value)
            if index == len(self) or self._values[index] != value:
                self._values.insert(index, value)
    
        def discard(self, value: T) -> None:
            index = bisect_left(self._values, value)
            if index < len(self) and self._values[index] == value:
                self._values.pop(index)
    
        def __contains__(self, value: Any) -> bool:
            index = bisect_left(self._values, value)
            return index < len(self) and self._values[index] == value
    
        def __iter__(self) -> Iterator[T]:
            return iter(self._values)
    
        def __len__(self) -> int:
            return len(self._values)
    
        def __repr__(self) -> str:
            return f"{self.__class__.__name__}({self._values!r})"
    
    

* but not where any set can, see e.g. Why are set methods like .intersection() not supported on set-like objects?

Answered By: jonrsharpe
Categories: questions Tags:
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.