TypeError when creating Series from custom ExtensionArray

Question:

I’ve created a basic example custom Pandas Extension Type for storing 2D coordinates, with source code below.

I’m able to successfully create arrays of this type using pd.array() which work as expected:

arr = pd.array([(1.5, 2.0), (156, 21), (-120, 98.5)], dtype='coordinate')

<CoordinateArray> [Coordinate(1.5, 2.0), Coordinate(156.0, 21.0), Coordinate(-120.0, 98.5)] Length: 3, dtype: <class '__main__.CoordinateDtype'>

However I am getting the below error when using that array to initialise a Series, or initialising a Series directly and specifying the ‘coordinate’ dtype:

Cell In [58], line 1
----> 1 pd.Series(coords, dtype='coordinate')

File ~/.local/lib/python3.9/site-packages/pandas/core/series.py:474, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    472 manager = get_option("mode.data_manager")
    473 if manager == "block":
--> 474     data = SingleBlockManager.from_array(data, index)
    475 elif manager == "array":
    476     data = SingleArrayManager.from_array(data, index)

File ~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py:1912, in SingleBlockManager.from_array(cls, array, index)
   1907 @classmethod
   1908 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
   1909     """
   1910     Constructor for if we have an array that is not yet a Block.
   1911     """
-> 1912     block = new_block(array, placement=slice(0, len(index)), ndim=1)
   1913     return cls(block, index)

File ~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2181, in new_block(values, placement, ndim)
   2178 klass = get_block_type(values.dtype)
   2180 values = maybe_coerce_values(values)
-> 2181 return klass(values, ndim=ndim, placement=placement)

TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got CoordinateArray)

It seems to be an issue with initialising the Block to hold the data, but I’m not sure why. Extension Type definition:

import numpy as np
import pandas as pd
from functools import total_ordering
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.dtypes import PandasExtensionDtype
from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin

@total_ordering
class Coordinate(object):
    """
    Simple class to represent a 2D coordinate with X and Y components.
    Could extend with more useful methods etc
    """
    def __init__(self, x, y):
        self.x = float(x)
        self.y = float(y)

    def __getitem__(self, index):
        """
        Allows object to act like (x, y) coordinate pair with indexing
        """
        if index == 0:
            return self.x
        elif index == 1:
            return self.y
        else:
            raise KeyError('Invalid coordinate index: {}'.format(index))

    def as_tuple(self):
        """
        Return as (x, y) coordinate pair
        """
        return (self.x, self.y)

    def __len__(self):
        return 2

    def __repr__(self):
        return 'Coordinate({}, {})'.format(self.x, self.y)

    # Operator support
    def __add__(self, other):
        """
        Add scalar value or other coordinate
        """
        if isinstance(other, (int, float)):
            return Coordinate(self.x + other, self.y + other)

        other_coord = create_coordinate(other)
        return Coordinate(self.x + other_coord.x, self.y + other_coord.y)
    
    def __sub__(self, other):
        """
        Subtract scalar value or other coordinate
        """
        if isinstance(other, (int, float)):
            return Coordinate(self.x - other, self.y - other)

        other_coord = create_coordinate(other)
        return Coordinate(self.x - other_coord.x, self.y - other_coord.y)

    def __mul__(self, other):
        if isinstance(other, (int, float)):
            return Coordinate(self.x * other, self.y * other)
        else:
            raise TypeError('Cannot multiply coordinate by {}'.format(type(other)))

    def __neg__(self):
        return Coordinate(-self.x, -self.y)

    def __eq__(self, other):
        other_coord = create_coordinate(other)
        return self.x == other_coord.x and self.y == other_coord.y

    def __lt__(self, other):
        other_coord = create_coordinate(other)
        return self.x < other_coord.x and self.y < other_coord.y 




def create_coordinate(val):
    """
    Factory function for constructing a Coordinate from various
    types of inputs
    """
    if isinstance(val, Coordinate):
        return val

    if isinstance(val, (list, tuple)) and len(val) == 2:
        # Construct from list-like of X,Y value pair
        return Coordinate(val[0], val[1])

    raise ValueError('Invalid value to create Coordinate from: {}'.format(val))
    

@register_extension_dtype
class CoordinateDtype(PandasExtensionDtype):
    """
    Class to describe the custom Coordinate data type
    """
    type = Coordinate       # Scalar type for data
    name = 'coordinate'     # String identifying the data type (for display)
    _metadata = ('name',)   # List of attributes to uniquely identify this data type

    @classmethod
    def construct_array_type(cls):
        """
        Return array type associated with this dtype
        """
        return CoordinateArray

    def __str__(self):
        return self.name


class CoordinateArray(ExtensionArray, ExtensionScalarOpsMixin):
    """
    Custom Extension Array type for an array of Coordinates
    Needs to define:
    - Associated Dtype it is used with
    - How to construct array from sequence of scalars
    - How data is stored and accessed
    - Any custom array methods
    """
    dtype = CoordinateDtype

    def __init__(self, x_values, y_values, copy=False):
        """
        Initialise array of coordinates from component X and Y values 
        (Allows efficient initialisation from existing lists/arrays)
        """
        self.x_values = np.array(x_values, dtype=np.float64, copy=copy)
        self.y_values = np.array(y_values, dtype=np.float64, copy=copy)


    @classmethod
    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
        # Construct new array from sequence of values (Unzip coordinates into x and y components)
        x_values, y_values = zip(*[create_coordinate(val).as_tuple() for val in scalars])
        return CoordinateArray(x_values, y_values, copy=copy)

    @classmethod
    def from_coordinates(cls, coordinates):
        """
        Construct array from sequence of values (coordinates)
        Can be provided as Coordinate instances or list/tuple like (x, y) pairs
        """
        return cls._from_sequence(coordinates)

    @classmethod
    def _concat_same_type(cls, to_concat):
        """
        Concatenate multiple arrays of this dtype
        """
        return CoordinateArray(
            np.concatenate(arr.x_values for arr in to_concat),
            np.concatenate(arr.y_values for arr in to_concat),
        )

    @property
    def nbytes(self):
        """
        The number of bytes needed to store this object in memory.
        """
        return self.x_values.nbytes + self.y_values.nbytes

    def __getitem__(self, item):
        """
        Retrieve single item or slice
        """
        if isinstance(item, int):
            # Get single coordinate
            return Coordinate(self.x_values[item], self.y_values[item])

        else:
            # Get subset from slice  or boolean array
            return CoordinateArray(self.x_values[item], self.y_values[item])

    def __eq__(self, other):
        """
        Perform element-wise equality with a given coordinate value
        """
        if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
            return NotImplemented

        return (self.x_values == other[0]) & (self.y_values == other[1])

    def __len__(self):
        return self.x_values.size

    def isna(self):
        """
        Returns a 1-D array indicating if each value is missing
        """
        return np.isnan(self.x_values)

    def take(self, indices, *, allow_fill=False, fill_value=None):
        """
        Take element from array using boolean index

        """
        from pandas.core.algorithms import take
        if allow_fill and fill_value is None:
            fill_value = self.dtype.na_value

        x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
        y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
        return CoordinateArray(x_result, y_result)

    def copy(self):
        """
        Return copy of array
        """
        return CoordinateArray(np.copy(self.x_values), np.copy(self.y_values))


# Register operator overloads using logic defined in Coordinate class
CoordinateArray._add_arithmetic_ops()
CoordinateArray._add_comparison_ops()
Asked By: Finn Andersen

||

Answers:

Found the issue, it was due to CoordinateArray having dtype attribute set to CoordinateDtype class, instead of as a property that returned a CoordinateDtype() instance

Answered By: Finn Andersen
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.