TypeError when creating Series from custom ExtensionArray
Question:
I’ve created a basic example custom Pandas Extension Type for storing 2D coordinates, with source code below.
I’m able to successfully create arrays of this type using pd.array() which work as expected:
arr = pd.array([(1.5, 2.0), (156, 21), (-120, 98.5)], dtype='coordinate')
<CoordinateArray> [Coordinate(1.5, 2.0), Coordinate(156.0, 21.0), Coordinate(-120.0, 98.5)] Length: 3, dtype: <class '__main__.CoordinateDtype'>
However I am getting the below error when using that array to initialise a Series, or initialising a Series directly and specifying the ‘coordinate’ dtype:
Cell In [58], line 1
----> 1 pd.Series(coords, dtype='coordinate')
File ~/.local/lib/python3.9/site-packages/pandas/core/series.py:474, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
472 manager = get_option("mode.data_manager")
473 if manager == "block":
--> 474 data = SingleBlockManager.from_array(data, index)
475 elif manager == "array":
476 data = SingleArrayManager.from_array(data, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py:1912, in SingleBlockManager.from_array(cls, array, index)
1907 @classmethod
1908 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1909 """
1910 Constructor for if we have an array that is not yet a Block.
1911 """
-> 1912 block = new_block(array, placement=slice(0, len(index)), ndim=1)
1913 return cls(block, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2181, in new_block(values, placement, ndim)
2178 klass = get_block_type(values.dtype)
2180 values = maybe_coerce_values(values)
-> 2181 return klass(values, ndim=ndim, placement=placement)
TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got CoordinateArray)
It seems to be an issue with initialising the Block to hold the data, but I’m not sure why. Extension Type definition:
import numpy as np
import pandas as pd
from functools import total_ordering
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.dtypes import PandasExtensionDtype
from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin
@total_ordering
class Coordinate(object):
"""
Simple class to represent a 2D coordinate with X and Y components.
Could extend with more useful methods etc
"""
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def __getitem__(self, index):
"""
Allows object to act like (x, y) coordinate pair with indexing
"""
if index == 0:
return self.x
elif index == 1:
return self.y
else:
raise KeyError('Invalid coordinate index: {}'.format(index))
def as_tuple(self):
"""
Return as (x, y) coordinate pair
"""
return (self.x, self.y)
def __len__(self):
return 2
def __repr__(self):
return 'Coordinate({}, {})'.format(self.x, self.y)
# Operator support
def __add__(self, other):
"""
Add scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x + other, self.y + other)
other_coord = create_coordinate(other)
return Coordinate(self.x + other_coord.x, self.y + other_coord.y)
def __sub__(self, other):
"""
Subtract scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x - other, self.y - other)
other_coord = create_coordinate(other)
return Coordinate(self.x - other_coord.x, self.y - other_coord.y)
def __mul__(self, other):
if isinstance(other, (int, float)):
return Coordinate(self.x * other, self.y * other)
else:
raise TypeError('Cannot multiply coordinate by {}'.format(type(other)))
def __neg__(self):
return Coordinate(-self.x, -self.y)
def __eq__(self, other):
other_coord = create_coordinate(other)
return self.x == other_coord.x and self.y == other_coord.y
def __lt__(self, other):
other_coord = create_coordinate(other)
return self.x < other_coord.x and self.y < other_coord.y
def create_coordinate(val):
"""
Factory function for constructing a Coordinate from various
types of inputs
"""
if isinstance(val, Coordinate):
return val
if isinstance(val, (list, tuple)) and len(val) == 2:
# Construct from list-like of X,Y value pair
return Coordinate(val[0], val[1])
raise ValueError('Invalid value to create Coordinate from: {}'.format(val))
@register_extension_dtype
class CoordinateDtype(PandasExtensionDtype):
"""
Class to describe the custom Coordinate data type
"""
type = Coordinate # Scalar type for data
name = 'coordinate' # String identifying the data type (for display)
_metadata = ('name',) # List of attributes to uniquely identify this data type
@classmethod
def construct_array_type(cls):
"""
Return array type associated with this dtype
"""
return CoordinateArray
def __str__(self):
return self.name
class CoordinateArray(ExtensionArray, ExtensionScalarOpsMixin):
"""
Custom Extension Array type for an array of Coordinates
Needs to define:
- Associated Dtype it is used with
- How to construct array from sequence of scalars
- How data is stored and accessed
- Any custom array methods
"""
dtype = CoordinateDtype
def __init__(self, x_values, y_values, copy=False):
"""
Initialise array of coordinates from component X and Y values
(Allows efficient initialisation from existing lists/arrays)
"""
self.x_values = np.array(x_values, dtype=np.float64, copy=copy)
self.y_values = np.array(y_values, dtype=np.float64, copy=copy)
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
# Construct new array from sequence of values (Unzip coordinates into x and y components)
x_values, y_values = zip(*[create_coordinate(val).as_tuple() for val in scalars])
return CoordinateArray(x_values, y_values, copy=copy)
@classmethod
def from_coordinates(cls, coordinates):
"""
Construct array from sequence of values (coordinates)
Can be provided as Coordinate instances or list/tuple like (x, y) pairs
"""
return cls._from_sequence(coordinates)
@classmethod
def _concat_same_type(cls, to_concat):
"""
Concatenate multiple arrays of this dtype
"""
return CoordinateArray(
np.concatenate(arr.x_values for arr in to_concat),
np.concatenate(arr.y_values for arr in to_concat),
)
@property
def nbytes(self):
"""
The number of bytes needed to store this object in memory.
"""
return self.x_values.nbytes + self.y_values.nbytes
def __getitem__(self, item):
"""
Retrieve single item or slice
"""
if isinstance(item, int):
# Get single coordinate
return Coordinate(self.x_values[item], self.y_values[item])
else:
# Get subset from slice or boolean array
return CoordinateArray(self.x_values[item], self.y_values[item])
def __eq__(self, other):
"""
Perform element-wise equality with a given coordinate value
"""
if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
return NotImplemented
return (self.x_values == other[0]) & (self.y_values == other[1])
def __len__(self):
return self.x_values.size
def isna(self):
"""
Returns a 1-D array indicating if each value is missing
"""
return np.isnan(self.x_values)
def take(self, indices, *, allow_fill=False, fill_value=None):
"""
Take element from array using boolean index
"""
from pandas.core.algorithms import take
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
return CoordinateArray(x_result, y_result)
def copy(self):
"""
Return copy of array
"""
return CoordinateArray(np.copy(self.x_values), np.copy(self.y_values))
# Register operator overloads using logic defined in Coordinate class
CoordinateArray._add_arithmetic_ops()
CoordinateArray._add_comparison_ops()
Answers:
Found the issue, it was due to CoordinateArray
having dtype
attribute set to CoordinateDtype
class, instead of as a property that returned a CoordinateDtype()
instance
I’ve created a basic example custom Pandas Extension Type for storing 2D coordinates, with source code below.
I’m able to successfully create arrays of this type using pd.array() which work as expected:
arr = pd.array([(1.5, 2.0), (156, 21), (-120, 98.5)], dtype='coordinate')
<CoordinateArray> [Coordinate(1.5, 2.0), Coordinate(156.0, 21.0), Coordinate(-120.0, 98.5)] Length: 3, dtype: <class '__main__.CoordinateDtype'>
However I am getting the below error when using that array to initialise a Series, or initialising a Series directly and specifying the ‘coordinate’ dtype:
Cell In [58], line 1
----> 1 pd.Series(coords, dtype='coordinate')
File ~/.local/lib/python3.9/site-packages/pandas/core/series.py:474, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
472 manager = get_option("mode.data_manager")
473 if manager == "block":
--> 474 data = SingleBlockManager.from_array(data, index)
475 elif manager == "array":
476 data = SingleArrayManager.from_array(data, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py:1912, in SingleBlockManager.from_array(cls, array, index)
1907 @classmethod
1908 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1909 """
1910 Constructor for if we have an array that is not yet a Block.
1911 """
-> 1912 block = new_block(array, placement=slice(0, len(index)), ndim=1)
1913 return cls(block, index)
File ~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2181, in new_block(values, placement, ndim)
2178 klass = get_block_type(values.dtype)
2180 values = maybe_coerce_values(values)
-> 2181 return klass(values, ndim=ndim, placement=placement)
TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got CoordinateArray)
It seems to be an issue with initialising the Block to hold the data, but I’m not sure why. Extension Type definition:
import numpy as np
import pandas as pd
from functools import total_ordering
from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.dtypes import PandasExtensionDtype
from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin
@total_ordering
class Coordinate(object):
"""
Simple class to represent a 2D coordinate with X and Y components.
Could extend with more useful methods etc
"""
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def __getitem__(self, index):
"""
Allows object to act like (x, y) coordinate pair with indexing
"""
if index == 0:
return self.x
elif index == 1:
return self.y
else:
raise KeyError('Invalid coordinate index: {}'.format(index))
def as_tuple(self):
"""
Return as (x, y) coordinate pair
"""
return (self.x, self.y)
def __len__(self):
return 2
def __repr__(self):
return 'Coordinate({}, {})'.format(self.x, self.y)
# Operator support
def __add__(self, other):
"""
Add scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x + other, self.y + other)
other_coord = create_coordinate(other)
return Coordinate(self.x + other_coord.x, self.y + other_coord.y)
def __sub__(self, other):
"""
Subtract scalar value or other coordinate
"""
if isinstance(other, (int, float)):
return Coordinate(self.x - other, self.y - other)
other_coord = create_coordinate(other)
return Coordinate(self.x - other_coord.x, self.y - other_coord.y)
def __mul__(self, other):
if isinstance(other, (int, float)):
return Coordinate(self.x * other, self.y * other)
else:
raise TypeError('Cannot multiply coordinate by {}'.format(type(other)))
def __neg__(self):
return Coordinate(-self.x, -self.y)
def __eq__(self, other):
other_coord = create_coordinate(other)
return self.x == other_coord.x and self.y == other_coord.y
def __lt__(self, other):
other_coord = create_coordinate(other)
return self.x < other_coord.x and self.y < other_coord.y
def create_coordinate(val):
"""
Factory function for constructing a Coordinate from various
types of inputs
"""
if isinstance(val, Coordinate):
return val
if isinstance(val, (list, tuple)) and len(val) == 2:
# Construct from list-like of X,Y value pair
return Coordinate(val[0], val[1])
raise ValueError('Invalid value to create Coordinate from: {}'.format(val))
@register_extension_dtype
class CoordinateDtype(PandasExtensionDtype):
"""
Class to describe the custom Coordinate data type
"""
type = Coordinate # Scalar type for data
name = 'coordinate' # String identifying the data type (for display)
_metadata = ('name',) # List of attributes to uniquely identify this data type
@classmethod
def construct_array_type(cls):
"""
Return array type associated with this dtype
"""
return CoordinateArray
def __str__(self):
return self.name
class CoordinateArray(ExtensionArray, ExtensionScalarOpsMixin):
"""
Custom Extension Array type for an array of Coordinates
Needs to define:
- Associated Dtype it is used with
- How to construct array from sequence of scalars
- How data is stored and accessed
- Any custom array methods
"""
dtype = CoordinateDtype
def __init__(self, x_values, y_values, copy=False):
"""
Initialise array of coordinates from component X and Y values
(Allows efficient initialisation from existing lists/arrays)
"""
self.x_values = np.array(x_values, dtype=np.float64, copy=copy)
self.y_values = np.array(y_values, dtype=np.float64, copy=copy)
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
# Construct new array from sequence of values (Unzip coordinates into x and y components)
x_values, y_values = zip(*[create_coordinate(val).as_tuple() for val in scalars])
return CoordinateArray(x_values, y_values, copy=copy)
@classmethod
def from_coordinates(cls, coordinates):
"""
Construct array from sequence of values (coordinates)
Can be provided as Coordinate instances or list/tuple like (x, y) pairs
"""
return cls._from_sequence(coordinates)
@classmethod
def _concat_same_type(cls, to_concat):
"""
Concatenate multiple arrays of this dtype
"""
return CoordinateArray(
np.concatenate(arr.x_values for arr in to_concat),
np.concatenate(arr.y_values for arr in to_concat),
)
@property
def nbytes(self):
"""
The number of bytes needed to store this object in memory.
"""
return self.x_values.nbytes + self.y_values.nbytes
def __getitem__(self, item):
"""
Retrieve single item or slice
"""
if isinstance(item, int):
# Get single coordinate
return Coordinate(self.x_values[item], self.y_values[item])
else:
# Get subset from slice or boolean array
return CoordinateArray(self.x_values[item], self.y_values[item])
def __eq__(self, other):
"""
Perform element-wise equality with a given coordinate value
"""
if isinstance(other, (pd.Index, pd.Series, pd.DataFrame)):
return NotImplemented
return (self.x_values == other[0]) & (self.y_values == other[1])
def __len__(self):
return self.x_values.size
def isna(self):
"""
Returns a 1-D array indicating if each value is missing
"""
return np.isnan(self.x_values)
def take(self, indices, *, allow_fill=False, fill_value=None):
"""
Take element from array using boolean index
"""
from pandas.core.algorithms import take
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
x_result = take(self.x_values, indices, fill_value=fill_value, allow_fill=allow_fill)
y_result = take(self.y_values, indices, fill_value=fill_value, allow_fill=allow_fill)
return CoordinateArray(x_result, y_result)
def copy(self):
"""
Return copy of array
"""
return CoordinateArray(np.copy(self.x_values), np.copy(self.y_values))
# Register operator overloads using logic defined in Coordinate class
CoordinateArray._add_arithmetic_ops()
CoordinateArray._add_comparison_ops()
Found the issue, it was due to CoordinateArray
having dtype
attribute set to CoordinateDtype
class, instead of as a property that returned a CoordinateDtype()
instance