Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Remove the no-longer-needed wrapper layers [WIP] #3209

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
push more methods out of _tdb_handles.py
johnkerl committed Oct 20, 2024
commit 9fcf6b0ec124fcd17940398fc59ea6eb6b0fb43e
79 changes: 58 additions & 21 deletions apis/python/src/tiledbsoma/_soma_array.py
Original file line number Diff line number Diff line change
@@ -3,8 +3,9 @@
#
# Licensed under the MIT License.

from typing import Any, Optional, Sequence, Tuple
from typing import Any, List, Optional, Sequence, Tuple

import numpy as np
import pyarrow as pa
from somacore import options
from typing_extensions import Self
@@ -57,7 +58,11 @@ def schema(self) -> pa.Schema:
Lifecycle:
Maturing.
"""
return self._handle.schema
return self._clib_handle.schema

@property
def ndim(self) -> int:
return len(self._clib_handle.dimension_names)

def config_options_from_schema(self) -> clib.PlatformConfig:
"""Returns metadata about the array that is not encompassed within the
@@ -91,32 +96,64 @@ def config_options_from_schema(self) -> clib.PlatformConfig:
* cell_order: str
* consolidate_and_vacuum: bool
"""
return self._handle.config_options_from_schema()

def non_empty_domain(self) -> Tuple[Tuple[Any, Any], ...]:
"""
Retrieves the non-empty domain for each dimension, namely the smallest
and largest indices in each dimension for which the array/dataframe has
data occupied. This is nominally the same as the domain used at
creation time, but if for example only a portion of the available domain
has actually had data written, this function will return a tighter
range.
"""
return self._handle.non_empty_domain()
return self._clib_handle.config_options_from_schema()

def _tiledb_array_keys(self) -> Tuple[str, ...]:
"""Return all dim and attr names."""
return self._tiledb_dim_names() + self._tiledb_attr_names()

def _tiledb_dim_names(self) -> Tuple[str, ...]:
"""Reads the dimension names from the schema: for example, ['obs_id', 'var_id']."""
return self._handle.dim_names
return tuple(self._clib_handle.dimension_names)

def _tiledb_attr_names(self) -> Tuple[str, ...]:
"""Reads the attribute names from the schema:
for example, the list of column names in a dataframe.
"""
return self._handle.attr_names
return self.attr_names

@property
def dim_names(self) -> Tuple[str, ...]:
return tuple(self._clib_handle.dimension_names)

@property
def attr_names(self) -> Tuple[str, ...]:
return tuple(
f.name
for f in self.schema
if f.name not in self._clib_handle.dimension_names
)

def _cast_domainish(
self, domainish: List[Any]
) -> Tuple[Tuple[object, object], ...]:
result = []
for i, slot in enumerate(domainish):

arrow_type = slot[0].type
if pa.types.is_timestamp(arrow_type):
pandas_type = np.dtype(arrow_type.to_pandas_dtype())
result.append(
tuple(
pandas_type.type(e.cast(pa.int64()).as_py(), arrow_type.unit)
for e in slot
)
)
else:
result.append(tuple(e.as_py() for e in slot))

return tuple(result)

def non_empty_domain(self) -> Tuple[Tuple[Any, Any], ...]:
"""
Retrieves the non-empty domain for each dimension, namely the smallest
and largest indices in each dimension for which the array/dataframe has
data occupied. This is nominally the same as the domain used at
creation time, but if for example only a portion of the available domain
has actually had data written, this function will return a tighter
range.
"""
return self._cast_domainish(self._clib_handle.non_empty_domain())

def _domain(self) -> Tuple[Tuple[Any, Any], ...]:
"""This is the SOMA domain, not the core domain.
@@ -131,7 +168,7 @@ def _domain(self) -> Tuple[Tuple[Any, Any], ...]:
* Core current domain is new as of core 2.25 and can be
resized up to core (max) domain.
"""
return self._handle.domain
return self._cast_domainish(self._clib_handle.domain())

def _maxdomain(self) -> Tuple[Tuple[Any, Any], ...]:
"""This is the SOMA maxdomain, not the core domain.
@@ -146,7 +183,7 @@ def _maxdomain(self) -> Tuple[Tuple[Any, Any], ...]:
* Core current domain is new as of core 2.25 and can be
resized up to core (max) domain.
"""
return self._handle.maxdomain
return self._cast_domainish(self._clib_handle.maxdomain())

def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> None:
"""Parses the given coords and sets them on the SOMA Reader."""
@@ -156,10 +193,10 @@ def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> No
" not str or bytes"
)

if len(coords) > self._handle.ndim:
if len(coords) > self.ndim:
raise ValueError(
f"coords ({len(coords)} elements) must be shorter than ndim"
f" ({self._handle.ndim})"
f" ({self.ndim})"
)
for i, coord in enumerate(coords):
dim = self.schema.field(i)
@@ -190,7 +227,7 @@ def _set_reader_coord(
if isinstance(coord, slice):
_util.validate_slice(coord)
try:
dom = self._handle.domain[dim_idx]
dom = self._domain()[dim_idx]
lo_hi = _util.slice_to_numeric_range(coord, dom)
except _util.NonNumericDimensionError:
return False # We only handle numeric dimensions here.
119 changes: 0 additions & 119 deletions apis/python/src/tiledbsoma/_tdb_handles.py
Original file line number Diff line number Diff line change
@@ -15,7 +15,6 @@
Dict,
Generic,
Iterator,
List,
Mapping,
MutableMapping,
Optional,
@@ -372,128 +371,10 @@ def _do_initial_reads(self, reader: RawHandle) -> None:
# non–attrs-managed field
self.metadata = MetadataWrapper(self, dict(reader.meta))

@property
def schema(self) -> pa.Schema:
return self._handle.schema

def config_options_from_schema(self) -> clib.PlatformConfig:
return self._handle.config_options_from_schema()

@property
def meta(self) -> "MetadataWrapper":
return self.metadata

@property
def ndim(self) -> int:
return len(self._handle.dimension_names)

def _cast_domainish(
self, domainish: List[Any]
) -> Tuple[Tuple[object, object], ...]:
result = []
for i, slot in enumerate(domainish):

arrow_type = slot[0].type
if pa.types.is_timestamp(arrow_type):
pandas_type = np.dtype(arrow_type.to_pandas_dtype())
result.append(
tuple(
pandas_type.type(e.cast(pa.int64()).as_py(), arrow_type.unit)
for e in slot
)
)
else:
result.append(tuple(e.as_py() for e in slot))

return tuple(result)

@property
def domain(self) -> Tuple[Tuple[object, object], ...]:
return self._cast_domainish(self._handle.domain())

@property
def maxdomain(self) -> Tuple[Tuple[object, object], ...]:
return self._cast_domainish(self._handle.maxdomain())

def non_empty_domain(self) -> Tuple[Tuple[object, object], ...]:
return self._cast_domainish(self._handle.non_empty_domain())

@property
def attr_names(self) -> Tuple[str, ...]:
return tuple(
f.name for f in self.schema if f.name not in self._handle.dimension_names
)

@property
def dim_names(self) -> Tuple[str, ...]:
return tuple(self._handle.dimension_names)

@property
def shape(self) -> Tuple[int, ...]:
"""Not implemented for DataFrame."""
return cast(Tuple[int, ...], tuple(self._handle.shape))

@property
def maxshape(self) -> Tuple[int, ...]:
"""Not implemented for DataFrame."""
return cast(Tuple[int, ...], tuple(self._handle.maxshape))

@property
def maybe_soma_joinid_shape(self) -> Optional[int]:
"""Only implemented for DataFrame."""
raise NotImplementedError

@property
def maybe_soma_joinid_maxshape(self) -> Optional[int]:
"""Only implemented for DataFrame."""
raise NotImplementedError

@property
def tiledbsoma_has_upgraded_shape(self) -> bool:
"""Not implemented for DataFrame."""
raise NotImplementedError

@property
def tiledbsoma_has_upgraded_domain(self) -> bool:
"""Only implemented for DataFrame."""
raise NotImplementedError

def resize(self, newshape: Sequence[Union[int, None]]) -> None:
"""Not implemented for DataFrame."""
raise NotImplementedError

def tiledbsoma_can_resize(
self, newshape: Sequence[Union[int, None]]
) -> StatusAndReason:
"""Not implemented for DataFrame."""
raise NotImplementedError

def tiledbsoma_upgrade_shape(self, newshape: Sequence[Union[int, None]]) -> None:
"""Not implemented for DataFrame."""
raise NotImplementedError

def tiledbsoma_can_upgrade_shape(
self, newshape: Sequence[Union[int, None]]
) -> StatusAndReason:
"""Not implemented for DataFrame."""
raise NotImplementedError

def resize_soma_joinid_shape(self, newshape: int) -> None:
"""Only implemented for DataFrame."""
raise NotImplementedError

def can_resize_soma_joinid_shape(self, newshape: int) -> StatusAndReason:
"""Only implemented for DataFrame."""
raise NotImplementedError

def upgrade_soma_joinid_shape(self, newshape: int) -> None:
"""Only implemented for DataFrame."""
raise NotImplementedError

def can_upgrade_soma_joinid_shape(self, newshape: int) -> StatusAndReason:
"""Only implemented for DataFrame."""
raise NotImplementedError


class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]):
"""Wrapper around a Pybind11 SOMADataFrame handle."""
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
@@ -1931,7 +1931,7 @@ def _write_matrix_to_denseNDArray(

def _read_nonempty_domain(arr: SOMAArray) -> Any:
try:
return arr._handle.non_empty_domain()
return arr.non_empty_domain()
except (SOMAError, RuntimeError):
# This means that we're open in write-only mode.
# Reopen the array in read mode.
@@ -1940,7 +1940,7 @@ def _read_nonempty_domain(arr: SOMAArray) -> Any:

cls = type(arr)
with cls.open(arr.uri, "r", platform_config=None, context=arr.context) as readarr:
return readarr._handle.non_empty_domain()
return readarr.non_empty_domain()


def _find_sparse_chunk_size(