Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++/python] Add GeometryDataFrame creation #3674

Merged
merged 14 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def run(self):
"src/tiledbsoma/soma_object.cc",
"src/tiledbsoma/soma_dataframe.cc",
"src/tiledbsoma/soma_point_cloud_dataframe.cc",
"src/tiledbsoma/soma_geometry_dataframe.cc",
"src/tiledbsoma/soma_dense_ndarray.cc",
"src/tiledbsoma/soma_sparse_ndarray.cc",
"src/tiledbsoma/soma_group.cc",
Expand Down
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@
get_storage_engine,
show_package_versions,
)
from ._geometry_dataframe import GeometryDataFrame
from ._indexer import IntIndexer, tiledbsoma_build_index
from ._measurement import Measurement
from ._multiscale_image import MultiscaleImage
Expand Down Expand Up @@ -209,6 +210,7 @@
"DoesNotExistError",
"Experiment",
"ExperimentAxisQuery",
"GeometryDataFrame",
"get_implementation_version",
"get_implementation",
"get_SOMA_version",
Expand Down
93 changes: 85 additions & 8 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from . import _arrow_types, _util
from . import pytiledbsoma as clib
from ._constants import SOMA_JOINID
from ._constants import SOMA_GEOMETRY, SOMA_JOINID
from ._exception import SOMAError, map_exception_for_create
from ._read_iters import ManagedQuery, TableReadIter
from ._soma_array import SOMAArray
Expand Down Expand Up @@ -790,7 +790,9 @@


def _canonicalize_schema(
schema: pa.Schema, index_column_names: Sequence[str]
schema: pa.Schema,
index_column_names: Sequence[str],
required_columns: Sequence[str] = [SOMA_JOINID],
) -> pa.Schema:
"""Turns an Arrow schema into the canonical version and checks for errors.

Expand All @@ -807,21 +809,45 @@
raise ValueError(
f"{SOMA_JOINID} field must be of type Arrow int64 but is {joinid_type}"
)
else:
elif SOMA_JOINID in required_columns:
# add SOMA_JOINID
schema = schema.append(pa.field(SOMA_JOINID, pa.int64()))

if SOMA_GEOMETRY in schema.names:
geometry_type = schema.field(SOMA_GEOMETRY).type
if geometry_type != pa.binary() and geometry_type != pa.large_binary():
raise ValueError(

Check warning on line 819 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L817-L819

Added lines #L817 - L819 were not covered by tests
f"{SOMA_GEOMETRY} field must be of type Arrow binary or large_binary but is {geometry_type}"
)
schema.set(

Check warning on line 822 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L822

Added line #L822 was not covered by tests
schema.get_field_index(SOMA_GEOMETRY),
schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"}),
)
elif SOMA_GEOMETRY in required_columns:
# add SOMA_GEOMETRY
schema = schema.append(
pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"})
)

# verify no illegal use of soma_ prefix
for field_name in schema.names:
if field_name.startswith("soma_") and field_name != SOMA_JOINID:
if (
field_name.startswith("soma_")
and field_name != SOMA_JOINID
and field_name != SOMA_GEOMETRY
):
raise ValueError(
f"DataFrame schema may not contain fields with name prefix ``soma_``: got ``{field_name}``"
)

# verify that all index_column_names are present in the schema
schema_names_set = set(schema.names)
for index_column_name in index_column_names:
if index_column_name.startswith("soma_") and index_column_name != SOMA_JOINID:
if (
index_column_name.startswith("soma_")
and index_column_name != SOMA_JOINID
and index_column_name != SOMA_GEOMETRY
):
raise ValueError(
f'index_column_name other than "soma_joinid" must not begin with "soma_"; got "{index_column_name}"'
)
Expand Down Expand Up @@ -864,7 +890,7 @@
index_column_name: str,
pa_type: pa.DataType,
dtype: Any,
) -> Tuple[Tuple[Any, Any], bool]:
) -> Tuple[Tuple[Any, Any], Union[bool, Tuple[bool, ...]]]:
"""Helper function for _build_tiledb_schema. Given a user-specified domain for a
dimension slot -- which may be ``None``, or a two-tuple of which either element
may be ``None`` -- return either what the user specified (if adequate) or
Expand All @@ -873,6 +899,37 @@
Returns a boolean for whether the underlying datatype's max range was used.
"""
saturated_range = False
if index_column_name == SOMA_GEOMETRY:
# SOMA_GEOMETRY domain should be either a list of None or a list of tuple[float, float]
axes_lo = []
axes_hi = []
if isinstance(slot_domain, list):
f64info: NPFInfo = np.finfo(np.float64)
saturated_multi_range = []
for axis_domain in slot_domain:
if axis_domain is None:
axes_lo.append(f64info.min)
axes_hi.append(f64info.max)
saturated_multi_range.append(True)
elif not isinstance(axis_domain, tuple) or len(axis_domain) != 2:
raise ValueError("Axis domain should be a tuple[float, float]")

Check warning on line 915 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L915

Added line #L915 was not covered by tests
else:
if np.issubdtype(type(axis_domain[0]), NPFloating) or np.issubdtype(
type(axis_domain[1]), NPFloating
):
raise ValueError("Axis domain should be a tuple[float, float]")

Check warning on line 920 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L920

Added line #L920 was not covered by tests

axes_lo.append(axis_domain[0])
axes_hi.append(axis_domain[1])
saturated_multi_range.append(False)
slot_domain = tuple(axes_lo), tuple(axes_hi)
else:
raise ValueError(

Check warning on line 927 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L927

Added line #L927 was not covered by tests
f"{SOMA_GEOMETRY} domain should be either a list of None or a list of tuple[float, float]"
)

return (slot_domain, tuple(saturated_multi_range))

if slot_domain is not None:
# User-specified; go with it when possible
if (
Expand Down Expand Up @@ -1007,6 +1064,9 @@
if isinstance(dtype, np.dtype) and dtype.itemsize == 1:
extent = 1

if index_column_name == SOMA_GEOMETRY:
return extent

# Core string dims have no extent and no (core) domain. We return "" here
# simply so we can pass libtiledbsoma "" for domain and extent, while it
# will (and must) ignore these when creating the TileDB schema.
Expand Down Expand Up @@ -1053,9 +1113,26 @@
# extent exceeds max value representable by domain type. Reduce domain max
# by 1 tile extent to allow for expansion.
def _revise_domain_for_extent(
domain: Tuple[Any, Any], extent: Any, saturated_range: bool
domain: Tuple[Any, Any], extent: Any, saturated_range: Union[bool, Tuple[bool, ...]]
) -> Tuple[Any, Any]:
if saturated_range:
if isinstance(saturated_range, tuple):
# Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]]
if isinstance(domain[1], tuple):
if len(saturated_range) != len(domain[1]):
raise ValueError(

Check warning on line 1122 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L1122

Added line #L1122 was not covered by tests
"Internal error: Saturatin flag length does not match domain size"
)

return (
domain[0],
[
(dim_max - extent) if saturated_range[idx] else dim_max
for idx, dim_max in enumerate(domain[1])
],
)

raise ValueError("Expected a complex domain")

Check warning on line 1134 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L1134

Added line #L1134 was not covered by tests
elif saturated_range:
return (domain[0], domain[1] - extent)
else:
return domain
Loading