Skip to content

Commit

Permalink
Merge pull request #36 from theislab/feature/spatialdata_validator
Browse files Browse the repository at this point in the history
Add SpatialDataValidator
  • Loading branch information
Zethson authored Oct 16, 2024
2 parents d05d6a4 + 17f1f7b commit 5fc4ade
Show file tree
Hide file tree
Showing 5 changed files with 728 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ __pycache__/

Sandbox.ipynb
test.ipynb
*/10ktp__10X__Visium__Mouse__brain__20200623__v1.1.0.zarr
578 changes: 578 additions & 0 deletions scripts/validator_demo.ipynb

Large diffs are not rendered by default.

19 changes: 9 additions & 10 deletions src/spatialdata_db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from importlib.metadata import version
from importlib import resources
import pandas as pd


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")


__all__ = ["load_10x_metadata"]
from spatialdata_db.parsing import load_10x_metadata
from django.core.exceptions import ImproperlyConfigured
from lamin_utils import logger
try:
from spatialdata_db.lamin_spatialdata_validator import SpatialDataValidator
except ImproperlyConfigured:
logger.warning("Importing SpatialDataValidator currently requires being connected to a lamindb instance.")

__all__ = ["load_10x_metadata", "SpatialDataValidator"]
__version__ = version("spatialdata-db")
133 changes: 133 additions & 0 deletions src/spatialdata_db/lamin_spatialdata_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import bionty as bt
import pandas as pd
import anndata as ad
import spatialdata as sd

from lamindb.core import AnnDataCurator, DataFrameCurator
from lamindb_setup.core.types import UPathStr
from lnschema_core import Record
from lnschema_core.types import FieldAttr
from lamin_utils import logger, colors

def _add_defaults(data: pd.DataFrame | UPathStr, defaults: dict[str, str] = None) -> None:
"""Adds default values to a Pandas DataFrame if values are missing."""
if defaults:
if isinstance(data, UPathStr):
data = pd.read_csv(UPathStr) # TODO this parsing is not very safe

for col, default in defaults.items():
if col not in data.columns:
data[col] = default
else:
data[col].fillna(default, inplace=True)


class SpatialDataMetadataValidator(DataFrameCurator):

DEFAULT_CATEGORICALS = {
"assay": bt.ExperimentalFactor.name,
}

DEFAULT_VALUES = {
"assay": "na",
}

FIXED_SOURCES = {
"assay": bt.Source.filter(entity="bionty.ExperimentalFactor", name="efo", version="3.70.0").one()
}

def __init__(
self,
data: pd.DataFrame | UPathStr,
categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
defaults: dict[str, str] = DEFAULT_VALUES,
sources: dict[str, Record] = FIXED_SOURCES,
organism="human",
):
self.data = data

_add_defaults(data, defaults)

super().__init__(
df=data, categoricals=categoricals, sources=sources, organism=organism
)

def validate(self, organism: str | None = None) -> bool:
"""Validate the global SpatialDataMetadata."""
return DataFrameCurator.validate(self, organism)

class SpatialDataTableValidator(AnnDataCurator):

DEFAULT_CATEGORICALS = {
"celltype": bt.CellType.name,
}

DEFAULT_VALUES = {
"celltype": "normal",
}

DEFAULT_SOURCES = {
"celltype": bt.Source.filter(entity="bionty.CellType", name="cl", version="2023-08-24").one()
}

# TODO not every AnnData objects will have all of these obs columns present but one of them should
# Figure out how to pass the categoricals to the respective tables

def __init__(
self,
data: ad.AnnData | UPathStr,
var_index: FieldAttr = bt.Gene.ensembl_gene_id,
categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
defaults: dict[str, str] = DEFAULT_VALUES,
table_key: str,
organism="human",
):
self.data = data
self.table_key = table_key

_add_defaults(data.obs, defaults)

super().__init__(
data=data, var_index=var_index, categoricals=categoricals, organism=organism
)

def validate(self, organism: str | None = None) -> bool:
"""Validate the table."""
return super().validate(organism)


class SpatialDataValidator:
"""Custom curation flow for SpatialData."""

def __init__(
self,
sdata: sd.SpatialData | UPathStr,
# categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
# defaults: dict[str, str] = None,
# sources: dict[str, Record] = FIXED_SOURCES,
organism="human",
):
self.sdata = sdata
self.organism = organism

# TODO think about how to integrate the parameters -> some weird nested quirky thing

self.metadata_validator = SpatialDataMetadataValidator(data=self.sdata.metadata, organism=self.organism)
self.table_validators = {table_key: SpatialDataTableValidator(data=sdata.tables[table_key], table_key=table_key, organism=self.organism) for table_key in self.sdata.tables.keys()}


def validate(self, organism: str | None = None) -> bool:
"""Validating Spatialdata objects including the metadata and all tables (AnnData objects)."""
# TODO this should very clearly state which things were able to be validate or not

logger.info(f"Validating {colors.green('metadata')}.")
is_metadata_validated = self.metadata_validator.validate(organism)
is_tables_validated = False
for table_key, sdtvalidator in self.table_validators.items():
logger.info(f"Validating Anndata object with key {colors.green(table_key)}")
is_tables_validated = sdtvalidator.validate(organism)

return is_metadata_validated and is_tables_validated
7 changes: 7 additions & 0 deletions src/spatialdata_db/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from importlib import resources
import pandas as pd


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")

0 comments on commit 5fc4ade

Please sign in to comment.