Skip to content

Commit

Permalink
feat(write_labels): store all sparse X and raw.X in canonical form (#602
Browse files Browse the repository at this point in the history
)

* add enforce canonical

* add enforce canonical

* - add unit tests to for enforce_canonical_format
- always write to memory.
- add comments

* remove extra log message

* remove returned X

* Bump version: 3.1.2 → 3.1.3-rc.0

* Bump version: 3.1.3-rc.0 → 3.1.3

* pr feedback

* add log message about enforcing canonical

* stating what function is used.

---------

Co-authored-by: nayib-jose-gloria <[email protected]>
  • Loading branch information
Bento007 and nayib-jose-gloria authored Aug 23, 2023
1 parent 226b2fa commit 3a22ca3
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.1.2
current_version = 3.1.3
commit = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<prerel>rc)\.(?P<prerelversion>\d+))?
serialize =
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/cellxgene_schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.1.2"
__version__ = "3.1.3"
22 changes: 22 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,25 @@ def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike]) -> ad.AnnData:
sys.exit(1)

return adata


def enforce_canonical_format(adata: ad.AnnData):
"""
Enforce canonical format for anndata X and raw.X. All operation are done inplace.
Canonical Format is required to support h5ad to Seurat file conversion.
:param adata:
"""

def _enforce_canonical_format(df):
X = df.X
if hasattr(X, "has_canonical_format") and not X.has_canonical_format:
# this enforces canonical form; see https://docs.scipy.org/doc/scipy/tutorial/sparse.html#canonical-formats
logger.warning("noncanonical data found in X; converting to canonical format using sum_duplicates.")
X.sum_duplicates()

# enforce for canonical
logger.info("enforce canonical format in X")
_enforce_canonical_format(adata)
if adata.raw:
logger.info("enforce canonical format in raw.X")
_enforce_canonical_format(adata.raw)
19 changes: 10 additions & 9 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from cellxgene_schema import ontology
from cellxgene_schema.validate import ONTOLOGY_CHECKER, Validator

from .utils import getattr_anndata
from .utils import enforce_canonical_format, getattr_anndata

logger = logging.getLogger(__name__)

Expand All @@ -31,14 +31,13 @@ def __init__(self, validator: Validator):
"AnnData object is not valid or hasn't been run through validation. "
"Validate AnnData first before attempting to write labels"
)

if validator.adata.isbacked:
try:
self.adata = validator.adata.to_memory()
except ValueError:
self.adata = validator.adata
else:
self.adata = validator.adata.copy()
try:
# Always reading into memory because the canonical enforcement requires the X matrix to be in memory. Do
# this early to make other label writing operation faster.
self.adata = validator.adata.to_memory()
except ValueError:
# already in memory
self.adata = validator.adata
self.validator = validator
self.schema_def = validator.schema_def
self.errors = []
Expand Down Expand Up @@ -343,6 +342,8 @@ def write_labels(self, add_labels_file: str):
# Update version
self.adata.uns["schema_version"] = self.validator.schema_version

enforce_canonical_format(self.adata)

# Write file
try:
self.adata.write_h5ad(add_labels_file, compression="gzip")
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="cellxgene-schema",
version="3.1.2",
version="3.1.3",
url="https://github.com/chanzuckerberg/single-cell-curation",
license="MIT",
author="Chan Zuckerberg Initiative",
Expand Down
38 changes: 37 additions & 1 deletion cellxgene_schema_cli/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
import numpy as np
import pytest
from cellxgene_schema.utils import map_ontology_term, remove_deprecated_features, replace_ontology_term
from anndata import AnnData
from cellxgene_schema.utils import (
enforce_canonical_format,
map_ontology_term,
remove_deprecated_features,
replace_ontology_term,
)
from fixtures.examples_validate import adata, adata_non_raw
from scipy.sparse import coo_matrix


@pytest.fixture
Expand Down Expand Up @@ -71,3 +79,31 @@ def test_map_ontology_term__(adata_without_raw):
assert all(a == "CL:0000001" for a in donor_1_rows["cell_type_ontology_term_id"])
donor_2_rows = adata_without_raw.obs.loc[adata_without_raw.obs["donor_id"] == "donor_2"]
assert all(a == "CL:0000002" for a in donor_2_rows["cell_type_ontology_term_id"])


@pytest.fixture
def noncanonical_matrix():
array = np.array([[1, 0, 1], [3, 2, 3], [4, 5, 4]])
return coo_matrix((array[0], (array[1], array[2])))


class TestEnforceCanonical:
def test_adata_with_noncanonical_X_and_raw_X(self, noncanonical_matrix):
assert noncanonical_matrix.has_canonical_format is False
adata = AnnData(noncanonical_matrix)
enforce_canonical_format(adata)
assert adata.X.has_canonical_format is True

def test_adata_with_noncanonical_raw_X(self, noncanonical_matrix):
assert noncanonical_matrix.has_canonical_format is False
adata = AnnData(raw=AnnData(noncanonical_matrix))
enforce_canonical_format(adata)
assert adata.raw.X.has_canonical_format is True

def test_adata_with_canonical_raw_X(self, adata_with_raw):
enforce_canonical_format(adata)
assert adata_with_raw.raw.X.has_canonical_format is True

def test_adata_with_canonical_X(self, adata_without_raw):
enforce_canonical_format(adata)
assert adata_without_raw.X.has_canonical_format is True
5 changes: 5 additions & 0 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ def test__validate_with_h5ad_valid_and_labels(self):

success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path)

import anndata as ad

adata = ad.read_h5ad(labels_path)
self.assertTrue(adata.X.has_canonical_format)
self.assertTrue(adata.raw.X.has_canonical_format)
self.assertTrue(success)
self.assertListEqual(errors, [])
self.assertTrue(is_seurat_convertible)
Expand Down

0 comments on commit 3a22ca3

Please sign in to comment.