From eeadf417862c7a4d3c6dadab16b080d1e20b140e Mon Sep 17 00:00:00 2001
From: Sidney Bell
Date: Mon, 18 Dec 2023 21:49:31 +0530
Subject: [PATCH 1/6] Copy updates (#6364)
---
frontend/src/views/CensusDirectory/index.tsx | 21 +++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/frontend/src/views/CensusDirectory/index.tsx b/frontend/src/views/CensusDirectory/index.tsx
index ef03329039197..7f6532842da7d 100644
--- a/frontend/src/views/CensusDirectory/index.tsx
+++ b/frontend/src/views/CensusDirectory/index.tsx
@@ -42,8 +42,11 @@ function CensusDirectory() {
provide feedback!
- {/* TODO: add link to notebooks once available */}
- Please see these tutorials for usage details.
+ Please{" "}
+
+ see these tutorials
+ {" "}
+ for usage details.
If you’d like to have your project featured here, please{" "}
@@ -52,15 +55,15 @@ function CensusDirectory() {
{maintainedProjects.length > 0 && (
- CELL×GENE Maintained Projects
+ CELL×GENE Collaboration Projects
- These models and their output embeddings are maintained and
- regularly re-trained by CELL×GENE in close collaboration with their
- creators. Embeddings are accessible via the Census API;
- corresponding models are available via CELL×GENE-maintained links.
+ These models and their output embeddings are ongoing collaborations.
+ CZI and the partner labs are improving the models as the Census
+ resource grows. Embeddings are accessible via the Census API;
+ corresponding models are available for download.
Please{" "}
-
+
contact the CELL×GENE team with feedback
.
@@ -80,7 +83,7 @@ function CensusDirectory() {
available).
For issues accessing these embeddings, please{" "}
-
+
contact the CELL×GENE team
. For feedback on the embeddings themselves, please contact the
From e1fb59fe46a1132c042a44f87cfcd67c7e38b006 Mon Sep 17 00:00:00 2001
From: pablo-gar
Date: Mon, 18 Dec 2023 10:48:40 -0600
Subject: [PATCH 2/6] feat(census-models): fix Geneformer description (#6363)
---
frontend/census-projects.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/frontend/census-projects.json b/frontend/census-projects.json
index d22f1251627fd..edfdc6a6b5a48 100644
--- a/frontend/census-projects.json
+++ b/frontend/census-projects.json
@@ -2,7 +2,7 @@
{
"tier": "maintained",
"title": "Geneformer embeddings fine-tuned for CELLxGENE Census cell subclass classification",
- "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the Geneformer ",
+ "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the pre-trained Geneformer model.",
"primary_contact": {
"name": "CELLxGENE Discover Team",
"email": "soma@chanzuckerberg.com",
From d2db23fb277955b073644216ed1eeeaeffec10a9 Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Mon, 18 Dec 2023 10:48:13 -0800
Subject: [PATCH 3/6] chore: remove parameterize dependency (#6325)
- remove the parametrize dependency and use `pytest.mark.paramtrize`.
- change test_rollup to use `pytest.mark.paramtrize`.
---
requirements-backend.txt | 1 -
.../utils/test_type_conversion_utils.py | 62 +-
.../backend/wmg/api/common/test_rollup.py | 646 +++++++++---------
3 files changed, 353 insertions(+), 356 deletions(-)
diff --git a/requirements-backend.txt b/requirements-backend.txt
index b6cce4ec6b951..4be816affaf37 100644
--- a/requirements-backend.txt
+++ b/requirements-backend.txt
@@ -26,7 +26,6 @@ numba==0.56.2 # required for where's my gene
numpy==1.23.5 # required for where's my gene
owlready2==0.40.0
pandas==1.5.3 # required for where's my gene
-parameterized
psutil==5.9.5
psycopg2-binary>=2.8.5
pyarrow==12.0.0 # required for where's my gene
diff --git a/tests/unit/backend/layers/utils/test_type_conversion_utils.py b/tests/unit/backend/layers/utils/test_type_conversion_utils.py
index 7725a9c0836ef..f59a92fa7ea79 100644
--- a/tests/unit/backend/layers/utils/test_type_conversion_utils.py
+++ b/tests/unit/backend/layers/utils/test_type_conversion_utils.py
@@ -3,8 +3,8 @@
import numpy as np
import pandas as pd
+import pytest
from pandas import DataFrame, Series
-from parameterized import parameterized_class
from scipy import sparse
from backend.common.utils.type_conversion_utils import (
@@ -292,33 +292,35 @@ def __exit__(self, exc_type, exc_val, exc_tb):
]
-@parameterized_class(test_cases)
-class TestTypeInference(unittest.TestCase, AssertNoLog):
- def test_type_inference(self):
- throws = getattr(self, "throws", None)
- if throws:
- with self.assertRaises(throws):
- get_dtype_and_schema_of_array(self.data)
- with self.assertRaises(throws):
- get_encoding_dtype_of_array(self.data)
- with self.assertRaises(throws):
- get_schema_type_hint_of_array(self.data)
-
+@pytest.mark.parametrize("parameters", test_cases)
+def test_type_inference(parameters, caplog):
+ throws = parameters.get("throws", None)
+ if throws:
+ with pytest.raises(throws):
+ get_dtype_and_schema_of_array(parameters["data"])
+ with pytest.raises(throws):
+ get_dtype_and_schema_of_array(parameters["data"])
+ with pytest.raises(throws):
+ get_encoding_dtype_of_array(parameters["data"])
+ with pytest.raises(throws):
+ get_schema_type_hint_of_array(parameters["data"])
+
+ else:
+ logs = parameters.get("logs", None)
+ if logs is not None:
+ #
+ with caplog.at_level(logs["level"]):
+ encoding_dtype, schema_hint = get_dtype_and_schema_of_array(parameters["data"])
+ assert encoding_dtype == parameters["expected_encoding_dtype"]
+ assert schema_hint == parameters["expected_schema_hint"]
+ assert logs["output"] in caplog.messages[0]
else:
- logs = getattr(self, "logs", None)
- if logs is not None:
- with self.assertLogs(level=logs["level"]) as logger:
- encoding_dtype, schema_hint = get_dtype_and_schema_of_array(self.data)
- self.assertEqual(encoding_dtype, self.expected_encoding_dtype)
- self.assertEqual(schema_hint, self.expected_schema_hint)
- self.assertIn(logs["output"], logger.output[0])
-
- else:
- with self.assertNoLogs(logging.getLogger(), logging.WARNING):
- encoding_dtype, schema_hint = get_dtype_and_schema_of_array(self.data)
- self.assertEqual(encoding_dtype, self.expected_encoding_dtype)
- self.assertEqual(schema_hint, self.expected_schema_hint)
-
- # also test the other public API
- self.assertEqual(get_encoding_dtype_of_array(self.data), self.expected_encoding_dtype)
- self.assertEqual(get_schema_type_hint_of_array(self.data), self.expected_schema_hint)
+ with caplog.at_level(logging.WARNING):
+ encoding_dtype, schema_hint = get_dtype_and_schema_of_array(parameters["data"])
+ assert encoding_dtype == parameters["expected_encoding_dtype"]
+ assert schema_hint == parameters["expected_schema_hint"]
+ assert len(caplog.messages) == 0
+
+ # also test the other public API
+ assert get_encoding_dtype_of_array(parameters["data"]) == parameters["expected_encoding_dtype"]
+ assert get_schema_type_hint_of_array(parameters["data"]) == parameters["expected_schema_hint"]
diff --git a/tests/unit/backend/wmg/api/common/test_rollup.py b/tests/unit/backend/wmg/api/common/test_rollup.py
index c30d6ab07c625..977d5c2702cd8 100644
--- a/tests/unit/backend/wmg/api/common/test_rollup.py
+++ b/tests/unit/backend/wmg/api/common/test_rollup.py
@@ -2,28 +2,28 @@
In detail, this module tests the public and private functions defined in `backend.wmg.api.common.rollup` module.
"""
-import unittest
+from typing import List
import pandas as pd
+import pytest
from pandas import DataFrame
from pandas.testing import assert_frame_equal
-from parameterized import parameterized
from backend.wmg.api.common.rollup import rollup
-def _create_cell_counts_df_helper(cell_counts_rows: list[list], columns: list[str], index_cols: list[str]) -> DataFrame:
+def _create_cell_counts_df_helper(cell_counts_rows: List[list], columns: List[str], index_cols: List[str]) -> DataFrame:
cell_counts_df = pd.DataFrame(cell_counts_rows, columns=columns)
cell_counts_df = cell_counts_df.set_index(index_cols, verify_integrity=True)
return cell_counts_df
-def _create_gene_expression_df_helper(gene_expr_rows: list[list], columns: list[str]) -> DataFrame:
+def _create_gene_expression_df_helper(gene_expr_rows: List[list], columns: List[str]) -> DataFrame:
gene_expr_df = pd.DataFrame(gene_expr_rows, columns=columns)
return gene_expr_df
-def _cell_counts_df_without_compare_dim(cell_counts_rows: list[list]) -> DataFrame:
+def _cell_counts_df_without_compare_dim(cell_counts_rows: List[list]) -> DataFrame:
cell_counts_col_names = ["tissue_ontology_term_id", "cell_type_ontology_term_id", "n_cells_cell_type"]
cell_counts_index_col_names = ["tissue_ontology_term_id", "cell_type_ontology_term_id"]
return _create_cell_counts_df_helper(
@@ -31,7 +31,7 @@ def _cell_counts_df_without_compare_dim(cell_counts_rows: list[list]) -> DataFra
)
-def _cell_counts_df_with_ethnicity_compare_dim(cell_counts_rows: list[list]) -> DataFrame:
+def _cell_counts_df_with_ethnicity_compare_dim(cell_counts_rows: List[list]) -> DataFrame:
cell_counts_col_names = [
"tissue_ontology_term_id",
"cell_type_ontology_term_id",
@@ -50,7 +50,7 @@ def _cell_counts_df_with_ethnicity_compare_dim(cell_counts_rows: list[list]) ->
)
-def _gene_expression_df_without_compare_dim(gene_expr_rows: list[list]) -> DataFrame:
+def _gene_expression_df_without_compare_dim(gene_expr_rows: List[list]) -> DataFrame:
gene_expr_col_names = [
"gene_ontology_term_id",
"tissue_ontology_term_id",
@@ -63,7 +63,7 @@ def _gene_expression_df_without_compare_dim(gene_expr_rows: list[list]) -> DataF
return _create_gene_expression_df_helper(gene_expr_rows, columns=gene_expr_col_names)
-def _gene_expression_df_with_ethnicity_compare_dim(gene_expr_rows: list[list]) -> DataFrame:
+def _gene_expression_df_with_ethnicity_compare_dim(gene_expr_rows: List[list]) -> DataFrame:
gene_expr_col_names = [
"gene_ontology_term_id",
"tissue_ontology_term_id",
@@ -78,7 +78,294 @@ def _gene_expression_df_with_ethnicity_compare_dim(gene_expr_rows: list[list]) -
return _create_gene_expression_df_helper(gene_expr_rows, columns=gene_expr_col_names)
-class TestHighLevelRollupFunction(unittest.TestCase):
+def _rollup_testcases():
+ """
+ TODO: convert this to use pytest.mark.parametrize to remove the dependecy on parameterized
+ Testcases for the `rollup` function.
+
+ An important note about how the expected values are laid out in the testcases:
+
+ 1. Expected values for rows in the rolled up cell counts dataframe are sorted by
+ (tissue_ontology_term_id, cell_type_ontology_term_id, )
+
+ 2. Expected values for rows in the rolled up gene expression dataframe are sorted by
+ (tissue_ontology_term_id, cell_type_ontology_term_id, , gene_ontology_term_id)
+ """
+ tests = [
+ {
+ "name": "no_compare_dim_all_tissues_have_all_cell_types",
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 300],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 300],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 540],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 540],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_without_compare_dim,
+ "gene_expression_func": _gene_expression_df_without_compare_dim,
+ },
+ {
+ "name": "no_compare_dim_one_ancestor_cell_type_missing_in_one_tissue_but_exists_in_all_others",
+ # Tissue: "UBERON:0000955" MISSING cell type: "CL:0000127" in input cell counts
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 300],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ # cell count for cell type: "CL:0000127" in Tissue: "UBERON:0000955" GETS AGGREGATED because
+ # "CL:0000127" has non-zero cell count for at least one tissue in the input AND at least one
+ # descendant of "CL:0000127" has non-zero cell count for "UBERON:0000955"
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 240],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 540],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_without_compare_dim,
+ "gene_expression_func": _gene_expression_df_without_compare_dim,
+ },
+ {
+ "name": "no_compare_dim_gene_expressed_in_one_tissue_but_not_other",
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 300],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 300],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 540],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 540],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ # Gene "ENSG00000169429" expressed in Tissue "UBERON:0000955" but not expressed
+ # in Tissue "UBERON:0002113"
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_without_compare_dim,
+ "gene_expression_func": _gene_expression_df_without_compare_dim,
+ },
+ {
+ "name": "no_compare_dim_one_of_the_tissues_has_no_gene_expressions_at_all",
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 300],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 300],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", 540],
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000127", 540],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ # Tissue issue "UBERON:0002113" has no gene expressions
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_without_compare_dim,
+ "gene_expression_func": _gene_expression_df_without_compare_dim,
+ },
+ {
+ "name": "with_ethnicity_compare_dim_on_single_tissue",
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", "unknown", 300],
+ ["UBERON:0000955", "CL:0000644", "unknown", 70],
+ ["UBERON:0000955", "CL:0002605", "HANCESTRO:0005", 10],
+ ["UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 30],
+ ["UBERON:0000955", "CL:0002605", "multiethnic", 40],
+ ["UBERON:0000955", "CL:0002627", "HANCESTRO:0006", 10],
+ ["UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 20],
+ ["UBERON:0000955", "CL:0002627", "multiethnic", 30],
+ ["UBERON:0000955", "CL:0002627", "unknown", 40],
+ ],
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000127", "HANCESTRO:0008", 50],
+ ["UBERON:0000955", "CL:0000127", "multiethnic", 70],
+ ["UBERON:0000955", "CL:0000127", "unknown", 410],
+ ["UBERON:0000955", "CL:0000644", "unknown", 70],
+ ["UBERON:0000955", "CL:0002605", "HANCESTRO:0005", 10],
+ ["UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 30],
+ ["UBERON:0000955", "CL:0002605", "multiethnic", 40],
+ ["UBERON:0000955", "CL:0002627", "HANCESTRO:0006", 10],
+ ["UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 20],
+ ["UBERON:0000955", "CL:0002627", "multiethnic", 30],
+ ["UBERON:0000955", "CL:0002627", "unknown", 40],
+ ],
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", "unknown", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 1, 1, 30, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "multiethnic", 1, 1, 40, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 1, 1, 20, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", "multiethnic", 1, 1, 30, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "HANCESTRO:0008", 2, 2, 50, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "multiethnic", 1, 1, 40, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0000127", "multiethnic", 1, 1, 30, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "unknown", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", "unknown", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 1, 1, 30, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "multiethnic", 1, 1, 40, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 1, 1, 20, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", "multiethnic", 1, 1, 30, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_with_ethnicity_compare_dim,
+ "gene_expression_func": _gene_expression_df_with_ethnicity_compare_dim,
+ },
+ {
+ "name": "no_compare_dim_all_tissues_have_all_cell_types_except_root_cell_type",
+ "input_cell_counts": [
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "expected_rolled_up_cell_counts": [
+ ["UBERON:0000955", "CL:0000644", 70],
+ ["UBERON:0000955", "CL:0002605", 80],
+ ["UBERON:0000955", "CL:0002627", 90],
+ ["UBERON:0002113", "CL:0000644", 70],
+ ["UBERON:0002113", "CL:0002605", 80],
+ ["UBERON:0002113", "CL:0002627", 90],
+ ],
+ "input_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "expected_rolled_up_gene_expression": [
+ ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
+ ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
+ ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
+ ],
+ "cell_counts_func": _cell_counts_df_without_compare_dim,
+ "gene_expression_func": _gene_expression_df_without_compare_dim,
+ },
+ ]
+ for test in tests:
+ yield (
+ test["name"],
+ test["cell_counts_func"](test["input_cell_counts"]),
+ test["cell_counts_func"](test["expected_rolled_up_cell_counts"]),
+ test["gene_expression_func"](test["input_gene_expression"]),
+ test["gene_expression_func"](test["expected_rolled_up_gene_expression"]),
+ )
+
+
+@pytest.mark.parametrize(
+ "name,input_cell_counts_df,expected_cell_counts_df,input_gene_expr_df," "expected_gene_expr_df", _rollup_testcases()
+)
+def test__rollup(name, input_cell_counts_df, expected_cell_counts_df, input_gene_expr_df, expected_gene_expr_df):
"""
Test that the `rollup` function correctly accumulates (or rolls up) gene-expression
values FOR EACH expressed gene and cell count values up the cell type ANCESTOR paths
@@ -115,324 +402,33 @@ class TestHighLevelRollupFunction(unittest.TestCase):
5. Assert that the cell counts in the rolled up cell counts dataframe hold the correct
rolled up values.
"""
+ # Arrange
+ cell_counts_df_index_list = list(input_cell_counts_df.index.names)
- @staticmethod
- def _rollup_testcases():
- """
- Testcases for the `rollup` function.
-
- An important note about how the expected values are laid out in the testcases:
-
- 1. Expected values for rows in the rolled up cell counts dataframe are sorted by
- (tissue_ontology_term_id, cell_type_ontology_term_id, )
-
- 2. Expected values for rows in the rolled up gene expression dataframe are sorted by
- (tissue_ontology_term_id, cell_type_ontology_term_id, , gene_ontology_term_id)
- """
- tests = [
- {
- "name": "no_compare_dim_all_tissues_have_all_cell_types",
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 300],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 300],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 540],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 540],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "cell_counts_func": _cell_counts_df_without_compare_dim,
- "gene_expression_func": _gene_expression_df_without_compare_dim,
- },
- {
- "name": "no_compare_dim_one_ancestor_cell_type_missing_in_one_tissue_but_exists_in_all_others",
- # Tissue: "UBERON:0000955" MISSING cell type: "CL:0000127" in input cell counts
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 300],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- # cell count for cell type: "CL:0000127" in Tissue: "UBERON:0000955" GETS AGGREGATED because
- # "CL:0000127" has non-zero cell count for at least one tissue in the input AND at least one
- # descendant of "CL:0000127" has non-zero cell count for "UBERON:0000955"
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 240],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 540],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "cell_counts_func": _cell_counts_df_without_compare_dim,
- "gene_expression_func": _gene_expression_df_without_compare_dim,
- },
- {
- "name": "no_compare_dim_gene_expressed_in_one_tissue_but_not_other",
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 300],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 300],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 540],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 540],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- # Gene "ENSG00000169429" expressed in Tissue "UBERON:0000955" but not expressed
- # in Tissue "UBERON:0002113"
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ],
- "cell_counts_func": _cell_counts_df_without_compare_dim,
- "gene_expression_func": _gene_expression_df_without_compare_dim,
- },
- {
- "name": "no_compare_dim_one_of_the_tissues_has_no_gene_expressions_at_all",
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 300],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 300],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000127", 540],
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000127", 540],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- # Tissue issue "UBERON:0002113" has no gene expressions
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", 2, 2, 150, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0000127", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ],
- "cell_counts_func": _cell_counts_df_without_compare_dim,
- "gene_expression_func": _gene_expression_df_without_compare_dim,
- },
- {
- "name": "with_ethnicity_compare_dim_on_single_tissue",
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000127", "unknown", 300],
- ["UBERON:0000955", "CL:0000644", "unknown", 70],
- ["UBERON:0000955", "CL:0002605", "HANCESTRO:0005", 10],
- ["UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 30],
- ["UBERON:0000955", "CL:0002605", "multiethnic", 40],
- ["UBERON:0000955", "CL:0002627", "HANCESTRO:0006", 10],
- ["UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 20],
- ["UBERON:0000955", "CL:0002627", "multiethnic", 30],
- ["UBERON:0000955", "CL:0002627", "unknown", 40],
- ],
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000127", "HANCESTRO:0008", 50],
- ["UBERON:0000955", "CL:0000127", "multiethnic", 70],
- ["UBERON:0000955", "CL:0000127", "unknown", 410],
- ["UBERON:0000955", "CL:0000644", "unknown", 70],
- ["UBERON:0000955", "CL:0002605", "HANCESTRO:0005", 10],
- ["UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 30],
- ["UBERON:0000955", "CL:0002605", "multiethnic", 40],
- ["UBERON:0000955", "CL:0002627", "HANCESTRO:0006", 10],
- ["UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 20],
- ["UBERON:0000955", "CL:0002627", "multiethnic", 30],
- ["UBERON:0000955", "CL:0002627", "unknown", 40],
- ],
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", "unknown", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 1, 1, 30, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "multiethnic", 1, 1, 40, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 1, 1, 20, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", "multiethnic", 1, 1, 30, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "HANCESTRO:0008", 2, 2, 50, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "multiethnic", 1, 1, 40, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0000127", "multiethnic", 1, 1, 30, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000127", "unknown", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", "unknown", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "HANCESTRO:0008", 1, 1, 30, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", "multiethnic", 1, 1, 40, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002627", "HANCESTRO:0008", 1, 1, 20, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", "multiethnic", 1, 1, 30, 1000],
- ],
- "cell_counts_func": _cell_counts_df_with_ethnicity_compare_dim,
- "gene_expression_func": _gene_expression_df_with_ethnicity_compare_dim,
- },
- {
- "name": "no_compare_dim_all_tissues_have_all_cell_types_except_root_cell_type",
- "input_cell_counts": [
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "expected_rolled_up_cell_counts": [
- ["UBERON:0000955", "CL:0000644", 70],
- ["UBERON:0000955", "CL:0002605", 80],
- ["UBERON:0000955", "CL:0002627", 90],
- ["UBERON:0002113", "CL:0000644", 70],
- ["UBERON:0002113", "CL:0002605", 80],
- ["UBERON:0002113", "CL:0002627", 90],
- ],
- "input_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "expected_rolled_up_gene_expression": [
- ["ENSG00000085265", "UBERON:0000955", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0000955", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0000955", "CL:0002627", 1, 1, 90, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0000644", 1, 1, 70, 1000],
- ["ENSG00000085265", "UBERON:0002113", "CL:0002605", 1, 1, 80, 1000],
- ["ENSG00000169429", "UBERON:0002113", "CL:0002627", 1, 1, 90, 1000],
- ],
- "cell_counts_func": _cell_counts_df_without_compare_dim,
- "gene_expression_func": _gene_expression_df_without_compare_dim,
- },
- ]
-
- return [
- (
- test["name"],
- test["cell_counts_func"](test["input_cell_counts"]),
- test["cell_counts_func"](test["expected_rolled_up_cell_counts"]),
- test["gene_expression_func"](test["input_gene_expression"]),
- test["gene_expression_func"](test["expected_rolled_up_gene_expression"]),
- )
- for test in tests
- ]
-
- @parameterized.expand(_rollup_testcases)
- def test__rollup(self, _, input_cell_counts_df, expected_cell_counts_df, input_gene_expr_df, expected_gene_expr_df):
- # Arrange
- cell_counts_df_index_list = list(input_cell_counts_df.index.names)
-
- # Act
-
- # Note that we are creating copies of the input dataframes before passing them as
- # arguments to the `rollup` function so that if the `rollup` function mutates the
- # argument values, the input to the test is not affected.
- rolled_up_gene_expr_df, rolled_up_cell_counts_df = rollup(
- input_gene_expr_df.copy(), input_cell_counts_df.copy()
- )
+ # Act
- # Assert
- rolled_up_cell_counts_df.reset_index(inplace=True)
- expected_cell_counts_df.reset_index(inplace=True)
+ # Note that we are creating copies of the input dataframes before passing them as
+ # arguments to the `rollup` function so that if the `rollup` function mutates the
+ # argument values, the input to the test is not affected.
+ rolled_up_gene_expr_df, rolled_up_cell_counts_df = rollup(input_gene_expr_df.copy(), input_cell_counts_df.copy())
- assert_frame_equal(
- rolled_up_cell_counts_df.reset_index(drop=True),
- expected_cell_counts_df.reset_index(drop=True),
- check_dtype=False,
- )
+ # Assert
+ rolled_up_cell_counts_df.reset_index(inplace=True)
+ expected_cell_counts_df.reset_index(inplace=True)
- # sort the rolled up gene expression dataframe so that the correct rows are compared with
- # the expected gene expression rows in the assert call
- sort_columns_for_rolled_gene_expr_df = list(cell_counts_df_index_list) + ["gene_ontology_term_id"]
- rolled_up_gene_expr_df.sort_values(sort_columns_for_rolled_gene_expr_df, inplace=True)
+ assert_frame_equal(
+ rolled_up_cell_counts_df.reset_index(drop=True),
+ expected_cell_counts_df.reset_index(drop=True),
+ check_dtype=False,
+ )
- assert_frame_equal(
- rolled_up_gene_expr_df.reset_index(drop=True),
- expected_gene_expr_df.reset_index(drop=True),
- check_dtype=False,
- )
+ # sort the rolled up gene expression dataframe so that the correct rows are compared with
+ # the expected gene expression rows in the assert call
+ sort_columns_for_rolled_gene_expr_df = list(cell_counts_df_index_list) + ["gene_ontology_term_id"]
+ rolled_up_gene_expr_df.sort_values(sort_columns_for_rolled_gene_expr_df, inplace=True)
+
+ assert_frame_equal(
+ rolled_up_gene_expr_df.reset_index(drop=True),
+ expected_gene_expr_df.reset_index(drop=True),
+ check_dtype=False,
+ )
From e52f53c7c4f2e4c1e694cccbbeb61825015bf96d Mon Sep 17 00:00:00 2001
From: Severiano Badajoz
Date: Mon, 18 Dec 2023 12:10:30 -0800
Subject: [PATCH 4/6] feat(census-models): clobber and differentiate projects
with same title, add r embedding for maintained, analytics (#6360)
Co-authored-by: Sidney Bell
Co-authored-by: pablo-gar
---
frontend/census-projects.json | 71 +++++++-------
frontend/src/common/analytics/events.ts | 1 +
.../Header/components/Nav/index.tsx | 74 +--------------
frontend/src/types/census-projects.d.ts | 2 +-
.../components/EmbeddingButton/connect.ts | 74 +++++++++++----
.../components/EmbeddingButton/index.tsx | 27 +++---
.../components/EmbeddingButton/style.ts | 0
.../components/EmbeddingButton/types.ts | 7 ++
.../components/ModelButton/index.tsx | 20 ++--
.../Project/ProjectButtons/index.tsx | 83 ++++++++++++++++
.../Project/ProjectButtons/style.tsx | 26 +++++
.../components/EmbeddingButton/types.ts | 6 --
.../components/Project/connect.ts | 66 ++++++++++---
.../components/Project/index.tsx | 94 +++++++++----------
.../components/Project/types.ts | 10 +-
frontend/src/views/CensusDirectory/index.tsx | 57 +++++++----
frontend/src/views/CensusDirectory/style.ts | 16 +---
frontend/src/views/CensusDirectory/utils.ts | 69 +++++++++++++-
18 files changed, 450 insertions(+), 253 deletions(-)
rename frontend/src/views/CensusDirectory/components/Project/{ => ProjectButtons}/components/EmbeddingButton/connect.ts (56%)
rename frontend/src/views/CensusDirectory/components/Project/{ => ProjectButtons}/components/EmbeddingButton/index.tsx (84%)
rename frontend/src/views/CensusDirectory/components/Project/{ => ProjectButtons}/components/EmbeddingButton/style.ts (100%)
create mode 100644 frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/types.ts
rename frontend/src/views/CensusDirectory/components/Project/{ => ProjectButtons}/components/ModelButton/index.tsx (72%)
create mode 100644 frontend/src/views/CensusDirectory/components/Project/ProjectButtons/index.tsx
create mode 100644 frontend/src/views/CensusDirectory/components/Project/ProjectButtons/style.tsx
delete mode 100644 frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/types.ts
diff --git a/frontend/census-projects.json b/frontend/census-projects.json
index edfdc6a6b5a48..3d7a04d0a5e6d 100644
--- a/frontend/census-projects.json
+++ b/frontend/census-projects.json
@@ -1,37 +1,4 @@
[
- {
- "tier": "maintained",
- "title": "Geneformer embeddings fine-tuned for CELLxGENE Census cell subclass classification",
- "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the pre-trained Geneformer model.",
- "primary_contact": {
- "name": "CELLxGENE Discover Team",
- "email": "soma@chanzuckerberg.com",
- "affiliation": "CZI"
- },
- "DOI": "10.1038/s41586-023-06139-9",
- "publication_info": "",
- "publication_link": "",
- "project_page": "",
- "additional_information": "Beginning with the geneformer-12L-30M pretrained model published by Theodoris et al. (huggingface.co/ctheodoris/Geneformer), a BertForSequenceClassification model was trained to predict cell subclass (as annotated in CELLxGENE Discover see https://cellxgene.cziscience.com/collections). Embeddings were then generated using Geneformer’s EmbExtractor module with emb_layer=0.\nFor full details and a reproducible workflow please see: https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/models/geneformer/README.md",
- "model_link": "s3://cellxgene-contrib-public/models/geneformer/2023-12-15/homo_sapiens/fined-tuned-model/",
- "data_type": "obs_embedding",
- "obsm_layer": "geneformer",
- "census_version": "2023-12-15",
- "experiment_name": "homo_sapiens",
- "measurement_name": "RNA",
- "n_cells": 62998417,
- "n_columns": 512,
- "n_features": 512,
- "notebook_links": [
- [
- "Using trained model",
- "https://chanzuckerberg.github.io/cellxgene-census/notebooks/analysis_demo/comp_bio_geneformer_prediction.html"
- ]
- ],
- "submission_date": "2023-11-06",
- "last_updated": null,
- "revised_by": null
- },
{
"tier": "maintained",
"title": "scVI integrated-embeddings with explicit modeling of batch effects",
@@ -130,6 +97,40 @@
"last_updated": null,
"revised_by": null
},
+ {
+ "tier": "maintained",
+ "title": "Geneformer embeddings fine-tuned for CELLxGENE Census cell subclass classification",
+ "description": "Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.\nThese cell embeddings are derived from a Geneformer model CZI fine-tuned for cell subclass classification. As the fine-tuning procedure remains experimental and wasn’t performed by the Geneformer authors, these embeddings should not be used to assess performance of the pre-trained Geneformer model.",
+ "primary_contact": {
+ "name": "CELLxGENE Discover Team",
+ "email": "soma@chanzuckerberg.com",
+ "affiliation": "CZI"
+ },
+ "DOI": "10.1038/s41586-023-06139-9",
+ "publication_info": "",
+ "publication_link": "",
+ "project_page": "",
+ "additional_information": "Beginning with the geneformer-12L-30M pretrained model published by Theodoris et al. (huggingface.co/ctheodoris/Geneformer), a BertForSequenceClassification model was trained to predict cell subclass (as annotated in CELLxGENE Discover see https://cellxgene.cziscience.com/collections). Embeddings were then generated using Geneformer’s EmbExtractor module with emb_layer=0.\nFor full details and a reproducible workflow please see: https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/models/geneformer/README.md",
+ "model_link": "s3://cellxgene-contrib-public/models/geneformer/2023-12-15/homo_sapiens/fined-tuned-model/",
+ "data_type": "obs_embedding",
+ "obsm_layer": "geneformer",
+ "census_version": "2023-12-15",
+ "experiment_name": "homo_sapiens",
+ "measurement_name": "RNA",
+ "n_cells": 62998417,
+ "n_columns": 512,
+ "n_features": 512,
+ "notebook_links": [
+ [
+ "Using trained model",
+ "https://chanzuckerberg.github.io/cellxgene-census/notebooks/analysis_demo/comp_bio_geneformer_prediction.html"
+ ]
+ ],
+ "submission_date": "2023-11-06",
+ "last_updated": null,
+ "revised_by": null
+ },
+
{
"tier": "community",
"title": "PINNACLE: Contextual AI Model for Single-Cell Protein Biology",
@@ -213,12 +214,12 @@
"additional_contacts": [
{
"name": "Jialong Jiang",
- "email": "jiangjl@caltech.edu" ,
+ "email": "jiangjl@caltech.edu",
"affiliation": "Thomson Lab, Caltech"
},
{
"name": "Yingying Gong",
- "email": "ygong@caltech.edu" ,
+ "email": "ygong@caltech.edu",
"affiliation": "Thomson Lab, Caltech"
}
],
diff --git a/frontend/src/common/analytics/events.ts b/frontend/src/common/analytics/events.ts
index 3caf14324404c..cc5824ed36bcb 100644
--- a/frontend/src/common/analytics/events.ts
+++ b/frontend/src/common/analytics/events.ts
@@ -60,6 +60,7 @@ export enum EVENTS {
CENSUS_EMBEDDING_COPIED = "CENSUS_EMBEDDING_COPIED",
CENSUS_PROJECT_LINK_CLICKED = "CENSUS_PROJECT_LINK_CLICKED",
CENSUS_EMBEDDING_NOTEBOOK_CLICKED = "CENSUS_EMBEDDING_NOTEBOOK_CLICKED",
+ CENSUS_MODELS_TUTORIALS_CLICKED = "CENSUS_MODELS_TUTORIALS_CLICKED",
DATASETS_CLICK_NAV = "DATASETS_CLICK_NAV",
COLLECTIONS_CLICK_NAV = "COLLECTIONS_CLICK_NAV",
DOCUMENTATION_CLICK_NAV = "DOCUMENTATION_CLICK_NAV",
diff --git a/frontend/src/components/Header/components/Nav/index.tsx b/frontend/src/components/Header/components/Nav/index.tsx
index 04cf29f1b74eb..a073d03275976 100644
--- a/frontend/src/components/Header/components/Nav/index.tsx
+++ b/frontend/src/components/Header/components/Nav/index.tsx
@@ -18,79 +18,7 @@ import { CENSUS_LINK } from "./constants";
import { Props } from "./types";
export default function Nav({ className, pathname }: Props): JSX.Element {
- const isCensusDirectory = isRouteActive(pathname, ROUTES.CENSUS_DIRECTORY);
-
- return !isCensusDirectory ? (
-
-
-
- {
- track(EVENTS.COLLECTIONS_CLICK_NAV);
- }}
- text="Collections"
- />
-
-
-
-
- {
- track(EVENTS.DATASETS_CLICK_NAV);
- }}
- text="Datasets"
- />
-
-
-
-
- {
- track(EVENTS.WMG_CLICK_NAV);
- }}
- text="Gene Expression"
- />
-
-
-
-
- {
- track(EVENTS.CELL_GUIDE_CLICK_NAV);
- }}
- text="Cell Guide"
- />
-
-
-
-
-
- {
- track(EVENTS.CENSUS_DOCUMENTATION_CLICK_NAV);
- }}
- rel="noopener"
- target="_self"
- text="Census"
- />
-
-
- ) : (
+ return (
<>
diff --git a/frontend/src/types/census-projects.d.ts b/frontend/src/types/census-projects.d.ts
index f905e0e95a467..236fc7b71d099 100644
--- a/frontend/src/types/census-projects.d.ts
+++ b/frontend/src/types/census-projects.d.ts
@@ -3,7 +3,7 @@ declare module "census-projects.json" {
extends Partial {
notebook_links?: [string, string][];
tier: "community" | "maintained";
- obs_matrix: string;
+ obsm_layer: string;
project_page: string;
}
const content: StaticProject[];
diff --git a/frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/connect.ts b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/connect.ts
similarity index 56%
rename from frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/connect.ts
rename to frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/connect.ts
index 57777acae45e7..726a426f1c18a 100644
--- a/frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/connect.ts
+++ b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/connect.ts
@@ -1,10 +1,9 @@
-import { useCallback, useState } from "react";
+import { useCallback, useEffect, useState } from "react";
import { track } from "src/common/analytics";
import { EVENTS } from "src/common/analytics/events";
import { EmbeddingButtonProps } from "./types";
-import { Project } from "src/common/queries/censusDirectory";
-import { StaticProject } from "census-projects.json";
import { getProjectTier } from "src/views/CensusDirectory/utils";
+import { UnionProject } from "../../../types";
// The div contains two lines of the word copy
const NUMBER_OF_EXTRA_LINES = 2;
@@ -13,13 +12,18 @@ const NUMBER_OF_PADDING_LINES = 1;
// Total amount of padding around the highlighted line
const LINE_HIGHLIGHT_BACKGROUND_PADDING = 8;
-function pythonCodeSnippet(project: StaticProject | Project): string {
+const MAINTAINED_PYTHON_NOTEBOOK_LINK =
+ "https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_access_maintained_embeddings.html";
+const MAINTAINED_R_NOTEBOOK_LINK =
+ "https://chanzuckerberg.github.io/cellxgene-census/r/articles/census_access_maintained_embeddings.html";
+const HOSTED_PYTHON_NOTEBOOK_LINK =
+ "https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_embedding.html";
+
+function pythonCodeSnippet(project: UnionProject, uri: string): string {
const censusVersion = project.census_version;
const organism = project.experiment_name;
const measurement = project.measurement_name;
- const uri = `"s3://cellxgene-contrib-archive/contrib/cell-census/${project.id}"`;
-
return project.tier === "maintained"
? ` import cellxgene_census
@@ -28,25 +32,46 @@ function pythonCodeSnippet(project: StaticProject | Project): string {
census,
organism = "${organism}",
measurement_name = "${measurement}",
- obs_value_filter = "tissue == 'tongue'",
- obsm_layers = "${project.obs_matrix}"
+ obs_value_filter = "tissue_general == 'central nervous system'",
+ obsm_layers = ["${project.obsm_layer}"]
)`
: ` import cellxgene_census
from cellxgene_census.experimental import get_embedding
- embedding_uri = ${uri}
+ embedding_uri = \\
+ "${uri}"
census = cellxgene_census.open_soma(census_version="${censusVersion}")
adata = cellxgene_census.get_anndata(
census,
organism = "${organism}",
measurement_name = "${measurement}",
- obs_value_filter = "tissue == 'tongue'",
+ obs_value_filter = "tissue_general == 'central nervous system'",
)
- embeddings = get_embedding("${censusVersion}", embedding_uri, adata.obs["soma_joinid"])
+ embeddings = get_embedding("${censusVersion}", embedding_uri, adata.obs["soma_joinid"]).to_numpy())
adata.obsm["emb"] = embeddings`;
}
+function rCodeSnippet(project: UnionProject): string {
+ const censusVersion = project.census_version;
+ const organism = project.experiment_name;
+
+ return project.tier === "maintained"
+ ? ` library("cellxgene.census")
+ library("Seurat")
+
+ census <- open_soma(census_version = "${censusVersion}")
+ seurat_obj <- get_seurat(
+ census,
+ organism = "${organism}",
+ obs_value_filter = "tissue_general == 'central nervous system'",
+ obs_column_names = c("cell_type"),
+ obsm_layers = c("${project.obsm_layer}")
+ )
+ `
+ : "";
+}
+
export const useConnect = ({ project }: EmbeddingButtonProps) => {
const [isOpen, setIsOpen] = useState(false);
const [isCopied, setIsCopied] = useState(false);
@@ -65,10 +90,12 @@ export const useConnect = ({ project }: EmbeddingButtonProps) => {
setIsOpen(!isOpen);
}, [isOpen, projectTier, project.title]);
- const codeSnippet = language === "python" ? pythonCodeSnippet(project) : "";
+ const uri = `s3://cellxgene-contrib-public/contrib/cell-census/soma/${project.census_version}/${project.id}`;
- // These can be derived from the static S3 namespace + the accessor_id or will be a static url provided in json blob
- const uri = `s3://cellxgene-contrib-archive/contrib/cell-census/${project.id}`;
+ const codeSnippet =
+ language === "python"
+ ? pythonCodeSnippet(project, uri)
+ : rCodeSnippet(project);
const codeSnippetRef = useCallback(
(node: HTMLDivElement) => {
@@ -83,8 +110,7 @@ export const useConnect = ({ project }: EmbeddingButtonProps) => {
const lineIndex = lines.findIndex((line: string) => line.includes(uri));
setURITopPosition(
- newLineHeight * (lineIndex + 1) +
- NUMBER_OF_PADDING_LINES +
+ newLineHeight * (lineIndex + 1 + NUMBER_OF_PADDING_LINES) +
LINE_HIGHLIGHT_BACKGROUND_PADDING / 2
);
setLineHeight(newLineHeight + LINE_HIGHLIGHT_BACKGROUND_PADDING);
@@ -93,6 +119,21 @@ export const useConnect = ({ project }: EmbeddingButtonProps) => {
[uri]
);
+ const [notebookLink, setNotebookLink] = useState("");
+ useEffect(() => {
+ if (projectTier === "maintained") {
+ if (language === "python") {
+ setNotebookLink(MAINTAINED_PYTHON_NOTEBOOK_LINK);
+ } else {
+ setNotebookLink(MAINTAINED_R_NOTEBOOK_LINK);
+ }
+ } else {
+ if (language === "python") {
+ setNotebookLink(HOSTED_PYTHON_NOTEBOOK_LINK);
+ }
+ }
+ }, [language, projectTier]);
+
const handleCopyMouseEnter = () => setIsCopied(false);
return {
@@ -104,6 +145,7 @@ export const useConnect = ({ project }: EmbeddingButtonProps) => {
uri,
uriTopPosition,
lineHeight,
+ notebookLink,
codeSnippetRef,
setLanguage,
handleButtonClick,
diff --git a/frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/index.tsx b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/index.tsx
similarity index 84%
rename from frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/index.tsx
rename to frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/index.tsx
index b2c9232166339..4b0a9e251a65f 100644
--- a/frontend/src/views/CensusDirectory/components/Project/components/EmbeddingButton/index.tsx
+++ b/frontend/src/views/CensusDirectory/components/Project/ProjectButtons/components/EmbeddingButton/index.tsx
@@ -8,18 +8,18 @@ import { track } from "src/common/analytics";
import { EVENTS } from "src/common/analytics/events";
import Highlight from "react-highlight";
import { RadioGroup } from "@mui/material";
-import Link from "next/link";
import { StyledDialogContent, Label, CodeSnippet, Break } from "./style";
-import { StyledButton } from "src/components/CreateCollectionModal/style";
+import { StyledButton } from "../../style";
function EmbeddingButton(props: EmbeddingButtonProps) {
- const { project } = props;
+ const { project, uniqueMetadata } = props;
const {
isOpen,
language,
codeSnippet,
projectTier,
uri,
+ notebookLink,
codeSnippetRef,
uriTopPosition,
lineHeight,
@@ -49,7 +49,11 @@ function EmbeddingButton(props: EmbeddingButtonProps) {
row
>
-
+