diff --git a/backend/wmg/api/v2.py b/backend/wmg/api/v2.py index a36af2daafd44..3caaf13ce5eca 100644 --- a/backend/wmg/api/v2.py +++ b/backend/wmg/api/v2.py @@ -47,6 +47,8 @@ def primary_filter_dimensions(): @tracer.wrap() def query(): request = connexion.request.json + sanitize_api_query_dict(request["filter"]) + is_rollup = request.get("is_rollup", True) compare = request.get("compare", None) @@ -81,6 +83,13 @@ def query(): cell_counts = q.cell_counts(criteria, compare_dimension=compare) + # For schema-4 we filter out comma-delimited values for `self_reported_ethnicity_ontology_term_id` + # from being included in the grouping and rollup logic per functional requirements: + # See: https://github.com/chanzuckerberg/single-cell/issues/596 + if (compare is not None) and compare == "self_reported_ethnicity_ontology_term_id": + expression_summary = df_not_containing_comma_delimited_ethnicity_values(expression_summary) + cell_counts = df_not_containing_comma_delimited_ethnicity_values(cell_counts) + with ServerTiming.time("build response"): if expression_summary.shape[0] > 0 or cell_counts.shape[0] > 0: group_by_terms = ["tissue_ontology_term_id", "cell_type_ontology_term_id", compare] if compare else None @@ -113,6 +122,8 @@ def query(): @tracer.wrap() def filters(): request = connexion.request.json + sanitize_api_query_dict(request["filter"]) + criteria = WmgFiltersQueryCriteria(**request["filter"]) with ServerTiming.time("load snapshot"): @@ -167,6 +178,55 @@ def markers(): ) +def df_not_containing_comma_delimited_ethnicity_values(input_df: DataFrame) -> DataFrame: + """ + Return a new dataframe with only the rows that DO NOT contain comma-delimited + values in the `self_reported_ethnicity_ontology_term_id` column. + + Parameters + ---------- + input_df: Dataframe + A dataframe that contains `self_reported_ethnicity_ontology_term_id` column + + Returns + ------- + A dataframe containing only the rows that do not have a comma-delimited value + for the `self_reported_ethnicity_ontology_term_id` column + """ + return input_df[~input_df.self_reported_ethnicity_ontology_term_id.str.contains(",")] + + +def sanitize_api_query_dict(query_dict: Any): + """ + Remove invalid values in the query dictionary encoding the query API + request body. + + The assumption is that this function is called at the beginning of the + API function. This usage also helps mitigate query injection attacks. + + NOTE: This is a destructive operation in that it mutates `query_dict`. + + Parameters + ---------- + query_dict : json object + The query dictionary to sanitize. + + Returns + ------- + None because this function mutates the function argument + """ + + # Sanitize `self_reported_ethnicity_ontology_term_ids` by removing + # comma-delimited values because WMG does not support filtering and grouping + # by ethnicity terms that encode mixed ethnicities encoded as a single comma-delimited string + # value + if "self_reported_ethnicity_ontology_term_ids" in query_dict: + ethnicity_term_ids = query_dict["self_reported_ethnicity_ontology_term_ids"] + + ethnicity_term_ids_to_keep = [x for x in ethnicity_term_ids if "," not in x] + query_dict["self_reported_ethnicity_ontology_term_ids"] = ethnicity_term_ids_to_keep + + def fetch_datasets_metadata(snapshot: WmgSnapshot, dataset_ids: Iterable[str]) -> List[Dict]: return [ snapshot.dataset_metadata.get(dataset_id, dict(id=dataset_id, label="", collection_id="", collection_label="")) @@ -218,6 +278,12 @@ def build_filter_dims_values(criteria: WmgFiltersQueryCriteria, snapshot: WmgSna else find_dim_option_values(criteria, snapshot, dim) ) + # For schema-4 we filter out comma-delimited values for `self_reported_ethnicity_ontology_term_id` + # from the options list per functional requirements: + # See: https://github.com/chanzuckerberg/single-cell/issues/596 + ethnicity_term_ids = dims["self_reported_ethnicity_ontology_term_id"] + dims["self_reported_ethnicity_ontology_term_id"] = [term_id for term_id in ethnicity_term_ids if "," not in term_id] + response_filter_dims_values = dict( datasets=fetch_datasets_metadata(snapshot, dims["dataset_id"]), disease_terms=build_ontology_term_id_label_mapping(dims["disease_ontology_term_id"]), diff --git a/tests/unit/backend/wmg/api/common/test_wmg_api_helpers.py b/tests/unit/backend/wmg/api/common/test_wmg_api_helpers.py new file mode 100644 index 0000000000000..4e0c17743e2bc --- /dev/null +++ b/tests/unit/backend/wmg/api/common/test_wmg_api_helpers.py @@ -0,0 +1,50 @@ +"""This module tests the helper functions used in `backend.wmg.api.v2.py`. +""" + +import pytest + +from backend.wmg.api.v2 import sanitize_api_query_dict + + +@pytest.mark.parametrize( + "input_query,expected_sanitized_query", + [ + ( + { + "organism_ontology_term_id": "NCBITaxon:9606", + "self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"], + }, + { + "organism_ontology_term_id": "NCBITaxon:9606", + "self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"], + }, + ), + ( + { + "organism_ontology_term_id": "NCBITaxon:9606", + "self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008", "HANCESTRO:0008,HANCESTOR:0021"], + }, + { + "organism_ontology_term_id": "NCBITaxon:9606", + "self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"], + }, + ), + ( + { + "organism_ontology_term_id": "NCBITaxon:9606", + "self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008,HANCESTOR:0021"], + }, + {"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []}, + ), + ( + {"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []}, + {"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []}, + ), + ({"organism_ontology_term_id": "NCBITaxon:9606"}, {"organism_ontology_term_id": "NCBITaxon:9606"}), + ], +) +def test_sanitize_api_query_dict(input_query, expected_sanitized_query): + # NOTE: `sanitize_api_query_dict()` mutates the function argument + sanitize_api_query_dict(input_query) + + assert input_query == expected_sanitized_query diff --git a/tests/unit/backend/wmg/api/test_v2.py b/tests/unit/backend/wmg/api/test_v2.py index 39c938e8f81cc..410e9c3ed47b5 100644 --- a/tests/unit/backend/wmg/api/test_v2.py +++ b/tests/unit/backend/wmg/api/test_v2.py @@ -1,5 +1,6 @@ import json import unittest +from typing import Dict, List from unittest.mock import patch from pytest import approx @@ -25,13 +26,13 @@ exclude_dev_stage_and_ethnicity_for_secondary_filter_test, forward_cell_type_ordering, load_realistic_test_snapshot, + ont_term_id_gen_schema4_ethnicity_variation, reverse_cell_type_ordering, ) from tests.unit.backend.wmg.test_query import generate_expected_marker_gene_data_with_pandas TEST_SNAPSHOT = "realistic-test-snapshot" - # this should only be used for generating expected outputs when using the test snapshot (see test_snapshot.py) def generate_expected_term_id_labels_dictionary( *, @@ -298,6 +299,133 @@ def generate_test_inputs_and_expected_outputs( ) +def gen_expected_output_ethnicity_compare_dim( + *, + genes: list[str], + cell_types: list[str], + tissues: list[str], + ethnicities: list[str], + dim_size: int, + me: float, + cell_count_per_row_cell_counts_cube: int, + cell_ordering_func=forward_cell_type_ordering, +) -> tuple: + """ + Generates expected outputs for the /wmg/v2/query endpoint, + where 'self_reported_ethnicity_ontology_term_id' is the compare dimension AND + grouping by 'self_reported_ethnicity_ontology_term_id' is done against an + EXPECTED list of unique values for 'self_reported_ethnicity_ontology_term_id'. + + This is especially useful to test against schema-4 values for ethnicity where + comma-delimited values should be excluded from the grouping. This function could + could be more generally useful for testing when expected grouping by the + ethnicity terms should be done against an expected list of values. + + Arguments + --------- + genes: list of gene ontology term IDs + + cell_types: list of cell_type ontology term IDs + + tissues: list of tissue ontology term IDs + + ethnicities: list of ethnicity ontology term IDs (the compare_dim terms) + + dim_size: size of each dimension of the test cube + + me: mean expression value to use for each (gene, tissue, cell_type) combination (scalar) + + cell_count_per_row_cell_counts_cube: num of cells per row in cell_counts cube (scalar) + + Returns + ------- + tuple of (expected_expression_summary, expected_term_id_labels) + + expected_expression_summary: dictionary containing the expected expression summary values + + expected_term_id_labels: dictionary containing the expected term ID labels + """ + + expected_combinations_per_tissue = 1 + + # It is possible that len(ethnicities) != dim_size because + # we remove comma-delimited ethnicity ontology term IDs from + # downstream processing like grouping and rollup. + for dim in expression_summary_non_indexed_dims: + if dim != "self_reported_ethnicity_ontology_term_id": + expected_combinations_per_tissue *= dim_size + else: + expected_combinations_per_tissue *= len(ethnicities) + + cell_count_tissue = cell_count_per_row_cell_counts_cube * expected_combinations_per_tissue + + expected_combinations_per_tissue_cell_type = expected_combinations_per_tissue // len(cell_types) + nnz_gene_tissue_cell_type = expected_combinations_per_tissue_cell_type + cell_count_tissue_cell_type = expected_combinations_per_tissue_cell_type * cell_count_per_row_cell_counts_cube + cell_counts_tissue_cell_type_ethncity = cell_count_tissue_cell_type // len(ethnicities) + nnz_gene_tissue_cell_type_ethnicity = nnz_gene_tissue_cell_type // len(ethnicities) + + expected_term_id_labels = generate_expected_term_id_labels_dictionary( + genes=genes, + tissues=tissues, + cell_types=cell_types, + cell_count_tissue_cell_type=cell_count_tissue_cell_type, + compare_terms=ethnicities, + cell_counts_tissue_cell_type_compare_dim=cell_counts_tissue_cell_type_ethncity, + cell_ordering_func=cell_ordering_func, + ) + expected_expression_summary = generate_expected_expression_summary_dictionary( + genes=genes, + tissues=tissues, + cell_count_tissue=cell_count_tissue, + cell_types=cell_types, + cell_count_tissue_cell_type=cell_count_tissue_cell_type, + nnz_gene_tissue_cell_type=nnz_gene_tissue_cell_type, + compare_terms=ethnicities, + cell_counts_tissue_cell_type_compare_dim=cell_counts_tissue_cell_type_ethncity, + nnz_gene_tissue_cell_type_compare_dim=nnz_gene_tissue_cell_type_ethnicity, + me=me, + ) + + return ( + expected_expression_summary, + expected_term_id_labels, + ) + + +def sort_filter_options(filter_options: Dict[str, List[Dict[str, str]]]): + """ + This utility function sorts the datastructure of ontology term IDs + to enable equality checks in test assertions. + + NOTE: This function mutates `filter_options` by sorting it in-place + + Parameters + ---------- + filter_options : An object that contains values for each dimension + + Returns + ------- + None because this function mutates the function argument + """ + for key, value_list in filter_options.items(): + if key != "datasets": + # The value_list for keys != "datasets", contains + # a list of dictionaries where each dictionary is + # a SINGLE key-value pair. Therefore, we sort the + # list of dictionaries by their key. + # We do this by extracting a tuple of keys to the each + # dictionary. Since these dictionaries only contain a + # single key-value pair, the length of he tuple is + # guaranteed to be 1 + value_list.sort(key=lambda x: tuple(x)[0]) + else: + # for value_list of keys == "datasets", sort + # the value_list by the value of the "id" key + # in each dictionary in the value_list + value_list.sort(key=lambda x: x["id"]) + + # TODO(prathap): Write tests that mock backend.wmg.api.v2.get_dot_plot_data() and # backend.wmg.api.v2.rollup() so that we can test backend.wmg.api.v2.query() with # rollup operations. @@ -394,6 +522,7 @@ def test__query_single_primary_dims__returns_200_and_correct_response( dim_size = 1 with create_temp_wmg_snapshot( dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, expression_summary_vals_fn=all_ones_expression_summary_values, cell_counts_generator_fn=all_tens_cell_counts_values, ) as snapshot: @@ -423,8 +552,11 @@ def test__query_single_primary_dims__returns_200_and_correct_response( "expression_summary": expected_expression_summary, "term_id_labels": expected_term_id_labels, } + + actual_response = json.loads(response.data) + self.assert_equality_nested_dict_with_floats( - expected=expected_response, actual=json.loads(response.data), key_path=[] + expected=expected_response, actual=actual_response, key_path=[] ) @patch("backend.wmg.api.v2.gene_term_label") @@ -436,6 +568,7 @@ def test__query_no_genes__returns_200_and_correct_response( dim_size = 3 with create_temp_wmg_snapshot( dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, expression_summary_vals_fn=all_ones_expression_summary_values, cell_counts_generator_fn=all_tens_cell_counts_values, ) as snapshot: @@ -481,6 +614,7 @@ def test__query_request_multi_primary_dims_only__returns_200_and_correct_respons dim_size = 3 with create_temp_wmg_snapshot( dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, expression_summary_vals_fn=all_ones_expression_summary_values, cell_counts_generator_fn=all_tens_cell_counts_values, ) as snapshot: @@ -558,6 +692,71 @@ def test__query_request_multi_primary_dims_only_with_compare__returns_200_and_co expected=expected, actual=json.loads(response.data), key_path=[] ) + @patch("backend.wmg.api.v2.gene_term_label") + @patch("backend.wmg.api.v2.ontology_term_label") + @patch("backend.wmg.api.v2.load_snapshot") + def test__schema4__query_request_with_compare_by_ethnicity__excludes_comma_delimited_ethnicity_values( + self, load_snapshot, ontology_term_label, gene_term_label + ): + dim_size = 3 + with create_temp_wmg_snapshot( + dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, + expression_summary_vals_fn=all_ones_expression_summary_values, + cell_counts_generator_fn=all_tens_cell_counts_values, + ) as snapshot: + # setup up API endpoints to use a mocked cube containing all stat values of 1, for a deterministic + # expected query response + load_snapshot.return_value = snapshot + ontology_term_label.side_effect = lambda ontology_term_id: f"{ontology_term_id}_label" + gene_term_label.side_effect = lambda gene_term_id: f"{gene_term_id}_label" + + genes = ["gene_ontology_term_id_0", "gene_ontology_term_id_2"] + organism = "organism_ontology_term_id_0" + + cell_types = [f"cell_type_ontology_term_id_{i}" for i in range(dim_size)] + + # WMG V2 API does not allow filtering by tissues and therefore the query result + # includes all tissues + all_tissues = [f"tissue_ontology_term_id_{i}" for i in range(dim_size)] + + # The cube test data with comma-delimited ethnicities is generated such that + # if the number of values for the ethnicity column is N, it will generate + # (N - 1) single value ethnicity term IDs (suffixed with 0 to N-2) and + # 1 comma-delimited ethnicity term ID. + # Since we are excluding the the comma-delimited term ID from the grouping, the + # output is a function of the single valued ethnicity term IDs. + ethnicities = [f"self_reported_ethnicity_ontology_term_id_{i}" for i in range(dim_size - 1)] + + (expected_expression_summary, expected_term_id_labels) = gen_expected_output_ethnicity_compare_dim( + genes=genes, + cell_types=cell_types, + tissues=all_tissues, + ethnicities=ethnicities, + dim_size=dim_size, + me=1.0, + cell_count_per_row_cell_counts_cube=10, + ) + + request = { + "filter": dict(gene_ontology_term_ids=genes, organism_ontology_term_id=organism), + "compare": "self_reported_ethnicity", + } + + response = self.app.post("/wmg/v2/query", json=request) + + self.assertEqual(200, response.status_code) + + expected = { + "snapshot_id": "dummy-snapshot", + "expression_summary": expected_expression_summary, + "term_id_labels": expected_term_id_labels, + } + + self.assert_equality_nested_dict_with_floats( + expected=expected, actual=json.loads(response.data), key_path=[] + ) + @patch("backend.wmg.api.v2.gene_term_label") @patch("backend.wmg.api.v2.ontology_term_label") @patch("backend.wmg.api.v2.load_snapshot") @@ -567,6 +766,7 @@ def test__query_explicit_cell_ordering__returns_correct_cell_ordering( dim_size = 2 with create_temp_wmg_snapshot( dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, expression_summary_vals_fn=all_ones_expression_summary_values, cell_counts_generator_fn=all_tens_cell_counts_values, cell_ordering_generator_fn=reverse_cell_type_ordering, @@ -603,6 +803,7 @@ def test__query_total_cell_count_per_cell_type(self, load_snapshot, ontology_ter dim_size = 2 with create_temp_wmg_snapshot( dim_size=dim_size, + dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation, expression_summary_vals_fn=all_ones_expression_summary_values, cell_counts_generator_fn=lambda coords: all_X_cell_counts_values(coords, expected_count), cell_ordering_generator_fn=reverse_cell_type_ordering, @@ -758,6 +959,181 @@ def test__filter_request_with_filter_dims__returns_valid_filter_dims__base_case( } self.assertEqual(json.loads(response.data)["filter_dims"], expected_filters) + @patch("backend.wmg.api.v2.fetch_datasets_metadata") + @patch("backend.wmg.api.v2.gene_term_label") + @patch("backend.wmg.api.v2.ontology_term_label") + @patch("backend.wmg.api.v2.load_snapshot") + def test__schema4__filter_request_with_empty_criteria__excludes_comma_delimited_ethnicity_values( + self, load_snapshot, ontology_term_label, gene_term_label, fetch_datasets_metadata + ): + # In this test we expect comma-delimited ethnicity values to NOT BE INCLUDED + # in the list of generated options for ethnicities + dim_size = 3 + with create_temp_wmg_snapshot( + dim_size=dim_size, dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation + ) as snapshot: + ontology_term_label.side_effect = lambda ontology_term_id: f"{ontology_term_id}_label" + gene_term_label.side_effect = lambda gene_term_id: f"{gene_term_id}_label" + fetch_datasets_metadata.return_value = mock_datasets_metadata([f"dataset_id_{i}" for i in range(dim_size)]) + load_snapshot.return_value = snapshot + filter_dict = dict( + cell_type_ontology_term_ids=[], + dataset_ids=[], + disease_ontology_term_ids=[], + development_stage_ontology_term_ids=[], + organism_ontology_term_id="organism_ontology_term_id_0", + publication_citations=[], + self_reported_ethnicity_ontology_term_ids=[], + sex_ontology_term_ids=[], + tissue_ontology_term_ids=[], + ) + + filter_request = dict(filter=filter_dict) + + response = self.app.post("/wmg/v2/filters", json=filter_request) + actual_filter_options = json.loads(response.data)["filter_dims"] + + # sorts 'actual_filter_options' in-place + sort_filter_options(actual_filter_options) + + expected_filter_options = { + "cell_type_terms": [ + {"cell_type_ontology_term_id_0": "cell_type_ontology_term_id_0_label"}, + {"cell_type_ontology_term_id_1": "cell_type_ontology_term_id_1_label"}, + {"cell_type_ontology_term_id_2": "cell_type_ontology_term_id_2_label"}, + ], + "datasets": [ + { + "collection_id": "dataset_id_0_coll_id", + "collection_label": "dataset_id_0_coll_name", + "id": "dataset_id_0", + "label": "dataset_id_0_name", + }, + { + "collection_id": "dataset_id_1_coll_id", + "collection_label": "dataset_id_1_coll_name", + "id": "dataset_id_1", + "label": "dataset_id_1_name", + }, + { + "collection_id": "dataset_id_2_coll_id", + "collection_label": "dataset_id_2_coll_name", + "id": "dataset_id_2", + "label": "dataset_id_2_name", + }, + ], + "development_stage_terms": [ + {"development_stage_ontology_term_id_0": "development_stage_ontology_term_id_0_label"}, + {"development_stage_ontology_term_id_1": "development_stage_ontology_term_id_1_label"}, + {"development_stage_ontology_term_id_2": "development_stage_ontology_term_id_2_label"}, + ], + "disease_terms": [], + "publication_citations": [], + # NOTE: ethnicity options generated does not include comma-delimited ethnicity values + # eventhough those values exist in the test cube + "self_reported_ethnicity_terms": [ + {"self_reported_ethnicity_ontology_term_id_0": "self_reported_ethnicity_ontology_term_id_0_label"}, + {"self_reported_ethnicity_ontology_term_id_1": "self_reported_ethnicity_ontology_term_id_1_label"}, + ], + "sex_terms": [], + "tissue_terms": [ + {"tissue_ontology_term_id_0": "tissue_ontology_term_id_0_label"}, + {"tissue_ontology_term_id_1": "tissue_ontology_term_id_1_label"}, + {"tissue_ontology_term_id_2": "tissue_ontology_term_id_2_label"}, + ], + } + + self.assertEqual(actual_filter_options, expected_filter_options) + + @patch("backend.wmg.api.v2.fetch_datasets_metadata") + @patch("backend.wmg.api.v2.gene_term_label") + @patch("backend.wmg.api.v2.ontology_term_label") + @patch("backend.wmg.api.v2.load_snapshot") + def test__schema4__filter_request_with_nonempty_criteria__excludes_comma_delimited_ethnicity_values( + self, load_snapshot, ontology_term_label, gene_term_label, fetch_datasets_metadata + ): + # In this test we expect comma-delimited ethnicity values to NOT BE INCLUDED + # in the list of generated options for ethnicities regardless of what filter options + # were selected. + dim_size = 3 + with create_temp_wmg_snapshot( + dim_size=dim_size, dim_ontology_term_ids_generator_fn=ont_term_id_gen_schema4_ethnicity_variation + ) as snapshot: + ontology_term_label.side_effect = lambda ontology_term_id: f"{ontology_term_id}_label" + gene_term_label.side_effect = lambda gene_term_id: f"{gene_term_id}_label" + fetch_datasets_metadata.return_value = mock_datasets_metadata([f"dataset_id_{i}" for i in range(dim_size)]) + load_snapshot.return_value = snapshot + + # Non-empty selection criteria + filter_dict = dict( + cell_type_ontology_term_ids=["cell_type_ontology_term_id_2"], + dataset_ids=[], + disease_ontology_term_ids=[], + development_stage_ontology_term_ids=[], + organism_ontology_term_id="organism_ontology_term_id_0", + publication_citations=[], + self_reported_ethnicity_ontology_term_ids=["self_reported_ethnicity_ontology_term_id_0"], + sex_ontology_term_ids=[], + tissue_ontology_term_ids=["tissue_ontology_term_id_1"], + ) + + filter_request = dict(filter=filter_dict) + + response = self.app.post("/wmg/v2/filters", json=filter_request) + actual_filter_options = json.loads(response.data)["filter_dims"] + + # sorts 'actual_filter_options' in-place + sort_filter_options(actual_filter_options) + + expected_filter_options = { + "cell_type_terms": [ + {"cell_type_ontology_term_id_0": "cell_type_ontology_term_id_0_label"}, + {"cell_type_ontology_term_id_1": "cell_type_ontology_term_id_1_label"}, + {"cell_type_ontology_term_id_2": "cell_type_ontology_term_id_2_label"}, + ], + "datasets": [ + { + "collection_id": "dataset_id_0_coll_id", + "collection_label": "dataset_id_0_coll_name", + "id": "dataset_id_0", + "label": "dataset_id_0_name", + }, + { + "collection_id": "dataset_id_1_coll_id", + "collection_label": "dataset_id_1_coll_name", + "id": "dataset_id_1", + "label": "dataset_id_1_name", + }, + { + "collection_id": "dataset_id_2_coll_id", + "collection_label": "dataset_id_2_coll_name", + "id": "dataset_id_2", + "label": "dataset_id_2_name", + }, + ], + "development_stage_terms": [ + {"development_stage_ontology_term_id_0": "development_stage_ontology_term_id_0_label"}, + {"development_stage_ontology_term_id_1": "development_stage_ontology_term_id_1_label"}, + {"development_stage_ontology_term_id_2": "development_stage_ontology_term_id_2_label"}, + ], + "disease_terms": [], + "publication_citations": [], + # NOTE: ethnicity options generated does not include comma-delimited ethnicity values + # eventhough those values exist in the test cube + "self_reported_ethnicity_terms": [ + {"self_reported_ethnicity_ontology_term_id_0": "self_reported_ethnicity_ontology_term_id_0_label"}, + {"self_reported_ethnicity_ontology_term_id_1": "self_reported_ethnicity_ontology_term_id_1_label"}, + ], + "sex_terms": [], + "tissue_terms": [ + {"tissue_ontology_term_id_0": "tissue_ontology_term_id_0_label"}, + {"tissue_ontology_term_id_1": "tissue_ontology_term_id_1_label"}, + {"tissue_ontology_term_id_2": "tissue_ontology_term_id_2_label"}, + ], + } + + self.assertEqual(actual_filter_options, expected_filter_options) + @patch("backend.wmg.api.v2.fetch_datasets_metadata") @patch("backend.wmg.api.v2.gene_term_label") @patch("backend.wmg.api.v2.ontology_term_label") diff --git a/tests/unit/backend/wmg/fixtures/test_snapshot.py b/tests/unit/backend/wmg/fixtures/test_snapshot.py index eb19e8f293172..1bd86859d3f1f 100644 --- a/tests/unit/backend/wmg/fixtures/test_snapshot.py +++ b/tests/unit/backend/wmg/fixtures/test_snapshot.py @@ -48,6 +48,27 @@ def simple_ontology_terms_generator(dimension_name: str, n_terms: int) -> List[s return [f"{dimension_name}_{i}" for i in range(n_terms)] +def ont_term_id_gen_schema4_ethnicity_variation(dimension_name: str, n_terms: int) -> List[str]: + """ + Generates ontology term IDs for all dimensions with special treatment for + `self_reported_ethnicity_ontology_term_id` to include schema4 specific format. + """ + + # For schema4, `self_reported_ethnicity_ontology_term_id` can contain + # comma-delimited values. This scheme simply appends a predetermined + # comma-delimited value, "self_reported_ethnicity_ontology_term_id_x,self_reported_ethnicity_ontology_term_id_y", + # to the end of list of simple ontology term IDs + if dimension_name == "self_reported_ethnicity_ontology_term_id": + schema4_term_id = "self_reported_ethnicity_ontology_term_id_x,self_reported_ethnicity_ontology_term_id_y" + # generate simple ontology term IDs for the first `n_terms-1` values + term_ids = simple_ontology_terms_generator(dimension_name, n_terms - 1) + term_ids.append(schema4_term_id) + return term_ids + + # For all other dimensions compute a simple list of term IDs + return simple_ontology_terms_generator(dimension_name, n_terms) + + def semi_real_dimension_values_generator(dimension_name: str, dim_size: int) -> List[str]: """ Returns a set of ontology term ids, sampled from real ontologies. While these ontology terms are @@ -253,8 +274,10 @@ def load_realistic_test_snapshot_tmpdir(snapshot_name: str) -> WmgSnapshot: @contextlib.contextmanager def create_temp_wmg_snapshot( + *, dim_size=3, snapshot_name="dummy-snapshot", + dim_ontology_term_ids_generator_fn: Callable[[str, int], List[str]] = simple_ontology_terms_generator, expression_summary_vals_fn: Callable[[List[Tuple]], Dict[str, List]] = random_expression_summary_values, exclude_logical_coord_fn: Callable[[NamedTuple], bool] = None, cell_counts_generator_fn: Callable[[List[Tuple]], List] = random_cell_counts_values, @@ -264,6 +287,7 @@ def create_temp_wmg_snapshot( expression_summary_cube_dir, cell_counts_cube_dir = create_cubes( cube_dir, dim_size, + dim_ontology_term_ids_generator_fn=dim_ontology_term_ids_generator_fn, exclude_logical_coord_fn=exclude_logical_coord_fn, expression_summary_vals_fn=expression_summary_vals_fn, cell_counts_fn=cell_counts_generator_fn,