Skip to content

Commit

Permalink
feat(schema4): Filter out comma-delimited ethnicity term IDs from inp…
Browse files Browse the repository at this point in the history
…ut datastructures (#6126)
  • Loading branch information
prathapsridharan authored Nov 2, 2023
1 parent c8a0f86 commit db66f11
Show file tree
Hide file tree
Showing 4 changed files with 518 additions and 2 deletions.
66 changes: 66 additions & 0 deletions backend/wmg/api/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def primary_filter_dimensions():
@tracer.wrap()
def query():
request = connexion.request.json
sanitize_api_query_dict(request["filter"])

is_rollup = request.get("is_rollup", True)
compare = request.get("compare", None)

Expand Down Expand Up @@ -81,6 +83,13 @@ def query():

cell_counts = q.cell_counts(criteria, compare_dimension=compare)

# For schema-4 we filter out comma-delimited values for `self_reported_ethnicity_ontology_term_id`
# from being included in the grouping and rollup logic per functional requirements:
# See: https://github.com/chanzuckerberg/single-cell/issues/596
if (compare is not None) and compare == "self_reported_ethnicity_ontology_term_id":
expression_summary = df_not_containing_comma_delimited_ethnicity_values(expression_summary)
cell_counts = df_not_containing_comma_delimited_ethnicity_values(cell_counts)

with ServerTiming.time("build response"):
if expression_summary.shape[0] > 0 or cell_counts.shape[0] > 0:
group_by_terms = ["tissue_ontology_term_id", "cell_type_ontology_term_id", compare] if compare else None
Expand Down Expand Up @@ -113,6 +122,8 @@ def query():
@tracer.wrap()
def filters():
request = connexion.request.json
sanitize_api_query_dict(request["filter"])

criteria = WmgFiltersQueryCriteria(**request["filter"])

with ServerTiming.time("load snapshot"):
Expand Down Expand Up @@ -167,6 +178,55 @@ def markers():
)


def df_not_containing_comma_delimited_ethnicity_values(input_df: DataFrame) -> DataFrame:
"""
Return a new dataframe with only the rows that DO NOT contain comma-delimited
values in the `self_reported_ethnicity_ontology_term_id` column.
Parameters
----------
input_df: Dataframe
A dataframe that contains `self_reported_ethnicity_ontology_term_id` column
Returns
-------
A dataframe containing only the rows that do not have a comma-delimited value
for the `self_reported_ethnicity_ontology_term_id` column
"""
return input_df[~input_df.self_reported_ethnicity_ontology_term_id.str.contains(",")]


def sanitize_api_query_dict(query_dict: Any):
"""
Remove invalid values in the query dictionary encoding the query API
request body.
The assumption is that this function is called at the beginning of the
API function. This usage also helps mitigate query injection attacks.
NOTE: This is a destructive operation in that it mutates `query_dict`.
Parameters
----------
query_dict : json object
The query dictionary to sanitize.
Returns
-------
None because this function mutates the function argument
"""

# Sanitize `self_reported_ethnicity_ontology_term_ids` by removing
# comma-delimited values because WMG does not support filtering and grouping
# by ethnicity terms that encode mixed ethnicities encoded as a single comma-delimited string
# value
if "self_reported_ethnicity_ontology_term_ids" in query_dict:
ethnicity_term_ids = query_dict["self_reported_ethnicity_ontology_term_ids"]

ethnicity_term_ids_to_keep = [x for x in ethnicity_term_ids if "," not in x]
query_dict["self_reported_ethnicity_ontology_term_ids"] = ethnicity_term_ids_to_keep


def fetch_datasets_metadata(snapshot: WmgSnapshot, dataset_ids: Iterable[str]) -> List[Dict]:
return [
snapshot.dataset_metadata.get(dataset_id, dict(id=dataset_id, label="", collection_id="", collection_label=""))
Expand Down Expand Up @@ -218,6 +278,12 @@ def build_filter_dims_values(criteria: WmgFiltersQueryCriteria, snapshot: WmgSna
else find_dim_option_values(criteria, snapshot, dim)
)

# For schema-4 we filter out comma-delimited values for `self_reported_ethnicity_ontology_term_id`
# from the options list per functional requirements:
# See: https://github.com/chanzuckerberg/single-cell/issues/596
ethnicity_term_ids = dims["self_reported_ethnicity_ontology_term_id"]
dims["self_reported_ethnicity_ontology_term_id"] = [term_id for term_id in ethnicity_term_ids if "," not in term_id]

response_filter_dims_values = dict(
datasets=fetch_datasets_metadata(snapshot, dims["dataset_id"]),
disease_terms=build_ontology_term_id_label_mapping(dims["disease_ontology_term_id"]),
Expand Down
50 changes: 50 additions & 0 deletions tests/unit/backend/wmg/api/common/test_wmg_api_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""This module tests the helper functions used in `backend.wmg.api.v2.py`.
"""

import pytest

from backend.wmg.api.v2 import sanitize_api_query_dict


@pytest.mark.parametrize(
"input_query,expected_sanitized_query",
[
(
{
"organism_ontology_term_id": "NCBITaxon:9606",
"self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"],
},
{
"organism_ontology_term_id": "NCBITaxon:9606",
"self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"],
},
),
(
{
"organism_ontology_term_id": "NCBITaxon:9606",
"self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008", "HANCESTRO:0008,HANCESTOR:0021"],
},
{
"organism_ontology_term_id": "NCBITaxon:9606",
"self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008"],
},
),
(
{
"organism_ontology_term_id": "NCBITaxon:9606",
"self_reported_ethnicity_ontology_term_ids": ["HANCESTRO:0008,HANCESTOR:0021"],
},
{"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []},
),
(
{"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []},
{"organism_ontology_term_id": "NCBITaxon:9606", "self_reported_ethnicity_ontology_term_ids": []},
),
({"organism_ontology_term_id": "NCBITaxon:9606"}, {"organism_ontology_term_id": "NCBITaxon:9606"}),
],
)
def test_sanitize_api_query_dict(input_query, expected_sanitized_query):
# NOTE: `sanitize_api_query_dict()` mutates the function argument
sanitize_api_query_dict(input_query)

assert input_query == expected_sanitized_query
Loading

0 comments on commit db66f11

Please sign in to comment.