diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py index 3b33977cd..97fd23129 100644 --- a/chord_metadata_service/patients/tests/test_api.py +++ b/chord_metadata_service/patients/tests/test_api.py @@ -2,6 +2,7 @@ import json import csv import io +import random from django.conf import settings from django.urls import reverse from django.test import override_settings @@ -317,19 +318,22 @@ class PublicFilteringIndividualsTest(APITestCase): """ Test for api/public GET filtering """ response_threshold = CONFIG_PUBLIC_TEST["rules"]["count_threshold"] - random_range = 137 + num_individuals = 137 + random_seed = 341 # do not change this please :)) @staticmethod def response_threshold_check(response): return response['count'] if 'count' in response else settings.INSUFFICIENT_DATA_AVAILABLE def setUp(self): - individuals = [c.generate_valid_individual() for _ in range(self.random_range)] # random range + individuals = [c.generate_valid_individual() for _ in range(self.num_individuals)] for individual in individuals: Individual.objects.create(**individual) p = ph_m.Procedure.objects.create(**ph_c.VALID_PROCEDURE_1) ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(Individual.objects.all()[0], p)) + random.seed(self.random_seed) + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) def test_public_filtering_sex(self): # sex field search @@ -589,6 +593,20 @@ def test_public_filtering_mapping_for_search_filter(self): self.assertEqual(1, response_obj["count"]) +class PublicFilteringIndividualsTestSmallCellCount(PublicFilteringIndividualsTest): + num_individuals = 3 # below configured test threshold + # rest of the methods are inherited + + @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST) + def test_public_overview_sex(self): + response = self.client.get('/api/public_search_fields') + self.assertEqual(response.status_code, status.HTTP_200_OK) + response_obj = response.json() + + # overview for sex should have 0 entries due to small cell counts + self.assertEqual(len(response_obj["sections"][0]["fields"][0]["options"]), 0) # path to sex field + + class PublicAgeRangeFilteringIndividualsTest(APITestCase): """ Test for api/public GET filtering """ diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py index be0160657..0e175327e 100644 --- a/chord_metadata_service/restapi/utils.py +++ b/chord_metadata_service/restapi/utils.py @@ -579,6 +579,9 @@ def get_field_options(field_props: dict) -> list[Any]: options = field_props["config"].get("enum") # Special case: no list of values specified if options is None: + # We must be careful here not to leak 'small cell' values as options + # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this + # should be treated as if the field isn't in the database at all. options = get_distinct_field_values(field_props) elif field_props["datatype"] == "number": options = [label for floor, ceil, label in labelled_range_generator(field_props)] @@ -594,8 +597,15 @@ def get_field_options(field_props: dict) -> list[Any]: def get_distinct_field_values(field_props: dict) -> list[Any]: + # We must be careful here not to leak 'small cell' values as options + # - e.g., if there are three individuals with sex=UNKNOWN_SEX, this + # should be treated as if the field isn't in the database at all. + model, field = get_model_and_field(field_props["mapping"]) - return model.objects.values_list(field, flat=True).order_by(field).distinct() + threshold = get_threshold() + + values_with_counts = model.objects.values_list(field).annotate(count=Count(field, distinct=True)) + return [val for val, count in values_with_counts if count > threshold] def filter_queryset_field_value(qs, field_props, value: str):