From 3ff159d0e9c499b187a062ac12229e5ceccc868d Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 13:39:43 -0400 Subject: [PATCH 01/10] Update STR data pipeline for new data Major changes here are: * Instead of a single `reference_region`, STRs now have a list of `reference_regions` with a single one designated the `main_reference_region` * Allele size distributions and genotype distributions were previously represented with an attempt to represent multidimensional data with a number of nested structs, which was serviceable when there were only one or two dimensions we might want to filter on, but was getting increasingly convoluted. Since this new data expands the number of dimensions further, rather than build on the former schema and confuse things more, these distributions are now represented with a flattened list of structs each of which represents a single subset of the distribution. --- .../gnomad_v3_short_tandem_repeats.py | 529 ++++-------------- .../pipelines/export_to_elasticsearch.py | 2 +- .../gnomad_v3_short_tandem_repeats.py | 4 +- 3 files changed, 106 insertions(+), 429 deletions(-) diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py index e27fa872d..359f778f5 100644 --- a/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v3/gnomad_v3_short_tandem_repeats.py @@ -1,6 +1,4 @@ -import itertools import json -from collections import defaultdict import hail as hl @@ -8,12 +6,8 @@ def _parse_region_id(region_id): [chrom, position_range] = region_id.split(":") chrom = chrom[3:] - [start, stop] = map(int, position_range.split("-")) - return { - "chrom": chrom, - "start": start, - "stop": stop, - } + [start, stop] = list(map(int, position_range.split("-"))) + return {"chrom": chrom, "start": start, "stop": stop, "reference_genome": "GRCh38"} def _prepare_histogram(histogram): @@ -23,161 +17,6 @@ def _prepare_histogram(histogram): ) -def _population_sort_key(pop): - pop_id = pop["id"] - if pop_id == "XX" or pop_id == "XY": - return ("zzz", pop_id) - - if "_" in pop_id: - return tuple(pop_id.split("_")) - - return (pop_id, "") - - -def _get_total_histogram(histogram): - total = defaultdict(int) - for v in histogram.values(): - for k, n in v.items(): - total[k] += n - - return total - - -def _prepare_allele_size_distribution_populations(locus): - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountHistogram"].keys())) - - distributions = sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population - } - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population and k.split("/")[1] == "XX" - } - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[0] == population and k.split("/")[1] == "XY" - } - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_histogram( - _get_total_histogram( - {k: v for k, v in locus["AlleleCountHistogram"].items() if k.split("/")[1] == sex} - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - -def _prepare_allele_size_distribution_repeat_units(locus): - repeat_units = sorted(set(key.split("/")[2] for key in locus["AlleleCountHistogram"].keys())) - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountHistogram"].keys())) - - distributions = sorted( - [ - { - "repeat_unit": repeat_unit, - "distribution": _prepare_histogram( - _get_total_histogram( - {k: v for k, v in locus["AlleleCountHistogram"].items() if k.split("/")[2] == repeat_unit} - ) - ), - "populations": sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[2] == repeat_unit and k.split("/")[0] == population - } - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_histogram( - locus["AlleleCountHistogram"].get(f"{population}/XX/{repeat_unit}", {}) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_histogram( - locus["AlleleCountHistogram"].get(f"{population}/XY/{repeat_unit}", {}) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_histogram( - _get_total_histogram( - { - k: v - for k, v in locus["AlleleCountHistogram"].items() - if k.split("/")[2] == repeat_unit and k.split("/")[1] == sex - } - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ), - } - for repeat_unit in repeat_units - ], - key=lambda r: (len(r["repeat_unit"]), r["repeat_unit"]), - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - def _prepare_age_distribution(locus): age_bins = [ ("<20", None, 20), @@ -205,192 +44,6 @@ def _prepare_age_distribution(locus): ] -def _prepare_genotype_distribution_histogram(histogram): - return sorted( - ([*(int(n) for n in n_repeats.split("/")), n_samples] for n_repeats, n_samples in histogram.items()), - key=lambda value: (value[0], value[1]), - ) - - -def _filter_genotype_distribution_histogram(histogram, repeat_units=None, population=None, sex=None): - predicates = [] - if repeat_units: - predicates.append( - lambda key: tuple(sorted(key.split("/")[2:4])) in (repeat_units, tuple(reversed(repeat_units))) - ) - if population: - predicates.append(lambda key: key.split("/")[0] == population) - if sex: - predicates.append(lambda key: key.split("/")[1] == sex) - - filtered_histogram = {k: v for k, v in histogram.items() if all(predicate(k) for predicate in predicates)} - - if not repeat_units: - return filtered_histogram - - return dict( - itertools.chain( - ((k, v) for k, v in filtered_histogram.items() if tuple(k.split("/")[2:4]) == repeat_units), - ( - (f"{k}-reversed", {"/".join(reversed(vk.split("/"))): vv for vk, vv in v.items()}) - for k, v in filtered_histogram.items() - if tuple(k.split("/")[2:4]) == tuple(reversed(repeat_units)) - and tuple(k.split("/")[2:4]) != repeat_units - ), - ) - ) - - -def _prepare_genotype_distribution_populations(locus): - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountScatterPlot"].keys())) - - distributions = sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population - ) - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population, sex="XX" - ) - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], population=population, sex="XY" - ) - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram(locus["AlleleCountScatterPlot"], sex=sex) - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - -def _prepare_genotype_distribution_repeat_units(locus): - repeat_unit_pairs = sorted( - set(tuple(sorted(key.split("/")[2:4])) for key in locus["AlleleCountScatterPlot"].keys()) - ) - populations = sorted(set(key.split("/")[0] for key in locus["AlleleCountScatterPlot"].keys())) - - distributions = sorted( - [ - { - "repeat_units": list(repeat_unit_pair), - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], repeat_units=repeat_unit_pair - ) - ) - ), - "populations": sorted( - list( - itertools.chain.from_iterable( - [ - { - "id": population, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - ) - ) - ), - }, - { - "id": f"{population}_XX", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - sex="XX", - ) - ) - ), - }, - { - "id": f"{population}_XY", - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], - repeat_units=repeat_unit_pair, - population=population, - sex="XY", - ) - ) - ), - }, - ] - for population in populations - ) - ) - + [ - { - "id": sex, - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram( - _filter_genotype_distribution_histogram( - locus["AlleleCountScatterPlot"], repeat_units=repeat_unit_pair, sex=sex - ) - ) - ), - } - for sex in ["XX", "XY"] - ], - key=_population_sort_key, - ), - } - for repeat_unit_pair in repeat_unit_pairs - ], - key=lambda r: ( - len(r["repeat_units"][0]), - len(r["repeat_units"][1]), - r["repeat_units"][0], - r["repeat_units"][1], - ), - ) - - return [distribution for distribution in distributions if distribution["distribution"]] - - def _prepare_disease_repeat_size_classifications(disease): ranges = [] @@ -414,6 +67,63 @@ def _prepare_disease_repeat_size_classifications(disease): } +def _parse_allele_count_histogram_section(cohort_key, distribution): + [ancestry_group, sex, repunit, quality_description, q_score] = cohort_key.split("/") + return { + "ancestry_group": ancestry_group, + "sex": sex, + "repunit": repunit, + "quality_description": quality_description.lower(), + "q_score": float(q_score), + "distribution": [{"repunit_count": int(k), "frequency": v} for k, v in distribution.items()], + } + + +def _prepare_allele_size_distribution(allele_count_histogram): + return [_parse_allele_count_histogram_section(k, v) for (k, v) in allele_count_histogram.items()] + + +def _parse_allele_scatter_plot_item(item): + (key, value) = item + [short_allele_repunit_count, long_allele_repunit_count] = key.split("/") + return { + "short_allele_repunit_count": int(short_allele_repunit_count), + "long_allele_repunit_count": int(long_allele_repunit_count), + "frequency": value, + } + + +def _parse_allele_scatter_plot_distribution(distribution): + return list(map(_parse_allele_scatter_plot_item, distribution.items())) + + +def _parse_allele_scatter_plot_histogram_section(cohort_key, distribution): + [ancestry_group, sex, short_allele_repunit, long_allele_repunit, quality_description, q_score] = cohort_key.split( + "/" + ) + return { + "ancestry_group": ancestry_group, + "sex": sex, + "short_allele_repunit": short_allele_repunit, + "long_allele_repunit": long_allele_repunit, + "quality_description": quality_description, + "q_score": float(q_score), + "distribution": _parse_allele_scatter_plot_distribution(distribution), + } + + +def _prepare_genotype_distribution(allele_scatter_plot_histogram): + return [_parse_allele_scatter_plot_histogram_section(k, v) for k, v in allele_scatter_plot_histogram.items()] + + +def _parse_reference_regions(regions): + # "regions" may be a single string or list of strings + + if isinstance(regions, str): + return [_parse_region_id(regions)] + return list(map(_parse_region_id, regions)) + + def prepare_gnomad_v3_short_tandem_repeats(path): with hl.hadoop_open(path) as input_file: data = json.load(input_file) @@ -435,7 +145,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): for disease in locus["Diseases"] ], "stripy_id": locus["STRipyName"] if "STRipyName" in locus else None, - "reference_region": {"reference_genome": "GRCh38", **_parse_region_id(locus["ReferenceRegion"])}, + "main_reference_region": _parse_region_id(locus["MainReferenceRegion"]), + "reference_regions": _parse_reference_regions(locus["ReferenceRegion"]), "reference_repeat_unit": locus["ReferenceRepeatUnit"], "repeat_units": sorted( ( @@ -456,18 +167,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): ), key=lambda r: (len(r["repeat_unit"]), r["repeat_unit"]), ), - "allele_size_distribution": { - "distribution": _prepare_histogram(_get_total_histogram(locus["AlleleCountHistogram"])), - "populations": _prepare_allele_size_distribution_populations(locus), - "repeat_units": _prepare_allele_size_distribution_repeat_units(locus), - }, - "genotype_distribution": { - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram(locus["AlleleCountScatterPlot"]) - ), - "populations": _prepare_genotype_distribution_populations(locus), - "repeat_units": _prepare_genotype_distribution_repeat_units(locus), - }, + "allele_size_distribution": _prepare_allele_size_distribution(locus["AlleleCountHistogram"]), + "genotype_distribution": _prepare_genotype_distribution(locus["AlleleCountScatterPlot"]), "age_distribution": _prepare_age_distribution(locus), "adjacent_repeats": sorted( [ @@ -482,20 +183,12 @@ def prepare_gnomad_v3_short_tandem_repeats(path): set(k.split("/")[2] for k in adjacent_repeat["AlleleCountHistogram"].keys()), key=lambda repeat_unit: (len(repeat_unit), repeat_unit), ), - "allele_size_distribution": { - "distribution": _prepare_histogram( - _get_total_histogram(adjacent_repeat["AlleleCountHistogram"]) - ), - "populations": _prepare_allele_size_distribution_populations(adjacent_repeat), - "repeat_units": _prepare_allele_size_distribution_repeat_units(adjacent_repeat), - }, - "genotype_distribution": { - "distribution": _prepare_genotype_distribution_histogram( - _get_total_histogram(adjacent_repeat["AlleleCountScatterPlot"]) - ), - "populations": _prepare_genotype_distribution_populations(adjacent_repeat), - "repeat_units": _prepare_genotype_distribution_repeat_units(adjacent_repeat), - }, + "allele_size_distribution": _prepare_allele_size_distribution( + adjacent_repeat["AlleleCountHistogram"] + ), + "genotype_distribution": _prepare_genotype_distribution( + adjacent_repeat["AlleleCountScatterPlot"] + ), "age_distribution": _prepare_age_distribution(adjacent_repeat), } for adjacent_repeat_id, adjacent_repeat in locus.get("AdjacentRepeats", {}).items() @@ -506,7 +199,31 @@ def prepare_gnomad_v3_short_tandem_repeats(path): for locus in data.values() ] - return hl.Table.parallelize( + allele_size_distribution_schema = hl.tarray( + hl.tstruct( + ancestry_group=hl.tstr, + sex=hl.tstr, + repunit=hl.tstr, + quality_description=hl.tstr, + q_score=hl.tfloat, + distribution=hl.tarray(hl.tstruct(repunit_count=hl.tint, frequency=hl.tint)), + ) + ) + genotype_distribution_schema = hl.tarray( + hl.tstruct( + ancestry_group=hl.tstr, + sex=hl.tstr, + short_allele_repunit=hl.tstr, + long_allele_repunit=hl.tstr, + quality_description=hl.tstr, + q_score=hl.tfloat, + distribution=hl.tarray( + hl.tstruct(short_allele_repunit_count=hl.tint, long_allele_repunit_count=hl.tint, frequency=hl.tfloat) + ), + ) + ) + + ds = hl.Table.parallelize( ds, hl.tstruct( id=hl.tstr, @@ -521,31 +238,14 @@ def prepare_gnomad_v3_short_tandem_repeats(path): notes=hl.tstr, ) ), - reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), + main_reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), + reference_regions=hl.tarray( + hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint) + ), reference_repeat_unit=hl.tstr, repeat_units=hl.tarray(hl.tstruct(repeat_unit=hl.tstr, classification=hl.tstr)), - allele_size_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_unit=hl.tstr, - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - ) - ), - ), - genotype_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_units=hl.tarray(hl.tstr), - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - ) - ), - ), + allele_size_distribution=allele_size_distribution_schema, + genotype_distribution=genotype_distribution_schema, age_distribution=hl.tarray( hl.tstruct(age_range=hl.tarray(hl.tint), distribution=hl.tarray(hl.tarray(hl.tint))) ), @@ -556,32 +256,8 @@ def prepare_gnomad_v3_short_tandem_repeats(path): reference_region=hl.tstruct(reference_genome=hl.tstr, chrom=hl.tstr, start=hl.tint, stop=hl.tint), reference_repeat_unit=hl.tstr, repeat_units=hl.tarray(hl.tstr), - allele_size_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_unit=hl.tstr, - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray( - hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint))) - ), - ) - ), - ), - genotype_distribution=hl.tstruct( - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray(hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint)))), - repeat_units=hl.tarray( - hl.tstruct( - repeat_units=hl.tarray(hl.tstr), - distribution=hl.tarray(hl.tarray(hl.tint)), - populations=hl.tarray( - hl.tstruct(id=hl.tstr, distribution=hl.tarray(hl.tarray(hl.tint))) - ), - ) - ), - ), + allele_size_distribution=allele_size_distribution_schema, + genotype_distribution=genotype_distribution_schema, age_distribution=hl.tarray( hl.tstruct(age_range=hl.tarray(hl.tint), distribution=hl.tarray(hl.tarray(hl.tint))) ), @@ -590,3 +266,4 @@ def prepare_gnomad_v3_short_tandem_repeats(path): ), n_partitions=1, ) + return ds diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py index 9ddfa8b7b..e91b9d2d3 100644 --- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py +++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py @@ -284,7 +284,7 @@ def add_liftover_document_id(ds): ), "args": { "index": "gnomad_v3_short_tandem_repeats", - "index_fields": ["id", "gene.ensembl_id", "reference_region"], + "index_fields": ["id", "gene.ensembl_id", "main_reference_region"], "id_field": "id", "num_shards": 1, }, diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py index 04848c871..f303e7ed7 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py @@ -8,8 +8,8 @@ pipeline.add_task( "prepare_short_tandem_repeats", prepare_gnomad_v3_short_tandem_repeats, - "/gnomad_v3/gnomad_v3_short_tandem_repeats.ht", - {"path": "gs://gcp-public-data--gnomad/release/3.1.3/json/gnomAD_STR_distributions__2022_01_20.json.gz"}, + "/gnomad_v4/gnomad_v4_short_tandem_repeats.ht", + {"path": "gs://gnomad-browser-data-pipeline/phil-scratch/gnomAD_STR_distributions__gnomad-v2__2024_06_28.json"}, ) ############################################### From 32853c923f4259f5a413d8b96ef08fc22cef307b Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 16:45:50 -0400 Subject: [PATCH 02/10] Update GraphQL schema for STRs to reflect new ES schema --- .../graphql/types/short-tandem-repeat.graphql | 66 +++++++++---------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/graphql-api/src/graphql/types/short-tandem-repeat.graphql b/graphql-api/src/graphql/types/short-tandem-repeat.graphql index 52dd64ae0..2ba0f2caf 100644 --- a/graphql-api/src/graphql/types/short-tandem-repeat.graphql +++ b/graphql-api/src/graphql/types/short-tandem-repeat.graphql @@ -31,43 +31,39 @@ type ShortTandemRepeatRepeatUnit { classification: String! } -type ShortTandemRepeatAlleleSizeDistributionPopulation { - id: String! - distribution: [[Int!]!]! -} - -type ShortTandemRepeatAlleleSizeDistributionRepeatUnit { - repeat_unit: String! - distribution: [[Int!]!]! - populations: [ShortTandemRepeatAlleleSizeDistributionPopulation!]! -} - -type ShortTandemRepeatAlleleSizeDistribution { +type ShortTandemRepeatAgeDistributionBin { + age_range: [Int]! distribution: [[Int!]!]! - populations: [ShortTandemRepeatAlleleSizeDistributionPopulation!]! - repeat_units: [ShortTandemRepeatAlleleSizeDistributionRepeatUnit!]! } -type ShortTandemRepeatGenotypeDistributionPopulation { - id: String! - distribution: [[Int!]!]! +type ShortTandemRepeatAlleleSizeItem { + repunit_count: Int! + frequency: Int! } -type ShortTandemRepeatGenotypeDistributionRepeatUnit { - repeat_units: [String!]! - distribution: [[Int!]!]! - populations: [ShortTandemRepeatGenotypeDistributionPopulation!]! +type ShortTandemRepeatAlleleSizeDistributionCohort { + ancestry_group: String! + sex: String! + repunit: String! + quality_description: String! + q_score: Float! + distribution: [ShortTandemRepeatAlleleSizeItem!]! } -type ShortTandemRepeatGenotypeDistribution { - distribution: [[Int!]!]! - populations: [ShortTandemRepeatGenotypeDistributionPopulation!]! - repeat_units: [ShortTandemRepeatGenotypeDistributionRepeatUnit!]! +type ShortTandemRepeatGenotypeItem { + short_allele_repunit_count: Int! + long_allele_repunit_count: Int! + frequency: Int! } -type ShortTandemRepeatAgeDistributionBin { - age_range: [Int]! - distribution: [[Int!]!]! +type ShortTandemRepeatGenotypeDistributionCohort { + ancestry_group: String! + sex: String! + short_allele_repunit: String! + long_allele_repunit: String! + quality_description: String! + q_score: Float! + distribution: [ShortTandemRepeatGenotypeItem!]! } type ShortTandemRepeatAdjacentRepeat { @@ -75,8 +71,8 @@ type ShortTandemRepeatAdjacentRepeat { reference_region: ShortTandemRepeatReferenceRegion! reference_repeat_unit: String! repeat_units: [String!]! - allele_size_distribution: ShortTandemRepeatAlleleSizeDistribution! - genotype_distribution: ShortTandemRepeatGenotypeDistribution! + allele_size_distribution: [ShortTandemRepeatAlleleSizeDistributionCohort!]! + genotype_distribution: [ShortTandemRepeatGenotypeDistributionCohort!]! age_distribution: [ShortTandemRepeatAgeDistributionBin!] } @@ -85,7 +81,8 @@ type ShortTandemRepeat { gene: ShortTandemRepeatGene! associated_diseases: [ShortTandemRepeatAssociatedDisease!]! stripy_id: String! - reference_region: ShortTandemRepeatReferenceRegion! + main_reference_region: ShortTandemRepeatReferenceRegion! + reference_regions: [ShortTandemRepeatReferenceRegion!]! reference_repeat_unit: String! } @@ -94,11 +91,12 @@ type ShortTandemRepeatDetails { gene: ShortTandemRepeatGene! associated_diseases: [ShortTandemRepeatAssociatedDisease!]! stripy_id: String - reference_region: ShortTandemRepeatReferenceRegion! + main_reference_region: ShortTandemRepeatReferenceRegion! + reference_regions: [ShortTandemRepeatReferenceRegion!]! reference_repeat_unit: String! repeat_units: [ShortTandemRepeatRepeatUnit!]! - allele_size_distribution: ShortTandemRepeatAlleleSizeDistribution! - genotype_distribution: ShortTandemRepeatGenotypeDistribution! + allele_size_distribution: [ShortTandemRepeatAlleleSizeDistributionCohort!]! + genotype_distribution: [ShortTandemRepeatGenotypeDistributionCohort!]! age_distribution: [ShortTandemRepeatAgeDistributionBin!] adjacent_repeats: [ShortTandemRepeatAdjacentRepeat!]! } From 9797f784da76cb7db9bd8a994e9c29e54d3efd9e Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Wed, 17 Jul 2024 15:39:28 -0400 Subject: [PATCH 03/10] Rework STR page frontend Key changes: * Allele size distribution plot can now show, by use of stacked bars, breakdown of each bucket by population, quality, or sex. This also involved replacing some of our custom logic with calls to the visx family of libraries. * More options for y-scaling of allele size distribution plot. * Assorted refactoring, type specifying, and similar cleanup --- .gitignore | 3 + browser/package.json | 3 + ...ShortTandemRepeatAdjacentRepeatSection.tsx | 117 +- .../ShortTandemRepeatAgeDistributionPlot.tsx | 81 +- ...TandemRepeatAlleleSizeDistributionPlot.tsx | 388 +- .../ShortTandemRepeatAttributes.tsx | 90 +- .../ShortTandemRepeatColorBySelect.tsx | 53 + ...emRepeatGenotypeDistributionBinDetails.tsx | 88 +- ...rtTandemRepeatGenotypeDistributionPlot.tsx | 125 +- ...tGenotypeDistributionRepeatUnitsSelect.tsx | 98 +- .../ShortTandemRepeatPage.tsx | 534 +- .../ShortTandemRepeatPageContainer.tsx | 102 +- .../ShortTandemRepeatPopulationOptions.tsx | 82 +- .../ShortTandemRepeatReads.tsx | 174 +- .../ShortTandemRepeatPage.spec.tsx.snap | 11524 ++++++++++++---- ...ortTandemRepeatPageContainer.spec.tsx.snap | 1976 ++- .../shortTandemRepeatHelpers.ts | 263 +- .../src/__factories__/ShortTandemRepeat.ts | 51 +- dataset-metadata/gnomadPopulations.ts | 9 +- graphql-api/src/graphql/resolvers/va.ts | 16 +- pnpm-lock.yaml | 90 + .../create_short_tandem_repeat_reads_db.py | 125 +- reads/src/datasets.js | 4 +- 23 files changed, 11008 insertions(+), 4988 deletions(-) create mode 100644 browser/src/ShortTandemRepeatPage/ShortTandemRepeatColorBySelect.tsx diff --git a/.gitignore b/.gitignore index 6c612328a..0835549af 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ hail-*.log # Playright test dirs /tests/playwright/ /playwright/.cache/ + +# Reads metadata databases +reads/*.db diff --git a/browser/package.json b/browser/package.json index 6fa0a5d3b..dfa60d0e1 100644 --- a/browser/package.json +++ b/browser/package.json @@ -24,6 +24,9 @@ "@hot-loader/react-dom": "^17.0.0", "@visx/axis": "^3.0.0", "@visx/group": "^3.0.0", + "@visx/legend": "^3.12.0", + "@visx/scale": "^3.12.0", + "@visx/shape": "^3.12.0", "core-js": "3.5.0", "css-loader": "^6.7.3", "d3-array": "^1.2.4", diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx index 81c9bcde4..0be687c94 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAdjacentRepeatSection.tsx @@ -1,5 +1,4 @@ -import { max, min } from 'd3-array' -import React, { useState } from 'react' +import React, { SetStateAction, useState, Dispatch } from 'react' import { Modal, Select } from '@gnomad/ui' @@ -7,8 +6,14 @@ import ControlSection from '../VariantPage/ControlSection' import ShortTandemRepeatPopulationOptions from './ShortTandemRepeatPopulationOptions' import { ShortTandemRepeatAdjacentRepeat } from './ShortTandemRepeatPage' -import ShortTandemRepeatAlleleSizeDistributionPlot from './ShortTandemRepeatAlleleSizeDistributionPlot' -import ShortTandemRepeatGenotypeDistributionPlot from './ShortTandemRepeatGenotypeDistributionPlot' +import ShortTandemRepeatAlleleSizeDistributionPlot, { + ScaleType, + Sex, + ColorBy, +} from './ShortTandemRepeatAlleleSizeDistributionPlot' +import ShortTandemRepeatGenotypeDistributionPlot, { + Bin as GenotypeBin, +} from './ShortTandemRepeatGenotypeDistributionPlot' import ShortTandemRepeatGenotypeDistributionBinDetails from './ShortTandemRepeatGenotypeDistributionBinDetails' import ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect from './ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect' import ShortTandemRepeatAdjacentRepeatAttributes from './ShortTandemRepeatAdjacentRepeatAttributes' @@ -16,37 +21,49 @@ import { getSelectedAlleleSizeDistribution, getSelectedGenotypeDistribution, getGenotypeDistributionPlotAxisLabels, + genotypeRepunitPairs, + maxAlleleSizeDistributionRepeats, + maxGenotypeDistributionRepeats, } from './shortTandemRepeatHelpers' +import { PopulationId } from '@gnomad/dataset-metadata/gnomadPopulations' type Props = { adjacentRepeat: ShortTandemRepeatAdjacentRepeat - populationIds: string[] - selectedPopulationId: string - onSelectPopulationId: (...args: any[]) => any - selectedScaleType: string - onSelectScaleType: (...args: any[]) => any + selectedScaleType: ScaleType + selectedPopulation: PopulationId | '' + selectedSex: Sex | '' + selectedColorBy: ColorBy | '' + populations: PopulationId[] + selectedGenotypeDistributionBin: GenotypeBin | null + setSelectedGenotypeDistributionBin: Dispatch> + setSelectedScaleType: Dispatch> + setSelectedPopulation: Dispatch> + setSelectedSex: Dispatch> } const ShortTandemRepeatAdjacentRepeatSection = ({ adjacentRepeat, - populationIds, - selectedPopulationId, - onSelectPopulationId, + populations, selectedScaleType, - onSelectScaleType, + selectedPopulation, + selectedSex, + selectedColorBy, + setSelectedScaleType, + setSelectedPopulation, + setSelectedSex, }: Props) => { const [selectedRepeatUnit, setSelectedRepeatUnit] = useState( adjacentRepeat.repeat_units.length === 1 ? adjacentRepeat.repeat_units[0] : '' ) + const genotypeDistributionPairs = genotypeRepunitPairs(adjacentRepeat) + const defaultGenotypeDistributionRepeatUnits = + genotypeDistributionPairs.length === 1 ? genotypeDistributionPairs[0] : '' const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = - useState( - adjacentRepeat.genotype_distribution.repeat_units.length === 1 - ? adjacentRepeat.genotype_distribution.repeat_units[0].repeat_units.join(' / ') - : '' - ) + useState(defaultGenotypeDistributionRepeatUnits) - const [selectedGenotypeDistributionBin, setSelectedGenotypeDistributionBin] = useState(null) + const [selectedGenotypeDistributionBin, setSelectedGenotypeDistributionBin] = + useState(null) return (
@@ -55,25 +72,25 @@ const ShortTandemRepeatAdjacentRepeatSection = ({

Allele Size Distribution

Genotype Distribution

max(d.slice(0, 2))), - max(adjacentRepeat.genotype_distribution.distribution, (d: any) => min(d.slice(0, 2))), - ]} + maxRepeats={maxGenotypeDistributionRepeats(adjacentRepeat)} genotypeDistribution={getSelectedGenotypeDistribution(adjacentRepeat, { selectedRepeatUnits: selectedGenotypeDistributionRepeatUnits, - selectedPopulationId, + selectedPopulation, + selectedSex, })} - onSelectBin={(bin: any) => { + onSelectBin={(bin: GenotypeBin) => { if (bin.xRange[0] !== bin.xRange[1] || bin.yRange[0] !== bin.yRange[1]) { setSelectedGenotypeDistributionBin(bin) } }} + xRanges={[]} + yRanges={[]} /> {selectedGenotypeDistributionBin && ( )} diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx index 8fa875668..c328be5e8 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAgeDistributionPlot.tsx @@ -1,12 +1,12 @@ import { max } from 'd3-array' import { scaleBand, scaleLog } from 'd3-scale' -import PropTypes from 'prop-types' import React from 'react' import { withSize } from 'react-sizeme' import styled from 'styled-components' import { AxisBottom, AxisLeft } from '@visx/axis' import { TooltipAnchor } from '@gnomad/ui' +import { PlotRange, AgeDistributionItem } from './ShortTandemRepeatPage' // The 100% width/height container is necessary the component // to size to fit its container vs staying at its initial size. @@ -19,9 +19,9 @@ const GraphWrapper = styled.div` const labelProps = { fontSize: 14, textAnchor: 'middle', -} +} as const -const ageRangeLabel = (ageRange: any) => { +const ageRangeLabel = (ageRange: [number | null, number | null]) => { const [minAge, maxAge] = ageRange if (minAge === null) { @@ -33,9 +33,15 @@ const ageRangeLabel = (ageRange: any) => { return `${minAge}-${maxAge}` } +type Props = { + ageDistribution: AgeDistributionItem[] + maxRepeats: number + ranges: PlotRange[] + size: { width: number } +} + const ShortTandemRepeatAgeDistributionPlot = withSize()( - // @ts-expect-error TS(2339) FIXME: Property 'ageDistribution' does not exist on type ... Remove this comment to see the full error message - ({ ageDistribution, maxRepeats, ranges, size: { width } }) => { + ({ ageDistribution, maxRepeats, ranges = [], size: { width } }: Props) => { const height = Math.min(width, 300) const margin = { @@ -53,7 +59,7 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( const yNumBins = ageDistribution.length - const data = Array.from(Array(xNumBins * yNumBins).keys()).map((n: any) => { + const data = Array.from(Array(xNumBins * yNumBins).keys()).map((n) => { const xBinIndex = Math.floor(n / yNumBins) const yBinIndex = n % yNumBins @@ -76,22 +82,19 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( } }) - ageDistribution.forEach((ageBin: any, yBinIndex: any) => { - // @ts-expect-error TS(7031) FIXME: Binding element 'repeats' implicitly has an 'any' ... Remove this comment to see the full error message + ageDistribution.forEach((ageBin, yBinIndex) => { ageBin.distribution.forEach(([repeats, nAlleles]) => { const xBinIndex = Math.floor(repeats / xBinSize) data[xBinIndex * yNumBins + yBinIndex].count += nAlleles }) }) - const xScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const xScale = scaleBand() .domain(Array.from(Array(xNumBins).keys())) .range([0, plotWidth]) const xBandwidth = xScale.bandwidth() - const yScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const yScale = scaleBand() .domain(Array.from(Array(yNumBins).keys())) .range([plotHeight, 0]) const yBandwidth = yScale.bandwidth() @@ -99,7 +102,7 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( const xMaxNumLabels = Math.floor(plotWidth / 20) const xLabelInterval = Math.max(Math.round(xNumBins / xMaxNumLabels), 1) - const xTickFormat = (binIndex: any) => { + const xTickFormat = (binIndex: number) => { if (binIndex % xLabelInterval !== 0) { return '' } @@ -111,16 +114,12 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( return `${binIndex * xBinSize} - ${binIndex * xBinSize + xBinSize - 1}` } - const yTickFormat = (binIndex: any) => { + const yTickFormat = (binIndex: number) => { return ageRangeLabel(ageDistribution[binIndex].age_range) } const opacityScale = scaleLog() - // @ts-expect-error TS(2345) FIXME: Argument of type '(string | number | undefined)[]'... Remove this comment to see the full error message - .domain([ - 1, - max(ageDistribution, (ageBin: any) => max(ageBin.distribution, (d: any) => d[1])), - ]) + .domain([1, max(ageDistribution, (ageBin) => max(ageBin.distribution, (d) => d[1])) || 2]) .range([0.1, 1]) return ( @@ -129,7 +128,6 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( {data - .filter((d: any) => d.count !== 0) - .map((d: any) => { + .filter((d) => d.count !== 0) + .map((d) => { return ( {ranges - .filter((range: any) => range.start !== range.stop) - .filter((range: any) => range.start <= maxRepeats) - .map((range: any, rangeIndex: any) => { + .filter((range) => range.start !== range.stop) + .filter((range) => range.start <= maxRepeats) + .map((range, rangeIndex) => { const startBinIndex = Math.floor(range.start / xBinSize) const startX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(startBinIndex) + + (xScale(startBinIndex) || 0) + ((range.start - startBinIndex * xBinSize) / xBinSize) * xBandwidth let stopX if (range.stop <= maxRepeats) { const stopBinIndex = Math.floor(range.stop / xBinSize) stopX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(stopBinIndex) + + (xScale(stopBinIndex) || 0) + ((range.stop - stopBinIndex * xBinSize) / xBinSize) * xBandwidth } else { stopX = plotWidth @@ -306,27 +300,4 @@ const ShortTandemRepeatAgeDistributionPlot = withSize()( ShortTandemRepeatAgeDistributionPlot.displayName = 'ShortTandemRepeatAgeDistributionPlot' -ShortTandemRepeatAgeDistributionPlot.propTypes = { - // @ts-expect-error TS(2322) FIXME: Type '{ ageDistribution: PropTypes.Requireable<(Pr... Remove this comment to see the full error message - ageDistribution: PropTypes.arrayOf( - PropTypes.shape({ - age_range: PropTypes.arrayOf(PropTypes.number).isRequired, - distribution: PropTypes.arrayOf(PropTypes.arrayOf(PropTypes.number)).isRequired, - }) - ), - maxRepeats: PropTypes.number.isRequired, - ranges: PropTypes.arrayOf( - PropTypes.shape({ - start: PropTypes.number.isRequired, - stop: PropTypes.number.isRequired, - label: PropTypes.string.isRequired, - }) - ), -} - -ShortTandemRepeatAgeDistributionPlot.defaultProps = { - // @ts-expect-error TS(2322) FIXME: Type '{ ranges: never[]; }' is not assignable to t... Remove this comment to see the full error message - ranges: [], -} - export default ShortTandemRepeatAgeDistributionPlot diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAlleleSizeDistributionPlot.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAlleleSizeDistributionPlot.tsx index 392583544..e0bb7e1cc 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAlleleSizeDistributionPlot.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAlleleSizeDistributionPlot.tsx @@ -1,12 +1,16 @@ import { max } from 'd3-array' -import { scaleBand, scaleLinear, scaleLog } from 'd3-scale' -import PropTypes from 'prop-types' +import { scaleBand, scaleLinear, scaleLog, scaleOrdinal } from 'd3-scale' import React, { useMemo } from 'react' import { withSize } from 'react-sizeme' import styled from 'styled-components' import { AxisBottom, AxisLeft } from '@visx/axis' +import { BarStack, Bar } from '@visx/shape' +import { AnyD3Scale } from '@visx/scale' +import { LegendOrdinal } from '@visx/legend' import { TooltipAnchor } from '@gnomad/ui' +import { GNOMAD_POPULATION_NAMES, PopulationId } from '@gnomad/dataset-metadata/gnomadPopulations' +import { colorByLabels } from './ShortTandemRepeatColorBySelect' // The 100% width/height container is necessary the component // to size to fit its container vs staying at its initial size. @@ -16,15 +20,140 @@ const GraphWrapper = styled.div` height: 100%; /* stylelint-disable-line unit-whitelist */ ` -const TooltipTrigger = styled.rect` +const BarWithHoverEffect = styled(Bar)` pointer-events: visible; &:hover { - fill: rgba(0, 0, 0, 0.05); + fill-opacity: 0.7; } ` -const tickFormat = (n: any) => { +export type ScaleType = + | 'linear' + | 'linear-truncated-50' + | 'linear-truncated-200' + | 'linear-truncated-1000' + | 'log' + +export const genotypeQualityKeys = [ + 'low', + 'medium-low', + 'medium', + 'medium-high', + 'high', + 'not-reviewed', +] as const + +export type GenotypeQuality = (typeof genotypeQualityKeys)[number] + +export const qScoreKeys = [ + '0', + '0.1', + '0.2', + '0.3', + '0.4', + '0.5', + '0.6', + '0.7', + '0.8', + '0.9', + '1', +] as const + +export type QScoreBin = (typeof qScoreKeys)[number] +export type ColorByValue = GenotypeQuality | QScoreBin | Sex | PopulationId | '' + +export type AlleleSizeDistributionItem = { + repunit_count: number + frequency: number + colorByValue: ColorByValue +} + +export type Sex = 'XX' | 'XY' + +export type ColorBy = 'quality_description' | 'q_score' | 'population' | 'sex' + +const defaultColor = '#73ab3d' +const colorMap: Record> = { + '': { + '': defaultColor, + }, + quality_description: { + low: '#d73027', + 'medium-low': '#fc8d59', + medium: '#fee08b', + 'medium-high': '#d9ef8b', + high: '#1a9850', + 'not-reviewed': '#aaaaaa', + }, + q_score: { + '0': '#ff0000', + '0.1': '#ff3300', + '0.2': '#ff6600', + '0.3': '#ff9900', + '0.4': '#ffcc00', + '0.5': '#ffff00', + '0.6': '#ccff33', + '0.7': '#99ff66', + '0.8': '#66ff99', + '0.9': '#33ffcc', + '1': '#00ff00', + }, + sex: { + XX: '#F7C3CC', + XY: '#6AA6CE', + }, + population: { + nfe: '#6AA6CE', + afr: '#941494', + fin: '#012F6C', + amr: '#EF1E24', + ami: '#ff7f00', + asj: '#FF7E4F', + eas: '#128B44', + mid: '#f781bf', + oth: '#ABB8B9', + sas: '#FE9A10', + }, +} as const + +const qualityDescriptionLabels: Record = { + low: 'Low', + 'medium-low': 'Medium-low', + medium: 'Medium', + 'medium-high': 'Medium-high', + high: 'High', + 'not-reviewed': 'Not reviewed', +} + +const qScoreLabels: Record = { + '0': '0 to 0.05', + '0.1': '0.05 to 0.15', + '0.2': '0.15 to 0.25', + '0.3': '0.25 to 0.35', + '0.4': '0.35 to 0.45', + '0.5': '0.45 to 0.55', + '0.6': '0.55 to 0.65', + '0.7': '0.65 to 0.75', + '0.8': '0.75 to 0.85', + '0.9': '0.85 to 0.95', + '1': '0.95 to 1', +} + +const fixedLegendLabels: Partial>> = { + quality_description: qualityDescriptionLabels, + q_score: qScoreLabels, + population: GNOMAD_POPULATION_NAMES, +} + +const legendLabel = (colorBy: ColorBy, key: string) => fixedLegendLabels[colorBy]?.[key] || key + +const legendLabels = (colorBy: ColorBy, keys: string[]) => + keys.map((key) => legendLabel(colorBy, key)) + +const colorForValue = (colorBy: ColorBy | '', value: string) => + colorMap[colorBy]?.[value] || defaultColor +const tickFormat = (n: number) => { if (n >= 1e9) { return `${(n / 1e9).toPrecision(3)}B` } @@ -40,23 +169,71 @@ const tickFormat = (n: any) => { const labelProps = { fontSize: 14, textAnchor: 'middle', +} as const + +type Range = { start: number; stop: number; label: string } + +type Props = { + maxRepeats: number + alleleSizeDistribution: AlleleSizeDistributionItem[] + colorBy: ColorBy | '' + repeatUnitLength: number | null + scaleType: ScaleType + ranges?: Range[] + size: { width: number } +} + +type Bin = Partial> & { + index: number + label: string + fullFrequency: number +} + +const legendKeys: Record = { + quality_description: [...genotypeQualityKeys], + q_score: [...qScoreKeys], + sex: ['XX', 'XY'], + population: ['nfe', 'afr', 'fin', 'amr', 'ami', 'asj', 'eas', 'mid', 'oth', 'sas'], +} + +const LegendFromColorBy = ({ colorBy }: { colorBy: ColorBy | '' }) => { + if (colorBy === '') { + return null + } + + const keys = legendKeys[colorBy] + const labels = legendLabels(colorBy, [...keys]) + const colors = keys.map((key) => colorMap[colorBy][key]) + const scale = scaleOrdinal().domain(labels).range(colors) + return ( + + ) +} + +const tooltipContent = (data: Bin, colorBy: ColorBy | '', key: ColorByValue | ''): string => { + const repeatText = data.label === '1' ? '1 repeat' : `${data.label} repeats` + const alleles = data[key] || 0 + const alleleText = alleles === 1 ? '1 allele' : `${alleles} alleles` + const colorByText = + colorBy === '' ? '' : `, ${colorByLabels[colorBy]} is ${legendLabel(colorBy, key)}` + return `${repeatText}${colorByText}: ${alleleText}` } const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( ({ - // @ts-expect-error TS(2339) FIXME: Property 'maxRepeats' does not exist on type '{}'. maxRepeats, - // @ts-expect-error TS(2339) FIXME: Property 'alleleSizeDistribution' does not exist o... Remove this comment to see the full error message alleleSizeDistribution, - // @ts-expect-error TS(2339) FIXME: Property 'repeatUnitLength' does not exist on type... Remove this comment to see the full error message + colorBy, repeatUnitLength, - // @ts-expect-error TS(2339) FIXME: Property 'size' does not exist on type '{}'. size: { width }, - // @ts-expect-error TS(2339) FIXME: Property 'scaleType' does not exist on type '{}'. - scaleType, - // @ts-expect-error TS(2339) FIXME: Property 'ranges' does not exist on type '{}'. - ranges, - }) => { + scaleType = 'linear', + ranges = [], + }: Props) => { const height = 300 const margin = { @@ -72,37 +249,64 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( const binSize = Math.max(1, Math.ceil(maxRepeats / (plotWidth / 10))) const nBins = Math.floor(maxRepeats / binSize) + 1 - const data = useMemo(() => { - const d = Array.from(Array(nBins).keys()).map((n: any) => ({ - binIndex: n, - label: binSize === 1 ? `${n}` : `${n * binSize} - ${n * binSize + binSize - 1}`, - count: 0, - })) + const binLabels: string[] = [...Array(nBins).keys()].map((binIndex) => + binSize === 1 ? `${binIndex}` : `${binIndex * binSize} - ${binIndex * binSize + binSize - 1}` + ) + + const emptyBins: Bin[] = Array.from(Array(nBins)).map((_, binIndex) => ({ + label: binLabels[binIndex], + index: binIndex, + fullFrequency: 0, + })) - // @ts-expect-error TS(7031) FIXME: Binding element 'repeatCount' implicitly has an 'a... Remove this comment to see the full error message - alleleSizeDistribution.forEach(([repeatCount, nAlleles]) => { - const binIndex = Math.floor(repeatCount / binSize) - d[binIndex].count += nAlleles - }) + const data: Bin[] = useMemo(() => { + const binsByColorByValue = alleleSizeDistribution.reduce((acc, item) => { + const binIndex = Math.floor(item.repunit_count / binSize) + const oldBin: Bin = acc[binIndex] + const oldFrequency = oldBin[item.colorByValue] || 0 + const newFrequency = oldFrequency + item.frequency + const newBin: Bin = { + ...oldBin, + [item.colorByValue]: newFrequency, + fullFrequency: oldBin.fullFrequency + item.frequency, + } + return { ...acc, [binIndex]: newBin } + }, emptyBins) + return Object.values(binsByColorByValue) + }, [alleleSizeDistribution, binSize, emptyBins]) - return d - }, [alleleSizeDistribution, nBins, binSize]) + const keys = useMemo(() => { + const keySet: Record = data + .flatMap((bin) => Object.keys(bin)) + .reduce((acc, key) => ({ ...acc, [key]: true }), {}) + return Object.keys(keySet).filter( + (key) => key !== 'index' && key !== 'label' && key !== 'fullFrequency' + ) + }, [data]) + // maps binIndex and colorByValue to a y and y start - const xScale = scaleBand() - .domain(data.map((d: any) => d.binIndex)) + const xScale = scaleBand() + .domain(data.map((d) => d.index)) .range([0, plotWidth]) const xBandwidth = xScale.bandwidth() - let yScale: any + let yScale: AnyD3Scale if (scaleType === 'log') { - const maxLog = Math.ceil(Math.log10(max(data, (d: any) => d.count) || 1)) + const maxLog = Math.ceil(Math.log10(max(data, (d) => d.fullFrequency) || 1)) yScale = scaleLog() .domain([1, 10 ** maxLog]) - .range([plotHeight - 10, 0]) + .range([plotHeight, 0]) + .clamp(true) + } else if (scaleType === 'linear-truncated-50') { + yScale = scaleLinear().domain([0, 50]).range([plotHeight, 0]).clamp(true) + } else if (scaleType === 'linear-truncated-200') { + yScale = scaleLinear().domain([0, 200]).range([plotHeight, 0]).clamp(true) + } else if (scaleType === 'linear-truncated-1000') { + yScale = scaleLinear().domain([0, 1000]).range([plotHeight, 0]).clamp(true) } else { yScale = scaleLinear() - .domain([0, max(data, (d: any) => d.count) || 1]) + .domain([0, max(data, (d) => d.fullFrequency) || 1]) .range([plotHeight, 0]) } @@ -117,8 +321,7 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( const readLengthBinIndex = Math.floor(readLengthInRepeats / binSize) // Read length line should be drawn at the center of the range for its value. readLengthX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(readLengthBinIndex) + + (xScale(readLengthBinIndex) || 0) + ((readLengthInRepeats - readLengthBinIndex * binSize) / binSize) * xBandwidth + xBandwidth / binSize / 2 } @@ -126,18 +329,17 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( return ( + - // @ts-expect-error TS(7015) FIXME: Element implicitly has an 'any' type because index... Remove this comment to see the full error message - (binIndex as any) % labelInterval === 0 ? data[binIndex].label : '' + tickFormat={(binIndex: number) => + binIndex % labelInterval === 0 ? binLabels[binIndex] : '' } tickLabelProps={ binSize === 1 @@ -157,8 +359,7 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( fontSize: 10, textAnchor: 'end', transform: `translate(0, 0), rotate(-40 ${ - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(binIndex) + xBandwidth / 2 + (xScale(binIndex) || 0) + xBandwidth / 2 }, 0)`, } } @@ -168,7 +369,6 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( (Number.isInteger(Math.log10(n)) ? tickFormat(n) : '') - : tickFormat + ? (n: unknown) => + Number.isInteger(Math.log10(n as number)) ? tickFormat(n as number) : '' + : (n: unknown) => tickFormat(n as number) } tickLabelProps={() => ({ dx: '-0.25em', @@ -200,54 +400,57 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( /> )} - {data.map((d: any) => { - const y = d.count === 0 ? plotHeight : yScale(d.count) - return ( - - - - - - - ) - })} + colorForValue(colorBy, key.toString())} + x={(bin) => bin.index} + y0={(point) => point[0] || 0} + y1={(point) => point[1] || 0} + > + {(stacks) => + stacks.map((stack) => + stack.bars.map((bar) => { + const tooltip = tooltipContent( + bar.bar.data, + colorBy, + bar.key as ColorByValue | '' + ) + return ( + + + + + + + + ) + }) + ) + } + {ranges - .filter((range: any) => range.start !== range.stop) - .filter((range: any) => range.start <= maxRepeats) - .map((range: any, rangeIndex: any) => { + .filter((range) => range.start !== range.stop) + .filter((range) => range.start <= maxRepeats) + .map((range, rangeIndex) => { const startBinIndex = Math.floor(range.start / binSize) const startX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(startBinIndex) + + (xScale(startBinIndex) || 0) + ((range.start - startBinIndex * binSize) / binSize) * xBandwidth let stopX if (range.stop <= maxRepeats) { const stopBinIndex = Math.floor(range.stop / binSize) stopX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(stopBinIndex) + + (xScale(stopBinIndex) || 0) + ((range.stop - stopBinIndex * binSize) / binSize) * xBandwidth } else { stopX = plotWidth @@ -350,25 +553,4 @@ const ShortTandemRepeatAlleleSizeDistributionPlot = withSize()( ShortTandemRepeatAlleleSizeDistributionPlot.displayName = 'ShortTandemRepeatAlleleSizeDistributionPlot' -ShortTandemRepeatAlleleSizeDistributionPlot.propTypes = { - // @ts-expect-error TS(2322) FIXME: Type '{ maxRepeats: PropTypes.Validator; a... Remove this comment to see the full error message - maxRepeats: PropTypes.number.isRequired, - alleleSizeDistribution: PropTypes.arrayOf(PropTypes.arrayOf(PropTypes.number)).isRequired, - repeatUnitLength: PropTypes.number, - scaleType: PropTypes.oneOf(['linear', 'log']), - ranges: PropTypes.arrayOf( - PropTypes.shape({ - start: PropTypes.number.isRequired, - stop: PropTypes.number.isRequired, - label: PropTypes.string.isRequired, - }) - ), -} - -ShortTandemRepeatAlleleSizeDistributionPlot.defaultProps = { - // @ts-expect-error TS(2322) FIXME: Type '{ scaleType: string; ranges: never[]; }' is ... Remove this comment to see the full error message - scaleType: 'linear', - ranges: [], -} - export default ShortTandemRepeatAlleleSizeDistributionPlot diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAttributes.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAttributes.tsx index ec8457f4f..17c619a6a 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAttributes.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatAttributes.tsx @@ -4,34 +4,32 @@ import AttributeList, { AttributeListItem } from '../AttributeList' import InlineList from '../InlineList' import Link from '../Link' -import { ShortTandemRepeat } from './ShortTandemRepeatPage' +import { ShortTandemRepeat, RepeatUnitClassification } from './ShortTandemRepeatPage' type ShortTandemRepeatRepeatUnitsProps = { shortTandemRepeat: ShortTandemRepeat } const ShortTandemRepeatRepeatUnits = ({ shortTandemRepeat }: ShortTandemRepeatRepeatUnitsProps) => { - const repeatUnitsByClassification = {} + const repeatUnitsByClassification: Partial> = {} shortTandemRepeat.repeat_units.forEach((repeatUnit) => { - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message if (repeatUnitsByClassification[repeatUnit.classification] === undefined) { - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message repeatUnitsByClassification[repeatUnit.classification] = [] } - // @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message - repeatUnitsByClassification[repeatUnit.classification].push(repeatUnit.repeat_unit) + repeatUnitsByClassification[repeatUnit.classification]!.push(repeatUnit.repeat_unit) }) if ( - !(repeatUnitsByClassification as any).pathogenic && - !(repeatUnitsByClassification as any).benign + !repeatUnitsByClassification.pathogenic && + !repeatUnitsByClassification.benign && + repeatUnitsByClassification.unknown ) { return ( 1 ? 's' : ''}`} + label={`Repeat unit${repeatUnitsByClassification.unknown.length > 1 ? 's' : ''}`} > ( + items={repeatUnitsByClassification.unknown.map((repeatUnit: string) => ( {repeatUnit === shortTandemRepeat.reference_repeat_unit && shortTandemRepeat.repeat_units.length > 1 @@ -39,23 +37,54 @@ const ShortTandemRepeatRepeatUnits = ({ shortTandemRepeat }: ShortTandemRepeatRe : repeatUnit} ))} - label={`Repeat unit${(repeatUnitsByClassification as any).unknown.length > 1 ? 's' : ''}`} + label={`Repeat unit${repeatUnitsByClassification.unknown.length > 1 ? 's' : ''}`} /> ) } + if ( + repeatUnitsByClassification.pathogenic && + repeatUnitsByClassification.pathogenic.length === 1 && + !repeatUnitsByClassification.benign && + !repeatUnitsByClassification.unknown + ) { + return ( + <> + {repeatUnitsByClassification.pathogenic && ( + 1 ? 's' : ''}`} + > + ( + + {repeatUnit === shortTandemRepeat.reference_repeat_unit && + shortTandemRepeat.repeat_units.length > 1 + ? `${repeatUnit} (reference)` + : repeatUnit} + + ))} + label={`Pathogenic repeat unit${ + repeatUnitsByClassification.pathogenic.length > 1 ? 's' : '' + }`} + /> + + )} + + ) + } + return ( <> - {(repeatUnitsByClassification as any).pathogenic && ( + {repeatUnitsByClassification.pathogenic && ( 1 ? 's' : '' + repeatUnitsByClassification.pathogenic.length > 1 ? 's' : '' }`} tooltip="These repeat units have been reported in the literature as pathogenic when they expand beyond a certain threshold." > ( + items={repeatUnitsByClassification.pathogenic.map((repeatUnit: string) => ( {repeatUnit === shortTandemRepeat.reference_repeat_unit && shortTandemRepeat.repeat_units.length > 1 @@ -64,20 +93,18 @@ const ShortTandemRepeatRepeatUnits = ({ shortTandemRepeat }: ShortTandemRepeatRe ))} label={`Pathogenic repeat unit${ - (repeatUnitsByClassification as any).pathogenic.length > 1 ? 's' : '' + repeatUnitsByClassification.pathogenic.length > 1 ? 's' : '' }`} /> )} - {(repeatUnitsByClassification as any).benign && ( + {repeatUnitsByClassification.benign && ( 1 ? 's' : '' - }`} + label={`Benign repeat unit${repeatUnitsByClassification.benign.length > 1 ? 's' : ''}`} tooltip="These repeat units are regarded in the literature as benign, even when expanded." > ( + items={repeatUnitsByClassification.benign.map((repeatUnit: string) => ( {repeatUnit === shortTandemRepeat.reference_repeat_unit && shortTandemRepeat.repeat_units.length > 1 @@ -85,21 +112,17 @@ const ShortTandemRepeatRepeatUnits = ({ shortTandemRepeat }: ShortTandemRepeatRe : repeatUnit} ))} - label={`Benign repeat unit${ - (repeatUnitsByClassification as any).benign.length > 1 ? 's' : '' - }`} + label={`Benign repeat unit${repeatUnitsByClassification.benign.length > 1 ? 's' : ''}`} /> )} - {(repeatUnitsByClassification as any).unknown && ( + {repeatUnitsByClassification.unknown && ( 1 ? 's' : '' - }`} + label={`Other repeat unit${repeatUnitsByClassification.unknown.length > 1 ? 's' : ''}`} tooltip="These are the other repeat units detected at this locus within gnomAD samples by the call_non_ref_pathogenic_motifs.py script." > ( + items={repeatUnitsByClassification.unknown.map((repeatUnit: string) => ( {repeatUnit === shortTandemRepeat.reference_repeat_unit && shortTandemRepeat.repeat_units.length > 1 @@ -107,9 +130,7 @@ const ShortTandemRepeatRepeatUnits = ({ shortTandemRepeat }: ShortTandemRepeatRe : repeatUnit} ))} - label={`Other repeat unit${ - (repeatUnitsByClassification as any).unknown.length > 1 ? 's' : '' - }`} + label={`Other repeat unit${repeatUnitsByClassification.unknown.length > 1 ? 's' : ''}`} /> )} @@ -132,10 +153,11 @@ const ShortTandemRepeatAttributes = ({ shortTandemRepeat }: ShortTandemRepeatAtt {shortTandemRepeat.gene.region} - {shortTandemRepeat.reference_region.chrom}-{shortTandemRepeat.reference_region.start}- - {shortTandemRepeat.reference_region.stop} + {shortTandemRepeat.main_reference_region.chrom}: + {shortTandemRepeat.main_reference_region.start}- + {shortTandemRepeat.main_reference_region.stop} diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatColorBySelect.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatColorBySelect.tsx new file mode 100644 index 000000000..074dd4f5b --- /dev/null +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatColorBySelect.tsx @@ -0,0 +1,53 @@ +import React, { Dispatch, SetStateAction } from 'react' +import styled from 'styled-components' + +import { Select } from '@gnomad/ui' +import { ColorBy, ScaleType } from './ShortTandemRepeatAlleleSizeDistributionPlot' + +const Label = styled.label` + padding-right: 1em; +` + +type Props = { + id: string + selectedColorBy: ColorBy | '' + setSelectedColorBy: (newColorBy: ColorBy | '') => void + setSelectedScaleType: Dispatch> +} + +export const colorByLabels: Record = { + quality_description: 'GQ: manual review', + q_score: 'GQ: Q score', + sex: 'Sex', + population: 'Population', +} + +const ShortTandemRepeatColorBySelect = ({ + id, + selectedColorBy, + setSelectedColorBy, + setSelectedScaleType, +}: Props) => { + return ( + + ) +} + +export default ShortTandemRepeatColorBySelect diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionBinDetails.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionBinDetails.tsx index ac497fac3..a7a05a006 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionBinDetails.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionBinDetails.tsx @@ -2,13 +2,22 @@ import React from 'react' import { List, ListItem } from '@gnomad/ui' -import { ShortTandemRepeat, ShortTandemRepeatAdjacentRepeat } from './ShortTandemRepeatPage' +import { + ShortTandemRepeat, + ShortTandemRepeatAdjacentRepeat, + GenotypeDistributionItem, +} from './ShortTandemRepeatPage' + import { getSelectedGenotypeDistribution } from './shortTandemRepeatHelpers' +import { Sex } from './ShortTandemRepeatAlleleSizeDistributionPlot' + type Props = { shortTandemRepeatOrAdjacentRepeat: ShortTandemRepeat | ShortTandemRepeatAdjacentRepeat - selectedPopulationId: string - selectedRepeatUnits: string + selectedPopulation: string | '' + selectedSex: Sex | '' + selectedRepeatUnits: string[] | '' + repeatUnitPairs: string[][] bin: { label: string xRange: number[] @@ -18,71 +27,88 @@ type Props = { const ShortTandemRepeatGenotypeDistributionBinDetails = ({ shortTandemRepeatOrAdjacentRepeat, - selectedPopulationId, + selectedPopulation, + selectedSex, selectedRepeatUnits, + repeatUnitPairs, bin, }: Props) => { const genotypeDistribution = getSelectedGenotypeDistribution(shortTandemRepeatOrAdjacentRepeat, { - selectedPopulationId, + selectedPopulation, selectedRepeatUnits, + selectedSex, }) - const isInBin = (d: any) => - bin.xRange[0] <= d[0] && d[0] <= bin.xRange[1] && bin.yRange[0] <= d[1] && d[1] <= bin.yRange[1] + const isInBin = (item: GenotypeDistributionItem) => + bin.xRange[0] <= item.long_allele_repunit_count && + item.long_allele_repunit_count <= bin.xRange[1] && + bin.yRange[0] <= item.short_allele_repunit_count && + item.short_allele_repunit_count <= bin.yRange[1] return ( <> {/* @ts-expect-error TS(2745) FIXME: This JSX tag's 'children' prop expects type 'never... Remove this comment to see the full error message */} - {/* @ts-expect-error TS(7031) FIXME: Binding element 'x' implicitly has an 'any' type. */} - {genotypeDistribution.filter(isInBin).map(([x, y, n]) => ( - // @ts-expect-error TS(2769) FIXME: No overload matches this call. - - {x} repeats / {y} repeats: {n} individuals - - ))} + {genotypeDistribution + .filter(isInBin) + .map(({ long_allele_repunit_count, short_allele_repunit_count, frequency }) => ( + // @ts-expect-error TS(2769) FIXME: No overload matches this call. + + {long_allele_repunit_count} repeats / {short_allele_repunit_count} repeats:{' '} + {frequency} individuals + + ))} {!selectedRepeatUnits && ( <>

Repeat Units

{/* @ts-expect-error TS(2745) FIXME: This JSX tag's 'children' prop expects type 'never... Remove this comment to see the full error message */} - {shortTandemRepeatOrAdjacentRepeat.genotype_distribution.repeat_units - .map((repeatUnitsDistribution) => repeatUnitsDistribution.repeat_units) + {repeatUnitPairs .map((repeatUnits) => ({ repeatUnits, distribution: getSelectedGenotypeDistribution(shortTandemRepeatOrAdjacentRepeat, { - selectedPopulationId, - selectedRepeatUnits: repeatUnits.join(' / '), + selectedPopulation, + selectedSex, + selectedRepeatUnits: repeatUnits, }), })) - .flatMap(({ repeatUnits, distribution }: any) => [ + .flatMap(({ repeatUnits, distribution }) => [ { repeatUnits, - distribution: distribution.filter((d: any) => d[0] >= d[1]).filter(isInBin), + distribution: distribution + .filter((d) => d.long_allele_repunit_count >= d.short_allele_repunit_count) + .filter(isInBin), }, { repeatUnits: [...repeatUnits].reverse(), distribution: distribution - .filter((d: any) => d[0] < d[1]) - .map((d: any) => [d[1], d[0], d[2]]) + .filter((d) => d.long_allele_repunit_count < d.short_allele_repunit_count) + .map((d) => ({ + ...d, + long_allele_repunit_count: d.short_allele_repunit_count, + short_allele_repunit_count: d.long_allele_repunit_count, + })) .filter(isInBin), }, ]) - .filter(({ distribution }: any) => distribution.length > 0) - .map(({ repeatUnits, distribution }: any) => ( + .map(({ repeatUnits, distribution }) => ( // @ts-expect-error TS(2769) FIXME: No overload matches this call. {repeatUnits.join(' / ')} {/* @ts-expect-error TS(2745) FIXME: This JSX tag's 'children' prop expects type 'never... Remove this comment to see the full error message */} - {/* @ts-expect-error TS(7031) FIXME: Binding element 'x' implicitly has an 'any' type. */} - {distribution.map(([x, y, n]) => ( - // @ts-expect-error TS(2769) FIXME: No overload matches this call. - - {x} repeats / {y} repeats: {n} individuals - - ))} + {distribution.map( + ({ short_allele_repunit_count, long_allele_repunit_count, frequency }) => ( + // @ts-expect-error TS(2769) FIXME: No overload matches this call. + + {long_allele_repunit_count} repeats / {short_allele_repunit_count}{' '} + repeats: {frequency} individuals + + ) + )} ))} diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionPlot.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionPlot.tsx index 7ce7ef7d4..ff016df69 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionPlot.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionPlot.tsx @@ -1,12 +1,12 @@ import { max } from 'd3-array' import { scaleBand, scaleLog } from 'd3-scale' -import PropTypes from 'prop-types' import React from 'react' import { withSize } from 'react-sizeme' import styled from 'styled-components' import { AxisBottom, AxisLeft } from '@visx/axis' import { TooltipAnchor } from '@gnomad/ui' +import { GenotypeDistributionItem } from './ShortTandemRepeatPage' // The 100% width/height container is necessary the component // to size to fit its container vs staying at its initial size. @@ -19,25 +19,39 @@ const GraphWrapper = styled.div` const labelProps = { fontSize: 14, textAnchor: 'middle', +} as const + +type PlotRange = { start: number; stop: number; label: string } + +type Props = { + axisLabels: string[] + maxRepeats: [number, number] + genotypeDistribution: GenotypeDistributionItem[] + xRanges: PlotRange[] + yRanges: PlotRange[] + onSelectBin: (bin: Bin) => void + size: { width: number } +} + +export type Bin = { + label: string + xBinIndex: number + yBinIndex: number + xRange: number[] + yRange: number[] + count: number } const ShortTandemRepeatGenotypeDistributionPlot = withSize()( ({ - // @ts-expect-error TS(2339) FIXME: Property 'axisLabels' does not exist on type '{}'. axisLabels, - // @ts-expect-error TS(2339) FIXME: Property 'maxRepeats' does not exist on type '{}'. maxRepeats, - // @ts-expect-error TS(2339) FIXME: Property 'genotypeDistribution' does not exist on ... Remove this comment to see the full error message genotypeDistribution, - // @ts-expect-error TS(2339) FIXME: Property 'size' does not exist on type '{}'. size: { width }, - // @ts-expect-error TS(2339) FIXME: Property 'xRanges' does not exist on type '{}'. - xRanges, - // @ts-expect-error TS(2339) FIXME: Property 'yRanges' does not exist on type '{}'. - yRanges, - // @ts-expect-error TS(2339) FIXME: Property 'onSelectBin' does not exist on type '{}'... Remove this comment to see the full error message - onSelectBin, - }) => { + xRanges = [], + yRanges = [], + onSelectBin = () => {}, + }: Props) => { const height = Math.min(width, 500) const margin = { @@ -56,7 +70,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( const yBinSize = Math.max(1, Math.ceil(maxRepeats[1] / (plotHeight / 10))) const yNumBins = Math.floor(maxRepeats[1] / yBinSize) + 1 - const data = Array.from(Array(xNumBins * yNumBins).keys()).map((n: any) => { + const data = Array.from(Array(xNumBins * yNumBins).keys()).map((n) => { const xBinIndex = Math.floor(n / yNumBins) const yBinIndex = n % yNumBins @@ -78,7 +92,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( ? `${yBinIndex}` : `${yBinIndex * yBinSize} - ${yBinIndex * yBinSize + yBinSize - 1}` - return { + const result: Bin = { label: `${xLabel} repeats in ${axisLabels[0]} / ${yLabel} repeats in ${axisLabels[1]}`, xBinIndex, yBinIndex, @@ -86,23 +100,23 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( yRange, count: 0, } + return result }) - // @ts-expect-error TS(7031) FIXME: Binding element 'repeats1' implicitly has an 'any'... Remove this comment to see the full error message - genotypeDistribution.forEach(([repeats1, repeats2, nAlleles]) => { - const xBinIndex = Math.floor(repeats1 / xBinSize) - const yBinIndex = Math.floor(repeats2 / yBinSize) - data[xBinIndex * yNumBins + yBinIndex].count += nAlleles - }) + genotypeDistribution.forEach( + ({ short_allele_repunit_count, long_allele_repunit_count, frequency }) => { + const xBinIndex = Math.floor(long_allele_repunit_count / xBinSize) + const yBinIndex = Math.floor(short_allele_repunit_count / yBinSize) + data[xBinIndex * yNumBins + yBinIndex].count += frequency + } + ) - const xScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const xScale = scaleBand() .domain(Array.from(Array(xNumBins).keys())) .range([0, plotWidth]) const xBandwidth = xScale.bandwidth() - const yScale = scaleBand() - // @ts-expect-error TS(2345) FIXME: Argument of type 'number[]' is not assignable to p... Remove this comment to see the full error message + const yScale = scaleBand() .domain(Array.from(Array(yNumBins).keys())) .range([plotHeight, 0]) const yBandwidth = yScale.bandwidth() @@ -110,7 +124,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( const xMaxNumLabels = Math.floor(plotWidth / 20) const xLabelInterval = Math.max(Math.round(xNumBins / xMaxNumLabels), 1) - const xTickFormat = (binIndex: any) => { + const xTickFormat = (binIndex: number) => { if (binIndex % xLabelInterval !== 0) { return '' } @@ -122,7 +136,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( return `${binIndex * xBinSize} - ${binIndex * xBinSize + xBinSize - 1}` } - const yTickFormat = (binIndex: any) => { + const yTickFormat = (binIndex: number) => { if (yBinSize === 1) { return `${binIndex}` } @@ -131,8 +145,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( } const opacityScale = scaleLog() - // @ts-expect-error TS(2345) FIXME: Argument of type '(string | number | undefined)[]'... Remove this comment to see the full error message - .domain([1, max(genotypeDistribution, (d: any) => d[2])]) + .domain([1, max(genotypeDistribution, (d) => d.frequency) || 2]) .range([0.1, 1]) return ( @@ -141,7 +154,6 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( {data - .filter((d: any) => d.count !== 0) - .map((d: any) => { + .filter((d) => d.count !== 0) + .map((d) => { return ( {xRanges - .filter((range: any) => range.start !== range.stop) - .filter((range: any) => range.start <= maxRepeats[0]) - .map((range: any, rangeIndex: any, ranges: any) => { + .filter((range) => range.start !== range.stop) + .filter((range) => range.start <= maxRepeats[0]) + .map((range, rangeIndex, ranges) => { const startBinIndex = Math.floor(range.start / xBinSize) const startX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(startBinIndex) + + (xScale(startBinIndex) || 0) + ((range.start - startBinIndex * xBinSize) / xBinSize) * xBandwidth let stopX if (range.stop <= maxRepeats[0]) { const stopBinIndex = Math.floor(range.stop / xBinSize) stopX = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - xScale(stopBinIndex) + + (xScale(stopBinIndex) || 0) + ((range.stop - stopBinIndex * xBinSize) / xBinSize) * xBandwidth } else { stopX = plotWidth @@ -319,21 +327,19 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( {yRanges - .filter((range: any) => range.start !== range.stop) - .filter((range: any) => range.start <= maxRepeats[1]) - .map((range: any, rangeIndex: any, ranges: any) => { + .filter((range) => range.start !== range.stop) + .filter((range) => range.start <= maxRepeats[1]) + .map((range, rangeIndex, ranges) => { const startBinIndex = Math.floor(range.start / yBinSize) const startY = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - yScale(startBinIndex) + + (yScale(startBinIndex) || 0) + (1 - (range.start - startBinIndex * yBinSize) / yBinSize) * yBandwidth let stopY if (range.stop <= maxRepeats[1]) { const stopBinIndex = Math.floor(range.stop / yBinSize) stopY = - // @ts-expect-error TS(2532) FIXME: Object is possibly 'undefined'. - yScale(stopBinIndex) + + (yScale(stopBinIndex) || 0) + (1 - (range.stop - stopBinIndex * yBinSize) / yBinSize) * yBandwidth } else { stopY = 0 @@ -403,30 +409,7 @@ const ShortTandemRepeatGenotypeDistributionPlot = withSize()( ShortTandemRepeatGenotypeDistributionPlot.displayName = 'ShortTandemRepeatGenotypeDistributionPlot' -ShortTandemRepeatGenotypeDistributionPlot.propTypes = { - // @ts-expect-error TS(2322) FIXME: Type '{ axisLabels: PropTypes.Validator<(string | ... Remove this comment to see the full error message - axisLabels: PropTypes.arrayOf(PropTypes.string).isRequired, - maxRepeats: PropTypes.arrayOf(PropTypes.number).isRequired, - genotypeDistribution: PropTypes.arrayOf(PropTypes.arrayOf(PropTypes.number)).isRequired, - xRanges: PropTypes.arrayOf( - PropTypes.shape({ - start: PropTypes.number.isRequired, - stop: PropTypes.number.isRequired, - label: PropTypes.string.isRequired, - }) - ), - yRanges: PropTypes.arrayOf( - PropTypes.shape({ - start: PropTypes.number.isRequired, - stop: PropTypes.number.isRequired, - label: PropTypes.string.isRequired, - }) - ), - onSelectBin: PropTypes.func, -} - ShortTandemRepeatGenotypeDistributionPlot.defaultProps = { - // @ts-expect-error TS(2322) FIXME: Type '{ xRanges: never[]; yRanges: never[]; onSele... Remove this comment to see the full error message xRanges: [], yRanges: [], onSelectBin: () => {}, diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect.tsx index 381f96e90..0150e3dd8 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect.tsx @@ -1,37 +1,35 @@ -import React from 'react' +import React, { Dispatch, SetStateAction } from 'react' import { Select } from '@gnomad/ui' +import { ShortTandemRepeat, ShortTandemRepeatAdjacentRepeat } from './ShortTandemRepeatPage' +import { genotypeRepunitPairs, isAdjacentRepeat } from './shortTandemRepeatHelpers' type Props = { - shortTandemRepeatOrAdjacentRepeat: { - id: string - associated_diseases?: any[] - reference_repeat_unit: string - genotype_distribution: { - repeat_units: { - repeat_units?: string[] - }[] - } - repeat_units: any[] - } - value: string - onChange: (...args: any[]) => any + shortTandemRepeatOrAdjacentRepeat: ShortTandemRepeat | ShortTandemRepeatAdjacentRepeat + selectedRepeatUnits: string[] | '' + setSelectedRepeatUnits: Dispatch> } const ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect = ({ shortTandemRepeatOrAdjacentRepeat, - value, - onChange, + selectedRepeatUnits, + setSelectedRepeatUnits, }: Props) => { // Adjacent repeats do not have classifications for repeat units. - const isAdjacentRepeat = !shortTandemRepeatOrAdjacentRepeat.associated_diseases - const repeatUnitClassifications = isAdjacentRepeat + const repeatUnitClassifications: Record = isAdjacentRepeat( + shortTandemRepeatOrAdjacentRepeat + ) ? {} : shortTandemRepeatOrAdjacentRepeat.repeat_units.reduce( (acc, repeatUnit) => ({ ...acc, [repeatUnit.repeat_unit]: repeatUnit.classification }), {} ) + const repunitPairs = genotypeRepunitPairs(shortTandemRepeatOrAdjacentRepeat) + + if (repunitPairs.length === 1) { + return null + } return ( diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx index 833e19460..b30f0ff43 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPage.tsx @@ -1,4 +1,3 @@ -import { max, min } from 'd3-array' import React, { useState } from 'react' import styled from 'styled-components' @@ -14,8 +13,18 @@ import ShortTandemRepeatAgeDistributionPlot from './ShortTandemRepeatAgeDistribu import ShortTandemRepeatAssociatedDiseasesTable from './ShortTandemRepeatAssociatedDiseasesTable' import ShortTandemRepeatAttributes from './ShortTandemRepeatAttributes' import ShortTandemRepeatPopulationOptions from './ShortTandemRepeatPopulationOptions' -import ShortTandemRepeatAlleleSizeDistributionPlot from './ShortTandemRepeatAlleleSizeDistributionPlot' -import ShortTandemRepeatGenotypeDistributionPlot from './ShortTandemRepeatGenotypeDistributionPlot' +import ShortTandemRepeatColorBySelect from './ShortTandemRepeatColorBySelect' +import ShortTandemRepeatAlleleSizeDistributionPlot, { + ColorBy, + GenotypeQuality, + QScoreBin, + Sex, + ScaleType, + AlleleSizeDistributionItem, +} from './ShortTandemRepeatAlleleSizeDistributionPlot' +import ShortTandemRepeatGenotypeDistributionPlot, { + Bin as GenotypeBin, +} from './ShortTandemRepeatGenotypeDistributionPlot' import ShortTandemRepeatGenotypeDistributionBinDetails from './ShortTandemRepeatGenotypeDistributionBinDetails' import ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect from './ShortTandemRepeatGenotypeDistributionRepeatUnitsSelect' import ShortTandemRepeatReads from './ShortTandemRepeatReads' @@ -23,52 +32,66 @@ import { getSelectedAlleleSizeDistribution, getSelectedGenotypeDistribution, getGenotypeDistributionPlotAxisLabels, + maxAlleleSizeDistributionRepeats, + maxGenotypeDistributionRepeats, + genotypeRepunitPairs, } from './shortTandemRepeatHelpers' import ShortTandemRepeatAdjacentRepeatSection from './ShortTandemRepeatAdjacentRepeatSection' +import { PopulationId } from '@gnomad/dataset-metadata/gnomadPopulations' -type ShortTandemRepeatRepeatUnit = { - repeat_unit: string +type ShortTandemRepeatReferenceRegion = { + chrom: string + start: number + stop: number +} + +export type AlleleSizeDistributionCohort = { + ancestry_group: PopulationId + sex: Sex + repunit: string + quality_description: GenotypeQuality + q_score: QScoreBin + distribution: AlleleSizeDistributionItem[] +} + +export type GenotypeDistributionItem = { + short_allele_repunit_count: number + long_allele_repunit_count: number + frequency: number +} + +export type GenotypeDistributionCohort = { + ancestry_group: string + sex: Sex + short_allele_repunit: string + long_allele_repunit: string + quality_description: GenotypeQuality + q_score: QScoreBin + distribution: GenotypeDistributionItem[] +} + +export type AgeDistributionItem = { + age_range: [number | null, number | null] distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] } export type ShortTandemRepeatAdjacentRepeat = { id: string - reference_region: { - chrom: string - start: number - stop: number - } + reference_region: ShortTandemRepeatReferenceRegion reference_repeat_unit: string repeat_units: string[] - allele_size_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: ShortTandemRepeatRepeatUnit[] - } - genotype_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: { - repeat_units: string[] - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - }[] - } + allele_size_distribution: AlleleSizeDistributionCohort[] + genotype_distribution: GenotypeDistributionCohort[] +} + +export type PlotRange = { + label: string + start: number + stop: number } +export type RepeatUnitClassification = 'benign' | 'pathogenic' | 'unknown' + export type ShortTandemRepeat = { id: string gene: { @@ -89,39 +112,16 @@ export type ShortTandemRepeat = { notes: string | null }[] stripy_id: string | null - reference_region: { - chrom: string - start: number - stop: number - } + main_reference_region: ShortTandemRepeatReferenceRegion + reference_regions: ShortTandemRepeatReferenceRegion[] reference_repeat_unit: string repeat_units: { repeat_unit: string - classification: string + classification: RepeatUnitClassification }[] - allele_size_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: ShortTandemRepeatRepeatUnit[] - } - genotype_distribution: { - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - repeat_units: { - repeat_units: string[] - distribution: number[][] - populations: { - id: string - distribution: number[][] - }[] - }[] - } + allele_size_distribution: AlleleSizeDistributionCohort[] + genotype_distribution: GenotypeDistributionCohort[] + age_distribution: AgeDistributionItem[] adjacent_repeats: ShortTandemRepeatAdjacentRepeat[] } @@ -140,50 +140,55 @@ const FlexWrapper = styled.div` width: 100%; ` -const parseCombinedPopulationId = (combinedPopulationId: any) => { - let population - let sex - if (combinedPopulationId.includes('_')) { - ;[population, sex] = combinedPopulationId.split('_') - } else if (combinedPopulationId === 'XX' || combinedPopulationId === 'XY') { - population = null - sex = combinedPopulationId - } else { - population = combinedPopulationId - sex = null - } - return { population, sex } -} - type ShortTandemRepeatPageProps = { datasetId: DatasetId shortTandemRepeat: ShortTandemRepeat } +// Stacked bar plots only make sense when the y scale factor stays constant +// throughout, so log scale is only allowed when there's only one bar per +// column, that is, when not breaking down the data into subsets. +const logScaleAllowed = (colorBy: ColorBy | '') => colorBy === '' + const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepeatPageProps) => { - const [selectedRepeatUnit, setSelectedRepeatUnit] = useState( - shortTandemRepeat.allele_size_distribution.repeat_units.length === 1 - ? shortTandemRepeat.allele_size_distribution.repeat_units[0].repeat_unit + const { allele_size_distribution } = shortTandemRepeat + + const alleleSizeDistributionRepunits = [ + ...new Set(allele_size_distribution.map((cohort) => cohort.repunit)), + ].sort() + const genotypeDistributionRepunitPairs = genotypeRepunitPairs(shortTandemRepeat) + + const defaultAlleleSizeRepunit = + alleleSizeDistributionRepunits.length === 1 ? alleleSizeDistributionRepunits[0] : '' + const defaultGenotypeDistributionRepunits = + genotypeDistributionRepunitPairs.length === 1 ? genotypeDistributionRepunitPairs[0] : '' + const defaultDisease = + shortTandemRepeat.associated_diseases.length > 0 + ? shortTandemRepeat.associated_diseases[0].name : '' - ) - const [selectedPopulationId, setSelectedPopulationId] = useState('') - const [selectedScaleType, setSelectedScaleType] = useState('linear') - - const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = - useState( - shortTandemRepeat.genotype_distribution.repeat_units.length === 1 - ? shortTandemRepeat.genotype_distribution.repeat_units[0].repeat_units.join(' / ') - : '' - ) + const [selectedPopulation, setSelectedPopulation] = useState('') + const [selectedSex, setSelectedSex] = useState('') + const [selectedScaleType, setSelectedScaleType] = useState('linear') + const [selectedColorBy, rawSetSelectedColorBy] = useState('') - const [selectedDisease, setSelectedDisease] = useState( - shortTandemRepeat.associated_diseases[0].name - ) + const setSelectedColorBy = (newColorBy: ColorBy | '') => { + if (selectedScaleType === 'log' && !logScaleAllowed(newColorBy)) { + setSelectedScaleType('linear') + } + rawSetSelectedColorBy(newColorBy) + } - const [showAdjacentRepeats, setShowAdjacentRepeats] = useState(false) + const [selectedAlleleSizeRepeatUnit, setSelectedAlleleSizeRepeatUnit] = + useState(defaultAlleleSizeRepunit) + const [selectedGenotypeDistributionRepeatUnits, setSelectedGenotypeDistributionRepeatUnits] = + useState(defaultGenotypeDistributionRepunits) + const [selectedDisease, setSelectedDisease] = useState(defaultDisease) + const [showAdjacentRepeats, setShowAdjacentRepeats] = useState(false) - const populationIds = shortTandemRepeat.allele_size_distribution.populations.map((pop) => pop.id) + const populations = [ + ...new Set(shortTandemRepeat.allele_size_distribution.map((cohort) => cohort.ancestry_group)), + ].sort() const allRepeatUnitsByClassification: Record = {} shortTandemRepeat.repeat_units.forEach((repeatUnit) => { @@ -196,16 +201,14 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe // This uses repeat units from shortTandemRepeat.allele_size_distribution.repeat_units because // shortTandemRepeat.repeat_units may include repeat units that do not appear in gnomAD. const repeatUnitsFoundInGnomad = new Set( - shortTandemRepeat.allele_size_distribution.repeat_units.map( - (repeatUnit) => repeatUnit.repeat_unit - ) + shortTandemRepeat.allele_size_distribution.map((cohort) => cohort.repunit) ) const repeatUnitsFoundInGnomadByClassification: Record = {} Object.keys(allRepeatUnitsByClassification).forEach((classification) => { repeatUnitsFoundInGnomadByClassification[classification] = allRepeatUnitsByClassification[ classification - ].filter((repeatUnit: any) => repeatUnitsFoundInGnomad.has(repeatUnit)) + ].filter((repeatUnit) => repeatUnitsFoundInGnomad.has(repeatUnit)) }) const allRepeatUnitsFoundInGnomadArePathogenic = Object.keys( @@ -224,7 +227,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe ? diseaseToPlot.repeat_size_classifications : [] - const plotRanges = repeatSizeClassificationsToPlot.map((classification) => { + const plotRanges: PlotRange[] = repeatSizeClassificationsToPlot.map((classification) => { return { label: classification.classification, start: classification.min !== null ? classification.min : 0, @@ -232,7 +235,19 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe } }) - const [selectedGenotypeDistributionBin, setSelectedGenotypeDistributionBin] = useState(null) + const [selectedGenotypeDistributionBin, setSelectedGenotypeDistributionBin] = + useState(null) + + const maxAlleleRepeats = maxAlleleSizeDistributionRepeats(shortTandemRepeat) + + const isRepunitSelectionPathogenic = ( + selectedRepeatUnits: string[] | '', + selectionIndex: number + ) => + (selectedRepeatUnits === '' && allRepeatUnitsFoundInGnomadArePathogenic) || + (allRepeatUnitsByClassification.pathogenic || []).includes( + selectedGenotypeDistributionRepeatUnits[selectionIndex] + ) return ( <> @@ -259,12 +274,21 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe STRipy + {/* @ts-expect-error TS(2745) FIXME: This JSX tag's 'children' prop expects type 'never... Remove this comment to see the full error message */} + + {/* @ts-expect-error TS(2786) FIXME: 'ExternalLink' cannot be used as a JSX component. */} + + STRchive + + )} -

Related Loci

+

TRs in gnomAD

- Table of tandem repeat loci in gnomAD + Known disease-associated TRs

@@ -282,26 +306,25 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Allele Size Distribution + - + + + )} @@ -419,7 +441,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe { + onChange={(e: { target: { value: string } }) => { setSelectedDisease(e.target.value) }} > @@ -534,11 +544,9 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe {((selectedGenotypeDistributionRepeatUnits === '' && !allRepeatUnitsFoundInGnomadArePathogenic) || - !selectedGenotypeDistributionRepeatUnits - .split(' / ') - .every((repeatUnit) => - ((allRepeatUnitsByClassification as any).pathogenic || []).includes(repeatUnit) - )) && ( + !(selectedGenotypeDistributionRepeatUnits as string[]).every((repeatUnit) => + ((allRepeatUnitsByClassification as any).pathogenic || []).includes(repeatUnit) + )) && (

Note This plot includes non-pathogenic repeat units. Use the “Repeat units” menu to view specific repeat units. @@ -548,7 +556,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe {selectedGenotypeDistributionBin && ( )} @@ -570,13 +580,8 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Age Distribution {!allRepeatUnitsFoundInGnomadArePathogenic && ( @@ -586,7 +591,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe )}

- {shortTandemRepeat.adjacent_repeats.length > 0 && ( + {false && (

Adjacent Repeats @@ -597,11 +602,16 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe ) }) @@ -623,7 +633,7 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe Read Data{' '} 1 + alleleSizeDistributionRepunits.length > 1 ? 'str-read-data-multiple-repeat-units' : 'str-read-data' } @@ -631,18 +641,20 @@ const ShortTandemRepeatPage = ({ datasetId, shortTandemRepeat }: ShortTandemRepe

diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPageContainer.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPageContainer.tsx index ba8d0cd18..210a7bccf 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPageContainer.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPageContainer.tsx @@ -31,7 +31,14 @@ query ${operationName}($strId: String!, $datasetId: DatasetId!) { } notes } - reference_region { + main_reference_region { + reference_genome + chrom + start + stop + } + reference_regions { + reference_genome chrom start stop @@ -42,33 +49,27 @@ query ${operationName}($strId: String!, $datasetId: DatasetId!) { classification } allele_size_distribution { - distribution - populations { - id - distribution - } - repeat_units { - repeat_unit - distribution - populations { - id - distribution - } + ancestry_group + sex + repunit + quality_description + q_score + distribution { + repunit_count + frequency } } genotype_distribution { - distribution - populations { - id - distribution - } - repeat_units { - repeat_units - distribution - populations { - id - distribution - } + ancestry_group + sex + short_allele_repunit + long_allele_repunit + quality_description + q_score + distribution { + short_allele_repunit_count + long_allele_repunit_count + frequency } } age_distribution { @@ -79,41 +80,36 @@ query ${operationName}($strId: String!, $datasetId: DatasetId!) { adjacent_repeats { id reference_region { + reference_genome chrom start stop } reference_repeat_unit repeat_units - allele_size_distribution { - distribution - populations { - id - distribution - } - repeat_units { - repeat_unit - distribution - populations { - id - distribution - } - } + } + allele_size_distribution { + ancestry_group + sex + repunit + quality_description + q_score + distribution { + repunit_count + frequency } - genotype_distribution { - distribution - populations { - id - distribution - } - repeat_units { - repeat_units - distribution - populations { - id - distribution - } - } + } + genotype_distribution { + ancestry_group + sex + short_allele_repunit + long_allele_repunit + quality_description + q_score + distribution { + short_allele_repunit_count + long_allele_repunit_count + frequency } } } diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPopulationOptions.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPopulationOptions.tsx index f217fc478..e232ad206 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPopulationOptions.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatPopulationOptions.tsx @@ -1,9 +1,11 @@ -import React from 'react' +import React, { Dispatch, SetStateAction } from 'react' import styled from 'styled-components' import { Select } from '@gnomad/ui' -import { GNOMAD_POPULATION_NAMES } from '@gnomad/dataset-metadata/gnomadPopulations' +import { PopulationId, GNOMAD_POPULATION_NAMES } from '@gnomad/dataset-metadata/gnomadPopulations' + +import { Sex } from './ShortTandemRepeatAlleleSizeDistributionPlot' const Wrapper = styled.div` @media (max-width: 600px) { @@ -17,74 +19,64 @@ const Wrapper = styled.div` } ` +const Label = styled.label` + padding-right: 1em; +` + type Props = { id: string - populationIds: string[] - selectedPopulationId: string - onSelectPopulationId: (...args: any[]) => any + populations: PopulationId[] + selectedPopulation: PopulationId | '' + selectedSex: Sex | '' + setSelectedPopulation: Dispatch> + setSelectedSex: Dispatch> } const ShortTandemRepeatPopulationOptions = ({ id, - populationIds, - selectedPopulationId, - onSelectPopulationId, + populations, + selectedPopulation, + selectedSex, + setSelectedPopulation, + setSelectedSex, }: Props) => { - const selectedAncestralPopulation = - selectedPopulationId === 'XX' || selectedPopulationId === 'XY' - ? '' - : selectedPopulationId.split('_')[0] - - let selectedSex = '' - if (selectedPopulationId.endsWith('XX')) { - selectedSex = 'XX' - } else if (selectedPopulationId.endsWith('XY')) { - selectedSex = 'XY' - } + const populationsSortedByName = populations.sort((group1, group2) => + GNOMAD_POPULATION_NAMES[group1].localeCompare(GNOMAD_POPULATION_NAMES[group2]) + ) return ( - ) } diff --git a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatReads.tsx b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatReads.tsx index 4cf2e151d..87bfa4a04 100644 --- a/browser/src/ShortTandemRepeatPage/ShortTandemRepeatReads.tsx +++ b/browser/src/ShortTandemRepeatPage/ShortTandemRepeatReads.tsx @@ -3,7 +3,7 @@ import styled from 'styled-components' import { Button, Input, Select } from '@gnomad/ui' -import { GNOMAD_POPULATION_NAMES } from '@gnomad/dataset-metadata/gnomadPopulations' +import { GNOMAD_POPULATION_NAMES, PopulationId } from '@gnomad/dataset-metadata/gnomadPopulations' import AttributeList, { AttributeListItem } from '../AttributeList' import Delayed from '../Delayed' @@ -43,7 +43,7 @@ type ShortTandemRepeatReadProps = { lower: number } }[] - population: string + population: PopulationId sex: string age?: string pcr_protocol: string @@ -58,7 +58,6 @@ const ShortTandemRepeatRead = ({ read }: ShortTandemRepeatReadProps) => {
- {/* @ts-expect-error TS(7053) FIXME: Element implicitly has an 'any' type because expre... Remove this comment to see the full error message */} {GNOMAD_POPULATION_NAMES[read.population]} {read.sex} @@ -117,11 +116,10 @@ const ShortTandemRepeatReadContainer = ({ ) } - if (error) { + if (error || !read) { return Unable to load read } - // @ts-expect-error TS(2322) FIXME: Type 'null' is not assignable to type '{ alleles: ... Remove this comment to see the full error message return } @@ -194,15 +192,7 @@ const fetchReads = ({ datasetId, shortTandemRepeatId, filter, limit, offset }: a type ShortTandemRepeatReadsProps = { datasetId: string shortTandemRepeat: ShortTandemRepeat - filter: { - population?: string - sex?: string - alleles?: { - repeat_unit?: string - min_repeats?: number - max_repeat?: number - }[] - } + filter: Filters } const ShortTandemRepeatReads = ({ @@ -210,12 +200,12 @@ const ShortTandemRepeatReads = ({ shortTandemRepeat, filter, }: ShortTandemRepeatReadsProps) => { - const fetchReadsTimer = useRef(null) + const fetchReadsTimer = useRef | null>(null) const fetchNumReadsMemoized = useCallback(() => { - // @ts-expect-error TS(2769) FIXME: No overload matches this call. - clearTimeout(fetchReadsTimer.current) + if (fetchReadsTimer.current) { + clearTimeout(fetchReadsTimer.current) + } return new Promise((resolve: any, reject: any) => { - // @ts-expect-error TS(2322) FIXME: Type 'Timeout' is not assignable to type 'null'. fetchReadsTimer.current = setTimeout(() => { fetchNumReads({ datasetId, shortTandemRepeatId: shortTandemRepeat.id, filter }).then( resolve, @@ -224,8 +214,7 @@ const ShortTandemRepeatReads = ({ }, 300) }) }, [datasetId, shortTandemRepeat, filter]) - const { isLoading, response: numReads, error } = useRequest(fetchNumReadsMemoized) - + const { isLoading, response, error } = useRequest(fetchNumReadsMemoized) const readsStore = useRef(new Map()) const [readIndex, setReadIndex] = useState(0) @@ -283,6 +272,8 @@ const ShortTandemRepeatReads = ({ return Unable to load read data } + const numReads: number = response as unknown as number + if (numReads === 0) { return No matching samples found } @@ -307,18 +298,15 @@ const ShortTandemRepeatReads = ({ min={1} max={numReads} onChange={(e: any) => { - // @ts-expect-error TS(2531) FIXME: Object is possibly 'null'. - setReadIndex(Math.max(0, Math.min(numReads - 1, Number(e.target.value) - 1))) + setReadIndex(Math.max(0, Math.min(numReads! - 1, Number(e.target.value) - 1))) }} style={{ width: '10ch' }} />{' '} - {/* @ts-expect-error TS(2531) FIXME: Object is possibly 'null'. */} - of {numReads.toLocaleString()} + of {numReads!.toLocaleString()} + +
+ +

+ For more information about Tandem Repeats in gnomAD, read our + + + blog post + + . +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+ + AFF2 + + + GCC + + 5'-UTR + + Imaginary inheritance mode + + + FRAXE mental retardation + +
+ + AR + + + GCA + + coding: polyglutamine + + X-linked recessive + + + Spinal and bulbar muscular atrophy + +
+ + ARX_1 + + + NGC + + coding: polyalanine + + X-linked recessive, Z-linked recessive + + + Developmental and epileptic encephalopathy-1 + + , + + X-linked mental retardation with or without seizures + +
+ + ATN1 + + + CAG + + Made-up region + + Autosomal dominant + + + Dentatorubral-pallidoluysian atrophy + +
+ + ATXN1 + + + TGC + + Ersatz region + + Autosomal miscellaneous + + + Spinocerebellar ataxia 1 + +
+ + ATXN10 + + + ATTCT + + intron + + Autosomal recessive + + + Spinocerebellar ataxia 10 + +
+ + ATXN2 + + + GCT + + Fake region + + Autosomal dominant, Autosomal recessive + + + Made-up disease 1 + + , + + Spinocerebellar ataxia 2 + +
+
+ + + , + "container":
+
+ +

+ For more information about Tandem Repeats in gnomAD, read our + + + blog post + + . +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+ + AFF2 + + + GCC + + 5'-UTR + + Imaginary inheritance mode + + + FRAXE mental retardation + +
+ + AR + + + GCA + + coding: polyglutamine + + X-linked recessive + + + Spinal and bulbar muscular atrophy + +
+ + ARX_1 + + + NGC + + coding: polyalanine + + X-linked recessive, Z-linked recessive + + + Developmental and epileptic encephalopathy-1 + + , + + X-linked mental retardation with or without seizures + +
+ + ATN1 + + + CAG + + Made-up region + + Autosomal dominant + + + Dentatorubral-pallidoluysian atrophy + +
+ + ATXN1 + + + TGC + + Ersatz region + + Autosomal miscellaneous + + + Spinocerebellar ataxia 1 + +
+ + ATXN10 + + + ATTCT + + intron + + Autosomal recessive + + + Spinocerebellar ataxia 10 + +
+ + ATXN2 + + + GCT + + Fake region + + Autosomal dominant, Autosomal recessive + + + Made-up disease 1 + + , + + Spinocerebellar ataxia 2 + +
+
+
+
, + "debug": [Function], + "findAllByAltText": [Function], + "findAllByDisplayValue": [Function], + "findAllByLabelText": [Function], + "findAllByPlaceholderText": [Function], + "findAllByRole": [Function], + "findAllByTestId": [Function], + "findAllByText": [Function], + "findAllByTitle": [Function], + "findByAltText": [Function], + "findByDisplayValue": [Function], + "findByLabelText": [Function], + "findByPlaceholderText": [Function], + "findByRole": [Function], + "findByTestId": [Function], + "findByText": [Function], + "findByTitle": [Function], + "getAllByAltText": [Function], + "getAllByDisplayValue": [Function], + "getAllByLabelText": [Function], + "getAllByPlaceholderText": [Function], + "getAllByRole": [Function], + "getAllByTestId": [Function], + "getAllByText": [Function], + "getAllByTitle": [Function], + "getByAltText": [Function], + "getByDisplayValue": [Function], + "getByLabelText": [Function], + "getByPlaceholderText": [Function], + "getByRole": [Function], + "getByTestId": [Function], + "getByText": [Function], + "getByTitle": [Function], + "queryAllByAltText": [Function], + "queryAllByDisplayValue": [Function], + "queryAllByLabelText": [Function], + "queryAllByPlaceholderText": [Function], + "queryAllByRole": [Function], + "queryAllByTestId": [Function], + "queryAllByText": [Function], + "queryAllByTitle": [Function], + "queryByAltText": [Function], + "queryByDisplayValue": [Function], + "queryByLabelText": [Function], + "queryByPlaceholderText": [Function], + "queryByRole": [Function], + "queryByTestId": [Function], + "queryByText": [Function], + "queryByTitle": [Function], + "rerender": [Function], + "unmount": [Function], +} +`; diff --git a/browser/src/useTableSort.tsx b/browser/src/useTableSort.tsx index 1a53c5526..a9240e680 100644 --- a/browser/src/useTableSort.tsx +++ b/browser/src/useTableSort.tsx @@ -81,13 +81,21 @@ const useTableSort = ( return { headers, sortedRowData } } -type NumberHolder = { - [K in Key]: number +type Holder = { + [K in Key]: Value } +type NumberHolder = Holder +type StringHolder = Holder + export const numericCompareFunction = (key: Key) => >(a: RowData, b: RowData) => a[key] - b[key] +export const stringCompareFunction = + (key: Key) => + >(a: RowData, b: RowData) => + b[key].toLowerCase().localeCompare(a[key].toLowerCase()) + export default useTableSort From 0fb8f73ace18669ee1f16b52cc70f194a5b5ed81 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Mon, 15 Jul 2024 16:27:14 -0400 Subject: [PATCH 06/10] DONTMERGE rig index --- graphql-api/src/queries/short-tandem-repeat-queries.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphql-api/src/queries/short-tandem-repeat-queries.ts b/graphql-api/src/queries/short-tandem-repeat-queries.ts index d4b1eb2d2..e883835d4 100644 --- a/graphql-api/src/queries/short-tandem-repeat-queries.ts +++ b/graphql-api/src/queries/short-tandem-repeat-queries.ts @@ -5,7 +5,8 @@ import { fetchAllSearchResults } from './helpers/elasticsearch-helpers' const SHORT_TANDEM_REPEAT_INDICES = { gnomad_r3: 'gnomad_v3_short_tandem_repeats', - gnomad_r4: 'gnomad_v3_short_tandem_repeats', + // TK + gnomad_r4: 'gnomad_v3_short_tandem_repeats-2024-07-15--17-34', } const SUMMARY_FIELDS = [ From e7a4b59e6aed17d637ebb4f1a30a914a9fd64b87 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Fri, 28 Feb 2025 15:38:11 -0500 Subject: [PATCH 07/10] Update paths for reads metadata and visualizations --- reads/src/datasets.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reads/src/datasets.js b/reads/src/datasets.js index 1d66a2d89..5474d8b2a 100644 --- a/reads/src/datasets.js +++ b/reads/src/datasets.js @@ -41,11 +41,11 @@ const variantDatasets = { const shortTandemRepeatDatasets = { gnomad_r3: { - dbPath: '/readviz/datasets/gnomad_r3_short_tandem_repeats/str_reads.db', - publicPath: '/reads/gnomad_r3/short_tandem_repeats', + dbPath: '/readviz/datasets/gnomad_r4_short_tandem_repeats/str_reads.db', + publicPath: 'https://storage.googleapis.com/gnomad-str-public/release_2024_07/readviz_v2', }, gnomad_r4: { - dbPath: './v4_str_reads.db', + dbPath: '/readviz/datasets/gnomad_r4_short_tandem_repeats/str_reads.db', publicPath: 'https://storage.googleapis.com/gnomad-str-public/release_2024_07/readviz_v2', }, } From 387491792bf255d311546464701335e1a238e63a Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Fri, 28 Feb 2025 16:14:01 -0500 Subject: [PATCH 08/10] fix typo in reads update docs --- reads/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reads/README.md b/reads/README.md index 8fda1edbf..bc5ccf879 100644 --- a/reads/README.md +++ b/reads/README.md @@ -41,7 +41,7 @@ tabix -p bed /path/to/gencode.bed.bgz ``` gcloud compute instances create \ --machine-type e2-standard-8 \ - --zone --machine-type + --zone gcloud compute instances attach-disk \ --disk \ From 23a6c74c8386d17807342aa9bf8cb4ef4fd5d5a0 Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Fri, 28 Feb 2025 16:59:31 -0500 Subject: [PATCH 09/10] Update STR distribution data file location --- .../data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py index f303e7ed7..94c0ae5fb 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v3_short_tandem_repeats.py @@ -9,7 +9,9 @@ "prepare_short_tandem_repeats", prepare_gnomad_v3_short_tandem_repeats, "/gnomad_v4/gnomad_v4_short_tandem_repeats.ht", - {"path": "gs://gnomad-browser-data-pipeline/phil-scratch/gnomAD_STR_distributions__gnomad-v2__2024_06_28.json"}, + { + "path": "gs://gnomad-browser-data-pipeline/inputs/secondary-analyses/strs/2024_07_24/gnomAD_STR_distributions__gnomad-v2__2024_07_24.json" + }, ) ############################################### From 136631a274446f958bb282379b242e83fd6eb54c Mon Sep 17 00:00:00 2001 From: Phil Darnowsky Date: Fri, 28 Feb 2025 17:20:29 -0500 Subject: [PATCH 10/10] Update STR index --- graphql-api/src/queries/short-tandem-repeat-queries.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/graphql-api/src/queries/short-tandem-repeat-queries.ts b/graphql-api/src/queries/short-tandem-repeat-queries.ts index e883835d4..86f02d31a 100644 --- a/graphql-api/src/queries/short-tandem-repeat-queries.ts +++ b/graphql-api/src/queries/short-tandem-repeat-queries.ts @@ -4,9 +4,8 @@ import { UserVisibleError } from '../errors' import { fetchAllSearchResults } from './helpers/elasticsearch-helpers' const SHORT_TANDEM_REPEAT_INDICES = { - gnomad_r3: 'gnomad_v3_short_tandem_repeats', - // TK - gnomad_r4: 'gnomad_v3_short_tandem_repeats-2024-07-15--17-34', + gnomad_r3: 'gnomad_v3_short_tandem_repeats-2025-02-28--22-16', + gnomad_r4: 'gnomad_v3_short_tandem_repeats-2025-02-28--22-16', } const SUMMARY_FIELDS = [