src/talos/example_config.toml

[GeneratePanelData]
# the panelapp instance to use
panelapp = 'https://panelapp.agha.umccr.org/api/v1/panels'
# panel ID in that panelapp instance to use as a base, by default this is the 'Mendeliome'
default_panel = 137
# these genes are removed from the base panel, but will be included if they occur in a phenotype-matched panel
require_pheno_match = ['FLG', 'GJB2', 'F2', 'F5']
# genes to remove from any panel, matched or otherwise. Effectively a blacklist
forbidden_genes = ['a', 'list', 'of', 'forbidden', 'genes']  # symbols, ENSG, or a mix
# IDs of panels to apply to all participants in this analysis, regardless of phenotype
# must exist in the relevant panelapp instance
forced_panels = [1, 2, 3]

# integer, when parsing panel data, this value determines whether a gene is 'recent'
# we find the date each gene became Green/Ready, and if that is within X months of today
# we treat the gene as new. New/Recent genes are highlighted in the report with a Star next to the symbol
within_x_months = 6

[FindGeneSymbolMap]
# much higher than this and we get into throttling issues
chunk_size = 800

[ValidateMOI]
# thresholds for different filters during the MOI checks
gnomad_dominant = 0.001

# if the population frequency annotations contain allele count, and hemi-count, these are used
# if absent, these additional tests are skipped
gnomad_max_homs_dominant = 0
gnomad_max_homs_recessive = 1
gnomad_max_ac_dominant = 10
gnomad_max_hemi = 1
callset_af_sv_dominant = 0.01

# optionally, ignore some categories. Categories named here are stripped from the variants upon ingestion
#ignore_categories = ['categoryboolean6']

# for these categories, require a phenotype-gene match
# this is the final part of the Category name, e.g. categorydetailspm5 is "pm5", and categorybooleansv1 is "sv1"
# apologies for the inconsistency
phenotype_match = ['6']

[RunHailFiltering]
# variables affecting how the VCF variants are parsed, and AnalysisVariant objects are populated
csq_string = ['consequence', 'symbol', 'gene', 'feature', 'mane_select', 'biotype', 'exon', 'hgvsc', 'hgvsp', 'cdna_position', 'cds_position', 'protein_position', 'ensp', 'lof', 'sift', 'polyphen', 'am_class', 'am_pathogenicity']

# variables for the hail operations, including CSQ sets and filter thresholds
ac_threshold = 0.01
additional_csq = ['missense_variant']
af_semi_rare = 0.01
callset_af_sv_recessive = 0.03
critical_csq = ['frameshift_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'start_lost', 'stop_gained', 'stop_lost', 'transcript_ablation']
max_parent_ab = 0.05
minimum_depth = 10
spliceai = 0.5

[RunHailFiltering.cores]
small_variants = 8

[categories]
1 = 'ClinVar Pathogenic'
3 = 'High Impact Variant'
4 = 'de Novo'
5 = 'High SpliceAI Score'
6 = 'AlphaMissense P/LP'
pm5 = 'ACMG PM5 - missense in same residue as known pathogenic'
sv1 = 'Predicted LOF SV'

[CreateTalosHTML]
external_labels = "path to labels file"
seqr_lookup = "e.g. gs://cpg-COHORT-test/reanalysis/parsed_seqr.json"
seqr_instance = "e.g. https://seqr.populationgenomics.org.au"
seqr_project = "e.g. COHORT_project_id"

[HPOFlagging]
# this section relates to phenotype-matching the final variant set

# set this to True to do a semantic term comparison when phenotype matching
# this does a wiggly semantic similarity test between participant and gene HPOs, through SemSimian
# if False, we will always do a set intersection on HPO terms
semantic_match = true

# min similarity score when doing a semsimian termset similarity test
min_similarity = 14.0