generated from populationgenomics/cpg-python-template-repo
-
Notifications
You must be signed in to change notification settings - Fork 4
/
example_config.toml
84 lines (70 loc) · 3.45 KB
/
example_config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
[GeneratePanelData]
# the panelapp instance to use
panelapp = 'https://panelapp.agha.umccr.org/api/v1/panels'
# panel ID in that panelapp instance to use as a base, by default this is the 'Mendeliome'
default_panel = 137
# these genes are removed from the base panel, but will be included if they occur in a phenotype-matched panel
require_pheno_match = ['FLG', 'GJB2', 'F2', 'F5']
# genes to remove from any panel, matched or otherwise. Effectively a blacklist
forbidden_genes = ['a', 'list', 'of', 'forbidden', 'genes'] # symbols, ENSG, or a mix
# IDs of panels to apply to all participants in this analysis, regardless of phenotype
# must exist in the relevant panelapp instance
forced_panels = [1, 2, 3]
# integer, when parsing panel data, this value determines whether a gene is 'recent'
# we find the date each gene became Green/Ready, and if that is within X months of today
# we treat the gene as new. New/Recent genes are highlighted in the report with a Star next to the symbol
within_x_months = 6
[FindGeneSymbolMap]
# much higher than this and we get into throttling issues
chunk_size = 800
[ValidateMOI]
# thresholds for different filters during the MOI checks
gnomad_dominant = 0.001
# if the population frequency annotations contain allele count, and hemi-count, these are used
# if absent, these additional tests are skipped
gnomad_max_homs_dominant = 0
gnomad_max_homs_recessive = 1
gnomad_max_ac_dominant = 10
gnomad_max_hemi = 1
callset_af_sv_dominant = 0.01
# optionally, ignore some categories. Categories named here are stripped from the variants upon ingestion
#ignore_categories = ['categoryboolean6']
# for these categories, require a phenotype-gene match
# this is the final part of the Category name, e.g. categorydetailspm5 is "pm5", and categorybooleansv1 is "sv1"
# apologies for the inconsistency
phenotype_match = ['6']
[RunHailFiltering]
# variables affecting how the VCF variants are parsed, and AnalysisVariant objects are populated
csq_string = ['consequence', 'symbol', 'gene', 'feature', 'mane_select', 'biotype', 'exon', 'hgvsc', 'hgvsp', 'cdna_position', 'cds_position', 'protein_position', 'ensp', 'lof', 'sift', 'polyphen', 'am_class', 'am_pathogenicity']
# variables for the hail operations, including CSQ sets and filter thresholds
ac_threshold = 0.01
additional_csq = ['missense_variant']
af_semi_rare = 0.01
callset_af_sv_recessive = 0.03
critical_csq = ['frameshift_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'start_lost', 'stop_gained', 'stop_lost', 'transcript_ablation']
max_parent_ab = 0.05
minimum_depth = 10
spliceai = 0.5
[RunHailFiltering.cores]
small_variants = 8
[categories]
1 = 'ClinVar Pathogenic'
3 = 'High Impact Variant'
4 = 'de Novo'
5 = 'High SpliceAI Score'
6 = 'AlphaMissense P/LP'
pm5 = 'ACMG PM5 - missense in same residue as known pathogenic'
sv1 = 'Predicted LOF SV'
[CreateTalosHTML]
external_labels = "path to labels file"
seqr_lookup = "e.g. gs://cpg-COHORT-test/reanalysis/parsed_seqr.json"
seqr_instance = "e.g. https://seqr.populationgenomics.org.au"
seqr_project = "e.g. COHORT_project_id"
[HPOFlagging]
# this section relates to phenotype-matching the final variant set
# set this to True to do a semantic term comparison when phenotype matching
# this does a wiggly semantic similarity test between participant and gene HPOs, through SemSimian
# if False, we will always do a set intersection on HPO terms
semantic_match = true
# min similarity score when doing a semsimian termset similarity test
min_similarity = 14.0