From cf54ad5c81321212795246501a2f0c940826173d Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Thu, 18 Mar 2021 16:26:27 -0400 Subject: [PATCH] Module04b: parametrize sample overlap and minimum var count outlier threshold for regeno filtering (#133) --- wdl/CombineReassess.wdl | 12 ++++++++++-- wdl/Module04b.wdl | 8 ++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/wdl/CombineReassess.wdl b/wdl/CombineReassess.wdl index 1f5abd3c9..1f9a9408a 100644 --- a/wdl/CombineReassess.wdl +++ b/wdl/CombineReassess.wdl @@ -8,6 +8,8 @@ workflow CombineReassess { File regeno_file File regeno_sample_ids_lookup Array[File] vcfs + Int min_var_per_sample_outlier_threshold + Float regeno_sample_overlap String sv_pipeline_base_docker String sv_pipeline_docker RuntimeAttr? runtime_attr_vcf2bed @@ -28,6 +30,8 @@ workflow CombineReassess { regeno_file = regeno_file, regeno_sample_ids_lookup = regeno_sample_ids_lookup, samplelist = samplelist, + min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold, + regeno_sample_overlap = regeno_sample_overlap, runtime_attr_override = runtime_attr_merge_list_creassess, sv_pipeline_base_docker = sv_pipeline_base_docker } @@ -82,6 +86,8 @@ task MergeList { File regeno_file Array[File] nonempty_txt File regeno_sample_ids_lookup + Int min_var_per_sample_outlier_threshold + Float regeno_sample_overlap String sv_pipeline_base_docker RuntimeAttr? runtime_attr_override } @@ -130,7 +136,9 @@ task MergeList { count(line) counts=np.array([int(dct[x]) for x in dct.keys()]) def reject_outliers(data, m=3): - return data[abs(data - np.mean(data)) > m * np.std(data)] + deviation_threshold = m * np.std(data) + data_mean = np.mean(data) + return data[np.logical_and(abs(data - data_mean) > deviation_threshold, data > ~{min_var_per_sample_outlier_threshold})] outliers=reject_outliers(counts) outlier_samples=set([x for x in dct.keys() if dct[x] in outliers]) with open("reassess_nonzero_overlap.txt",'w') as g, open("reassesss_by_var.txt",'r') as f: @@ -146,7 +154,7 @@ task MergeList { overlap_over_expected=str(len(regeno_in_expected)/len(expected)) g.write(dat[0]+"\t"+",".join(regeno)+'\t'+",".join(expected)+'\t'+overlap_over_regeno+'\t'+overlap_over_expected+"\n") CODE - awk '{if($4>0.7 && $5>0.7)print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt + awk '{if($4>~{regeno_sample_overlap} && $5>~{regeno_sample_overlap})print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt # the OR clause below is to ignore return code = 1 because that isn't an error, it just means there were 0 matched lines # (but don't ignore real error codes > 1) fgrep -w -f regeno_var_filtered.txt ~{regeno_file}> regeno.filtered.bed || [[ $? == 1 ]] diff --git a/wdl/Module04b.wdl b/wdl/Module04b.wdl index c2b9af92b..fed634eb5 100644 --- a/wdl/Module04b.wdl +++ b/wdl/Module04b.wdl @@ -24,8 +24,10 @@ workflow Module04b { String cohort # Cohort name or project prefix for all cohort-level outputs File contig_list Array[File] regeno_coverage_medians # one file per batch - Float regeno_max_allele_freq = 0.01 - Int regeno_allele_count_threshold = 3 + Float regeno_max_allele_freq = 0.01 # Rare variant filter for regenotyping candidates: must be < AF threshold (this parameter) or <= AC threshold (below) + Int regeno_allele_count_threshold = 3 # Rare variant filter for regenotyping candidates: must be < AF threshold (above) or <= AC threshold (this parameter) + Int min_var_per_sample_outlier_threshold = 3 # Threshold below which regeno SV count per sample should not be considered an outlier (need when counts are sparse) + Float regeno_sample_overlap = 0.7 # Minimum sample overlap required between raw and regenotyped calls RuntimeAttr? runtime_attr_cluster_merged_depth_beds RuntimeAttr? runtime_attr_regeno_raw_combined_depth @@ -188,6 +190,8 @@ workflow Module04b { regeno_file = MergeList.master_regeno, regeno_sample_ids_lookup = ConcatSampleIdLookupBed.concat_bed, vcfs = Genotype_2.genotyped_vcf, + min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold, + regeno_sample_overlap = regeno_sample_overlap, sv_pipeline_docker = sv_pipeline_docker, sv_pipeline_base_docker = sv_pipeline_base_docker, runtime_attr_merge_list_creassess = runtime_attr_merge_list_creassess,