Skip to content

Commit

Permalink
Add TrainGCNV input specifying subset list of samples for training (#294
Browse files Browse the repository at this point in the history
)
  • Loading branch information
epiercehoffman authored Feb 11, 2022
1 parent ab8a855 commit a770cc7
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 8 deletions.
25 changes: 20 additions & 5 deletions wdl/TrainGCNV.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ workflow TrainGCNV {
File reference_index # Index (.fai), must be in same dir as fasta
File reference_dict # Dictionary (.dict), must be in same dir as fasta
# Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
# Assumes all other inputs correspond to the full sample list. Intended for Terra
Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
Int subsample_seed = 42
# Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
Array[String]? sample_ids_training_subset

# Condense read counts
Int? condense_num_bins
Expand Down Expand Up @@ -85,7 +89,7 @@ workflow TrainGCNV {
String linux_docker
String gatk_docker
String condense_counts_docker
String? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
String? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples
# Runtime configuration overrides
RuntimeAttr? condense_counts_runtime_attr
Expand All @@ -100,20 +104,31 @@ workflow TrainGCNV {
RuntimeAttr? runtime_attr_explode
}
if (defined(n_samples_subsample)) {
if (defined(sample_ids_training_subset)) {
call util.GetSubsampledIndices {
input:
all_strings = write_lines(samples),
subset_strings = write_lines(select_first([sample_ids_training_subset])),
prefix = cohort,
sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
}
}

if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
call util.RandomSubsampleStringArray {
input:
strings = samples,
strings = write_lines(samples),
seed = subsample_seed,
subset_size = select_first([n_samples_subsample]),
prefix = cohort,
sv_pipeline_base_docker = select_first([sv_pipeline_base_docker])
}
}

Array[Int] sample_indices = select_first([RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])
Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))])

scatter (i in sample_indices) {
String sample_ids_ = samples[i]
call cov.CondenseReadCounts as CondenseReadCounts {
input:
counts = count_files[i],
Expand All @@ -138,7 +153,7 @@ workflow TrainGCNV {
preprocessed_intervals = CountsToIntervals.out,
filter_intervals = filter_intervals,
counts = CondenseReadCounts.out,
count_entity_ids = select_first([RandomSubsampleStringArray.subsampled_strings_array, samples]),
count_entity_ids = sample_ids_,
cohort_entity_id = cohort,
contig_ploidy_priors = contig_ploidy_priors,
num_intervals_per_scatter = num_intervals_per_scatter,
Expand Down
59 changes: 56 additions & 3 deletions wdl/Utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ task RunQC {

task RandomSubsampleStringArray {
input {
Array[String] strings
File strings
Int seed
Int subset_size
String prefix
Expand All @@ -172,7 +172,7 @@ task RandomSubsampleStringArray {
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
mem_gb: 1,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
Expand All @@ -185,7 +185,7 @@ task RandomSubsampleStringArray {
set -euo pipefail
python3 <<CODE
import random
string_array = ['~{sep="','" strings}']
string_array = [line.rstrip() for line in open("~{strings}", 'r')]
array_len = len(string_array)
if ~{subset_size} > array_len:
raise ValueError("Subsample quantity ~{subset_size} cannot > array length %d" % array_len)
Expand Down Expand Up @@ -218,6 +218,59 @@ task RandomSubsampleStringArray {
}
}
task GetSubsampledIndices {
input {
File all_strings
File subset_strings
String prefix
String sv_pipeline_base_docker
RuntimeAttr? runtime_attr_override
}
String subsample_indices_filename = "~{prefix}.subsample_indices.list"
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 1,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
command <<<
set -euo pipefail
python3 <<CODE
all_strings = [line.rstrip() for line in open("~{all_strings}", 'r')]
subset_strings = {line.rstrip() for line in open("~{subset_strings}", 'r')}
if not subset_strings.issubset(set(all_strings)):
raise ValueError("Subset list must be a subset of full list")
with open("~{subsample_indices_filename}", 'w') as indices:
for i, string in enumerate(all_strings):
if string in subset_strings:
indices.write(f"{i}\n")
CODE
>>>
output {
Array[Int] subsample_indices_array = read_lines(subsample_indices_filename)
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_base_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task SubsetPedFile {
input {
File ped_file
Expand Down

0 comments on commit a770cc7

Please sign in to comment.