Skip to content

Commit

Permalink
Manually rollback the change containing a coverage calculation.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 700505817
  • Loading branch information
akolesnikov authored and copybara-github committed Nov 27, 2024
1 parent 87764b2 commit 486f8a6
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 187 deletions.
2 changes: 1 addition & 1 deletion deeptrio/make_examples_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,7 +1394,7 @@ def test_regions_and_exclude_regions_flags_with_trio_options(self):
FLAGS.exclude_regions = '20:10,010,000-10,100,000'

options = make_examples.default_options(add_flags=True)
_, _, regions_from_options = (
_, regions_from_options = (
make_examples_core.processing_regions_from_options(options)
)
self.assertCountEqual(
Expand Down
188 changes: 5 additions & 183 deletions deepvariant/make_examples_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1401,41 +1401,6 @@ def small_model_variant_caller(
):
self._small_model_variant_caller = small_model_variant_caller

def _get_mean_coverage_per_sample(self) -> List[float]:
"""Returns the mean coverage per sample if set in options.
Fetches the per sample mean coverage if provided in the options with
downsample fraction applied. The default is 0.0 for all samples if not set.
Returns:
A list of floats representing mean coverages per sample.
"""
if self.mean_coverage_per_sample:
return self.mean_coverage_per_sample

if not any([
sample_options.mean_coverage > 0
for sample_options in self.options.sample_options
]):
# If no mean coverage is set, return 0.0 for all samples.
if 'mean_coverage' in self.options.channel_list:
logging.warning(
'No mean coverage is set in options but "mean_coverage" is in'
' channel_list. This is not expected. Mean coverage will be set to'
' the default value of 0.0.'
)
return [0.0] * len(self.samples)

mean_coverage_per_sample = []
for sample in self.samples:
cur_sample_mean_coverage = sample.options.mean_coverage
# Apply downsample fraction to mean coverage if applicable.
if sample.options.downsample_fraction:
cur_sample_mean_coverage *= sample.options.downsample_fraction
mean_coverage_per_sample.append(cur_sample_mean_coverage)

return mean_coverage_per_sample

def _make_direct_phasing_obj(self) -> direct_phasing.DirectPhasing:
return direct_phasing.DirectPhasing()

Expand Down Expand Up @@ -1690,7 +1655,6 @@ def writes_examples_in_region(
region. If the region contains no examples, return None.
"""
before_make_pileup_images = time.time()
mean_coverage_per_sample = self._get_mean_coverage_per_sample()
example_shape = None
# Create A tf.Example proto, which includes the candidate variant, the
# pileup image, and, if in training mode, the truth variants and labels
Expand Down Expand Up @@ -1743,7 +1707,7 @@ def writes_examples_in_region(
reads_per_sample,
sample_order,
role,
mean_coverage_per_sample,
[0.0] * len(self.samples),
)
)

Expand All @@ -1770,7 +1734,7 @@ def writes_examples_in_region(
reads_per_sample,
sample_order,
role,
mean_coverage_per_sample,
[0.0] * len(self.samples),
)
)

Expand Down Expand Up @@ -1959,74 +1923,6 @@ def find_candidate_positions(self, region: range_pb2.Range) -> Iterator[int]:
# Mark the end of partition
yield END_OF_PARTITION

def precompute_mean_coverage_per_sample(
self, sample_regions: List[range_pb2.Range]
):
"""Precomputes the estimation of mean coverage per sample.
If the value is already provided via flag, it will be used instead of
computing it.
Args:
sample_regions: List of regions to use for estimating mean coverage.
"""
start = time.time()

self.mean_coverage_per_sample = [0 for _ in self.samples]
if not self.initialized:
self.initialize()

n_positions = [0 for _ in self.samples]
n_reads_per_sample = [0 for _ in self.samples]
for sample_index, sample in enumerate(self.samples):
# Do not recompute mean coverage if it is already provided via flag.
if self.samples[sample_index].options.mean_coverage:
continue
logging_with_options(
self.options,
'Mean_coverage not set for sample %s. Estimating mean coverage...'
% self.samples[sample_index].options.name,
)
logging_with_options(
self.options,
'Using %d regions to estimate mean coverage.' % len(sample_regions),
)
for region in sample_regions:
allele_counter = self._make_allele_counter_for_region(region, [])
if sample.in_memory_sam_reader is not None:
for read in self.region_reads_norealign(
region=region,
sam_readers=sample.sam_readers,
reads_filenames=sample.options.reads_filenames,
):
allele_counter.add(read, sample.options.name)

summary_counts = allele_counter.summary_counts()
positions_in_region = len(summary_counts)
reads_in_region = sum(
[summary.total_read_count for summary in summary_counts]
)
if reads_in_region > 0:
n_positions[sample_index] += positions_in_region
n_reads_per_sample[sample_index] += reads_in_region
if n_positions[sample_index] > 0:
self.mean_coverage_per_sample[sample_index] = (
n_reads_per_sample[sample_index] / n_positions[sample_index]
)
logging_with_options(
self.options,
'Mean coverage estimation for sample %s: %f'
% (
self.samples[sample_index].options.name,
self.mean_coverage_per_sample[sample_index],
),
)
logging_with_options(
self.options,
'Overhead for precomputing mean coverage: %d seconds'
% (time.time() - start),
)

def _only_contains_alts_above_threshold(
self, variant: variants_pb2.Variant, threshold: float
) -> bool:
Expand Down Expand Up @@ -2871,9 +2767,7 @@ def load_candidate_positions(candidate_path: Any) -> List[int]:

def processing_regions_from_options(
options: deepvariant_pb2.MakeExamplesOptions,
) -> Tuple[
List[range_pb2.Range], List[range_pb2.Range], Optional[ranges.RangeSet]
]:
) -> Tuple[List[range_pb2.Range], Optional[ranges.RangeSet]]:
"""Computes the calling regions from our options.
This function does all of the work needed to read our input files and region
Expand Down Expand Up @@ -2976,74 +2870,6 @@ def processing_regions_from_options(

region_list = list(regions)

def sample_regions_for_coverage_estimation(
options: deepvariant_pb2.MakeExamplesOptions,
) -> List[range_pb2.Range]:
"""Returns a list of regions sampled from the BAM file to use for mean coverage estimation.
Uses the calling regions if options.sample_mean_coverage_on_calling_regions
is set.
Otherwise, uses: UNION(ref_contigs, sample_contigs) - exclude_contigs.
Samples NUM_REGIONS_FOR_MEAN_COVERAGE regions from
NUM_LOCI_FOR_MEAN_COVERAGE places in the genome from the above set. If there
are less than NUM_REGIONS_FOR_MEAN_COVERAGE regions, returns all regions.
Args:
options: deepvariant.MakeExamplesOptions proto containing information
about our input data sources.
Returns:
A list of regions to use for mean coverage estimation.
"""
sample_regions = regions_to_process(
contigs=contigs,
partition_size=options.allele_counter_options.partition_size,
calling_regions=calling_regions
if options.sample_mean_coverage_on_calling_regions
else None,
task_id=None,
num_shards=None,
candidates=None,
)
sample_regions_list = list(sample_regions)
if (
len(sample_regions_list) < NUM_REGIONS_FOR_MEAN_COVERAGE
and 'mean_coverage' in options.pic_options.channels
):
logging.warning(
'calling_regions has %d regions, which is less than %d. '
'This may result in inaccurate mean coverage if estimated.',
len(sample_regions_list),
NUM_REGIONS_FOR_MEAN_COVERAGE,
)
else:
# Sample regions to calculate mean coverage by random sampling
# NUM_LOCI_FOR_MEAN_COVERAGE places in the genome and expanding
# consecutively to reach NUM_REGIONS_FOR_MEAN_COVERAGE regions.
# See internal#comment46 for more details.
random_generator = np.random.RandomState(options.random_seed)

num_regions_per_locus = int(
NUM_REGIONS_FOR_MEAN_COVERAGE / NUM_LOCI_FOR_MEAN_COVERAGE
)
sample_regions_indexes = []
for sample_start_positions in random_generator.choice(
range(len(sample_regions_list)), NUM_LOCI_FOR_MEAN_COVERAGE
):
sample_regions_indexes += list(
range(
sample_start_positions,
min(
sample_start_positions + num_regions_per_locus,
len(sample_regions_list),
),
)
)
sample_regions_list = np.take(sample_regions_list, sample_regions_indexes)
return sample_regions_list

sample_regions_list = sample_regions_for_coverage_estimation(options)

# When using VcfCandidateImporter, it is safe to skip regions without
# candidates as long as gVCF output is not needed. There is a tradeoff
# though because it takes time to read the VCF, which is only worth it if
Expand Down Expand Up @@ -3077,8 +2903,8 @@ def sample_regions_for_coverage_estimation(
trim_runtime(time_elapsed), len(region_list), len(filtered_regions)
),
)
return filtered_regions, sample_regions_list, None
return region_list, sample_regions_list, calling_regions
return filtered_regions, None
return region_list, calling_regions


def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions):
Expand All @@ -3089,7 +2915,6 @@ def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions):
logging_with_options(options, 'Preparing inputs')
(
regions,
sample_regions,
calling_regions,
) = processing_regions_from_options(options)
main_sample = options.sample_options[options.main_sample_index]
Expand Down Expand Up @@ -3161,9 +2986,6 @@ def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions):
% (time.time() - before_initializing_inputs),
)

if 'mean_coverage' in options.pic_options.channels:
region_processor.precompute_mean_coverage_per_sample(sample_regions)

running_timer = timer.TimerStart()
# Ideally this would use dv_constants.NUM_CLASSES, which requires generalizing
# deepvariant_pb2.MakeExamplesStats to use an array for the class counts.
Expand Down
6 changes: 3 additions & 3 deletions deepvariant/make_examples_core_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,7 @@ def test_regions_and_exclude_regions_flags(self):
FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000'

options = make_examples.default_options(add_flags=True)
_, _, regions_from_options = (
_, regions_from_options = (
make_examples_core.processing_regions_from_options(options)
)
self.assertCountEqual(
Expand Down Expand Up @@ -847,7 +847,7 @@ def test_mixed_exclude_regions_flags(self):
+ bed_path
)
options = make_examples.default_options(add_flags=True)
_, _, regions_from_options = (
_, regions_from_options = (
make_examples_core.processing_regions_from_options(options)
)
self.assertCountEqual(
Expand All @@ -869,7 +869,7 @@ def test_regions_exclude_n_reference(self):
FLAGS.discard_non_dna_regions = True

options = make_examples.default_options(add_flags=True)
_, _, regions_from_options = (
_, regions_from_options = (
make_examples_core.processing_regions_from_options(options)
)
self.assertCountEqual(
Expand Down

0 comments on commit 486f8a6

Please sign in to comment.