From 486f8a66b8d71bc086b820a96ec97deaa5c79716 Mon Sep 17 00:00:00 2001 From: koles Date: Tue, 26 Nov 2024 17:23:08 -0800 Subject: [PATCH] Manually rollback the change containing a coverage calculation. PiperOrigin-RevId: 700505817 --- deeptrio/make_examples_test.py | 2 +- deepvariant/make_examples_core.py | 188 +------------------------ deepvariant/make_examples_core_test.py | 6 +- 3 files changed, 9 insertions(+), 187 deletions(-) diff --git a/deeptrio/make_examples_test.py b/deeptrio/make_examples_test.py index 0ccb30ee..eeec4d36 100644 --- a/deeptrio/make_examples_test.py +++ b/deeptrio/make_examples_test.py @@ -1394,7 +1394,7 @@ def test_regions_and_exclude_regions_flags_with_trio_options(self): FLAGS.exclude_regions = '20:10,010,000-10,100,000' options = make_examples.default_options(add_flags=True) - _, _, regions_from_options = ( + _, regions_from_options = ( make_examples_core.processing_regions_from_options(options) ) self.assertCountEqual( diff --git a/deepvariant/make_examples_core.py b/deepvariant/make_examples_core.py index 575de079..a195ebc7 100644 --- a/deepvariant/make_examples_core.py +++ b/deepvariant/make_examples_core.py @@ -1401,41 +1401,6 @@ def small_model_variant_caller( ): self._small_model_variant_caller = small_model_variant_caller - def _get_mean_coverage_per_sample(self) -> List[float]: - """Returns the mean coverage per sample if set in options. - - Fetches the per sample mean coverage if provided in the options with - downsample fraction applied. The default is 0.0 for all samples if not set. - - Returns: - A list of floats representing mean coverages per sample. - """ - if self.mean_coverage_per_sample: - return self.mean_coverage_per_sample - - if not any([ - sample_options.mean_coverage > 0 - for sample_options in self.options.sample_options - ]): - # If no mean coverage is set, return 0.0 for all samples. - if 'mean_coverage' in self.options.channel_list: - logging.warning( - 'No mean coverage is set in options but "mean_coverage" is in' - ' channel_list. This is not expected. Mean coverage will be set to' - ' the default value of 0.0.' - ) - return [0.0] * len(self.samples) - - mean_coverage_per_sample = [] - for sample in self.samples: - cur_sample_mean_coverage = sample.options.mean_coverage - # Apply downsample fraction to mean coverage if applicable. - if sample.options.downsample_fraction: - cur_sample_mean_coverage *= sample.options.downsample_fraction - mean_coverage_per_sample.append(cur_sample_mean_coverage) - - return mean_coverage_per_sample - def _make_direct_phasing_obj(self) -> direct_phasing.DirectPhasing: return direct_phasing.DirectPhasing() @@ -1690,7 +1655,6 @@ def writes_examples_in_region( region. If the region contains no examples, return None. """ before_make_pileup_images = time.time() - mean_coverage_per_sample = self._get_mean_coverage_per_sample() example_shape = None # Create A tf.Example proto, which includes the candidate variant, the # pileup image, and, if in training mode, the truth variants and labels @@ -1743,7 +1707,7 @@ def writes_examples_in_region( reads_per_sample, sample_order, role, - mean_coverage_per_sample, + [0.0] * len(self.samples), ) ) @@ -1770,7 +1734,7 @@ def writes_examples_in_region( reads_per_sample, sample_order, role, - mean_coverage_per_sample, + [0.0] * len(self.samples), ) ) @@ -1959,74 +1923,6 @@ def find_candidate_positions(self, region: range_pb2.Range) -> Iterator[int]: # Mark the end of partition yield END_OF_PARTITION - def precompute_mean_coverage_per_sample( - self, sample_regions: List[range_pb2.Range] - ): - """Precomputes the estimation of mean coverage per sample. - - If the value is already provided via flag, it will be used instead of - computing it. - - Args: - sample_regions: List of regions to use for estimating mean coverage. - """ - start = time.time() - - self.mean_coverage_per_sample = [0 for _ in self.samples] - if not self.initialized: - self.initialize() - - n_positions = [0 for _ in self.samples] - n_reads_per_sample = [0 for _ in self.samples] - for sample_index, sample in enumerate(self.samples): - # Do not recompute mean coverage if it is already provided via flag. - if self.samples[sample_index].options.mean_coverage: - continue - logging_with_options( - self.options, - 'Mean_coverage not set for sample %s. Estimating mean coverage...' - % self.samples[sample_index].options.name, - ) - logging_with_options( - self.options, - 'Using %d regions to estimate mean coverage.' % len(sample_regions), - ) - for region in sample_regions: - allele_counter = self._make_allele_counter_for_region(region, []) - if sample.in_memory_sam_reader is not None: - for read in self.region_reads_norealign( - region=region, - sam_readers=sample.sam_readers, - reads_filenames=sample.options.reads_filenames, - ): - allele_counter.add(read, sample.options.name) - - summary_counts = allele_counter.summary_counts() - positions_in_region = len(summary_counts) - reads_in_region = sum( - [summary.total_read_count for summary in summary_counts] - ) - if reads_in_region > 0: - n_positions[sample_index] += positions_in_region - n_reads_per_sample[sample_index] += reads_in_region - if n_positions[sample_index] > 0: - self.mean_coverage_per_sample[sample_index] = ( - n_reads_per_sample[sample_index] / n_positions[sample_index] - ) - logging_with_options( - self.options, - 'Mean coverage estimation for sample %s: %f' - % ( - self.samples[sample_index].options.name, - self.mean_coverage_per_sample[sample_index], - ), - ) - logging_with_options( - self.options, - 'Overhead for precomputing mean coverage: %d seconds' - % (time.time() - start), - ) - def _only_contains_alts_above_threshold( self, variant: variants_pb2.Variant, threshold: float ) -> bool: @@ -2871,9 +2767,7 @@ def load_candidate_positions(candidate_path: Any) -> List[int]: def processing_regions_from_options( options: deepvariant_pb2.MakeExamplesOptions, -) -> Tuple[ - List[range_pb2.Range], List[range_pb2.Range], Optional[ranges.RangeSet] -]: +) -> Tuple[List[range_pb2.Range], Optional[ranges.RangeSet]]: """Computes the calling regions from our options. This function does all of the work needed to read our input files and region @@ -2976,74 +2870,6 @@ def processing_regions_from_options( region_list = list(regions) - def sample_regions_for_coverage_estimation( - options: deepvariant_pb2.MakeExamplesOptions, - ) -> List[range_pb2.Range]: - """Returns a list of regions sampled from the BAM file to use for mean coverage estimation. - - Uses the calling regions if options.sample_mean_coverage_on_calling_regions - is set. - Otherwise, uses: UNION(ref_contigs, sample_contigs) - exclude_contigs. - Samples NUM_REGIONS_FOR_MEAN_COVERAGE regions from - NUM_LOCI_FOR_MEAN_COVERAGE places in the genome from the above set. If there - are less than NUM_REGIONS_FOR_MEAN_COVERAGE regions, returns all regions. - - Args: - options: deepvariant.MakeExamplesOptions proto containing information - about our input data sources. - - Returns: - A list of regions to use for mean coverage estimation. - """ - sample_regions = regions_to_process( - contigs=contigs, - partition_size=options.allele_counter_options.partition_size, - calling_regions=calling_regions - if options.sample_mean_coverage_on_calling_regions - else None, - task_id=None, - num_shards=None, - candidates=None, - ) - sample_regions_list = list(sample_regions) - if ( - len(sample_regions_list) < NUM_REGIONS_FOR_MEAN_COVERAGE - and 'mean_coverage' in options.pic_options.channels - ): - logging.warning( - 'calling_regions has %d regions, which is less than %d. ' - 'This may result in inaccurate mean coverage if estimated.', - len(sample_regions_list), - NUM_REGIONS_FOR_MEAN_COVERAGE, - ) - else: - # Sample regions to calculate mean coverage by random sampling - # NUM_LOCI_FOR_MEAN_COVERAGE places in the genome and expanding - # consecutively to reach NUM_REGIONS_FOR_MEAN_COVERAGE regions. - # See internal#comment46 for more details. - random_generator = np.random.RandomState(options.random_seed) - - num_regions_per_locus = int( - NUM_REGIONS_FOR_MEAN_COVERAGE / NUM_LOCI_FOR_MEAN_COVERAGE - ) - sample_regions_indexes = [] - for sample_start_positions in random_generator.choice( - range(len(sample_regions_list)), NUM_LOCI_FOR_MEAN_COVERAGE - ): - sample_regions_indexes += list( - range( - sample_start_positions, - min( - sample_start_positions + num_regions_per_locus, - len(sample_regions_list), - ), - ) - ) - sample_regions_list = np.take(sample_regions_list, sample_regions_indexes) - return sample_regions_list - - sample_regions_list = sample_regions_for_coverage_estimation(options) - # When using VcfCandidateImporter, it is safe to skip regions without # candidates as long as gVCF output is not needed. There is a tradeoff # though because it takes time to read the VCF, which is only worth it if @@ -3077,8 +2903,8 @@ def sample_regions_for_coverage_estimation( trim_runtime(time_elapsed), len(region_list), len(filtered_regions) ), ) - return filtered_regions, sample_regions_list, None - return region_list, sample_regions_list, calling_regions + return filtered_regions, None + return region_list, calling_regions def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions): @@ -3089,7 +2915,6 @@ def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions): logging_with_options(options, 'Preparing inputs') ( regions, - sample_regions, calling_regions, ) = processing_regions_from_options(options) main_sample = options.sample_options[options.main_sample_index] @@ -3161,9 +2986,6 @@ def make_examples_runner(options: deepvariant_pb2.MakeExamplesOptions): % (time.time() - before_initializing_inputs), ) - if 'mean_coverage' in options.pic_options.channels: - region_processor.precompute_mean_coverage_per_sample(sample_regions) - running_timer = timer.TimerStart() # Ideally this would use dv_constants.NUM_CLASSES, which requires generalizing # deepvariant_pb2.MakeExamplesStats to use an array for the class counts. diff --git a/deepvariant/make_examples_core_test.py b/deepvariant/make_examples_core_test.py index b3862167..2ed31c63 100644 --- a/deepvariant/make_examples_core_test.py +++ b/deepvariant/make_examples_core_test.py @@ -818,7 +818,7 @@ def test_regions_and_exclude_regions_flags(self): FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000' options = make_examples.default_options(add_flags=True) - _, _, regions_from_options = ( + _, regions_from_options = ( make_examples_core.processing_regions_from_options(options) ) self.assertCountEqual( @@ -847,7 +847,7 @@ def test_mixed_exclude_regions_flags(self): + bed_path ) options = make_examples.default_options(add_flags=True) - _, _, regions_from_options = ( + _, regions_from_options = ( make_examples_core.processing_regions_from_options(options) ) self.assertCountEqual( @@ -869,7 +869,7 @@ def test_regions_exclude_n_reference(self): FLAGS.discard_non_dna_regions = True options = make_examples.default_options(add_flags=True) - _, _, regions_from_options = ( + _, regions_from_options = ( make_examples_core.processing_regions_from_options(options) ) self.assertCountEqual(