Skip to content

Commit

Permalink
Addressing the issue raised in #811
Browse files Browse the repository at this point in the history
This change ensures that PL value is calculated according to the ploidy in reference blocks of gVCF.

PiperOrigin-RevId: 698266848
  • Loading branch information
akolesnikov authored and copybara-github committed Nov 20, 2024
1 parent d420e2f commit 05f0933
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 15 deletions.
3 changes: 3 additions & 0 deletions deepvariant/make_examples_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def assign_sample_name(sample_name_flag: str, reads_filenames: str) -> str:
def make_vc_options(
sample_name: str, flags_obj: flags.FlagValues
) -> deepvariant_pb2.VariantCallerOptions:
haploid_contigs_str = flags_obj.haploid_contigs or ''
return deepvariant_pb2.VariantCallerOptions(
min_count_snps=flags_obj.vsc_min_count_snps,
min_count_indels=flags_obj.vsc_min_count_indels,
Expand All @@ -208,6 +209,8 @@ def make_vc_options(
phase_reads_region_padding_pct=dv_constants.PHASE_READS_REGION_PADDING_PCT,
track_ref_reads=flags_obj.track_ref_reads,
small_model_vaf_context_window_size=flags_obj.small_model_vaf_context_window_size,
haploid_contigs=haploid_contigs_str.split(','),
par_regions_bed=flags_obj.par_regions_bed,
)


Expand Down
21 changes: 21 additions & 0 deletions deepvariant/make_examples_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,27 @@
'If True, output phase information to the candidates.',
)

_HAPLOID_CONTIGS = flags.DEFINE_string(
'haploid_contigs',
None,
(
'Optional list of non autosomal chromosomes. For all listed chromosomes'
'HET probabilities are not considered. The list can be either comma '
'or space-separated.'
),
)

_PAR_REGIONS = flags.DEFINE_string(
'par_regions_bed',
None,
(
'Optional BED file containing Human Pseudoautosomal Region (PAR) '
'regions.'
'Variants within this region are unaffected by genotype reallocation '
'applied on regions supplied by --haploid_contigs flag.'
),
)


def shared_flags_to_options(
add_flags,
Expand Down
6 changes: 5 additions & 1 deletion deepvariant/protos/deepvariant.proto
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ message CallVariantsOutput {
}

// Options to control how our candidate VariantCaller works.
// Next ID: 19
// Next ID: 21
message VariantCallerOptions {
// Alleles occurring at least this many times in our AlleleCount are
// considered candidate variants.
Expand Down Expand Up @@ -433,6 +433,10 @@ message VariantCallerOptions {

// Small model context window size
int32 small_model_vaf_context_window_size = 18;

repeated string haploid_contigs = 19;

string par_regions_bed = 20;
}

// Options to control how we label variant calls.
Expand Down
59 changes: 48 additions & 11 deletions deepvariant/variant_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from deepvariant.python import variant_calling_multisample
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.util import genomics_math
from third_party.nucleus.util import ranges
from third_party.nucleus.util import variantcall_utils
from third_party.nucleus.util import vcf_constants

Expand All @@ -69,6 +70,8 @@

LOG_10 = math.log(10.0)

IMPOSSIBLE_PROBABILITY_LOG10 = 999.0


def _rescale_read_counts_if_necessary(
n_ref_reads, n_total_reads, max_allowed_reads
Expand Down Expand Up @@ -127,21 +130,30 @@ def __init__(self, options, use_cache_table, max_cache_coverage):
self.options
)

self.par_regions = None
if self.options.par_regions_bed:
self.par_regions = ranges.RangeSet.from_bed(
self.options.par_regions_bed, enable_logging=False
)

self.max_cache_coverage = max_cache_coverage
# pylint: disable=g-complex-comprehension
if use_cache_table:
self.table = [
[
self._calc_reference_confidence(n_ref, n_total)
for n_ref in range(n_total + 1)
[
self._calc_reference_confidence(n_ref, n_total, is_haploid)
for n_ref in range(n_total + 1)
]
for n_total in range(self.max_cache_coverage + 1)
]
for n_total in range(self.max_cache_coverage + 1)
for is_haploid in [False, True]
]
else:
self.table = None
# pylint: enable=g-complex-comprehension

def reference_confidence(self, n_ref, n_total):
def reference_confidence(self, n_ref, n_total, is_haploid=False):
"""Computes the confidence that a site in the genome has no variation.
Computes this confidence using only the counts of the number of reads
Expand Down Expand Up @@ -190,21 +202,22 @@ def reference_confidence(self, n_ref, n_total):
reference allele.
n_total: int >= 0 and >= n_ref: The number of reads supporting any allele
at this site.
is_haploid: bool. If True, the position should be haploid.
Returns:
A tuple of two values. The first is an integer value for the GQ (genotype
quality) and the second is an array-like of the log10 probabilities for
each of the three genotype configurations.
"""
if self.table is None:
return self._calc_reference_confidence(n_ref, n_total)
return self._calc_reference_confidence(n_ref, n_total, is_haploid)
else:
ref_index, total_index = _rescale_read_counts_if_necessary(
n_ref, n_total, self.max_cache_coverage
)
return self.table[total_index][ref_index]
return self.table[is_haploid][total_index][ref_index]

def _calc_reference_confidence(self, n_ref, n_total):
def _calc_reference_confidence(self, n_ref, n_total, is_haploid=False):
"""Performs the calculation described in reference_confidence()."""
if n_ref < 0:
raise ValueError('n_ref={} must be >= 0'.format(n_ref))
Expand All @@ -214,16 +227,23 @@ def _calc_reference_confidence(self, n_ref, n_total):
raise ValueError(
'ploidy={} but we only support ploidy=2'.format(self.options.ploidy)
)

if n_total == 0:
# No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
log10_probs = genomics_math.normalize_log10_probs([-1.0, -1.0, -1.0])
if is_haploid:
# No coverage case - all likelihoods are log10 of 1/2, 0, 1/2.
log10_probs = genomics_math.normalize_log10_probs(
[-1.0, -IMPOSSIBLE_PROBABILITY_LOG10, -1.0]
)
else:
# No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
log10_probs = genomics_math.normalize_log10_probs([-1.0, -1.0, -1.0])
else:
n_alts = n_total - n_ref
logp = math.log(self.options.p_error) / LOG_10
log1p = math.log1p(-self.options.p_error) / LOG_10
log10_p_ref = n_ref * log1p + n_alts * logp
log10_p_het = -n_total * math.log(self.options.ploidy) / LOG_10
if is_haploid:
log10_p_het = -IMPOSSIBLE_PROBABILITY_LOG10
log10_p_hom_alt = n_ref * logp + n_alts * log1p
log10_probs = genomics_math.normalize_log10_probs(
[log10_p_ref, log10_p_het, log10_p_hom_alt]
Expand Down Expand Up @@ -261,6 +281,12 @@ def make_gvcfs(self, allele_count_summaries, include_med_dp=False):
coordinate-sorted order containing gVCF records.
"""

par_regions = None
if self.options.par_regions_bed:
par_regions = ranges.RangeSet.from_bed(
self.options.par_regions_bed, enable_logging=False
)

def with_gq_and_likelihoods(summary_counts):
"""Returns summary_counts along with GQ and genotype likelihoods.
Expand Down Expand Up @@ -294,7 +320,18 @@ def with_gq_and_likelihoods(summary_counts):
else:
n_ref = summary_counts.ref_supporting_read_count
n_total = summary_counts.total_read_count
raw_gq, likelihoods = self.reference_confidence(n_ref, n_total)
is_haploid = (
summary_counts.reference_name in self.options.haploid_contigs
and not (
par_regions
and par_regions.overlaps(
summary_counts.reference_name, summary_counts.position
)
)
)
raw_gq, likelihoods = self.reference_confidence(
n_ref, n_total, is_haploid
)
quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution)
has_valid_gl = np.amax(likelihoods) == likelihoods[0]
return _GVCF(
Expand Down
28 changes: 25 additions & 3 deletions deepvariant/variant_caller_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def fake_allele_counter(self, start_pos, counts):
allele_counter.counts.return_value = counts
return allele_counter

# R code to produce the testdata expectation table.
# R code to produce the testdata expectation table. Haploid test cases were
# created manually.
# expected <- function(n_ref, n_alt, perr, max_gq = 100) {
# p_ref <- dbinom(n_alt, n_ref, perr)
# p_het <- dbinom(n_alt, n_ref, 0.5)
Expand Down Expand Up @@ -187,12 +188,33 @@ def fake_allele_counter(self, start_pos, counts):
[80, 0, 0.01, 100, [0.000000, -23.733215, -159.650816], 100],
[90, 0, 0.01, 100, [0.000000, -26.699867, -179.607168], 100],
[100, 0, 0.01, 100, [0.000000, -29.666519, -199.563519], 100],
# Test haploid case.
[
10,
8,
0.01,
100,
[-11.97381, -9.949651e02, -0.0000000000004609646],
0,
True,
],
[10, 1, 0.01, 100, [0.0, -996.960717, -15.965082], 100, True],
[10, 5, 0.01, 100, [-0.30103, -989.2792, -0.3010300], 3, True],
)
def test_ref_calc(
self, total_n, alt_n, p_error, max_gq, expected_likelihoods, expected_gq
self,
total_n,
alt_n,
p_error,
max_gq,
expected_likelihoods,
expected_gq,
is_haploid=False,
):
caller = PlaceholderVariantCaller(p_error, max_gq)
gq, likelihoods = caller.reference_confidence(total_n - alt_n, total_n)
gq, likelihoods = caller.reference_confidence(
total_n - alt_n, total_n, is_haploid
)
npt.assert_allclose(expected_likelihoods, likelihoods, atol=1e-6)
self.assertEqual(expected_gq, gq)

Expand Down
2 changes: 2 additions & 0 deletions scripts/run_deepvariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,8 @@ def create_all_commands_and_logfiles(intermediate_results_dir):
gvcf=nonvariant_site_tfrecord_path,
regions=_REGIONS.value,
sample_name=_SAMPLE_NAME.value,
haploid_contigs=_HAPLOID_CONTIGS.value,
par_regions_bed=_PAR_REGIONS.value,
)
)

Expand Down

0 comments on commit 05f0933

Please sign in to comment.