Skip to content

Commit 05f0933

Browse files
akolesnikovcopybara-github
authored andcommitted
Addressing the issue raised in #811
This change ensures that PL value is calculated according to the ploidy in reference blocks of gVCF. PiperOrigin-RevId: 698266848
1 parent d420e2f commit 05f0933

File tree

6 files changed

+104
-15
lines changed

6 files changed

+104
-15
lines changed

deepvariant/make_examples_core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ def assign_sample_name(sample_name_flag: str, reads_filenames: str) -> str:
188188
def make_vc_options(
189189
sample_name: str, flags_obj: flags.FlagValues
190190
) -> deepvariant_pb2.VariantCallerOptions:
191+
haploid_contigs_str = flags_obj.haploid_contigs or ''
191192
return deepvariant_pb2.VariantCallerOptions(
192193
min_count_snps=flags_obj.vsc_min_count_snps,
193194
min_count_indels=flags_obj.vsc_min_count_indels,
@@ -208,6 +209,8 @@ def make_vc_options(
208209
phase_reads_region_padding_pct=dv_constants.PHASE_READS_REGION_PADDING_PCT,
209210
track_ref_reads=flags_obj.track_ref_reads,
210211
small_model_vaf_context_window_size=flags_obj.small_model_vaf_context_window_size,
212+
haploid_contigs=haploid_contigs_str.split(','),
213+
par_regions_bed=flags_obj.par_regions_bed,
211214
)
212215

213216

deepvariant/make_examples_options.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,27 @@
736736
'If True, output phase information to the candidates.',
737737
)
738738

739+
_HAPLOID_CONTIGS = flags.DEFINE_string(
740+
'haploid_contigs',
741+
None,
742+
(
743+
'Optional list of non autosomal chromosomes. For all listed chromosomes'
744+
'HET probabilities are not considered. The list can be either comma '
745+
'or space-separated.'
746+
),
747+
)
748+
749+
_PAR_REGIONS = flags.DEFINE_string(
750+
'par_regions_bed',
751+
None,
752+
(
753+
'Optional BED file containing Human Pseudoautosomal Region (PAR) '
754+
'regions.'
755+
'Variants within this region are unaffected by genotype reallocation '
756+
'applied on regions supplied by --haploid_contigs flag.'
757+
),
758+
)
759+
739760

740761
def shared_flags_to_options(
741762
add_flags,

deepvariant/protos/deepvariant.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ message CallVariantsOutput {
371371
}
372372

373373
// Options to control how our candidate VariantCaller works.
374-
// Next ID: 19
374+
// Next ID: 21
375375
message VariantCallerOptions {
376376
// Alleles occurring at least this many times in our AlleleCount are
377377
// considered candidate variants.
@@ -433,6 +433,10 @@ message VariantCallerOptions {
433433

434434
// Small model context window size
435435
int32 small_model_vaf_context_window_size = 18;
436+
437+
repeated string haploid_contigs = 19;
438+
439+
string par_regions_bed = 20;
436440
}
437441

438442
// Options to control how we label variant calls.

deepvariant/variant_caller.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from deepvariant.python import variant_calling_multisample
4545
from third_party.nucleus.protos import variants_pb2
4646
from third_party.nucleus.util import genomics_math
47+
from third_party.nucleus.util import ranges
4748
from third_party.nucleus.util import variantcall_utils
4849
from third_party.nucleus.util import vcf_constants
4950

@@ -69,6 +70,8 @@
6970

7071
LOG_10 = math.log(10.0)
7172

73+
IMPOSSIBLE_PROBABILITY_LOG10 = 999.0
74+
7275

7376
def _rescale_read_counts_if_necessary(
7477
n_ref_reads, n_total_reads, max_allowed_reads
@@ -127,21 +130,30 @@ def __init__(self, options, use_cache_table, max_cache_coverage):
127130
self.options
128131
)
129132

133+
self.par_regions = None
134+
if self.options.par_regions_bed:
135+
self.par_regions = ranges.RangeSet.from_bed(
136+
self.options.par_regions_bed, enable_logging=False
137+
)
138+
130139
self.max_cache_coverage = max_cache_coverage
131140
# pylint: disable=g-complex-comprehension
132141
if use_cache_table:
133142
self.table = [
134143
[
135-
self._calc_reference_confidence(n_ref, n_total)
136-
for n_ref in range(n_total + 1)
144+
[
145+
self._calc_reference_confidence(n_ref, n_total, is_haploid)
146+
for n_ref in range(n_total + 1)
147+
]
148+
for n_total in range(self.max_cache_coverage + 1)
137149
]
138-
for n_total in range(self.max_cache_coverage + 1)
150+
for is_haploid in [False, True]
139151
]
140152
else:
141153
self.table = None
142154
# pylint: enable=g-complex-comprehension
143155

144-
def reference_confidence(self, n_ref, n_total):
156+
def reference_confidence(self, n_ref, n_total, is_haploid=False):
145157
"""Computes the confidence that a site in the genome has no variation.
146158
147159
Computes this confidence using only the counts of the number of reads
@@ -190,21 +202,22 @@ def reference_confidence(self, n_ref, n_total):
190202
reference allele.
191203
n_total: int >= 0 and >= n_ref: The number of reads supporting any allele
192204
at this site.
205+
is_haploid: bool. If True, the position should be haploid.
193206
194207
Returns:
195208
A tuple of two values. The first is an integer value for the GQ (genotype
196209
quality) and the second is an array-like of the log10 probabilities for
197210
each of the three genotype configurations.
198211
"""
199212
if self.table is None:
200-
return self._calc_reference_confidence(n_ref, n_total)
213+
return self._calc_reference_confidence(n_ref, n_total, is_haploid)
201214
else:
202215
ref_index, total_index = _rescale_read_counts_if_necessary(
203216
n_ref, n_total, self.max_cache_coverage
204217
)
205-
return self.table[total_index][ref_index]
218+
return self.table[is_haploid][total_index][ref_index]
206219

207-
def _calc_reference_confidence(self, n_ref, n_total):
220+
def _calc_reference_confidence(self, n_ref, n_total, is_haploid=False):
208221
"""Performs the calculation described in reference_confidence()."""
209222
if n_ref < 0:
210223
raise ValueError('n_ref={} must be >= 0'.format(n_ref))
@@ -214,16 +227,23 @@ def _calc_reference_confidence(self, n_ref, n_total):
214227
raise ValueError(
215228
'ploidy={} but we only support ploidy=2'.format(self.options.ploidy)
216229
)
217-
218230
if n_total == 0:
219-
# No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
220-
log10_probs = genomics_math.normalize_log10_probs([-1.0, -1.0, -1.0])
231+
if is_haploid:
232+
# No coverage case - all likelihoods are log10 of 1/2, 0, 1/2.
233+
log10_probs = genomics_math.normalize_log10_probs(
234+
[-1.0, -IMPOSSIBLE_PROBABILITY_LOG10, -1.0]
235+
)
236+
else:
237+
# No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
238+
log10_probs = genomics_math.normalize_log10_probs([-1.0, -1.0, -1.0])
221239
else:
222240
n_alts = n_total - n_ref
223241
logp = math.log(self.options.p_error) / LOG_10
224242
log1p = math.log1p(-self.options.p_error) / LOG_10
225243
log10_p_ref = n_ref * log1p + n_alts * logp
226244
log10_p_het = -n_total * math.log(self.options.ploidy) / LOG_10
245+
if is_haploid:
246+
log10_p_het = -IMPOSSIBLE_PROBABILITY_LOG10
227247
log10_p_hom_alt = n_ref * logp + n_alts * log1p
228248
log10_probs = genomics_math.normalize_log10_probs(
229249
[log10_p_ref, log10_p_het, log10_p_hom_alt]
@@ -261,6 +281,12 @@ def make_gvcfs(self, allele_count_summaries, include_med_dp=False):
261281
coordinate-sorted order containing gVCF records.
262282
"""
263283

284+
par_regions = None
285+
if self.options.par_regions_bed:
286+
par_regions = ranges.RangeSet.from_bed(
287+
self.options.par_regions_bed, enable_logging=False
288+
)
289+
264290
def with_gq_and_likelihoods(summary_counts):
265291
"""Returns summary_counts along with GQ and genotype likelihoods.
266292
@@ -294,7 +320,18 @@ def with_gq_and_likelihoods(summary_counts):
294320
else:
295321
n_ref = summary_counts.ref_supporting_read_count
296322
n_total = summary_counts.total_read_count
297-
raw_gq, likelihoods = self.reference_confidence(n_ref, n_total)
323+
is_haploid = (
324+
summary_counts.reference_name in self.options.haploid_contigs
325+
and not (
326+
par_regions
327+
and par_regions.overlaps(
328+
summary_counts.reference_name, summary_counts.position
329+
)
330+
)
331+
)
332+
raw_gq, likelihoods = self.reference_confidence(
333+
n_ref, n_total, is_haploid
334+
)
298335
quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution)
299336
has_valid_gl = np.amax(likelihoods) == likelihoods[0]
300337
return _GVCF(

deepvariant/variant_caller_test.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ def fake_allele_counter(self, start_pos, counts):
103103
allele_counter.counts.return_value = counts
104104
return allele_counter
105105

106-
# R code to produce the testdata expectation table.
106+
# R code to produce the testdata expectation table. Haploid test cases were
107+
# created manually.
107108
# expected <- function(n_ref, n_alt, perr, max_gq = 100) {
108109
# p_ref <- dbinom(n_alt, n_ref, perr)
109110
# p_het <- dbinom(n_alt, n_ref, 0.5)
@@ -187,12 +188,33 @@ def fake_allele_counter(self, start_pos, counts):
187188
[80, 0, 0.01, 100, [0.000000, -23.733215, -159.650816], 100],
188189
[90, 0, 0.01, 100, [0.000000, -26.699867, -179.607168], 100],
189190
[100, 0, 0.01, 100, [0.000000, -29.666519, -199.563519], 100],
191+
# Test haploid case.
192+
[
193+
10,
194+
8,
195+
0.01,
196+
100,
197+
[-11.97381, -9.949651e02, -0.0000000000004609646],
198+
0,
199+
True,
200+
],
201+
[10, 1, 0.01, 100, [0.0, -996.960717, -15.965082], 100, True],
202+
[10, 5, 0.01, 100, [-0.30103, -989.2792, -0.3010300], 3, True],
190203
)
191204
def test_ref_calc(
192-
self, total_n, alt_n, p_error, max_gq, expected_likelihoods, expected_gq
205+
self,
206+
total_n,
207+
alt_n,
208+
p_error,
209+
max_gq,
210+
expected_likelihoods,
211+
expected_gq,
212+
is_haploid=False,
193213
):
194214
caller = PlaceholderVariantCaller(p_error, max_gq)
195-
gq, likelihoods = caller.reference_confidence(total_n - alt_n, total_n)
215+
gq, likelihoods = caller.reference_confidence(
216+
total_n - alt_n, total_n, is_haploid
217+
)
196218
npt.assert_allclose(expected_likelihoods, likelihoods, atol=1e-6)
197219
self.assertEqual(expected_gq, gq)
198220

scripts/run_deepvariant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,8 @@ def create_all_commands_and_logfiles(intermediate_results_dir):
742742
gvcf=nonvariant_site_tfrecord_path,
743743
regions=_REGIONS.value,
744744
sample_name=_SAMPLE_NAME.value,
745+
haploid_contigs=_HAPLOID_CONTIGS.value,
746+
par_regions_bed=_PAR_REGIONS.value,
745747
)
746748
)
747749

0 commit comments

Comments
 (0)