44
44
from deepvariant .python import variant_calling_multisample
45
45
from third_party .nucleus .protos import variants_pb2
46
46
from third_party .nucleus .util import genomics_math
47
+ from third_party .nucleus .util import ranges
47
48
from third_party .nucleus .util import variantcall_utils
48
49
from third_party .nucleus .util import vcf_constants
49
50
69
70
70
71
LOG_10 = math .log (10.0 )
71
72
73
+ IMPOSSIBLE_PROBABILITY_LOG10 = 999.0
74
+
72
75
73
76
def _rescale_read_counts_if_necessary (
74
77
n_ref_reads , n_total_reads , max_allowed_reads
@@ -127,21 +130,30 @@ def __init__(self, options, use_cache_table, max_cache_coverage):
127
130
self .options
128
131
)
129
132
133
+ self .par_regions = None
134
+ if self .options .par_regions_bed :
135
+ self .par_regions = ranges .RangeSet .from_bed (
136
+ self .options .par_regions_bed , enable_logging = False
137
+ )
138
+
130
139
self .max_cache_coverage = max_cache_coverage
131
140
# pylint: disable=g-complex-comprehension
132
141
if use_cache_table :
133
142
self .table = [
134
143
[
135
- self ._calc_reference_confidence (n_ref , n_total )
136
- for n_ref in range (n_total + 1 )
144
+ [
145
+ self ._calc_reference_confidence (n_ref , n_total , is_haploid )
146
+ for n_ref in range (n_total + 1 )
147
+ ]
148
+ for n_total in range (self .max_cache_coverage + 1 )
137
149
]
138
- for n_total in range ( self . max_cache_coverage + 1 )
150
+ for is_haploid in [ False , True ]
139
151
]
140
152
else :
141
153
self .table = None
142
154
# pylint: enable=g-complex-comprehension
143
155
144
- def reference_confidence (self , n_ref , n_total ):
156
+ def reference_confidence (self , n_ref , n_total , is_haploid = False ):
145
157
"""Computes the confidence that a site in the genome has no variation.
146
158
147
159
Computes this confidence using only the counts of the number of reads
@@ -190,21 +202,22 @@ def reference_confidence(self, n_ref, n_total):
190
202
reference allele.
191
203
n_total: int >= 0 and >= n_ref: The number of reads supporting any allele
192
204
at this site.
205
+ is_haploid: bool. If True, the position should be haploid.
193
206
194
207
Returns:
195
208
A tuple of two values. The first is an integer value for the GQ (genotype
196
209
quality) and the second is an array-like of the log10 probabilities for
197
210
each of the three genotype configurations.
198
211
"""
199
212
if self .table is None :
200
- return self ._calc_reference_confidence (n_ref , n_total )
213
+ return self ._calc_reference_confidence (n_ref , n_total , is_haploid )
201
214
else :
202
215
ref_index , total_index = _rescale_read_counts_if_necessary (
203
216
n_ref , n_total , self .max_cache_coverage
204
217
)
205
- return self .table [total_index ][ref_index ]
218
+ return self .table [is_haploid ][ total_index ][ref_index ]
206
219
207
- def _calc_reference_confidence (self , n_ref , n_total ):
220
+ def _calc_reference_confidence (self , n_ref , n_total , is_haploid = False ):
208
221
"""Performs the calculation described in reference_confidence()."""
209
222
if n_ref < 0 :
210
223
raise ValueError ('n_ref={} must be >= 0' .format (n_ref ))
@@ -214,16 +227,23 @@ def _calc_reference_confidence(self, n_ref, n_total):
214
227
raise ValueError (
215
228
'ploidy={} but we only support ploidy=2' .format (self .options .ploidy )
216
229
)
217
-
218
230
if n_total == 0 :
219
- # No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
220
- log10_probs = genomics_math .normalize_log10_probs ([- 1.0 , - 1.0 , - 1.0 ])
231
+ if is_haploid :
232
+ # No coverage case - all likelihoods are log10 of 1/2, 0, 1/2.
233
+ log10_probs = genomics_math .normalize_log10_probs (
234
+ [- 1.0 , - IMPOSSIBLE_PROBABILITY_LOG10 , - 1.0 ]
235
+ )
236
+ else :
237
+ # No coverage case - all likelihoods are log10 of 1/3, 1/3, 1/3.
238
+ log10_probs = genomics_math .normalize_log10_probs ([- 1.0 , - 1.0 , - 1.0 ])
221
239
else :
222
240
n_alts = n_total - n_ref
223
241
logp = math .log (self .options .p_error ) / LOG_10
224
242
log1p = math .log1p (- self .options .p_error ) / LOG_10
225
243
log10_p_ref = n_ref * log1p + n_alts * logp
226
244
log10_p_het = - n_total * math .log (self .options .ploidy ) / LOG_10
245
+ if is_haploid :
246
+ log10_p_het = - IMPOSSIBLE_PROBABILITY_LOG10
227
247
log10_p_hom_alt = n_ref * logp + n_alts * log1p
228
248
log10_probs = genomics_math .normalize_log10_probs (
229
249
[log10_p_ref , log10_p_het , log10_p_hom_alt ]
@@ -261,6 +281,12 @@ def make_gvcfs(self, allele_count_summaries, include_med_dp=False):
261
281
coordinate-sorted order containing gVCF records.
262
282
"""
263
283
284
+ par_regions = None
285
+ if self .options .par_regions_bed :
286
+ par_regions = ranges .RangeSet .from_bed (
287
+ self .options .par_regions_bed , enable_logging = False
288
+ )
289
+
264
290
def with_gq_and_likelihoods (summary_counts ):
265
291
"""Returns summary_counts along with GQ and genotype likelihoods.
266
292
@@ -294,7 +320,18 @@ def with_gq_and_likelihoods(summary_counts):
294
320
else :
295
321
n_ref = summary_counts .ref_supporting_read_count
296
322
n_total = summary_counts .total_read_count
297
- raw_gq , likelihoods = self .reference_confidence (n_ref , n_total )
323
+ is_haploid = (
324
+ summary_counts .reference_name in self .options .haploid_contigs
325
+ and not (
326
+ par_regions
327
+ and par_regions .overlaps (
328
+ summary_counts .reference_name , summary_counts .position
329
+ )
330
+ )
331
+ )
332
+ raw_gq , likelihoods = self .reference_confidence (
333
+ n_ref , n_total , is_haploid
334
+ )
298
335
quantized_gq = _quantize_gq (raw_gq , self .options .gq_resolution )
299
336
has_valid_gl = np .amax (likelihoods ) == likelihoods [0 ]
300
337
return _GVCF (
0 commit comments