Skip to content

Commit

Permalink
Merge branch 'gvcf' into 'dev'
Browse files Browse the repository at this point in the history
first go at ref-only alts

See merge request research/medaka!491
  • Loading branch information
cjw85 committed Apr 15, 2021
2 parents 8510396 + bf2dd2c commit 103ab47
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 68 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v1.3.1]
### Added
- Option to output VCF record for all reference positions from
`medaka variant`.

## [v1.3.0]
### Changed
- Haploid variant calling reverted to old-style methodology.
Expand Down
2 changes: 1 addition & 1 deletion medaka/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import subprocess

__version__ = '1.3.0'
__version__ = '1.3.1'


def check_minimap2_version():
Expand Down
172 changes: 106 additions & 66 deletions medaka/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,74 +831,114 @@ def _prob_to_snp(
genotype_data=genotype))
return results

def decode_variants(self, sample, ref_seq, ambig_ref=False):
def decode_variants(
self, sample, ref_seq, ambig_ref=False, return_all=False):
"""Convert network output in sample to a set of `medaka.vcf.Variant` s.
A consensus sequence is decoded and compared with a reference sequence.
Both substitution and indel variants that may be multi-base will be
reported in the output `medaka.vcf.Variant` s.
:param sample: `medaka.common.Sample`.
:param ref_seq: reference sequence, should be upper case.
:param ref_seq: reference sequence, should be upper case
:param ambig_ref: bool, if True, decode variants at ambiguous (N)
reference positions.
:param return_all: bool, emit VCF records with `'.'` ALT for reference
positions which contain no variant.
:returns: list of `medaka.vcf.Variant` objects.
"""
logger = medaka.common.get_named_logger("CallVars")
if sample.positions['minor'][0] != 0:
raise ValueError(
"The first position of a sample must not be an insertion.")

pos = sample.positions
probs = sample.label_probs

assert sample.positions['minor'][0] == 0
encoding = self._encoding

# array of symbols retaining gaps
predicted = np.array(list(
self.decode_consensus(sample, with_gaps=True)))

# get reference sequence with insertions marked as '*'

def get_symbol(p):
return ref_seq[p['major']] if p['minor'] == 0 else '*'

reference = np.fromiter((get_symbol(p) for p in pos),
dtype='|U1', count=len(pos))
reference = np.fromiter(
(ref_seq[p['major']] if p['minor'] == 0 else '*' for p in pos),
dtype='|U1', count=len(pos))

def make_ref_records(rstart, rend):
# for each minor == 0 position, create a <REF> variant
_pos = pos[rstart:rend]
is_ref = _pos['minor'] == 0
_pos = _pos[is_ref]['major']
_probs = probs[rstart:rend][is_ref]
_ref = reference[rstart:rend][is_ref]
_enc = [encoding[(s if s != 'N' else '*',)] for s in _ref]
_quals = self._phred(
1.0 - np.array(_probs[np.arange(_probs.shape[0]), _enc]))
_variants = list()
info = dict()
for p, base, q in zip(_pos, _ref, _quals):
logger.info("Reference allele {} at: {} with {}".format(
base, p, q))
genotype = {'GT': '0', 'GQ': self._pfmt(q, 0)}
_variants.append(
medaka.vcf.Variant(
sample.ref_name, p, base,
alt='.', filt='PASS', info=info, qual=self._pfmt(q),
genotype_data=genotype))
return _variants

# find variants by looking for runs of labels which differ.
# If both labels are gap, we don't want to consider this a
# match so as to avoid splitting up long insertions if
# there happens to be a gap label in ref and pred.
mismatch = predicted != reference
both_gap = np.logical_and(predicted == '*',
reference == '*')
both_gap = np.logical_and(
predicted == '*', reference == '*')
is_variant = np.logical_or(mismatch, both_gap)

# medaka.common.rle requires numeric input
runs = medaka.rle.rle(is_variant)
variant_runs = runs[np.where(runs['value'])]
if not return_all:
runs = runs[np.where(runs['value'])]
variants = []
encoding = self._encoding
for run in variant_runs:
start = run['start']
end = start + run['length']
for rlen, rstart, is_variant in runs:
rend = rstart + rlen
if return_all:
if not is_variant:
variants.extend(make_ref_records(rstart, rend))
continue
else:
# the last thing added to `variants` was a non-variant
# If the current variant starts on an insertion column
# we need to remove that last entry.
# TODO: it would be better to flag is_variant blocks
# such that all columns with common major coord
# are grouped together
if pos[rstart]['minor'] != 0:
try:
variants.pop()
except IndexError:
pass

# if call or ref starts with gap, pad left
# (or right if we are at start of genome).
pad_right = False
# if chunks start with a deletion, need to pad (see comment below).
pad_right = False
pad_del_at_start_of_chunk = False
while ((not pad_right and
(reference[start] == '*' or
predicted[start] == '*'))
or
(pad_right and
(reference[end] == '*' or
predicted[end] == '*'))):

if tuple(pos[start]) == (0, 0) or pad_right:
end += 1
while (
(not pad_right and
(reference[rstart] == '*' or predicted[rstart] == '*'))
or (pad_right and
(reference[rend] == '*' or predicted[rend] == '*'))):

if tuple(pos[rstart]) == (0, 0) or pad_right:
rend += 1
# avoid getting stuck in loop if there is a run
# of dels at start of ref
pad_right = True
elif (start == 0 and pos[start]['minor'] == 0
and predicted[start] == '*'):
elif (rstart == 0 and pos[rstart]['minor'] == 0
and predicted[rstart] == '*'):
# If variant calling is being performed on a region (rather
# than an entire chr), it is possible that a chunk will
# start with a deletion. In which case, the VCF spec says
Expand All @@ -908,39 +948,43 @@ def get_symbol(p):
pad_del_at_start_of_chunk = True
break
else:
assert start != 0
start -= 1
assert rstart != 0
rstart -= 1

var_ref_with_gaps = ''.join(s for s in reference[start:end])
# get the ref/predicted sequence with/without gaps
var_ref_with_gaps = ''.join(s for s in reference[rstart:rend])
var_pred_with_gaps = ''.join(s for s in predicted[rstart:rend])
var_ref = var_ref_with_gaps.replace('*', '')

var_pred_with_gaps = ''.join(s for s in predicted[start:end])
var_pred = var_pred_with_gaps.replace('*', '')

# its possible for things to reduce to not being a variant.
# also maybe skip things if reference contains ambiguous symbols
if var_ref == var_pred:
# not a variant
if return_all:
variants.extend(make_ref_records(rstart, rend))
continue
elif (not ambig_ref and not
set(var_ref).issubset(set(self.symbols))):
# don't call where reference is ambiguous
elif (not ambig_ref and
not set(var_ref).issubset(set(self.symbols))):
continue

var_probs = probs[start:end]

# As N is not in encoding, encode N as *
# This only affects calculation of quals.
var_ref_encoded = (encoding[(s if s != 'N' else '*',)]
for s in var_ref_with_gaps)
var_pred_encoded = (encoding[(s,)] for s in var_pred_with_gaps)

ref_probs = np.array([var_probs[i, j]
for i, j in enumerate(var_ref_encoded)])
pred_probs = np.array([var_probs[i, j]
for i, j in enumerate(var_pred_encoded)])

ref_quals = [self._phred(1 - p) for p in ref_probs]
pred_quals = [self._phred(1 - p) for p in pred_probs]

var_ref_encoded = (
encoding[(s if s != 'N' else '*',)]
for s in var_ref_with_gaps)
var_pred_encoded = (
encoding[(s,)] for s in var_pred_with_gaps)

# calculate probabilities
var_probs = probs[rstart:rend]
ref_probs = np.array(
[var_probs[i, j] for i, j in enumerate(var_ref_encoded)])
pred_probs = np.array(
[var_probs[i, j] for i, j in enumerate(var_pred_encoded)])
ref_quals = self._phred(1.0 - ref_probs)
pred_quals = self._phred(1.0 - pred_probs)

info = dict()
if self.verbose:
info = {
'ref_seq': var_ref_with_gaps,
Expand All @@ -949,26 +993,22 @@ def get_symbol(p):
'pred_qs': ','.join((self._pfmt(q) for q in pred_quals)),
'ref_q': self._pfmt(sum(ref_quals)),
'pred_q': self._pfmt(sum(pred_quals)),
'n_cols': len(pred_quals)
}
else:
info = {}
'n_cols': len(pred_quals)}

# log likelihood ratio
qual = sum(pred_quals) - sum(ref_quals)
genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)}

var_pos = pos['major'][start]

# position in reference sequence
var_pos = pos['major'][rstart]
if pad_del_at_start_of_chunk:
var_pos -= 1
var_ref = ref_seq[var_pos] + var_ref
var_pred = ref_seq[var_pos] + var_pred

variant = medaka.vcf.Variant(sample.ref_name, var_pos, var_ref,
alt=var_pred, filt='PASS', info=info,
qual=self._pfmt(qual),
genotype_data=genotype)
# create the variant record
variant = medaka.vcf.Variant(
sample.ref_name, var_pos, var_ref,
alt=var_pred, filt='PASS', info=info, qual=self._pfmt(qual),
genotype_data=genotype)
variant = variant.trim()
variants.append(variant)

Expand Down
2 changes: 2 additions & 0 deletions medaka/medaka.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,8 @@ def main():
help='Populate VCF info fields.')
var_parser.add_argument('--ambig_ref', action='store_true',
help='Decode variants at ambiguous reference positions.')
var_parser.add_argument('--gvcf', action='store_true',
help='Output VCF records for reference loci predicted to be non-variant.')

# TODO do we still need this?
snp_parser = subparsers.add_parser('snp',
Expand Down
3 changes: 2 additions & 1 deletion medaka/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ def variants_from_hdf(args):

for sample in joined_samples:
variants = label_scheme.decode_variants(
sample, ref_seq, ambig_ref=args.ambig_ref)
sample, ref_seq, ambig_ref=args.ambig_ref,
return_all=args.gvcf)
vcf_writer.write_variants(variants, sort=True)


Expand Down

0 comments on commit 103ab47

Please sign in to comment.