From bf2dd2c155535099dcbcbc85b45810404049ff1d Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 15 Apr 2021 11:25:57 +0100 Subject: [PATCH] first go at ref-only alts --- CHANGELOG.md | 5 ++ medaka/__init__.py | 2 +- medaka/labels.py | 172 ++++++++++++++++++++++++++++----------------- medaka/medaka.py | 2 + medaka/variant.py | 3 +- 5 files changed, 116 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fc963d6..bfea37dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.3.1] +### Added + - Option to output VCF record for all reference positions from + `medaka variant`. + ## [v1.3.0] ### Changed - Haploid variant calling reverted to old-style methodology. diff --git a/medaka/__init__.py b/medaka/__init__.py index 4afbc23e..86f90bc0 100644 --- a/medaka/__init__.py +++ b/medaka/__init__.py @@ -4,7 +4,7 @@ import os import subprocess -__version__ = '1.3.0' +__version__ = '1.3.1' def check_minimap2_version(): diff --git a/medaka/labels.py b/medaka/labels.py index b6762b7d..dc422bbe 100644 --- a/medaka/labels.py +++ b/medaka/labels.py @@ -831,7 +831,8 @@ def _prob_to_snp( genotype_data=genotype)) return results - def decode_variants(self, sample, ref_seq, ambig_ref=False): + def decode_variants( + self, sample, ref_seq, ambig_ref=False, return_all=False): """Convert network output in sample to a set of `medaka.vcf.Variant` s. A consensus sequence is decoded and compared with a reference sequence. @@ -839,66 +840,105 @@ def decode_variants(self, sample, ref_seq, ambig_ref=False): reported in the output `medaka.vcf.Variant` s. :param sample: `medaka.common.Sample`. - :param ref_seq: reference sequence, should be upper case. + :param ref_seq: reference sequence, should be upper case :param ambig_ref: bool, if True, decode variants at ambiguous (N) reference positions. + :param return_all: bool, emit VCF records with `'.'` ALT for reference + positions which contain no variant. :returns: list of `medaka.vcf.Variant` objects. """ + logger = medaka.common.get_named_logger("CallVars") + if sample.positions['minor'][0] != 0: + raise ValueError( + "The first position of a sample must not be an insertion.") + pos = sample.positions probs = sample.label_probs - - assert sample.positions['minor'][0] == 0 + encoding = self._encoding # array of symbols retaining gaps predicted = np.array(list( self.decode_consensus(sample, with_gaps=True))) # get reference sequence with insertions marked as '*' - - def get_symbol(p): - return ref_seq[p['major']] if p['minor'] == 0 else '*' - - reference = np.fromiter((get_symbol(p) for p in pos), - dtype='|U1', count=len(pos)) + reference = np.fromiter( + (ref_seq[p['major']] if p['minor'] == 0 else '*' for p in pos), + dtype='|U1', count=len(pos)) + + def make_ref_records(rstart, rend): + # for each minor == 0 position, create a variant + _pos = pos[rstart:rend] + is_ref = _pos['minor'] == 0 + _pos = _pos[is_ref]['major'] + _probs = probs[rstart:rend][is_ref] + _ref = reference[rstart:rend][is_ref] + _enc = [encoding[(s if s != 'N' else '*',)] for s in _ref] + _quals = self._phred( + 1.0 - np.array(_probs[np.arange(_probs.shape[0]), _enc])) + _variants = list() + info = dict() + for p, base, q in zip(_pos, _ref, _quals): + logger.info("Reference allele {} at: {} with {}".format( + base, p, q)) + genotype = {'GT': '0', 'GQ': self._pfmt(q, 0)} + _variants.append( + medaka.vcf.Variant( + sample.ref_name, p, base, + alt='.', filt='PASS', info=info, qual=self._pfmt(q), + genotype_data=genotype)) + return _variants # find variants by looking for runs of labels which differ. # If both labels are gap, we don't want to consider this a # match so as to avoid splitting up long insertions if # there happens to be a gap label in ref and pred. mismatch = predicted != reference - both_gap = np.logical_and(predicted == '*', - reference == '*') + both_gap = np.logical_and( + predicted == '*', reference == '*') is_variant = np.logical_or(mismatch, both_gap) - # medaka.common.rle requires numeric input runs = medaka.rle.rle(is_variant) - variant_runs = runs[np.where(runs['value'])] + if not return_all: + runs = runs[np.where(runs['value'])] variants = [] - encoding = self._encoding - for run in variant_runs: - start = run['start'] - end = start + run['length'] + for rlen, rstart, is_variant in runs: + rend = rstart + rlen + if return_all: + if not is_variant: + variants.extend(make_ref_records(rstart, rend)) + continue + else: + # the last thing added to `variants` was a non-variant + # If the current variant starts on an insertion column + # we need to remove that last entry. + # TODO: it would be better to flag is_variant blocks + # such that all columns with common major coord + # are grouped together + if pos[rstart]['minor'] != 0: + try: + variants.pop() + except IndexError: + pass + # if call or ref starts with gap, pad left # (or right if we are at start of genome). - pad_right = False # if chunks start with a deletion, need to pad (see comment below). + pad_right = False pad_del_at_start_of_chunk = False - while ((not pad_right and - (reference[start] == '*' or - predicted[start] == '*')) - or - (pad_right and - (reference[end] == '*' or - predicted[end] == '*'))): - - if tuple(pos[start]) == (0, 0) or pad_right: - end += 1 + while ( + (not pad_right and + (reference[rstart] == '*' or predicted[rstart] == '*')) + or (pad_right and + (reference[rend] == '*' or predicted[rend] == '*'))): + + if tuple(pos[rstart]) == (0, 0) or pad_right: + rend += 1 # avoid getting stuck in loop if there is a run # of dels at start of ref pad_right = True - elif (start == 0 and pos[start]['minor'] == 0 - and predicted[start] == '*'): + elif (rstart == 0 and pos[rstart]['minor'] == 0 + and predicted[rstart] == '*'): # If variant calling is being performed on a region (rather # than an entire chr), it is possible that a chunk will # start with a deletion. In which case, the VCF spec says @@ -908,39 +948,43 @@ def get_symbol(p): pad_del_at_start_of_chunk = True break else: - assert start != 0 - start -= 1 + assert rstart != 0 + rstart -= 1 - var_ref_with_gaps = ''.join(s for s in reference[start:end]) + # get the ref/predicted sequence with/without gaps + var_ref_with_gaps = ''.join(s for s in reference[rstart:rend]) + var_pred_with_gaps = ''.join(s for s in predicted[rstart:rend]) var_ref = var_ref_with_gaps.replace('*', '') - - var_pred_with_gaps = ''.join(s for s in predicted[start:end]) var_pred = var_pred_with_gaps.replace('*', '') + # its possible for things to reduce to not being a variant. + # also maybe skip things if reference contains ambiguous symbols if var_ref == var_pred: - # not a variant + if return_all: + variants.extend(make_ref_records(rstart, rend)) continue - elif (not ambig_ref and not - set(var_ref).issubset(set(self.symbols))): - # don't call where reference is ambiguous + elif (not ambig_ref and + not set(var_ref).issubset(set(self.symbols))): continue - var_probs = probs[start:end] - # As N is not in encoding, encode N as * # This only affects calculation of quals. - var_ref_encoded = (encoding[(s if s != 'N' else '*',)] - for s in var_ref_with_gaps) - var_pred_encoded = (encoding[(s,)] for s in var_pred_with_gaps) - - ref_probs = np.array([var_probs[i, j] - for i, j in enumerate(var_ref_encoded)]) - pred_probs = np.array([var_probs[i, j] - for i, j in enumerate(var_pred_encoded)]) - - ref_quals = [self._phred(1 - p) for p in ref_probs] - pred_quals = [self._phred(1 - p) for p in pred_probs] - + var_ref_encoded = ( + encoding[(s if s != 'N' else '*',)] + for s in var_ref_with_gaps) + var_pred_encoded = ( + encoding[(s,)] for s in var_pred_with_gaps) + + # calculate probabilities + var_probs = probs[rstart:rend] + ref_probs = np.array( + [var_probs[i, j] for i, j in enumerate(var_ref_encoded)]) + pred_probs = np.array( + [var_probs[i, j] for i, j in enumerate(var_pred_encoded)]) + ref_quals = self._phred(1.0 - ref_probs) + pred_quals = self._phred(1.0 - pred_probs) + + info = dict() if self.verbose: info = { 'ref_seq': var_ref_with_gaps, @@ -949,26 +993,22 @@ def get_symbol(p): 'pred_qs': ','.join((self._pfmt(q) for q in pred_quals)), 'ref_q': self._pfmt(sum(ref_quals)), 'pred_q': self._pfmt(sum(pred_quals)), - 'n_cols': len(pred_quals) - } - else: - info = {} + 'n_cols': len(pred_quals)} # log likelihood ratio qual = sum(pred_quals) - sum(ref_quals) genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)} - - var_pos = pos['major'][start] - + # position in reference sequence + var_pos = pos['major'][rstart] if pad_del_at_start_of_chunk: var_pos -= 1 var_ref = ref_seq[var_pos] + var_ref var_pred = ref_seq[var_pos] + var_pred - - variant = medaka.vcf.Variant(sample.ref_name, var_pos, var_ref, - alt=var_pred, filt='PASS', info=info, - qual=self._pfmt(qual), - genotype_data=genotype) + # create the variant record + variant = medaka.vcf.Variant( + sample.ref_name, var_pos, var_ref, + alt=var_pred, filt='PASS', info=info, qual=self._pfmt(qual), + genotype_data=genotype) variant = variant.trim() variants.append(variant) diff --git a/medaka/medaka.py b/medaka/medaka.py index 327f4fc3..4cf03989 100644 --- a/medaka/medaka.py +++ b/medaka/medaka.py @@ -497,6 +497,8 @@ def main(): help='Populate VCF info fields.') var_parser.add_argument('--ambig_ref', action='store_true', help='Decode variants at ambiguous reference positions.') + var_parser.add_argument('--gvcf', action='store_true', + help='Output VCF records for reference loci predicted to be non-variant.') # TODO do we still need this? snp_parser = subparsers.add_parser('snp', diff --git a/medaka/variant.py b/medaka/variant.py index 3cda9d6c..6a54de98 100644 --- a/medaka/variant.py +++ b/medaka/variant.py @@ -229,7 +229,8 @@ def variants_from_hdf(args): for sample in joined_samples: variants = label_scheme.decode_variants( - sample, ref_seq, ambig_ref=args.ambig_ref) + sample, ref_seq, ambig_ref=args.ambig_ref, + return_all=args.gvcf) vcf_writer.write_variants(variants, sort=True)