Skip to content

Commit

Permalink
[translate] require nodes to have sequences
Browse files Browse the repository at this point in the history
This is the implicit expectation, and is true for all of our pipelines.
In theory it's possible to translate without having the full sequence
attached to the node by reconstructing mutations on the root, but that
is not the approach taken by the current code.

As we explicitly check the tree nodes against the sequences in the
node-data JSON we can skip the automatic tests optionally performed when
reading the node-data JSON.

Closes #1345
  • Loading branch information
jameshadfield committed Dec 4, 2023
1 parent 0174b67 commit cf242f3
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .utils import read_node_data, load_features, write_json, get_json_name
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent

class MissingNodeError(Exception):
pass
Expand Down Expand Up @@ -341,14 +342,22 @@ def sequences_json(node_data_json, tree):
Extract the full nuc sequence for each node in the provided node-data JSON.
Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
"""
node_data = read_node_data(node_data_json, tree)
node_data = read_node_data(node_data_json)
if node_data is None:
raise AugurError("could not read node data (incl sequences)")
# extract sequences from node meta data
sequences = {}
for k,v in node_data['nodes'].items():
if 'sequence' in v:
sequences[k] = v['sequence']
tree_nodes = {c.name for c in tree.find_clades()}
tree_nodes_missing_sequences = tree_nodes - set(sequences.keys())
if len(tree_nodes_missing_sequences):
raise AugurError(dedent(f"""\
{len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON.
These must be present under 'nodes' → <node_name> → 'sequence'.
This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
"""))
return sequences

def register_parser(parent_subparsers):
Expand Down Expand Up @@ -414,7 +423,7 @@ def run(args):
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
sequences = sequences_json(args.ancestral_sequences, args.tree)
sequences = sequences_json(args.ancestral_sequences, tree)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}

## glob the annotations for later auspice export
Expand Down

0 comments on commit cf242f3

Please sign in to comment.