From 0174b679294baf35fc2a760697bd9e4766ca26fa Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Fri, 1 Dec 2023 12:14:29 +1300
Subject: [PATCH] [translate] refactor & improve readability

There should be no functional changes. Co-locating the sequence reading
& feature extraction is easier to read, and were Python's scoping to be
different it'd be even nicer as we wouldn't leave around variables which
are never re-used.

As part of this `translate_vcf_feature` has changed from using an
undocumented `return None` to raising a (documented) error which (IMO)
is easier to reason with.
---
 augur/translate.py                            | 54 +++++++++----------
 tests/functional/translate/cram/genes.t       |  4 +-
 .../translate/cram/translate-with-genbank.t   |  2 +-
 .../cram/translate-with-gff-and-gene-name.t   |  2 +-
 .../cram/translate-with-gff-and-gene.t        |  2 +-
 5 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/augur/translate.py b/augur/translate.py
index 1fcfcc487..9358f0309 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -27,6 +27,9 @@ class MissingNodeError(Exception):
 class MismatchNodeError(Exception):
     pass
 
+class NoVariationError(Exception):
+    pass
+
 def safe_translate(sequence, report_exceptions=False):
     """Returns an amino acid translation of the given nucleotide sequence accounting
     for gaps in the given sequence.
@@ -145,6 +148,9 @@ def translate_vcf_feature(sequences, ref, feature):
         translated reference gene, positions of AA differences, and AA
         differences indexed by node name
 
+    Raises
+    ------
+    NoVariationError : if no variable sites within this feature (across all sequences)
     '''
     def str_reverse_comp(str_seq):
         #gets reverse-compliment of a string and returns it as a string
@@ -205,11 +211,10 @@ def str_reverse_comp(str_seq):
 
     prot['positions'].sort()
 
-    #if no variable sites, exclude this gene
+    # raise an error if no variable sites observed
     if len(prot['positions']) == 0:
-        return None
-    else:
-        return prot
+        raise NoVariationError()
+    return prot
 
 def construct_mut(start, pos, end):
     return str(start) + str(pos) + str(end)
@@ -380,6 +385,8 @@ def check_arg_combinations(args, is_vcf):
 def run(args):
     ## read tree and data, if reading data fails, return with error code
     tree = Phylo.read(args.tree, 'newick')
+    is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
+    check_arg_combinations(args, is_vcf)
 
     # If genes is a file, read in the genes to translate
     if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
@@ -387,16 +394,6 @@ def run(args):
     else:
         genes = args.genes
 
-    ## check file format and read in sequences
-    is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
-    check_arg_combinations(args, is_vcf)
-
-    if is_vcf:
-        (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
-    else:
-        sequences = sequences_json(args.ancestral_sequences, args.tree)
-
-
     ## load features; only requested features if genes given
     features = load_features(args.reference_sequence, genes)
     if features is None:
@@ -404,22 +401,21 @@ def run(args):
         return 1
     print("Read in {} features from reference sequence file".format(len(features)))
 
-    ### translate every feature - but not 'nuc'!
+    ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
     translations = {}
-    deleted = []
-    for fname, feat in features.items():
-        if is_vcf:
-            trans = translate_vcf_feature(sequences, ref, feat)
-            if trans:
-                translations[fname] = trans
-            else:
-                deleted.append(fname)
-        else:
-            if feat.type != 'source':
-                translations[fname] = translate_feature(sequences, feat)
-
-    if len(deleted) != 0:
-        print("{} genes had no mutations and so have been be excluded.".format(len(deleted)))
+    if is_vcf:
+        (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
+        features_without_variation = []
+        for fname, feat in features.items():
+            try:
+                translations[fname] = translate_vcf_feature(sequences, ref, feat)
+            except NoVariationError:
+                features_without_variation.append(fname)
+        if len(features_without_variation):
+            print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
+    else:
+        sequences = sequences_json(args.ancestral_sequences, args.tree)
+        translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}
 
     ## glob the annotations for later auspice export
     #
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
index 2b629f9e5..c6289bd7a 100644
--- a/tests/functional/translate/cram/genes.t
+++ b/tests/functional/translate/cram/genes.t
@@ -14,9 +14,9 @@ Similar tests to those in `general.t` but here testing the --genes argument
   >  --reference-sequence $DATA/reference.gff \
   >  --genes gene2 gene3 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-args.json"
-  Validating schema of .+ (re)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 1 features from reference sequence file
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
@@ -36,9 +36,9 @@ Using a text file rather than command line arguments
   >  --genes "$CRAMTMP/$TESTFILE/genes.txt" \
   >  --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-txt.json"
   Read in 2 specified genes to translate.
-  Validating schema of .+ (re)
   Couldn't find gene gene3 in GFF or GenBank file
   Read in 1 features from reference sequence file
+  Validating schema of .+ (re)
   amino acid mutations written to .+ (re)
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
index d4dec07e7..5be3992c2 100644
--- a/tests/functional/translate/cram/translate-with-genbank.t
+++ b/tests/functional/translate/cram/translate-with-genbank.t
@@ -11,8 +11,8 @@ Translate amino acids for genes using a GenBank file.
   >   --reference-sequence translate/data/zika/zika_outgroup.gb \
   >   --genes CA PRO \
   >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
   Read in 3 features from reference sequence file
+  Validating schema of 'translate/data/zika/nt_muts.json'...
   amino acid mutations written to .* (re)
   $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json
   {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
index 8796d35c4..f3d19b50a 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --ancestral-sequences translate/data/zika/nt_muts.json \
   >   --reference-sequence "$TMP/genemap.gff" \
   >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
   Read in 2 features from reference sequence file
+  Validating schema of 'translate/data/zika/nt_muts.json'...
   amino acid mutations written to .* (re)
 
 Other than the sequence ids which will include a temporary path, the JSONs
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
index 703240e8c..84c4fa995 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-gene.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --ancestral-sequences translate/data/zika/nt_muts.json \
   >   --reference-sequence "$TMP/genemap.gff" \
   >   --output-node-data $TMP/aa_muts.json
-  Validating schema of 'translate/data/zika/nt_muts.json'...
   Read in 2 features from reference sequence file
+  Validating schema of 'translate/data/zika/nt_muts.json'...
   amino acid mutations written to .* (re)
   $ python3 "../../scripts/diff_jsons.py" \
   >  --exclude-regex-paths "['seqid']" -- \