Merge pull request #1351 from nextstrain/james/translate/nuc-annotation

Improve parsing of GenBank / GFF files
nextstrain · Dec 20, 2023 · e346693 · e346693
2 parents fc900aa + c91ca33
commit e346693
Show file tree

Hide file tree

Showing 16 changed files with 440 additions and 81 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,8 +2,18 @@
 
 ## __NEXT__
 
+### Major Changes
+
+* ancestral, translate: GenBank files now require the (GFF mandatory) source feature to be present.[#1351][] (@jameshadfield)
+* ancestral, translate: For GFF files, we extract the genome/sequence coordinates by inspecting the sequence-region pragma, region type and/or source type. This information is now required. [#1351][] (@jameshadfield)
+
 ### Features
 
+* ancestral, translate: A range of improvements to how we parse GFF and GenBank reference files. [#1351][] (@jameshadfield)
+    * translate will now always export a 'nuc' annotation in the output JSON, allowing it to pass validation
+    * Gene/CDS names of 'nuc' are now forbidden.
+    * If a Gene/CDS in the GFF/GenBank file is unparsed we now print a warning.
+* utils::load_features: This function may now raise `AugurError`. [#1351][] (@jameshadfield)
 * ancestral: For VCF alignments, a VCF output file is now only created when requested via `--output-vcf`. [#1344][] (@jameshadfield)
 * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
@@ -16,9 +26,13 @@
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
     * separate VCF-only arguments into their own group
+* translate: Fixes a bug in the parsing behaviour of GFF files whereby the presence of the `--genes` command line argument would change how we read individual GFF lines. Issue [#1349][], PR [#1351][] (@jameshadfield)
+
 
 [#1344]: https://github.com/nextstrain/augur/pull/1344
 [#1348]: https://github.com/nextstrain/augur/pull/1348
+[#1351]: https://github.com/nextstrain/augur/pull/1351
+[#1349]: https://github.com/nextstrain/augur/issues/1349
 
 ## 23.1.1 (7 November 2023)
 

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -323,8 +323,12 @@ def run(args):
         from .utils import load_features
         ## load features; only requested features if genes given
         features = load_features(args.annotation, args.genes)
-        if features is None:
-            raise AugurError("could not read features of reference sequence file")
+        # Ensure the already-created nuc annotation coordinates match those parsed from the reference file
+        if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
+            features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
+            raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
+                f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")
+
         print("Read in {} features from reference sequence file".format(len(features)))
         for gene in args.genes:
             print(f"Processing gene: {gene}")

diff --git a/augur/translate.py b/augur/translate.py
@@ -403,19 +403,16 @@ def run(args):
 
     ## load features; only requested features if genes given
     features = load_features(args.reference_sequence, genes)
-    if features is None:
-        print("ERROR: could not read features of reference sequence file")
-        return 1
     print("Read in {} features from reference sequence file".format(len(features)))
 
-    ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
-    ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank.
+    ## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name
+    ## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files)
     translations = {}
     if is_vcf:
         (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
         features_without_variation = []
         for fname, feat in features.items():
-            if feat.type=='source':
+            if fname=='nuc':
                 continue
             try:
                 translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
@@ -425,26 +422,26 @@ def run(args):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
         sequences = sequences_json(args.ancestral_sequences, tree)
-        translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}
+        translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
 
     ## glob the annotations for later auspice export
     #
     # Note that BioPython FeatureLocations use
     # "Pythonic" coordinates: [zero-origin, half-open)
     # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
-    annotations = {}
+    annotations = {
+        'nuc': {'start': features['nuc'].location.start+1,
+                'end':   features['nuc'].location.end,
+                'strand': '+',
+                'type':  features['nuc'].type,     # (unused by auspice)
+                'seqid': args.reference_sequence}  # (unused by auspice)
+    }
     for fname, feat in features.items():
         annotations[fname] = {'seqid':args.reference_sequence,
                               'type':feat.type,
                               'start':int(feat.location.start)+1,
                               'end':int(feat.location.end),
                               'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
-    if is_vcf: #need to add our own nuc
-        annotations['nuc'] = {'seqid':args.reference_sequence,
-                              'type':feat.type,
-                              'start': 1,
-                              'end': len(ref),
-                              'strand': '+'}
 
     ## determine amino acid mutations for each node
     try: