diff --git a/CHANGES.md b/CHANGES.md index 02ad665f3..c3d2192dc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,8 +2,18 @@ ## __NEXT__ +### Major Changes + +* ancestral, translate: GenBank files now require the (GFF mandatory) source feature to be present.[#1351][] (@jameshadfield) +* ancestral, translate: For GFF files, we extract the genome/sequence coordinates by inspecting the sequence-region pragma, region type and/or source type. This information is now required. [#1351][] (@jameshadfield) + ### Features +* ancestral, translate: A range of improvements to how we parse GFF and GenBank reference files. [#1351][] (@jameshadfield) + * translate will now always export a 'nuc' annotation in the output JSON, allowing it to pass validation + * Gene/CDS names of 'nuc' are now forbidden. + * If a Gene/CDS in the GFF/GenBank file is unparsed we now print a warning. +* utils::load_features: This function may now raise `AugurError`. [#1351][] (@jameshadfield) * ancestral: For VCF alignments, a VCF output file is now only created when requested via `--output-vcf`. [#1344][] (@jameshadfield) * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield) * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. @@ -16,9 +26,13 @@ * translate: Improvements to command line arguments. [#1348][] (@jameshadfield) * `--tree` and `--ancestral-sequences` are now required arguments. * separate VCF-only arguments into their own group +* translate: Fixes a bug in the parsing behaviour of GFF files whereby the presence of the `--genes` command line argument would change how we read individual GFF lines. Issue [#1349][], PR [#1351][] (@jameshadfield) + [#1344]: https://github.com/nextstrain/augur/pull/1344 [#1348]: https://github.com/nextstrain/augur/pull/1348 +[#1351]: https://github.com/nextstrain/augur/pull/1351 +[#1349]: https://github.com/nextstrain/augur/issues/1349 ## 23.1.1 (7 November 2023) diff --git a/augur/ancestral.py b/augur/ancestral.py index a15d1ef93..621faeb6a 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -323,8 +323,12 @@ def run(args): from .utils import load_features ## load features; only requested features if genes given features = load_features(args.annotation, args.genes) - if features is None: - raise AugurError("could not read features of reference sequence file") + # Ensure the already-created nuc annotation coordinates match those parsed from the reference file + if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or + features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']): + raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})" + f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).") + print("Read in {} features from reference sequence file".format(len(features))) for gene in args.genes: print(f"Processing gene: {gene}") diff --git a/augur/translate.py b/augur/translate.py index 8c7253c39..bc4a1bc13 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -403,19 +403,16 @@ def run(args): ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) - if features is None: - print("ERROR: could not read features of reference sequence file") - return 1 print("Read in {} features from reference sequence file".format(len(features))) - ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature - ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank. + ## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name + ## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files) translations = {} if is_vcf: (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) features_without_variation = [] for fname, feat in features.items(): - if feat.type=='source': + if fname=='nuc': continue try: translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) @@ -425,26 +422,26 @@ def run(args): print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) else: sequences = sequences_json(args.ancestral_sequences, tree) - translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} + translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'} ## glob the annotations for later auspice export # # Note that BioPython FeatureLocations use # "Pythonic" coordinates: [zero-origin, half-open) # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] - annotations = {} + annotations = { + 'nuc': {'start': features['nuc'].location.start+1, + 'end': features['nuc'].location.end, + 'strand': '+', + 'type': features['nuc'].type, # (unused by auspice) + 'seqid': args.reference_sequence} # (unused by auspice) + } for fname, feat in features.items(): annotations[fname] = {'seqid':args.reference_sequence, 'type':feat.type, 'start':int(feat.location.start)+1, 'end':int(feat.location.end), 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} - if is_vcf: #need to add our own nuc - annotations['nuc'] = {'seqid':args.reference_sequence, - 'type':feat.type, - 'start': 1, - 'end': len(ref), - 'strand': '+'} ## determine amino acid mutations for each node try: diff --git a/augur/utils.py b/augur/utils.py index d687e62a5..2a94cc0d7 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -11,6 +11,7 @@ from augur.io.file import open_file from augur.types import ValidationMode +from augur.errors import AugurError from augur.util_support.color_parser import ColorParser from augur.util_support.node_data_reader import NodeDataReader @@ -143,66 +144,267 @@ def default(self, obj): def load_features(reference, feature_names=None): - #read in appropriately whether GFF or Genbank + """ + Parse a GFF/GenBank reference file. See the docstrings for _read_gff and + _read_genbank for details. + + Parameters + ---------- + reference : str + File path to GFF or GenBank (.gb) reference + feature_names : None or set or list (optional) + Restrict the genes we read to those in the set/list + + Returns + ------- + features : dict + keys: feature names, values: Note + that feature names may not equivalent to GenBank feature keys + + Raises + ------ + AugurError + If the reference file doesn't exist, or is malformed / empty + """ #checks explicitly for GFF otherwise assumes Genbank if not os.path.isfile(reference): - print("ERROR: reference sequence not found. looking for", reference) - return None + raise AugurError(f"reference sequence file {reference!r} not found") - features = {} if '.gff' in reference.lower(): - #looks for 'gene' and 'gene' as best for TB - from BCBio import GFF - limit_info = dict( gff_type = ['gene', 'source'] ) - - with open(reference, encoding='utf-8') as in_handle: - for rec in GFF.parse(in_handle, limit_info=limit_info): - for feat in rec.features: - # Check for gene names stored in qualifiers commonly used by - # virus-specific gene maps first (e.g., 'gene', - # 'gene_name'). Then, check for qualifiers used by non-viral - # pathogens (e.g., 'locus_tag'). - if feature_names is not None: - if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names: - fname = feat.qualifiers["gene"][0] - elif "gene_name" in feat.qualifiers and feat.qualifiers["gene_name"][0] in feature_names: - fname = feat.qualifiers["gene_name"][0] - elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names: - fname = feat.qualifiers["locus_tag"][0] - else: - fname = None - else: - if "gene" in feat.qualifiers: - fname = feat.qualifiers["gene"][0] - elif "gene_name" in feat.qualifiers: - fname = feat.qualifiers["gene_name"][0] - else: - fname = feat.qualifiers["locus_tag"][0] - if feat.type == "source": - fname = "nuc" - - if fname: - features[fname] = feat - - if feature_names is not None: - for fe in feature_names: - if fe not in features: - print("Couldn't find gene {} in GFF or GenBank file".format(fe)) + return _read_gff(reference, feature_names) + else: + return _read_genbank(reference, feature_names) + +def _read_nuc_annotation_from_gff(record, reference): + """ + Looks for the ##sequence-region pragma as well as 'region' & 'source' GFF + types. Note that 'source' isn't really a GFF feature type, but is used + widely in the Nextstrain ecosystem. If there are multiple we check that the + coordinates agree. + + Parameters + ---------- + record : + reference: string + File path to GFF reference + + Returns + ------- + + Raises + ------ + AugurError + If no information on the genome / seqid length is available or if the + information is contradictory + """ + nuc = {} + # Attempt to parse the sequence-region pragma to learn the genome + # length (in the absence of record/source we'll use this for 'nuc') + sequence_regions = record.annotations.get('sequence-region', []) + if len(sequence_regions)>1: + raise AugurError(f"Reference {reference!r} contains multiple ##sequence-region pragma lines. Augur can only handle GFF files with a single one.") + elif sequence_regions: + from Bio.SeqFeature import SeqFeature, FeatureLocation + (name, start, stop) = sequence_regions[0] + nuc['pragma'] = SeqFeature( + FeatureLocation(start, stop), + strand=1, + type='##sequence-region pragma', + id=name, + ) + for feat in record.features: + if feat.type == "region": + nuc['region'] = feat + elif feat.type == "source": + nuc['source'] = feat + + # ensure they all agree on coordinates, if there are multiple + if len(nuc.values())>1: + coords = [(name, int(feat.location.start), int(feat.location.end)) for name,feat in nuc.items()] + if not all(el[1]==coords[0][1] and el[2]==coords[0][2] for el in coords): + raise AugurError(f"Reference {reference!r} contained contradictory coordinates for the seqid/genome. We parsed the following coordinates: " + + ', '.join([f"{el[0]}: [{el[1]+1}, {el[2]}]" for el in coords]) # +1 on the first coord to shift to one-based GFF representation + ) + + if 'pragma' in nuc: ## the pragma is GFF's preferred way to define nuc coords + return nuc['pragma'] + elif 'region' in nuc: + return nuc['region'] + elif 'source' in nuc: + return nuc['source'] else: - from Bio import SeqIO - for feat in SeqIO.read(reference, 'genbank').features: - if feat.type=='CDS': - if "locus_tag" in feat.qualifiers: - fname = feat.qualifiers["locus_tag"][0] - if feature_names is None or fname in feature_names: - features[fname] = feat - elif "gene" in feat.qualifiers: + raise AugurError(f"Reference {reference!r} didn't define any information we can use to create the 'nuc' annotation. You can use a line with a 'record' or 'source' GFF type or a ##sequence-region pragma.") + + +def _read_gff(reference, feature_names): + """ + Read a GFF file. We only read GFF IDs 'gene' or 'source' (the latter may not technically + be a valid GFF field, but is used widely within the Nextstrain ecosystem). + Only the first entry in the GFF file is parsed. + We create a "feature name" via: + - for 'source' IDs use 'nuc' + - for 'gene' IDs use the 'gene', 'gene_name' or 'locus_tag'. + If none are specified, the intention is to silently ignore but there are bugs here. + + Parameters + ---------- + reference : string + File path to GFF reference + feature_names : None or set or list + Restrict the genes we read to those in the set/list + + Returns + ------- + features : dict + keys: feature names, values: + Note that feature names may not equivalent to GenBank feature keys + + Raises + ------ + AugurError + If the reference file contains no IDs or multiple different seqids + If a gene is found with the name 'nuc' + """ + from BCBio import GFF + valid_types = ['gene', 'source', 'region'] + features = {} + + with open(reference, encoding='utf-8') as in_handle: + # Note that `GFF.parse` doesn't always yield GFF records in the order + # one may expect, but since we raise AugurError if there are multiple + # this doesn't matter. + gff_entries = list(GFF.parse(in_handle, limit_info={'gff_type': valid_types})) + if len(gff_entries) == 0: + raise AugurError(f"Reference {reference!r} contains no valid data rows. Valid GFF types (3rd column) are {', '.join(valid_types)}.") + elif len(gff_entries) > 1: + raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.") + else: + rec = gff_entries[0] + + features['nuc'] = _read_nuc_annotation_from_gff(rec, reference) + features_skipped = 0 + + for feat in rec.features: + if feat.type == "gene": + # Check for gene names stored in qualifiers commonly used by + # virus-specific gene maps first (e.g., 'gene', + # 'gene_name'). Then, check for qualifiers used by non-viral + # pathogens (e.g., 'locus_tag'). + if "gene" in feat.qualifiers: fname = feat.qualifiers["gene"][0] - if feature_names is None or fname in feature_names: - features[fname] = feat - elif feat.type=='source': #read 'nuc' as well for annotations - need start/end of whole! - features['nuc'] = feat + elif "gene_name" in feat.qualifiers: + fname = feat.qualifiers["gene_name"][0] + elif "locus_tag" in feat.qualifiers: + fname = feat.qualifiers["locus_tag"][0] + else: + features_skipped+=1 + fname = None + + if fname == 'nuc': + raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.") + + if feature_names is not None and fname not in feature_names: + # Skip (don't store) this feature + continue + + if fname: + features[fname] = feat + + if feature_names is not None: + for fe in feature_names: + if fe not in features: + print("Couldn't find gene {} in GFF or GenBank file".format(fe)) + + if features_skipped: + print(f"WARNING: {features_skipped} GFF rows of type=gene skipped as they didn't have a gene, gene_name or locus_tag attribute.") + + return features + +def _read_nuc_annotation_from_genbank(record, reference): + """ + Extracts the mandatory 'source' feature. If the sequence is present we check + the length agrees with the source. (The 'ORIGIN' may be left blank, + according to .) + + See for more. + + Parameters + ---------- + record : reference: string + + Returns + ------- + + + Raises + ------ + AugurError + If 'source' not defined or if coords contradict. + """ + nuc = None + for feat in record.features: + if feat.type=='source': + nuc = feat + if not nuc: + raise AugurError(f"Reference {reference!r} did not define the mandatory source feature.") + if nuc.location.start!=0: # this is a '1' in the GenBank file + raise AugurError(f"Reference {reference!r} source feature did not start at 1.") + if record.seq and len(record.seq)!=nuc.location.end: + raise AugurError(f"Reference {reference!r} source feature was length {nuc.location.end} but the included sequence was length {len(record.seq)}.") + return nuc + +def _read_genbank(reference, feature_names): + """ + Read a GenBank file. We only read GenBank feature keys 'CDS' or 'source'. + We create a "feature name" via: + - for 'source' features use 'nuc' + - for 'CDS' features use the locus_tag or the gene. If neither, then silently ignore. + + Parameters + ---------- + reference : string + File path to GenBank reference + feature_names : None or set or list + Restrict the CDSs we read to those in the set/list + + Returns + ------- + features : dict + keys: feature names, values: + Note that feature names may not equivalent to GenBank feature keys + + Raises + ------ + AugurError + If 'nuc' annotation not parsed + If a CDS feature is given the name 'nuc' + """ + from Bio import SeqIO + gb = SeqIO.read(reference, 'genbank') + features = { + 'nuc': _read_nuc_annotation_from_genbank(gb, reference) + } + + features_skipped = 0 + for feat in gb.features: + if feat.type=='CDS': + fname = None + if "locus_tag" in feat.qualifiers: + fname = feat.qualifiers["locus_tag"][0] + elif "gene" in feat.qualifiers: + fname = feat.qualifiers["gene"][0] + else: + features_skipped+=1 + + if fname == 'nuc': + raise AugurError(f"Reference {reference!r} contains a CDS with the name 'nuc'. This is not allowed.") + + if fname and (feature_names is None or fname in feature_names): + features[fname] = feat + + if features_skipped: + print(f"WARNING: {features_skipped} CDS features skipped as they didn't have a locus_tag or gene qualifier.") return features diff --git a/tests/functional/translate/cram/basic-error-checking.t b/tests/functional/translate/cram/basic-error-checking.t new file mode 100644 index 000000000..789cf89c0 --- /dev/null +++ b/tests/functional/translate/cram/basic-error-checking.t @@ -0,0 +1,16 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +Missing reference file + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence $DATA/reference.doesnt-exist.gff \ + > --output-node-data "aa_muts.json" > /dev/null + ERROR: reference sequence file '.+/reference.doesnt-exist.gff' not found (re) + [2] diff --git a/tests/functional/translate/cram/genbank.t b/tests/functional/translate/cram/genbank.t new file mode 100644 index 000000000..120e32cd1 --- /dev/null +++ b/tests/functional/translate/cram/genbank.t @@ -0,0 +1,33 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +These tests are intended to test variants of GenBank reference file formatting + + +Remove the mandatory source feature from the file + $ sed '5,6d' "$DATA/reference.gb" > "reference.no-source.gb" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.no-source.gb" \ + > --output-node-data "aa_muts.json" + ERROR: Reference .+ did not define the mandatory source feature. (re) + [2] + +Remove a nucleotide from the ORIGIN sequence so the coordinates don't match the source + + $ sed 's/TGACCATAAA/TGACCATAA/' "$DATA/reference.gb" > "reference.short-origin.gb" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.short-origin.gb" \ + > --output-node-data "aa_muts.json" + .+ BiopythonParserWarning: .+ (re) + .+ (re) + ERROR: Reference .+ (re) + [2] diff --git a/tests/functional/translate/cram/general.t b/tests/functional/translate/cram/general.t index fa389dbcb..09ef166f7 100644 --- a/tests/functional/translate/cram/general.t +++ b/tests/functional/translate/cram/general.t @@ -13,7 +13,7 @@ which validate the output will fail as it's missing a 'nuc' annotation. $ ${AUGUR} translate \ > --tree "$ANC_DATA/tree.nwk" \ > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ - > --reference-sequence "$DATA/reference.source.gff" \ + > --reference-sequence "$DATA/reference.gff" \ > --output-node-data "aa_muts.json" > /dev/null $ python3 "$SCRIPTS/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t index 3a5b866bd..126b6f904 100644 --- a/tests/functional/translate/cram/genes.t +++ b/tests/functional/translate/cram/genes.t @@ -12,7 +12,7 @@ as a feature ('nuc' in this case) $ ${AUGUR} translate \ > --tree "$ANC_DATA/tree.nwk" \ > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ - > --reference-sequence "$DATA/reference.source.gff" \ + > --reference-sequence "$DATA/reference.gff" \ > --genes gene2 gene3 \ > --output-node-data "aa_muts.genes-args.json" Couldn't find gene gene3 in GFF or GenBank file @@ -33,7 +33,7 @@ Using a text file rather than command line arguments $ ${AUGUR} translate \ > --tree "$ANC_DATA/tree.nwk" \ > --ancestral-sequences "$ANC_DATA/nt_muts.ref-seq.json" \ - > --reference-sequence "$DATA/reference.source.gff" \ + > --reference-sequence "$DATA/reference.gff" \ > --genes "genes.txt" \ > --output-node-data "aa_muts.genes-txt.json" Read in 2 specified genes to translate. diff --git a/tests/functional/translate/cram/gff.t b/tests/functional/translate/cram/gff.t new file mode 100644 index 000000000..aec3eb6fd --- /dev/null +++ b/tests/functional/translate/cram/gff.t @@ -0,0 +1,93 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +These tests are intended to test variants of GFF formatting + + +GFF file with no valid rows + + $ head -n 3 $DATA/reference.gff > "reference.empty.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.empty.gff" \ + > --output-node-data "aa_muts.json" > /dev/null + ERROR: Reference 'reference.empty.gff' contains no valid data rows. .+ (re) + [2] + +GFF file with an extra record + + $ cp $DATA/reference.gff "reference.double.gff" + + $ echo -e "additional\tRefSeq\tsource\t1\t10\t.\t+\t.\tID=additional" >> "reference.double.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.double.gff" \ + > --output-node-data "aa_muts.json" + ERROR: Reference 'reference.double.gff' contains multiple seqids .+ (re) + [2] + + +GFF file with data row GFF type 'region' replaced by 'source' _and_ the +##sequence-region pragma removed. This essentially mimics the information +augur 23.1.1 and earlier would use, before augur started parsing region and/or +the ##sequence-region pragma. + $ grep -v '##sequence-region' "$DATA/reference.gff" | + > sed 's/\tregion\t/\tsource\t/' > "reference-only.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference-only.gff" \ + > --output-node-data "aa_muts-only.json" > /dev/null + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts-only.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {'values_changed': {"root['annotations']['nuc']['type']": {'new_value': 'source', 'old_value': '##sequence-region pragma'}}} + +GFF file with data row added with GFF type 'source' with coordinates which don't match + $ sed '5s/^/reference_name\tRefSeq\tsource\t1\t70\t.\t+\t.\tID=reference_name\n/' \ + > "$DATA/reference.gff" > "reference-contradicts.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference-contradicts.gff" \ + > --output-node-data "aa_muts.json" + ERROR: Reference .+ contained contradictory coordinates .+ (re) + [2] + +GFF file with 'region' removed, so the only genome information is the ##sequence-region pragma + $ egrep -v '\tregion\t' "$DATA/reference.gff" > "reference.pragma-only.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.pragma-only.gff" \ + > --output-node-data "aa_muts.pragma-only.json" > /dev/null + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.pragma-only.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {} + +GFF file with no genome coordinate information + $ egrep -v 'region' "$DATA/reference.gff" > "reference.no-nuc-info.gff" + + $ ${AUGUR} translate \ + > --tree $ANC_DATA/tree.nwk \ + > --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \ + > --reference-sequence "reference.no-nuc-info.gff" \ + > --output-node-data "aa_muts.json" + ERROR: Reference .+ didn't define any information we can use to create the 'nuc' annotation. .+ (re) + [2] \ No newline at end of file diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index b68b134f9..ed2fb54ad 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -12,6 +12,7 @@ Translate amino acids for genes using a GenBank file. > --reference-sequence "$DATA/zika/zika_outgroup.gb" \ > --genes CA PRO \ > --output-node-data aa_muts.json + WARNING: 1 CDS features skipped as they didn't have a locus_tag or gene qualifier. Read in 3 features from reference sequence file Validating schema of '.+nt_muts.json'... (re) amino acid mutations written to .* (re) diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 9cb273bc5..1534245aa 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -18,7 +18,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence "genemap.gff" \ > --output-node-data aa_muts.json - Read in 2 features from reference sequence file + Read in 3 features from reference sequence file Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index 2c7d1d016..ec0bee6d6 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -18,7 +18,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences "${DATA}/zika/nt_muts.json" \ > --reference-sequence genemap.gff \ > --output-node-data aa_muts.json - Read in 2 features from reference sequence file + Read in 3 features from reference sequence file Validating schema of '.+/nt_muts.json'... (re) amino acid mutations written to .* (re) diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t index e58ea8979..ee1c5975a 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t +++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t @@ -17,7 +17,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --vcf-reference-output translations_reference.fasta Gene length of 'rrs' is not a multiple of 3. will pad with N Read in 187 specified genes to translate. - Read in 187 features from reference sequence file + Read in 188 features from reference sequence file 162 genes had no mutations and so have been be excluded. amino acid mutations written to .* (re) diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json index aa3c82801..8fe9db91a 100644 --- a/tests/functional/translate/data/simple-genome/aa_muts.json +++ b/tests/functional/translate/data/simple-genome/aa_muts.json @@ -2,24 +2,24 @@ "annotations": { "gene1": { "end": 24, - "seqid": "data/reference.source.gff", + "seqid": "data/reference.gff", "start": 10, "strand": "+", "type": "gene" }, "gene2": { "end": 47, - "seqid": "data/reference.source.gff", + "seqid": "data/reference.gff", "start": 36, "strand": "-", "type": "gene" }, "nuc": { "end": 50, - "seqid": "data/reference.source.gff", + "seqid": "data/reference.gff", "start": 1, "strand": "+", - "type": "source" + "type": "##sequence-region pragma" } }, "generated_by": { diff --git a/tests/functional/translate/data/simple-genome/reference.source.gff b/tests/functional/translate/data/simple-genome/reference.gff similarity index 59% rename from tests/functional/translate/data/simple-genome/reference.source.gff rename to tests/functional/translate/data/simple-genome/reference.gff index bbe1084a7..c614d2dd6 100644 --- a/tests/functional/translate/data/simple-genome/reference.source.gff +++ b/tests/functional/translate/data/simple-genome/reference.gff @@ -2,6 +2,5 @@ ##created by james hadfield for testing NextStrain (December 2023) ##sequence-region reference_name 1 50 reference_name RefSeq region 1 50 . + . ID=reference_name -reference_name RefSeq source 1 50 . + . ID=reference_name;locus_tag="https://github.com/nextstrain/augur/issues/1349";Note1="Source isn't really a GFF ID, but is required for Nextstrain to function correctly" reference_name RefSeq gene 10 24 . + . Name=gene1;gene=gene1 reference_name RefSeq gene 36 47 . - . Name=gene2;gene=gene2 diff --git a/tests/functional/translate/data/tb/aa_muts.json b/tests/functional/translate/data/tb/aa_muts.json index 75a40fd01..cdd97e67b 100644 --- a/tests/functional/translate/data/tb/aa_muts.json +++ b/tests/functional/translate/data/tb/aa_muts.json @@ -1055,7 +1055,7 @@ "seqid": "translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff", "start": 1, "strand": "+", - "type": "gene" + "type": "##sequence-region pragma" }, "opcA": { "end": 1625365,