From 0174b679294baf35fc2a760697bd9e4766ca26fa Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Fri, 1 Dec 2023 12:14:29 +1300 Subject: [PATCH] [translate] refactor & improve readability There should be no functional changes. Co-locating the sequence reading & feature extraction is easier to read, and were Python's scoping to be different it'd be even nicer as we wouldn't leave around variables which are never re-used. As part of this `translate_vcf_feature` has changed from using an undocumented `return None` to raising a (documented) error which (IMO) is easier to reason with. --- augur/translate.py | 54 +++++++++---------- tests/functional/translate/cram/genes.t | 4 +- .../translate/cram/translate-with-genbank.t | 2 +- .../cram/translate-with-gff-and-gene-name.t | 2 +- .../cram/translate-with-gff-and-gene.t | 2 +- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/augur/translate.py b/augur/translate.py index 1fcfcc487..9358f0309 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -27,6 +27,9 @@ class MissingNodeError(Exception): class MismatchNodeError(Exception): pass +class NoVariationError(Exception): + pass + def safe_translate(sequence, report_exceptions=False): """Returns an amino acid translation of the given nucleotide sequence accounting for gaps in the given sequence. @@ -145,6 +148,9 @@ def translate_vcf_feature(sequences, ref, feature): translated reference gene, positions of AA differences, and AA differences indexed by node name + Raises + ------ + NoVariationError : if no variable sites within this feature (across all sequences) ''' def str_reverse_comp(str_seq): #gets reverse-compliment of a string and returns it as a string @@ -205,11 +211,10 @@ def str_reverse_comp(str_seq): prot['positions'].sort() - #if no variable sites, exclude this gene + # raise an error if no variable sites observed if len(prot['positions']) == 0: - return None - else: - return prot + raise NoVariationError() + return prot def construct_mut(start, pos, end): return str(start) + str(pos) + str(end) @@ -380,6 +385,8 @@ def check_arg_combinations(args, is_vcf): def run(args): ## read tree and data, if reading data fails, return with error code tree = Phylo.read(args.tree, 'newick') + is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) + check_arg_combinations(args, is_vcf) # If genes is a file, read in the genes to translate if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): @@ -387,16 +394,6 @@ def run(args): else: genes = args.genes - ## check file format and read in sequences - is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) - check_arg_combinations(args, is_vcf) - - if is_vcf: - (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) - else: - sequences = sequences_json(args.ancestral_sequences, args.tree) - - ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) if features is None: @@ -404,22 +401,21 @@ def run(args): return 1 print("Read in {} features from reference sequence file".format(len(features))) - ### translate every feature - but not 'nuc'! + ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature translations = {} - deleted = [] - for fname, feat in features.items(): - if is_vcf: - trans = translate_vcf_feature(sequences, ref, feat) - if trans: - translations[fname] = trans - else: - deleted.append(fname) - else: - if feat.type != 'source': - translations[fname] = translate_feature(sequences, feat) - - if len(deleted) != 0: - print("{} genes had no mutations and so have been be excluded.".format(len(deleted))) + if is_vcf: + (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) + features_without_variation = [] + for fname, feat in features.items(): + try: + translations[fname] = translate_vcf_feature(sequences, ref, feat) + except NoVariationError: + features_without_variation.append(fname) + if len(features_without_variation): + print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) + else: + sequences = sequences_json(args.ancestral_sequences, args.tree) + translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'} ## glob the annotations for later auspice export # diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t index 2b629f9e5..c6289bd7a 100644 --- a/tests/functional/translate/cram/genes.t +++ b/tests/functional/translate/cram/genes.t @@ -14,9 +14,9 @@ Similar tests to those in `general.t` but here testing the --genes argument > --reference-sequence $DATA/reference.gff \ > --genes gene2 gene3 \ > --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-args.json" - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 1 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ @@ -36,9 +36,9 @@ Using a text file rather than command line arguments > --genes "$CRAMTMP/$TESTFILE/genes.txt" \ > --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-txt.json" Read in 2 specified genes to translate. - Validating schema of .+ (re) Couldn't find gene gene3 in GFF or GenBank file Read in 1 features from reference sequence file + Validating schema of .+ (re) amino acid mutations written to .+ (re) $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \ diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t index d4dec07e7..5be3992c2 100644 --- a/tests/functional/translate/cram/translate-with-genbank.t +++ b/tests/functional/translate/cram/translate-with-genbank.t @@ -11,8 +11,8 @@ Translate amino acids for genes using a GenBank file. > --reference-sequence translate/data/zika/zika_outgroup.gb \ > --genes CA PRO \ > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... Read in 3 features from reference sequence file + Validating schema of 'translate/data/zika/nt_muts.json'... amino acid mutations written to .* (re) $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json {} diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t index 8796d35c4..f3d19b50a 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t @@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences translate/data/zika/nt_muts.json \ > --reference-sequence "$TMP/genemap.gff" \ > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... Read in 2 features from reference sequence file + Validating schema of 'translate/data/zika/nt_muts.json'... amino acid mutations written to .* (re) Other than the sequence ids which will include a temporary path, the JSONs diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t index 703240e8c..84c4fa995 100644 --- a/tests/functional/translate/cram/translate-with-gff-and-gene.t +++ b/tests/functional/translate/cram/translate-with-gff-and-gene.t @@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store > --ancestral-sequences translate/data/zika/nt_muts.json \ > --reference-sequence "$TMP/genemap.gff" \ > --output-node-data $TMP/aa_muts.json - Validating schema of 'translate/data/zika/nt_muts.json'... Read in 2 features from reference sequence file + Validating schema of 'translate/data/zika/nt_muts.json'... amino acid mutations written to .* (re) $ python3 "../../scripts/diff_jsons.py" \ > --exclude-regex-paths "['seqid']" -- \