Skip to content

Commit

Permalink
[translate] refactor & improve readability
Browse files Browse the repository at this point in the history
There should be no functional changes. Co-locating the sequence reading
& feature extraction is easier to read, and were Python's scoping to be
different it'd be even nicer as we wouldn't leave around variables which
are never re-used.

As part of this `translate_vcf_feature` has changed from using an
undocumented `return None` to raising a (documented) error which (IMO)
is easier to reason with.
  • Loading branch information
jameshadfield committed Dec 4, 2023
1 parent c88a124 commit 0174b67
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 34 deletions.
54 changes: 25 additions & 29 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class MissingNodeError(Exception):
class MismatchNodeError(Exception):
pass

class NoVariationError(Exception):
pass

def safe_translate(sequence, report_exceptions=False):
"""Returns an amino acid translation of the given nucleotide sequence accounting
for gaps in the given sequence.
Expand Down Expand Up @@ -145,6 +148,9 @@ def translate_vcf_feature(sequences, ref, feature):
translated reference gene, positions of AA differences, and AA
differences indexed by node name
Raises
------
NoVariationError : if no variable sites within this feature (across all sequences)
'''
def str_reverse_comp(str_seq):
#gets reverse-compliment of a string and returns it as a string
Expand Down Expand Up @@ -205,11 +211,10 @@ def str_reverse_comp(str_seq):

prot['positions'].sort()

#if no variable sites, exclude this gene
# raise an error if no variable sites observed
if len(prot['positions']) == 0:
return None
else:
return prot
raise NoVariationError()
return prot

def construct_mut(start, pos, end):
return str(start) + str(pos) + str(end)
Expand Down Expand Up @@ -380,46 +385,37 @@ def check_arg_combinations(args, is_vcf):
def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

# If genes is a file, read in the genes to translate
if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
genes = get_genes_from_file(args.genes[0])
else:
genes = args.genes

## check file format and read in sequences
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
else:
sequences = sequences_json(args.ancestral_sequences, args.tree)


## load features; only requested features if genes given
features = load_features(args.reference_sequence, genes)
if features is None:
print("ERROR: could not read features of reference sequence file")
return 1
print("Read in {} features from reference sequence file".format(len(features)))

### translate every feature - but not 'nuc'!
## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
translations = {}
deleted = []
for fname, feat in features.items():
if is_vcf:
trans = translate_vcf_feature(sequences, ref, feat)
if trans:
translations[fname] = trans
else:
deleted.append(fname)
else:
if feat.type != 'source':
translations[fname] = translate_feature(sequences, feat)

if len(deleted) != 0:
print("{} genes had no mutations and so have been be excluded.".format(len(deleted)))
if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
features_without_variation = []
for fname, feat in features.items():
try:
translations[fname] = translate_vcf_feature(sequences, ref, feat)
except NoVariationError:
features_without_variation.append(fname)
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
sequences = sequences_json(args.ancestral_sequences, args.tree)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}

## glob the annotations for later auspice export
#
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/translate/cram/genes.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ Similar tests to those in `general.t` but here testing the --genes argument
> --reference-sequence $DATA/reference.gff \
> --genes gene2 gene3 \
> --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-args.json"
Validating schema of .+ (re)
Couldn't find gene gene3 in GFF or GenBank file
Read in 1 features from reference sequence file
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand All @@ -36,9 +36,9 @@ Using a text file rather than command line arguments
> --genes "$CRAMTMP/$TESTFILE/genes.txt" \
> --output-node-data "$CRAMTMP/$TESTFILE/aa_muts.genes-txt.json"
Read in 2 specified genes to translate.
Validating schema of .+ (re)
Couldn't find gene gene3 in GFF or GenBank file
Read in 1 features from reference sequence file
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/translate/cram/translate-with-genbank.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ Translate amino acids for genes using a GenBank file.
> --reference-sequence translate/data/zika/zika_outgroup.gb \
> --genes CA PRO \
> --output-node-data $TMP/aa_muts.json
Validating schema of 'translate/data/zika/nt_muts.json'...
Read in 3 features from reference sequence file
Validating schema of 'translate/data/zika/nt_muts.json'...
amino acid mutations written to .* (re)
$ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json
{}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --ancestral-sequences translate/data/zika/nt_muts.json \
> --reference-sequence "$TMP/genemap.gff" \
> --output-node-data $TMP/aa_muts.json
Validating schema of 'translate/data/zika/nt_muts.json'...
Read in 2 features from reference sequence file
Validating schema of 'translate/data/zika/nt_muts.json'...
amino acid mutations written to .* (re)

Other than the sequence ids which will include a temporary path, the JSONs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --ancestral-sequences translate/data/zika/nt_muts.json \
> --reference-sequence "$TMP/genemap.gff" \
> --output-node-data $TMP/aa_muts.json
Validating schema of 'translate/data/zika/nt_muts.json'...
Read in 2 features from reference sequence file
Validating schema of 'translate/data/zika/nt_muts.json'...
amino acid mutations written to .* (re)
$ python3 "../../scripts/diff_jsons.py" \
> --exclude-regex-paths "['seqid']" -- \
Expand Down

0 comments on commit 0174b67

Please sign in to comment.