diff --git a/CHANGES.md b/CHANGES.md index 147292c51..2f7a8f867 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,11 +14,14 @@ ### Bug fixes +* Certain strain names would be silently renamed by `augur tree [--method iqtree]`. We now avoid such renaming wherever possible and in cases where there are backslashes or single quotes we now raise a fatal error. +Note that names with spaces in the FASTA header (description line) continue to be modified such that everything after the first space is not used in the resulting tree. [#1750][] (@jameshadfield) * Fixed the error that occurred when running `augur curate --help`. [#1755][] (@joverlee521) [#1744]: https://github.com/nextstrain/augur/pull/1744 [#1745]: https://github.com/nextstrain/augur/pull/1745 [#1755]: https://github.com/nextstrain/augur/pull/1755 +[#1750]: https://github.com/nextstrain/augur/pull/1750 ## 28.0.1 (10 February 2025) diff --git a/augur/tree.py b/augur/tree.py index cbef80840..b60f0a4fe 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -1,5 +1,7 @@ """ Build a tree using a variety of methods. + +IQ-TREE specific: Strain names with spaces are modified to remove all characters after (and including) the first space. """ import os @@ -14,13 +16,14 @@ import numpy as np from treetime.vcf_utils import read_vcf from pathlib import Path +from textwrap import dedent from .errors import AugurError from .io.file import open_file from .io.sequences import read_sequences from .io.shell_command_runner import run_shell_command from .io.vcf import shquote -from .utils import nthreads_value, load_mask_sites +from .utils import nthreads_value, load_mask_sites, read_tree DEFAULT_ARGS = { "fasttree": "-nt -nosupport", @@ -238,27 +241,52 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt ]) # create a dictionary for characters that IQ-tree changes. # we remove those prior to tree-building and reinstantiate later - def random_string(n): - from string import ascii_uppercase as letters - return "".join([letters[i] for i in np.random.randint(len(letters), size=n)]) + # Note: See for an alternate + # approach to escaping using a local-specific non-word regex. prefix = "DELIM" - escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'} + invalid_newick_chars = ["'", "\\"] # "\\" is a single backslash + invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>', + '/', '|', '*', ':', '%', '+', '!', ';', + '&', '@', ',', '$', '=', '^', '~', '?', '#', + '"', '`', + ] + escape_dict = {c:f'_{prefix}-{hex(ord(c))}_' for c in invalid_replaceable_chars} reverse_escape_dict = {v:k for k,v in escape_dict.items()} # IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later tmp_aln_file = str(Path(aln_file).with_name(Path(aln_file).stem + "-delim.fasta")) log_file = str(Path(tmp_aln_file).with_suffix(".iqtree.log")) - num_seqs = 0 + input_sequence_names = set() with open_file(tmp_aln_file, 'w') as ofile, open_file(aln_file) as ifile: for line in ifile: - tmp_line = line if line.startswith(">"): - num_seqs += 1 + strain_name = line.lstrip('>').rstrip('\n') + # Modify the strain name to remove anything after a space (0x20) + # This preserves the behaviour of augur v28 but makes it explicit. + strain_name = strain_name.split(' ')[0] + input_sequence_names.add(strain_name) for c,v in escape_dict.items(): - tmp_line = tmp_line.replace(c,v) + strain_name = strain_name.replace(c,v) + ofile.write(f">{strain_name}\n") + else: + ofile.write(line) + + invalid_newick_names = [strain_name for strain_name in input_sequence_names if any([char in strain_name for char in invalid_newick_chars])] + if len(invalid_newick_names): + if clean_up and os.path.isfile(tmp_aln_file): + os.remove(tmp_aln_file) + raise AugurError(dedent(f"""\ + Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - {f"{chr(10)}{' '*14}- ".join(['"' + str(name) + '"' for name in invalid_newick_names])} + + Invalid characters: {', '.join(['"' + str(char) + '"' for char in invalid_newick_chars])} + """)) - ofile.write(tmp_line) # Check tree builder arguments for conflicts with hardcoded defaults. check_conflicting_args(tree_builder_args, ( @@ -314,7 +342,8 @@ def random_string(n): if os.path.isfile(log_file): print("Please see the log file for more details: {}".format(log_file)) T=None - return T + + return (T, input_sequence_names) def write_out_informative_fasta(compress_seq, alignment, stripFile=None): @@ -455,6 +484,7 @@ def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None + input_sequence_names = None if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): # Prepare a multiple sequence alignment from the given variants VCF and # reference FASTA. @@ -502,7 +532,7 @@ def run(args): if args.method=='raxml': T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) elif args.method=='iqtree': - T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args) + (T, input_sequence_names) = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args) elif args.method=='fasttree': T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) except ConflictingArgumentsException as error: @@ -516,3 +546,30 @@ def run(args): tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') else: return 1 + + + # Finally, double check that the strain names in the tree are exactly the same as the strain names + # in the input alignment. The tree builder may have changed them, or we may have changed them ourselves + # in anticipation of the tree builder & then attempted to change them back. + # NOTE: We read the tree file just written to cover the case where `Phylo.write` changes the name + # NOTE: This check is only implemented for certain code paths - feel free to expand its coverage! + output_tree_names = {str(tip.name) for tip in read_tree(tree_fname).get_terminals()} + if input_sequence_names and len(input_sequence_names ^ output_tree_names)!=0: + # sorting names means we'll (often) be able to visually match them up in the output as the order isn't preserved between + # alignment and tree. However if the name modification was in the first character(s) then this won't work. + # Note: we use `str()` not `repr()` to avoid escapes - we want to show the name exactly as it is. + aln_only = ['"'+str(name)+'"' for name in sorted(list(input_sequence_names - output_tree_names))] + tree_only = ['"'+str(name)+'"' for name in sorted(list(output_tree_names - input_sequence_names))] + raise AugurError(dedent(f"""\ + Different names in input alignment and output tree (newick). It is highly likely this is due + to character replacement by the underlying tree builder or within `augur tree` itself. In the short + term you can change the names of your input (alignment) data, however please also consider submitting + a bug report to . The following names are unescaped and + surrounded by double quotes. + + Names unique to alignment: + - {f"{chr(10)}{' '*12} - ".join(aln_only)} + + Names unique to tree: + - {f"{chr(10)}{' '*12} - ".join(tree_only)} + """)) diff --git a/tests/functional/tree/cram/generate-fasta.py b/tests/functional/tree/cram/generate-fasta.py new file mode 100644 index 000000000..8c3a3d055 --- /dev/null +++ b/tests/functional/tree/cram/generate-fasta.py @@ -0,0 +1,28 @@ +import random + +def get_random_unicode(length): + """Adapted from """ + + try: + get_char = unichr + except NameError: + get_char = chr + + # code point ranges to be sampled + include_ranges = [ + # ASCII non-control characters excluding single quote (0x27) and backslash (0x5c) + (0x20, 0x26), (0x28, 0x5b), (0x5d, 0x7e) + ] + + alphabet = [ + get_char(code_point) for current_range in include_ranges + for code_point in range(current_range[0], current_range[1] + 1) + ] + return ''.join(random.choice(alphabet) for _ in range(length)) + + +if __name__ == "__main__": + for i in range(10): + print('>' + get_random_unicode(5)) + print("ATGC") + diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t new file mode 100644 index 000000000..355403644 --- /dev/null +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -0,0 +1,118 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +BACKGROUND: IQ tree modifies strain names to remove certain characters. We +attempt to prevent this within augur tree (by replacing them on the way into +IQ-TREE and switching them back on the way out). Beyond this certain characters +can't be written by Bio.Phylo as newick strain names (without escaping). + +# Start with a trivial test + + $ echo -e ">AAA\nATGC\n>BBB\nATGC\n>CCC\nATGC\n" > trivial.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment trivial.mfa \ + > --output trivial.new 1>/dev/null + +Test single quotes, which were the errors behind , + + + $ echo -e ">Coted'Ivoire/IPCI-DVE-GR1901/2022\nATGC" > single-quotes.mfa + $ echo -e ">A/Geoffroy'sCat/WA/24-037410-004-original/2024\nATGC" >> single-quotes.mfa + $ echo -e ">need-three-taxa\nATGC" >> single-quotes.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment single-quotes.mfa \ + > --output single-quotes.new 1>/dev/null + ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - .+ (re) + - .+ (re) + + Invalid characters: .* (re) + + [2] + +Test some other observed strain names to ensure they're not modified + $ echo -e ">A/PETFOOD/USA:OR/24-037325-011/2024\nATGC" > reported.mfa + $ echo -e ">[name>$!%]:1234\nATGC" >> reported.mfa + $ echo -e ">simple\nATGC" >> reported.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment reported.mfa \ + > --output reported.new 1>/dev/null + +Strain names with spaces are modified + + $ echo -e ">space afterwards\nATGC" > spaces.mfa + $ echo -e ">name\nATGC" >> spaces.mfa + $ echo -e ">simple\nATGC" >> spaces.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment spaces.mfa \ + > --output spaces.new 1>/dev/null + + $ grep 'afterwards' spaces.new + [1] + + +Backslashes are problematic, but only some of the time. For instance (and assuming we replaced the +character for the IQ-TREE building) the first three of these strains work and the others are +escaped by `Bio.Phylo.write`. Erring on the side of caution we don't allow any backslashes, +similar to single quotes. + + $ echo '>sing\le' > backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>dou\\ble' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>three\\\ee' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>v~`,\' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>wb)\o' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment backslashes.mfa \ + > --output backslashes.new 1>/dev/null + ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - .+ (re) + - .+ (re) + - .+ (re) + - .+ (re) + - .+ (re) + + Invalid characters: .* (re) + + [2] + +This test generates random ASCII names. It's disabled for CI as it's both +stochastic and slow but you can easily toggle it back on (by uncommenting the +function call) if you want to better test strain name handling in `augur tree` + + $ random_ascii_names() { + > python3 "$TESTDIR"/generate-fasta.py > random_${1}.mfa + > + > ${AUGUR} tree \ + > --method iqtree \ + > --alignment random_${1}.mfa \ + > --output random_${1}.new > /dev/null + > } + + $ for iteration in $(seq 1 100); do + > # random_ascii_names $iteration + > : + > done \ No newline at end of file