From 127d644696d1721956990ee4a4cc6e98fd835d84 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 10 Feb 2025 12:44:08 +1300 Subject: [PATCH 1/7] [tree] Identify when strain names are mismatched Includes failing test --- augur/tree.py | 40 ++++++++++++++++--- .../tree/cram/iqtree-name-modifications.t | 40 +++++++++++++++++++ 2 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 tests/functional/tree/cram/iqtree-name-modifications.t diff --git a/augur/tree.py b/augur/tree.py index cbef80840..87b738d89 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -14,13 +14,14 @@ import numpy as np from treetime.vcf_utils import read_vcf from pathlib import Path +from textwrap import dedent from .errors import AugurError from .io.file import open_file from .io.sequences import read_sequences from .io.shell_command_runner import run_shell_command from .io.vcf import shquote -from .utils import nthreads_value, load_mask_sites +from .utils import nthreads_value, load_mask_sites, read_tree DEFAULT_ARGS = { "fasttree": "-nt -nosupport", @@ -249,12 +250,12 @@ def random_string(n): # IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later tmp_aln_file = str(Path(aln_file).with_name(Path(aln_file).stem + "-delim.fasta")) log_file = str(Path(tmp_aln_file).with_suffix(".iqtree.log")) - num_seqs = 0 + input_sequence_names = set() with open_file(tmp_aln_file, 'w') as ofile, open_file(aln_file) as ifile: for line in ifile: tmp_line = line if line.startswith(">"): - num_seqs += 1 + input_sequence_names.add(tmp_line.lstrip('>').rstrip('\n')) for c,v in escape_dict.items(): tmp_line = tmp_line.replace(c,v) @@ -314,7 +315,8 @@ def random_string(n): if os.path.isfile(log_file): print("Please see the log file for more details: {}".format(log_file)) T=None - return T + + return (T, input_sequence_names) def write_out_informative_fasta(compress_seq, alignment, stripFile=None): @@ -455,6 +457,7 @@ def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None + input_sequence_names = None if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): # Prepare a multiple sequence alignment from the given variants VCF and # reference FASTA. @@ -502,7 +505,7 @@ def run(args): if args.method=='raxml': T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) elif args.method=='iqtree': - T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args) + (T, input_sequence_names) = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args) elif args.method=='fasttree': T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) except ConflictingArgumentsException as error: @@ -516,3 +519,30 @@ def run(args): tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') else: return 1 + + + # Finally, double check that the strain names in the tree are exactly the same as the strain names + # in the input alignment. The tree builder may have changed them, or we may have changed them ourselves + # in anticipation of the tree builder & then attempted to change them back. + # NOTE: We read the tree file just written to cover the case where `Phylo.write` changes the name + # NOTE: This check is only implemented for certain code paths - feel free to expand its coverage! + output_tree_names = {str(tip.name) for tip in read_tree(tree_fname).get_terminals()} + if input_sequence_names and len(input_sequence_names ^ output_tree_names)!=0: + # sorting names means we'll (often) be able to visually match them up in the output as the order isn't preserved between + # alignment and tree. However if the name modification was in the first character(s) then this won't work. + # Note: we use `str()` not `repr()` to avoid escapes - we want to show the name exactly as it is. + aln_only = ['"'+str(name)+'"' for name in sorted(list(input_sequence_names - output_tree_names))] + tree_only = ['"'+str(name)+'"' for name in sorted(list(output_tree_names - input_sequence_names))] + raise AugurError(dedent(f"""\ + Different names in input alignment and output tree (newick). It is highly likely this is due + to character replacement by the underlying tree builder or within `augur tree` itself. In the short + term you can change the names of your input (alignment) data, however please also consider submitting + a bug report to . The following names are unescaped and + surrounded by double quotes. + + Names unique to alignment: + - {f"{chr(10)}{' '*12} - ".join(aln_only)} + + Names unique to tree: + - {f"{chr(10)}{' '*12} - ".join(tree_only)} + """)) diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t new file mode 100644 index 000000000..87b7fbf2c --- /dev/null +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -0,0 +1,40 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +BACKGROUND: IQ tree modifies strain names to remove certain characters. We +attempt to prevent this within augur tree (by replacing them on the way into +IQ-TREE and switching them back on the way out). Beyond this certain characters +can't be written by Bio.Phylo as newick strain names (without escaping). + +# Start with a trivial test + + $ echo -e ">AAA\nATGC\n>BBB\nATGC\n>CCC\nATGC\n" > trivial.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment trivial.mfa \ + > --output trivial.new 1>/dev/null + +Test single quotes, which were the errors behind , + + + $ echo -e ">Coted'Ivoire/IPCI-DVE-GR1901/2022\nATGC" > single-quotes.mfa + $ echo -e ">A/Geoffroy'sCat/WA/24-037410-004-original/2024\nATGC" >> single-quotes.mfa + $ echo -e ">need-three-taxa\nATGC" >> single-quotes.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment single-quotes.mfa \ + > --output single-quotes.new 1>/dev/null + + +Test some other observed strain names to ensure they're not modified + $ echo -e ">A/PETFOOD/USA:OR/24-037325-011/2024\nATGC" > reported.mfa + $ echo -e ">[name>$!%]:1234\nATGC" >> reported.mfa + $ echo -e ">simple\nATGC" >> reported.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment reported.mfa \ + > --output reported.new 1>/dev/null From ed654ee1f577effc60de052318be1e19d0b96a55 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 10 Feb 2025 16:46:56 +1300 Subject: [PATCH 2/7] [tree] expand iq-tree escape chars We also replace invalid characters with their unicode hex representation (rather than a random string) which both guarantees uniqueness and makes debugging a whole lot easier. See for an alternate approach to escaping using a local-specific non-word regex. --- augur/tree.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/augur/tree.py b/augur/tree.py index 87b738d89..675cb15e6 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -239,11 +239,16 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt ]) # create a dictionary for characters that IQ-tree changes. # we remove those prior to tree-building and reinstantiate later - def random_string(n): - from string import ascii_uppercase as letters - return "".join([letters[i] for i in np.random.randint(len(letters), size=n)]) + # Note: See for an alternate + # approach to escaping using a local-specific non-word regex. prefix = "DELIM" - escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'} + + invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>', + '/', "\\", '|', '*', ':', '%', '+', '!', ';', # "\\" is a single backslash + '&', '@', ',', '$', '=', '^', '~', '?', '#', + '"', '`', ' ', + ] + escape_dict = {c:f'_{prefix}-{hex(ord(c))}_' for c in invalid_replaceable_chars} reverse_escape_dict = {v:k for k,v in escape_dict.items()} @@ -253,13 +258,14 @@ def random_string(n): input_sequence_names = set() with open_file(tmp_aln_file, 'w') as ofile, open_file(aln_file) as ifile: for line in ifile: - tmp_line = line if line.startswith(">"): - input_sequence_names.add(tmp_line.lstrip('>').rstrip('\n')) + strain_name = line.lstrip('>').rstrip('\n') + input_sequence_names.add(strain_name) for c,v in escape_dict.items(): - tmp_line = tmp_line.replace(c,v) - - ofile.write(tmp_line) + strain_name = strain_name.replace(c,v) + ofile.write(f">{strain_name}\n") + else: + ofile.write(line) # Check tree builder arguments for conflicts with hardcoded defaults. check_conflicting_args(tree_builder_args, ( From 778620d46387cf1c35686ac7a4c164f34f59258a Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 11 Feb 2025 09:46:28 +1300 Subject: [PATCH 3/7] [tree] disallow single quotes These are ok within Augur / Bio.Phylo code (assuming we replace them for the tree builder) but are escaped by `Bio.Phylo.Write`, so "'" becomes "\'" in the newick string. While there may be some way around this it's going to cause constant headaches for bioinformatics and so these characters should be replaced prior to running `augur tree` --- augur/tree.py | 18 +++++++++++++++++- .../tree/cram/iqtree-name-modifications.t | 12 +++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/augur/tree.py b/augur/tree.py index 675cb15e6..2e48ac212 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -242,7 +242,7 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt # Note: See for an alternate # approach to escaping using a local-specific non-word regex. prefix = "DELIM" - + invalid_newick_chars = ["'"] invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>', '/', "\\", '|', '*', ':', '%', '+', '!', ';', # "\\" is a single backslash '&', '@', ',', '$', '=', '^', '~', '?', '#', @@ -267,6 +267,22 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt else: ofile.write(line) + invalid_newick_names = [strain_name for strain_name in input_sequence_names if any([char in strain_name for char in invalid_newick_chars])] + if len(invalid_newick_names): + if clean_up and os.path.isfile(tmp_aln_file): + os.remove(tmp_aln_file) + raise AugurError(dedent(f"""\ + Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - {f"{chr(10)}{' '*14}- ".join(['"' + str(name) + '"' for name in invalid_newick_names])} + + Invalid characters: {', '.join(['"' + str(char) + '"' for char in invalid_newick_chars])} + """)) + + # Check tree builder arguments for conflicts with hardcoded defaults. check_conflicting_args(tree_builder_args, ( # -ntmax/--threads-max are synonyms diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t index 87b7fbf2c..a27dcd304 100644 --- a/tests/functional/tree/cram/iqtree-name-modifications.t +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -27,7 +27,17 @@ Test single quotes, which were the errors behind --method iqtree \ > --alignment single-quotes.mfa \ > --output single-quotes.new 1>/dev/null - + ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - .+ (re) + - .+ (re) + + Invalid characters: .* (re) + + [2] Test some other observed strain names to ensure they're not modified $ echo -e ">A/PETFOOD/USA:OR/24-037325-011/2024\nATGC" > reported.mfa From cdee135a7e92fe3006b1bc1c7c9383b2a9658015 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 11 Feb 2025 09:56:09 +1300 Subject: [PATCH 4/7] [tree] disallow backslashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depending on the strain name backslashes are fine some of the time and problematic other times. (Assuming we replace the backslash for the "-delim" alignment passed to IQ-TREE.) All examples here work fine within our code (and calls to `Bio.Phylo`) but some of them are escaped during the `Bio.Phylo.write` call¹. Examples which are written as-is: - "sing\le" - "dou\\ble" - "three\\\ee" Some examples which are escaped ("\" → "\\") - "v~`,\" - "wb)\o" - " Yb\y" - ";:R@\" - ";U,\~" - "\[4{h" - "\|$[8" - "\9]oC" - "\ZY)x" - "5\j (" - "9_n[\" - "a_\B;" - "CQ9\]" - "F-V:\" - "Dj \." - "F;2C\" - "h\)n|" - "j(A\v" - "l\+)*" - "v~`,\" ¹ Code is here, although nothing obvious sticks out to me --- augur/tree.py | 4 +-- .../tree/cram/iqtree-name-modifications.t | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/augur/tree.py b/augur/tree.py index 2e48ac212..835a1113b 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -242,9 +242,9 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt # Note: See for an alternate # approach to escaping using a local-specific non-word regex. prefix = "DELIM" - invalid_newick_chars = ["'"] + invalid_newick_chars = ["'", "\\"] # "\\" is a single backslash invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>', - '/', "\\", '|', '*', ':', '%', '+', '!', ';', # "\\" is a single backslash + '/', '|', '*', ':', '%', '+', '!', ';', '&', '@', ',', '$', '=', '^', '~', '?', '#', '"', '`', ' ', ] diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t index a27dcd304..e86c9d6ae 100644 --- a/tests/functional/tree/cram/iqtree-name-modifications.t +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -48,3 +48,39 @@ Test some other observed strain names to ensure they're not modified > --method iqtree \ > --alignment reported.mfa \ > --output reported.new 1>/dev/null + + +Backslashes are problematic, but only some of the time. For instance (and assuming we replaced the +character for the IQ-TREE building) the first three of these strains work and the others are +escaped by `Bio.Phylo.write`. Erring on the side of caution we don't allow any backslashes, +similar to single quotes. + + $ echo '>sing\le' > backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>dou\\ble' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>three\\\ee' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>v~`,\' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + $ echo '>wb)\o' >> backslashes.mfa + $ echo 'ATGC' >> backslashes.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment backslashes.mfa \ + > --output backslashes.new 1>/dev/null + ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least). + You should ensure these strain names are changed as early as possible in your analysis. The following + names are unescaped and surrounded by double quotes. + + Invalid strain names: + - .+ (re) + - .+ (re) + - .+ (re) + - .+ (re) + - .+ (re) + + Invalid characters: .* (re) + + [2] From c8611695c5266d620023f43eee749b2c46252ac5 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 11 Feb 2025 10:19:29 +1300 Subject: [PATCH 5/7] [tree] random ascii strain name testing This test generates random ASCII names. It's disabled for CI as it's both stochastic and slow but you can easily toggle it back on (by uncommenting the function call) if you want to better test strain name handling in `augur tree` --- tests/functional/tree/cram/generate-fasta.py | 28 +++++++++++++++++++ .../tree/cram/iqtree-name-modifications.t | 18 ++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 tests/functional/tree/cram/generate-fasta.py diff --git a/tests/functional/tree/cram/generate-fasta.py b/tests/functional/tree/cram/generate-fasta.py new file mode 100644 index 000000000..8c3a3d055 --- /dev/null +++ b/tests/functional/tree/cram/generate-fasta.py @@ -0,0 +1,28 @@ +import random + +def get_random_unicode(length): + """Adapted from """ + + try: + get_char = unichr + except NameError: + get_char = chr + + # code point ranges to be sampled + include_ranges = [ + # ASCII non-control characters excluding single quote (0x27) and backslash (0x5c) + (0x20, 0x26), (0x28, 0x5b), (0x5d, 0x7e) + ] + + alphabet = [ + get_char(code_point) for current_range in include_ranges + for code_point in range(current_range[0], current_range[1] + 1) + ] + return ''.join(random.choice(alphabet) for _ in range(length)) + + +if __name__ == "__main__": + for i in range(10): + print('>' + get_random_unicode(5)) + print("ATGC") + diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t index e86c9d6ae..41191caf6 100644 --- a/tests/functional/tree/cram/iqtree-name-modifications.t +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -84,3 +84,21 @@ similar to single quotes. Invalid characters: .* (re) [2] + +This test generates random ASCII names. It's disabled for CI as it's both +stochastic and slow but you can easily toggle it back on (by uncommenting the +function call) if you want to better test strain name handling in `augur tree` + + $ random_ascii_names() { + > python3 "$TESTDIR"/generate-fasta.py > random_${1}.mfa + > + > ${AUGUR} tree \ + > --method iqtree \ + > --alignment random_${1}.mfa \ + > --output random_${1}.new > /dev/null + > } + + $ for iteration in $(seq 1 100); do + > # random_ascii_names $iteration + > : + > done \ No newline at end of file From 2eecec4d72344a50a8acba39f4d687dce029a290 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 11 Feb 2025 11:35:22 +1300 Subject: [PATCH 6/7] [tree] modify strain names with spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Augur v28 (and earlier), strain names which included spaces such as the FASTA input ">KM597072.1 Mumps virus genotype G strain MuV/Gabon/13/2[G], complete genome" would be written to a delimited FASTA file by `augur tree` which didn't replace those spaces. IQ-TREE would then change those names by dropping all characters after (and including) the first space. After replacement of any delimeters the resulting `--output` newick would have "KM597072.1". The previous commits in this series changed the behaviour to preserve strain names entirely, which resulted in the full name in the output newick. This caused issues downstream because `augur refine` does the same name replacement when it parses the alignment file¹ and thus we have a mismatch of tree names and (parsed) alignment names. This commit restores the original behaviour with regards to names with spaces. ¹ --- augur/tree.py | 7 ++++++- .../tree/cram/iqtree-name-modifications.t | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/augur/tree.py b/augur/tree.py index 835a1113b..b60f0a4fe 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -1,5 +1,7 @@ """ Build a tree using a variety of methods. + +IQ-TREE specific: Strain names with spaces are modified to remove all characters after (and including) the first space. """ import os @@ -246,7 +248,7 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>', '/', '|', '*', ':', '%', '+', '!', ';', '&', '@', ',', '$', '=', '^', '~', '?', '#', - '"', '`', ' ', + '"', '`', ] escape_dict = {c:f'_{prefix}-{hex(ord(c))}_' for c in invalid_replaceable_chars} reverse_escape_dict = {v:k for k,v in escape_dict.items()} @@ -260,6 +262,9 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt for line in ifile: if line.startswith(">"): strain_name = line.lstrip('>').rstrip('\n') + # Modify the strain name to remove anything after a space (0x20) + # This preserves the behaviour of augur v28 but makes it explicit. + strain_name = strain_name.split(' ')[0] input_sequence_names.add(strain_name) for c,v in escape_dict.items(): strain_name = strain_name.replace(c,v) diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t index 41191caf6..355403644 100644 --- a/tests/functional/tree/cram/iqtree-name-modifications.t +++ b/tests/functional/tree/cram/iqtree-name-modifications.t @@ -49,6 +49,20 @@ Test some other observed strain names to ensure they're not modified > --alignment reported.mfa \ > --output reported.new 1>/dev/null +Strain names with spaces are modified + + $ echo -e ">space afterwards\nATGC" > spaces.mfa + $ echo -e ">name\nATGC" >> spaces.mfa + $ echo -e ">simple\nATGC" >> spaces.mfa + + $ ${AUGUR} tree \ + > --method iqtree \ + > --alignment spaces.mfa \ + > --output spaces.new 1>/dev/null + + $ grep 'afterwards' spaces.new + [1] + Backslashes are problematic, but only some of the time. For instance (and assuming we replaced the character for the IQ-TREE building) the first three of these strains work and the others are From 93d27409c23b2dd58303baa10b92d32a45a90d6c Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 13 Feb 2025 14:41:45 +1300 Subject: [PATCH 7/7] changelog --- CHANGES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 147292c51..2f7a8f867 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,11 +14,14 @@ ### Bug fixes +* Certain strain names would be silently renamed by `augur tree [--method iqtree]`. We now avoid such renaming wherever possible and in cases where there are backslashes or single quotes we now raise a fatal error. +Note that names with spaces in the FASTA header (description line) continue to be modified such that everything after the first space is not used in the resulting tree. [#1750][] (@jameshadfield) * Fixed the error that occurred when running `augur curate --help`. [#1755][] (@joverlee521) [#1744]: https://github.com/nextstrain/augur/pull/1744 [#1745]: https://github.com/nextstrain/augur/pull/1745 [#1755]: https://github.com/nextstrain/augur/pull/1755 +[#1750]: https://github.com/nextstrain/augur/pull/1750 ## 28.0.1 (10 February 2025)