nextstrain · jameshadfield · Feb 9, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -14,11 +14,14 @@
 
 ### Bug fixes
 
+* Certain strain names would be silently renamed by `augur tree [--method iqtree]`. We now avoid such renaming wherever possible and in cases where there are backslashes or single quotes we now raise a fatal error.
+Note that names with spaces in the FASTA header (description line) continue to be modified such that everything after the first space is not used in the resulting tree. [#1750][] (@jameshadfield)
 * Fixed the error that occurred when running `augur curate --help`. [#1755][] (@joverlee521)
 
 [#1744]: https://github.com/nextstrain/augur/pull/1744
 [#1745]: https://github.com/nextstrain/augur/pull/1745
 [#1755]: https://github.com/nextstrain/augur/pull/1755
+[#1750]: https://github.com/nextstrain/augur/pull/1750
 
 ## 28.0.1 (10 February 2025)
 

diff --git a/augur/tree.py b/augur/tree.py
@@ -1,5 +1,7 @@
 """
 Build a tree using a variety of methods.
+
+IQ-TREE specific: Strain names with spaces are modified to remove all characters after (and including) the first space.
 """
 
 import os
@@ -14,13 +16,14 @@
 import numpy as np
 from treetime.vcf_utils import read_vcf
 from pathlib import Path
+from textwrap import dedent
 
 from .errors import AugurError
 from .io.file import open_file
 from .io.sequences import read_sequences
 from .io.shell_command_runner import run_shell_command
 from .io.vcf import shquote
-from .utils import nthreads_value, load_mask_sites
+from .utils import nthreads_value, load_mask_sites, read_tree
 
 DEFAULT_ARGS = {
     "fasttree": "-nt -nosupport",
@@ -238,27 +241,52 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
     ])
     # create a dictionary for characters that IQ-tree changes.
     # we remove those prior to tree-building and reinstantiate later
-    def random_string(n):
-        from string import ascii_uppercase as letters
-        return "".join([letters[i] for i in np.random.randint(len(letters), size=n)])
+    # Note: See <https://github.com/nextstrain/augur/pull/1085> for an alternate
+    # approach to escaping using a local-specific non-word regex.
     prefix = "DELIM"
-    escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'}
+    invalid_newick_chars = ["'", "\\"] # "\\" is a single backslash
+    invalid_replaceable_chars = ['(', ')', '{', '}', '[', ']', '<', '>',
+                                 '/', '|', '*', ':', '%', '+', '!', ';',
+                                 '&', '@', ',', '$', '=', '^', '~', '?', '#',
+                                 '"', '`',
+                                ]
+    escape_dict = {c:f'_{prefix}-{hex(ord(c))}_' for c in invalid_replaceable_chars}
     reverse_escape_dict = {v:k for k,v in escape_dict.items()}
 
 
     # IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
     tmp_aln_file = str(Path(aln_file).with_name(Path(aln_file).stem + "-delim.fasta"))
     log_file = str(Path(tmp_aln_file).with_suffix(".iqtree.log"))
-    num_seqs = 0
+    input_sequence_names = set()
     with open_file(tmp_aln_file, 'w') as ofile, open_file(aln_file) as ifile:
         for line in ifile:
-            tmp_line = line
             if line.startswith(">"):
-                num_seqs += 1
+                strain_name = line.lstrip('>').rstrip('\n')
+                # Modify the strain name to remove anything after a space (0x20)
+                # This preserves the behaviour of augur v28 but makes it explicit.
+                strain_name = strain_name.split(' ')[0]
+                input_sequence_names.add(strain_name)
                 for c,v in escape_dict.items():
-                    tmp_line = tmp_line.replace(c,v)
+                    strain_name = strain_name.replace(c,v)
+                ofile.write(f">{strain_name}\n")
+            else:
+                ofile.write(line)
+
+    invalid_newick_names = [strain_name for strain_name in input_sequence_names if any([char in strain_name for char in invalid_newick_chars])]
+    if len(invalid_newick_names):
+        if clean_up and os.path.isfile(tmp_aln_file):
+            os.remove(tmp_aln_file)
+        raise AugurError(dedent(f"""\
+            Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least).
+            You should ensure these strain names are changed as early as possible in your analysis. The following
+            names are unescaped and surrounded by double quotes.
+
+            Invalid strain names:
+              - {f"{chr(10)}{' '*14}- ".join(['"' + str(name) + '"' for name in invalid_newick_names])}
+
+            Invalid characters: {', '.join(['"' + str(char) + '"' for char in invalid_newick_chars])}
+            """))
 
-            ofile.write(tmp_line)
 
     # Check tree builder arguments for conflicts with hardcoded defaults.
     check_conflicting_args(tree_builder_args, (
@@ -314,7 +342,8 @@ def random_string(n):
         if os.path.isfile(log_file):
             print("Please see the log file for more details: {}".format(log_file))
         T=None
-    return T
+
+    return (T, input_sequence_names)
 
 
 def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
@@ -455,6 +484,7 @@ def run(args):
     # check alignment type, set flags, read in if VCF
     is_vcf = False
     ref = None
+    input_sequence_names = None
     if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         # Prepare a multiple sequence alignment from the given variants VCF and
         # reference FASTA.
@@ -502,7 +532,7 @@ def run(args):
         if args.method=='raxml':
             T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args)
         elif args.method=='iqtree':
-            T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args)
+            (T, input_sequence_names) = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args)
         elif args.method=='fasttree':
             T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args)
     except ConflictingArgumentsException as error:
@@ -516,3 +546,30 @@ def run(args):
         tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
     else:
         return 1
+
+
+    # Finally, double check that the strain names in the tree are exactly the same as the strain names
+    # in the input alignment. The tree builder may have changed them, or we may have changed them ourselves
+    # in anticipation of the tree builder & then attempted to change them back.
+    # NOTE: We read the tree file just written to cover the case where `Phylo.write` changes the name
+    # NOTE: This check is only implemented for certain code paths - feel free to expand its coverage!
+    output_tree_names = {str(tip.name) for tip in read_tree(tree_fname).get_terminals()}
+    if input_sequence_names and len(input_sequence_names ^ output_tree_names)!=0:
+        # sorting names means we'll (often) be able to visually match them up in the output as the order isn't preserved between
+        # alignment and tree. However if the name modification was in the first character(s) then this won't work.
+        # Note: we use `str()` not `repr()` to avoid escapes - we want to show the name exactly as it is.
+        aln_only = ['"'+str(name)+'"' for name in sorted(list(input_sequence_names - output_tree_names))]
+        tree_only = ['"'+str(name)+'"' for name in sorted(list(output_tree_names - input_sequence_names))]
+        raise AugurError(dedent(f"""\
+            Different names in input alignment and output tree (newick). It is highly likely this is due
+            to character replacement by the underlying tree builder or within `augur tree` itself. In the short
+            term you can change the names of your input (alignment) data, however please also consider submitting
+            a bug report to <https://github.com/nextstrain/augur/issues>. The following names are unescaped and
+            surrounded by double quotes.
+
+            Names unique to alignment:
+             - {f"{chr(10)}{' '*12} - ".join(aln_only)}
+
+            Names unique to tree:
+             - {f"{chr(10)}{' '*12} - ".join(tree_only)}
+            """))
diff --git a/tests/functional/tree/cram/generate-fasta.py b/tests/functional/tree/cram/generate-fasta.py
@@ -0,0 +1,28 @@
+import random
+
+def get_random_unicode(length):
+    """Adapted from <https://stackoverflow.com/a/21666621>"""
+
+    try:
+        get_char = unichr
+    except NameError:
+        get_char = chr
+
+    # code point ranges to be sampled
+    include_ranges = [
+        # ASCII non-control characters excluding single quote (0x27) and backslash (0x5c)
+        (0x20, 0x26), (0x28, 0x5b), (0x5d, 0x7e)
+    ]
+
+    alphabet = [
+        get_char(code_point) for current_range in include_ranges
+            for code_point in range(current_range[0], current_range[1] + 1)
+    ]
+    return ''.join(random.choice(alphabet) for _ in range(length))
+
+
+if __name__ == "__main__":
+    for i in range(10):
+        print('>' + get_random_unicode(5))
+        print("ATGC")
+
diff --git a/tests/functional/tree/cram/iqtree-name-modifications.t b/tests/functional/tree/cram/iqtree-name-modifications.t
@@ -0,0 +1,118 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+BACKGROUND: IQ tree modifies strain names to remove certain characters. We
+attempt to prevent this within augur tree (by replacing them on the way into
+IQ-TREE and switching them back on the way out). Beyond this certain characters
+can't be written by Bio.Phylo as newick strain names (without escaping).
+
+# Start with a trivial test
+
+  $ echo -e ">AAA\nATGC\n>BBB\nATGC\n>CCC\nATGC\n" > trivial.mfa
+
+  $ ${AUGUR} tree \
+  >  --method iqtree \
+  >  --alignment trivial.mfa \
+  >  --output trivial.new 1>/dev/null
+
+Test single quotes, which were the errors behind <https://github.com/nextstrain/augur/issues/1084>,
+<https://github.com/nextstrain/avian-flu/issues/122> 
+
+  $ echo -e ">Coted'Ivoire/IPCI-DVE-GR1901/2022\nATGC" > single-quotes.mfa
+  $ echo -e ">A/Geoffroy'sCat/WA/24-037410-004-original/2024\nATGC" >> single-quotes.mfa
+  $ echo -e ">need-three-taxa\nATGC" >> single-quotes.mfa
+
+  $ ${AUGUR} tree \
+  >  --method iqtree \
+  >  --alignment single-quotes.mfa \
+  >  --output single-quotes.new 1>/dev/null
+  ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least).
+  You should ensure these strain names are changed as early as possible in your analysis. The following
+  names are unescaped and surrounded by double quotes.
+
+  Invalid strain names:
+    - .+ (re)
+    - .+ (re)
+
+  Invalid characters: .* (re)
+
+  [2]
+
+Test some other observed strain names to ensure they're not modified
+  $ echo -e ">A/PETFOOD/USA:OR/24-037325-011/2024\nATGC" > reported.mfa
+  $ echo -e ">[name>$!%]:1234\nATGC" >> reported.mfa
+  $ echo -e ">simple\nATGC" >> reported.mfa
+
+  $ ${AUGUR} tree \
+  >  --method iqtree \
+  >  --alignment reported.mfa \
+  >  --output reported.new 1>/dev/null
+
+Strain names with spaces are modified
+
+  $ echo -e ">space afterwards\nATGC" > spaces.mfa
+  $ echo -e ">name\nATGC" >> spaces.mfa
+  $ echo -e ">simple\nATGC" >> spaces.mfa
+
+  $ ${AUGUR} tree \
+  >  --method iqtree \
+  >  --alignment spaces.mfa \
+  >  --output spaces.new 1>/dev/null
+
+  $ grep 'afterwards' spaces.new
+  [1]
+
+
+Backslashes are problematic, but only some of the time. For instance (and assuming we replaced the
+character for the IQ-TREE building) the first three of these strains work and the others are
+escaped by `Bio.Phylo.write`. Erring on the side of caution we don't allow any backslashes,
+similar to single quotes.
+
+  $ echo '>sing\le' > backslashes.mfa
+  $ echo 'ATGC' >> backslashes.mfa
+  $ echo '>dou\\ble' >> backslashes.mfa
+  $ echo 'ATGC' >> backslashes.mfa
+  $ echo '>three\\\ee' >> backslashes.mfa
+  $ echo 'ATGC' >> backslashes.mfa
+  $ echo '>v~`,\' >> backslashes.mfa
+  $ echo 'ATGC' >> backslashes.mfa
+  $ echo '>wb)\o' >> backslashes.mfa
+  $ echo 'ATGC' >> backslashes.mfa
+
+  $ ${AUGUR} tree \
+  >  --method iqtree \
+  >  --alignment backslashes.mfa \
+  >  --output backslashes.new 1>/dev/null
+  ERROR: Certain strain names have characters which cannot be written in newick format (by Bio.Python, at least).
+  You should ensure these strain names are changed as early as possible in your analysis. The following
+  names are unescaped and surrounded by double quotes.
+
+  Invalid strain names:
+    - .+ (re)
+    - .+ (re)
+    - .+ (re)
+    - .+ (re)
+    - .+ (re)
+
+  Invalid characters: .* (re)
+
+  [2]
+
+This test generates random ASCII names. It's disabled for CI as it's both
+stochastic and slow but you can easily toggle it back on (by uncommenting the
+function call) if you want to better test strain name handling in `augur tree`
+
+  $ random_ascii_names() {
+  >   python3 "$TESTDIR"/generate-fasta.py > random_${1}.mfa
+  >  
+  >   ${AUGUR} tree \
+  >    --method iqtree \
+  >    --alignment random_${1}.mfa \
+  >    --output random_${1}.new > /dev/null
+  > }
+
+  $ for iteration in $(seq 1 100); do
+  >    # random_ascii_names $iteration
+  >    :
+  > done