[genome builds] Implement masking

Masks out positions in the genome which have nucleotides called in <50% of samples. This is especially important in genome builds as the terminal ends of segments are no longer terminal and thus sparse sequence data was resulting in artefactual partitioning of the tree. I've left the cattle-flu builds unchanged (i.e. no masking) but we should revisit this.
nextstrain · Feb 23, 2025 · 898e581 · 898e581
1 parent 732e38e
commit 898e581
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 1 deletion.
diff --git a/config/h5n1-cattle-outbreak.yaml b/config/h5n1-cattle-outbreak.yaml
@@ -69,6 +69,9 @@ filter:
   exclude_where:
     FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
 
+mask:
+  min_support: 0 # This lets all positions through regardless of how many sequences have a base
+
 
 refine:
   coalescent: const

diff --git a/config/h5n1-d1.1.yaml b/config/h5n1-d1.1.yaml
@@ -61,6 +61,8 @@ filter:
   exclude_where:
     FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
 
+mask:
+  min_support: 50 # This masks any position where <50% of sequences have a base
 
 refine:
   coalescent: const

diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
@@ -66,7 +66,7 @@ rule join_segments:
     input:
         alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) 
     output:
-        alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
+        alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta"
     wildcard_constraints:
         subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
         segment = 'genome',
@@ -78,6 +78,25 @@ rule join_segments:
             --output {output.alignment}
         """
 
+rule mask_genome:
+    input:
+        alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta"
+    output:
+        alignment = "results/{subtype}/{segment}/{time}/aligned.fasta",
+    params:
+        percentage = config['mask']['min_support']
+    wildcard_constraints:
+        subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
+        segment = 'genome',
+        time = 'default',
+    shell:
+        r"""
+        python scripts/mask.py \
+            --alignment {input.alignment} \
+            --percentage {params.percentage} \
+            --output {output.alignment}
+        """
+
 rule genome_metadata:
     input:
         sequences = "results/{subtype}/{segment}/{time}/aligned.fasta",

diff --git a/scripts/mask.py b/scripts/mask.py
@@ -0,0 +1,37 @@
+
+import argparse
+from augur.io.sequences import read_sequences, write_sequences
+from Bio.Seq import MutableSeq
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--alignment', type=str, required=True, help='genome alignment')
+    parser.add_argument('--percentage', type=str, required=True, help='positions with less coverage than this will be masked')
+    parser.add_argument('--output', type=str, required=True, help='masked genome alignment')
+
+    args = parser.parse_args()
+
+    # Store everything in memory
+    alignment = list(read_sequences(args.alignment))
+    genome_size = len(alignment[0].seq)
+    counts = [0 for _ in range(0, genome_size)] # zero-based
+    valid_bases = set(list("ATGCatcg"))
+    n_genomes = len(alignment)
+
+    for sample in alignment:
+        for idx, base in enumerate(sample.seq):
+            if base in valid_bases:
+                counts[idx] += 1
+
+    mask_bool = [c/n_genomes*100 < float(args.percentage) for c in counts]
+    mask_sites = [i for i,b in enumerate(mask_bool) if b==True]
+
+    print("Masking sites (zero-based):", mask_sites)
+    print("Total number of sites to mask:", len(mask_sites))
+
+    for sample in alignment:
+        sample.seq = MutableSeq(sample.seq)
+        for idx in mask_sites:
+            sample.seq[idx] = 'N'
+
+    write_sequences(alignment, args.output)