Skip to content

Commit

Permalink
[genome builds] Implement masking
Browse files Browse the repository at this point in the history
Masks out positions in the genome which have nucleotides called in <50%
of samples. This is especially important in genome builds as the
terminal ends of segments are no longer terminal and thus sparse
sequence data was resulting in artefactual partitioning of the tree.

I've left the cattle-flu builds unchanged (i.e. no masking) but we
should revisit this.
  • Loading branch information
jameshadfield committed Feb 23, 2025
1 parent 732e38e commit 898e581
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 1 deletion.
3 changes: 3 additions & 0 deletions config/h5n1-cattle-outbreak.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ filter:
exclude_where:
FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2

mask:
min_support: 0 # This lets all positions through regardless of how many sequences have a base


refine:
coalescent: const
Expand Down
2 changes: 2 additions & 0 deletions config/h5n1-d1.1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ filter:
exclude_where:
FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2

mask:
min_support: 50 # This masks any position where <50% of sequences have a base

refine:
coalescent: const
Expand Down
21 changes: 20 additions & 1 deletion rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ rule join_segments:
input:
alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS)
output:
alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta"
wildcard_constraints:
subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
segment = 'genome',
Expand All @@ -78,6 +78,25 @@ rule join_segments:
--output {output.alignment}
"""

rule mask_genome:
input:
alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta"
output:
alignment = "results/{subtype}/{segment}/{time}/aligned.fasta",
params:
percentage = config['mask']['min_support']
wildcard_constraints:
subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
segment = 'genome',
time = 'default',
shell:
r"""
python scripts/mask.py \
--alignment {input.alignment} \
--percentage {params.percentage} \
--output {output.alignment}
"""

rule genome_metadata:
input:
sequences = "results/{subtype}/{segment}/{time}/aligned.fasta",
Expand Down
37 changes: 37 additions & 0 deletions scripts/mask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

import argparse
from augur.io.sequences import read_sequences, write_sequences
from Bio.Seq import MutableSeq

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--alignment', type=str, required=True, help='genome alignment')
parser.add_argument('--percentage', type=str, required=True, help='positions with less coverage than this will be masked')
parser.add_argument('--output', type=str, required=True, help='masked genome alignment')

args = parser.parse_args()

# Store everything in memory
alignment = list(read_sequences(args.alignment))
genome_size = len(alignment[0].seq)
counts = [0 for _ in range(0, genome_size)] # zero-based
valid_bases = set(list("ATGCatcg"))
n_genomes = len(alignment)

for sample in alignment:
for idx, base in enumerate(sample.seq):
if base in valid_bases:
counts[idx] += 1

mask_bool = [c/n_genomes*100 < float(args.percentage) for c in counts]
mask_sites = [i for i,b in enumerate(mask_bool) if b==True]

print("Masking sites (zero-based):", mask_sites)
print("Total number of sites to mask:", len(mask_sites))

for sample in alignment:
sample.seq = MutableSeq(sample.seq)
for idx in mask_sites:
sample.seq[idx] = 'N'

write_sequences(alignment, args.output)

0 comments on commit 898e581

Please sign in to comment.