From 898e58127999240c0f520b3dc63d5cdd39da888c Mon Sep 17 00:00:00 2001 From: james hadfield Date: Fri, 14 Feb 2025 13:00:01 +1300 Subject: [PATCH] [genome builds] Implement masking Masks out positions in the genome which have nucleotides called in <50% of samples. This is especially important in genome builds as the terminal ends of segments are no longer terminal and thus sparse sequence data was resulting in artefactual partitioning of the tree. I've left the cattle-flu builds unchanged (i.e. no masking) but we should revisit this. --- config/h5n1-cattle-outbreak.yaml | 3 +++ config/h5n1-d1.1.yaml | 2 ++ rules/cattle-flu.smk | 21 +++++++++++++++++- scripts/mask.py | 37 ++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 scripts/mask.py diff --git a/config/h5n1-cattle-outbreak.yaml b/config/h5n1-cattle-outbreak.yaml index 13f6505..629f413 100644 --- a/config/h5n1-cattle-outbreak.yaml +++ b/config/h5n1-cattle-outbreak.yaml @@ -69,6 +69,9 @@ filter: exclude_where: FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2 +mask: + min_support: 0 # This lets all positions through regardless of how many sequences have a base + refine: coalescent: const diff --git a/config/h5n1-d1.1.yaml b/config/h5n1-d1.1.yaml index fd4fcd4..7be2a54 100644 --- a/config/h5n1-d1.1.yaml +++ b/config/h5n1-d1.1.yaml @@ -61,6 +61,8 @@ filter: exclude_where: FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2 +mask: + min_support: 50 # This masks any position where <50% of sequences have a base refine: coalescent: const diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index 5553277..f737b90 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -66,7 +66,7 @@ rule join_segments: input: alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) output: - alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" + alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta" wildcard_constraints: subtype = 'h5n1-cattle-outbreak|h5n1-d1.1', segment = 'genome', @@ -78,6 +78,25 @@ rule join_segments: --output {output.alignment} """ +rule mask_genome: + input: + alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta" + output: + alignment = "results/{subtype}/{segment}/{time}/aligned.fasta", + params: + percentage = config['mask']['min_support'] + wildcard_constraints: + subtype = 'h5n1-cattle-outbreak|h5n1-d1.1', + segment = 'genome', + time = 'default', + shell: + r""" + python scripts/mask.py \ + --alignment {input.alignment} \ + --percentage {params.percentage} \ + --output {output.alignment} + """ + rule genome_metadata: input: sequences = "results/{subtype}/{segment}/{time}/aligned.fasta", diff --git a/scripts/mask.py b/scripts/mask.py new file mode 100644 index 0000000..d5df8c2 --- /dev/null +++ b/scripts/mask.py @@ -0,0 +1,37 @@ + +import argparse +from augur.io.sequences import read_sequences, write_sequences +from Bio.Seq import MutableSeq + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--alignment', type=str, required=True, help='genome alignment') + parser.add_argument('--percentage', type=str, required=True, help='positions with less coverage than this will be masked') + parser.add_argument('--output', type=str, required=True, help='masked genome alignment') + + args = parser.parse_args() + + # Store everything in memory + alignment = list(read_sequences(args.alignment)) + genome_size = len(alignment[0].seq) + counts = [0 for _ in range(0, genome_size)] # zero-based + valid_bases = set(list("ATGCatcg")) + n_genomes = len(alignment) + + for sample in alignment: + for idx, base in enumerate(sample.seq): + if base in valid_bases: + counts[idx] += 1 + + mask_bool = [c/n_genomes*100 < float(args.percentage) for c in counts] + mask_sites = [i for i,b in enumerate(mask_bool) if b==True] + + print("Masking sites (zero-based):", mask_sites) + print("Total number of sites to mask:", len(mask_sites)) + + for sample in alignment: + sample.seq = MutableSeq(sample.seq) + for idx in mask_sites: + sample.seq[idx] = 'N' + + write_sequences(alignment, args.output) \ No newline at end of file