diff --git a/config/h5n1-cattle-outbreak.yaml b/config/h5n1-cattle-outbreak.yaml index 13f6505..629f413 100644 --- a/config/h5n1-cattle-outbreak.yaml +++ b/config/h5n1-cattle-outbreak.yaml @@ -69,6 +69,9 @@ filter: exclude_where: FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2 +mask: + min_support: 0 # This lets all positions through regardless of how many sequences have a base + refine: coalescent: const diff --git a/config/h5n1-d1.1.yaml b/config/h5n1-d1.1.yaml index fd4fcd4..7be2a54 100644 --- a/config/h5n1-d1.1.yaml +++ b/config/h5n1-d1.1.yaml @@ -61,6 +61,8 @@ filter: exclude_where: FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2 +mask: + min_support: 50 # This masks any position where <50% of sequences have a base refine: coalescent: const diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index 5553277..f737b90 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -66,7 +66,7 @@ rule join_segments: input: alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) output: - alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" + alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta" wildcard_constraints: subtype = 'h5n1-cattle-outbreak|h5n1-d1.1', segment = 'genome', @@ -78,6 +78,25 @@ rule join_segments: --output {output.alignment} """ +rule mask_genome: + input: + alignment = "results/{subtype}/{segment}/{time}/aligned_unmasked.fasta" + output: + alignment = "results/{subtype}/{segment}/{time}/aligned.fasta", + params: + percentage = config['mask']['min_support'] + wildcard_constraints: + subtype = 'h5n1-cattle-outbreak|h5n1-d1.1', + segment = 'genome', + time = 'default', + shell: + r""" + python scripts/mask.py \ + --alignment {input.alignment} \ + --percentage {params.percentage} \ + --output {output.alignment} + """ + rule genome_metadata: input: sequences = "results/{subtype}/{segment}/{time}/aligned.fasta", diff --git a/scripts/mask.py b/scripts/mask.py new file mode 100644 index 0000000..d5df8c2 --- /dev/null +++ b/scripts/mask.py @@ -0,0 +1,37 @@ + +import argparse +from augur.io.sequences import read_sequences, write_sequences +from Bio.Seq import MutableSeq + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--alignment', type=str, required=True, help='genome alignment') + parser.add_argument('--percentage', type=str, required=True, help='positions with less coverage than this will be masked') + parser.add_argument('--output', type=str, required=True, help='masked genome alignment') + + args = parser.parse_args() + + # Store everything in memory + alignment = list(read_sequences(args.alignment)) + genome_size = len(alignment[0].seq) + counts = [0 for _ in range(0, genome_size)] # zero-based + valid_bases = set(list("ATGCatcg")) + n_genomes = len(alignment) + + for sample in alignment: + for idx, base in enumerate(sample.seq): + if base in valid_bases: + counts[idx] += 1 + + mask_bool = [c/n_genomes*100 < float(args.percentage) for c in counts] + mask_sites = [i for i,b in enumerate(mask_bool) if b==True] + + print("Masking sites (zero-based):", mask_sites) + print("Total number of sites to mask:", len(mask_sites)) + + for sample in alignment: + sample.seq = MutableSeq(sample.seq) + for idx in mask_sites: + sample.seq[idx] = 'N' + + write_sequences(alignment, args.output) \ No newline at end of file