diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index d1c4fc2..2ff3dfc 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -64,7 +64,8 @@ rule join_segments: # allow snakemake to choose the correct rule to run. Note that `wildcards.segment="genome"` # here, and for that we need alignments for 8 individual segments, which we refer to as `wildcards.genome_seg` input: - alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS) + alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS), + metadata = metadata_by_wildcards, output: alignment = "results/{subtype}/{segment}/{time}/aligned.fasta", node_data = "results/{subtype}/{segment}/{time}/aligned.json", @@ -77,7 +78,8 @@ rule join_segments: python scripts/join-segments.py \ --segments {input.alignment} \ --output {output.alignment} \ - --output-node-data {output.node_data} + --output-node-data {output.node_data} \ + --force-include $( cat {input.metadata} | csvtk filter2 -t -f '$host=="Human"' | csvtk cut -t -f strain | tail -n +2 | tr "\n" " " ) """ rule genome_metadata: diff --git a/scripts/join-segments.py b/scripts/join-segments.py index 2f072d9..9c54c27 100644 --- a/scripts/join-segments.py +++ b/scripts/join-segments.py @@ -8,6 +8,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--segments', type = str, required = True, nargs='+', help = "per-segment alignments") + parser.add_argument('--force-include', type=str, nargs="+", required=False, + help="Force include these strains regardless of how many segments have been sequenced") parser.add_argument('--output', type = str, required = True, help = "output whole genome alignment") parser.add_argument('--output-node-data', type = str, required = False, help = "output metadata in node-data JSON format") args = parser.parse_args() @@ -51,8 +53,11 @@ def atgc_perc(seq): print("writing genome to ", args.output) for name,count in strain_counts.items(): if count<7: - print(f"Excluding {name} as it only appears in {count} segments") - continue + if name in args.force_include: + print(f"Force including {name} which would otherwise be dropped as it only appears in {count} segments") + else: + print(f"Excluding {name} as it only appears in {count} segments") + continue genome = "".join([sequence(seg, name) for seg in args.segments]) node_data['nodes'][name] = { "ATGC_perc": int( len([nt for nt in genome if nt in atgc])/len(genome) * 100),