From b75b1462c9c52774019884e0989e6fa2ad8eb82a Mon Sep 17 00:00:00 2001 From: riasc Date: Thu, 4 Jul 2024 20:14:40 -0500 Subject: [PATCH] distributed workfload for hlatyping --- CHANGELOG.md | 9 + workflow/rules/altsplicing.smk | 2 +- workflow/rules/common.smk | 156 ++++--------- workflow/rules/hlatyping.smk | 210 +++++++----------- .../genotyping/combine_optitype_results.py | 6 +- .../scripts/genotyping/optitype_wrapper.py | 7 +- 6 files changed, 148 insertions(+), 242 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ee574b..a4f93e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Prioritization of neoantigens is now done separately for each variant type (speeds up the process) - NMD information (e.g., escape rule,...) is now also calculated for all variants +## [0.2.9] - 2024-07-04 + +### Fix + +- Splitted rules in HLA typing to ensure better distribution of the workload +- Changed order in HLA typing rules (BAM files are now part of single-end) + - samtools fastq is only called for BAM files + - input of filtering directly from preprocessed/raw reads + ## [0.2.8] - 2024-06-26 ### Fix diff --git a/workflow/rules/altsplicing.smk b/workflow/rules/altsplicing.smk index eb13eeb..34fc688 100644 --- a/workflow/rules/altsplicing.smk +++ b/workflow/rules/altsplicing.smk @@ -12,7 +12,7 @@ rule spladder: confidence=f"""{config["altsplicing"]["confidence"]}""", iteration=f"""{config["altsplicing"]["iterations"]}""", edgelimit=f"""{config["altsplicing"]["edgelimit"]}""" - threads: config['threads'] + threads: 20 shell: """ bash workflow/scripts/run_spladder.sh \ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1b36cf6..4d82a50 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -161,123 +161,53 @@ def get_preproc_input(wildcards): ########### HLA GENOTYPING ########## -def get_input_hlatyping_SE(wildcards): - # special case: filetype is BAM (single-end) - just return (raw) BAM file +def get_input_reads_hlatyping_BAM(wildcards): seqtype = "dnaseq" if wildcards.nartype == "DNA" else "rnaseq" - if config["data"][f"{seqtype}_filetype"] == ".bam": - return config["data"][seqtype][wildcards.group] + return config["data"][seqtype][wildcards.group] - if config["preproc"]["activate"]: - return expand("results/{sample}/{seqtype}/reads/{group}_preproc.fq.gz", - sample = wildcards.sample, - seqtype = "dnaseq" if wildcards.nartype == "DNA" else "rnaseq", - group = wildcards.group) +def get_input_filtering_hlatyping_SE(wildcards): + seqtype = "dnaseq" if wildcards.nartype == "DNA" else "rnaseq" + if config["data"][f"{seqtype}_filetype"] == ".bam": + return expand("results/{sample}/{seqtype}/reads/{group}_flt_BAM.fq", + sample=wildcards.sample, + seqtype=seqtype, + group=wildcards.group) else: - return config["data"][f"{seqtype}"][wildcards.group] + if config["preproc"]["activate"]: + return expand("results/{sample}/{seqtype}/reads/{group}_preproc.fq.gz", + sample=wildcards.sample, + seqtype=seqtype, + group=wildcards.group) + else: + return config["data"][seqtype][wildcards.group] -def get_input_hlatyping_PE(wildcards): +def get_input_filtering_hlatyping_PE(wildcards): if config["preproc"]["activate"]: - return dict( - zip( - ["fwd", "rev"], - expand("results/{sample}/{seqtype}/reads/{group}_{pair}_preproc.fq.gz", - sample=wildcards.sample, - seqtype = "dnaseq" if wildcards.nartype == "DNA" else "rnaseq", - group=wildcards.group, - pair=["R1","R2"]) - ) - ) + return expand("results/{sample}/{seqtype}/reads/{group}_{readpair}_preproc.fq.gz", + sample=wildcards.sample, + seqtype="dnaseq" if wildcards.nartype == "DNA" else "rnaseq", + group=wildcards.group, + nartype=wildcards.nartype, + readpair=wildcards.readpair) else: - return dict( - zip( - ["fwd", "rev"], - config["data"][f"{wildcards.seqtype}"][wildcards.group] - ) - ) - - -def get_filtered_reads_hlatyping_SE(wildcards): - bam = [] - idx = [] - - if wildcards.nartype == "DNA": - if len(config["data"]["dnaseq"]) != 0: - for key in config["data"]["dnaseq"].keys(): - bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam", - sample=wildcards.sample, - group=key) - idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam.bai", - sample=wildcards.sample, - group=key) - - - if wildcards.nartype == "RNA": - if len(config["data"]["rnaseq"]) != 0: - for key in config["data"]["rnaseq"].keys(): - bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam", - sample=wildcards.sample, - group=key) - idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam.bai", - sample=wildcards.sample, - group=key) - - return dict( - zip( - ["bam", "idx"], - [bam, idx] - ) - ) - - -def get_filtered_reads_hlatyping_PE(wildcards): - bam = [] - idx = [] - - if wildcards.nartype == "DNA": - if len(config["data"]["dnaseq"]) != 0: - for key in config["data"]["dnaseq"].keys(): - bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam", - sample=wildcards.sample, - group=key, - readpair=wildcards.readpair) - idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam.bai", - sample=wildcards.sample, - group=key, - readpair=wildcards.readpair) - - if wildcards.nartype == "RNA": - if len(config["data"]["rnaseq"]) != 0: - for key in config["data"]["rnaseq"].keys(): - bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam", - sample=wildcards.sample, - group=key, - readpair=wildcards.readpair) - - idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam.bai", - sample=wildcards.sample, - group=key, - readpair=wildcards.readpair) - - return dict( - zip( - ["bam", "idx"], - [bam, idx] - ) - ) + seqtype = "dnaseq" if wildcards.nartype == "DNA" else "rnaseq" + return config["data"][f"{wildcards.seqtype}"][wildcards.group] def aggregate_mhcI_SE(wildcards): checkpoint_output = checkpoints.split_reads_mhcI_SE.get(**wildcards).output[0] - return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_result.tsv", + return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_result.tsv", sample=wildcards.sample, + group=wildcards.group, nartype=wildcards.nartype, no=glob_wildcards(os.path.join(checkpoint_output, "R_{no}.bam")).no) def aggregate_mhcI_PE(wildcards): checkpoint_output = checkpoints.split_reads_mhcI_PE.get(**wildcards).output[0] - return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_result.tsv", + return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_result.tsv", sample=wildcards.sample, + group=wildcards.group, nartype=wildcards.nartype, no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no) @@ -287,21 +217,29 @@ def get_all_mhcI_alleles(wildcards): if "DNA" in config["hlatyping"]["MHC-I_mode"]: if len(config["data"]["dnaseq"]) != 0: - if config["data"]["dnaseq_readtype"] == "SE": - values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_SE.tsv", - sample=wildcards.sample) + if config["data"]["dnaseq_readtype"] == "SE" or config["data"]["dnaseq_filetype"] == ".bam": + for key in config["data"]["dnaseq"].keys(): + values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_DNA_flt_SE.tsv", + sample=wildcards.sample, + group=key) elif config["data"]["dnaseq_readtype"] == "PE": - values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_PE.tsv", - sample=wildcards.sample) + for key in config["data"]["dnaseq"].keys(): + values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_DNA_flt_PE.tsv", + sample=wildcards.sample, + group=key) if "RNA" in config["hlatyping"]["MHC-I_mode"]: if len(config["data"]["rnaseq"]) != 0: - if config["data"]["rnaseq_readtype"] == "SE": - values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_SE.tsv", - sample=wildcards.sample) + if config["data"]["rnaseq_readtype"] == "SE" or config["data"]["rnaseq_filetype"] == ".bam": + for key in config["data"]["rnaseq"].keys(): + values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_RNA_flt_SE.tsv", + sample=wildcards.sample, + group=key) elif config["data"]["rnaseq_readtype"] == "PE": - values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_PE.tsv", - sample=wildcards.sample) + for key in config["data"]["rnaseq"].keys(): + values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_RNA_flt_PE.tsv", + sample=wildcards.sample, + group=key) if "custom" in config["hlatyping"]["MHC-I_mode"]: values += [config["data"]["custom"]["hlatyping"]["MHC-I"]] diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk index 9f4717f..edbdb11 100644 --- a/workflow/rules/hlatyping.smk +++ b/workflow/rules/hlatyping.smk @@ -1,21 +1,21 @@ ####### CLASS I HLA GENOTYPING ########### -rule get_hla_panel: +rule get_mhcI_hla_panel: output: dna="resources/hla/hla_ref_DNA.fa", rna="resources/hla/hla_ref_RNA.fa" message: - "Downloading HLA reference panels" + "Downloading MHC-I HLA reference panels" conda: "../envs/basic.yml" log: - "logs/hla_panel.log" + "logs/hlatyping/get_mhcI_hla_panel.log" shell: """ curl -o {output.dna} https://raw.githubusercontent.com/FRED-2/OptiType/v1.3.5/data/hla_reference_dna.fasta curl -o {output.rna} https://raw.githubusercontent.com/FRED-2/OptiType/v1.3.5/data/hla_reference_rna.fasta """ -rule index_hla_panel: +rule index_mhcI_hla_panel: input: fa="resources/hla/hla_ref_{nartype}.fa", output: @@ -24,8 +24,10 @@ rule index_hla_panel: ".lf.pst", ".rid.concat", ".rid.limits", ".sa.ind", ".sa.len", ".sa.val", ".txt.concat", ".txt.limits", ".txt.size"), + message: + "Index MHC-I HLA reference panel ({wildcards.nartype})" log: - "logs/yara_indexer_{nartype}.log" + "logs/hlatyping/index_mhcI_hla_panel_{nartype}.log" conda: "../envs/yara.yml" shell: @@ -33,125 +35,91 @@ rule index_hla_panel: mkdir -p resources/hla/yara_index/ yara_indexer \ -o resources/hla/yara_index/{wildcards.nartype} \ - {input.fa} > {log} + {input.fa} > {log} 2>&1 """ -######### get input reads ######## -rule get_reads_hlatyping_SE: +######### single-end reads ######### +rule get_reads_hlatyping_BAM: input: - reads=get_input_hlatyping_SE, + reads=get_input_reads_hlatyping_BAM, tmp="tmp/" output: - "results/{sample}/hla/reads/{group}_{nartype}_SE.fq" + "results/{sample}/hla/reads/{group}_{nartype}_BAM.fq" message: "Retrieve reads for HLA genotyping by converting the alignments ({wildcards.nartype}) of group:{wildcards.group} from sample:{wildcards.sample} to reads in FASTQ format" log: - "logs/{sample}/hla/get_hla_input_reads_{group}_{nartype}.log" + "logs/{sample}/hla/get_read_hlatyping_BAM_{group}_{nartype}.log" conda: "../envs/samtools.yml" - shell: - """ - samtools sort -n {input.reads} -T tmp/ | samtools fastq > {output} - """ - -rule get_reads_hlatyping_PE: - input: - unpack(get_input_hlatyping_PE), - tmp="tmp/" - output: - fwd="results/{sample}/hla/reads/{group}_{nartype}_PE_R1.fq", - rev="results/{sample}/hla/reads/{group}_{nartype}_PE_R2.fq" - message: - "Retrieve paired-end reads ({wildcards.nartype}) for HLA genotyping - sample:{wildcards.sample} group:{wildcards.group}" - log: - "logs/{sample}/hla/get_reads_hlatyping_PE_{group}_{nartype}.log" threads: 4 - resources: - mem_mb=20000 - conda: - "../envs/samtools.yml" shell: """ - samtools sort -@4 -m4g -n {input.fwd} -T tmp/ | samtools fastq -@4 > {output.fwd} - samtools sort -@4 -m4g -n {input.rev} -T tmp/ | samtools fastq -@4 > {output.rev} + samtools sort -@ {threads} -n {input.reads} -T tmp/ \ + | samtools fastq -@ {threads} > {output} """ -######### single-end reads ######### rule filter_reads_mhcI_SE: input: - reads="results/{sample}/hla/reads/{group}_{nartype}_SE.fq", - panel=multiext("resources/hla/yara_index/{nartype}", - ".lf.drp", ".lf.drs", ".lf.drv", + reads=get_input_filtering_hlatyping_SE, + panel=multiext("resources/hla/yara_index/{nartype}", + ".lf.drp", ".lf.drs", ".lf.drv", ".lf.pst", ".rid.concat", ".rid.limits", ".sa.ind", ".sa.len", ".sa.val", ".txt.concat", ".txt.limits", ".txt.size"), output: - "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam" + reads="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam", message: "Filter the {wildcards.nartype} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel" log: - "logs/{sample}/{group}_hla_reads_filtering_{nartype}" - threads: - config["threads"] + "logs/{sample}/hlatyping/filter_reads_mhcI_SE_{group}_{nartype}.log" conda: "../envs/yara.yml" + threads: config['threads'] shell: """ yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.nartype} \ - {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log} + {input.reads} | samtools view -h -F 4 -b1 - -o {output.reads} > {log} 2>&1 """ -rule index_reads_mhcI_SE: - input: - "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam" - output: - "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam.bai" - log: - "logs/{sample}/{group}_hla_reads_filtering_{nartype}_index" - params: - extra="", # optional params string - threads: 4 # This value - 1 will be sent to -@ - wrapper: - "v2.3.0/bio/samtools/index" - -rule merge_reads_mhcI_SE: +rule sort_and_index_reads_mhcI_SE: input: - unpack(get_filtered_reads_hlatyping_SE), + "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam", output: - bam="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam", - idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam.bai" + bam="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE_sorted.bam", + idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE_sorted.bam.bai" message: - "Merge the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample}" + "Sort the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample}" + threads: 4 + resources: + mem_mb=20000 log: - "logs/{sample}/hla/merge_reads_mhcI_{nartype}_SE.log" + "logs/{sample}/hlatyping/sort_and_index_reads_mhcI_PE_{group}_{nartype}.log" conda: "../envs/samtools.yml" - threads: 4 shell: """ - samtools merge -@ {threads} {output.bam} {input.bam} > {log} 2>&1 + samtools sort -@ {threads} -m4g {input} -o {output.bam} > {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 """ - checkpoint split_reads_mhcI_SE: input: - reads="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam", - index="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam.bai" + fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE_sorted.bam", + fwd_idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE_sorted.bam.bai", output: - directory("results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE/") + directory("results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE/") message: - "Splitting filtered BAM files for HLA typing" + "Splitting filtered group: {wildcards.group} BAM files ({wildcards.nartype}seq reads) for HLA typing" log: - "logs/{sample}/hla/split_bam_{nartype}.log" + "logs/{sample}/hlatyping/split_bam_mhcI_SE_{group}_{nartype}.log" conda: "../envs/gatk.yml" threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.nartype}_flt_merged_SE/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.nartype}_flt_SE/ gatk SplitSamByNumberOfReads \ - -I {input.reads} \ + -I {input.fwd} \ --OUTPUT {output} \ --OUT_PREFIX R \ --SPLIT_TO_N_READS 100000 @@ -159,44 +127,47 @@ checkpoint split_reads_mhcI_SE: rule hlatyping_mhcI_SE: input: - reads="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE/R_{no}.bam", + fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE/R_{no}.bam", + rev="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE/R_{no}.bam", output: - pdf="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: - "logs/{sample}/optitype/merged_{nartype}_{no}_call.log" + "logs/{sample}/hlatyping/hlatyping_mhcI_SE_{group}_{nartype}_{no}.log" conda: "../envs/optitype.yml" threads: 64 shell: """ python3 workflow/scripts/genotyping/optitype_wrapper.py \ - '{input.reads}' {wildcards.nartype} {wildcards.no} \ - results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.nartype}_flt_merged_SE/ > {log} + '{input.fwd} {input.rev}' {wildcards.nartype} {wildcards.no} \ + results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_SE/ > {log} 2>&1 """ -rule combine_mhcI_SE: +rule combine_hlatyping_mhcI_SE: input: - aggregate_mhcI_SE + aggregate_mhcI_SE, output: - "results/{sample}/hla/mhc-I/genotyping/{nartype}_SE.tsv" + "results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE.tsv", + message: + "Combining HLA alleles from predicted optitype results from {wildcards.nartype}seq reads in group: {wildcards.group}" log: - "logs/{sample}/genotyping/combine_optitype_mhcI_{nartype}_call.log" + "logs/{sample}/hlatyping/combine_optitype_mhcI_PE_{group}_{nartype}.log" conda: "../envs/basic.yml" threads: 1 shell: """ python3 workflow/scripts/genotyping/combine_optitype_results.py \ - '{input}' {output} + '{input}' {wildcards.group} {output} """ ############# paired-end reads ########### rule filter_reads_mhcI_PE: input: - reads="results/{sample}/hla/reads/{group}_{nartype}_PE_{readpair}.fq", + reads=get_input_filtering_hlatyping_PE, panel=multiext("resources/hla/yara_index/{nartype}", ".lf.drp", ".lf.drs", ".lf.drv", ".lf.pst", ".rid.concat", ".rid.limits", @@ -207,66 +178,55 @@ rule filter_reads_mhcI_PE: message: "Filter the {wildcards.nartype} reads of group:{wildcards.group} of sample:{wildcards.sample} against the HLA panel" log: - "logs/{sample}/{group}_hla_reads_filtering_{nartype}_{readpair}.log" + "logs/{sample}/hlatyping/filter_reads_mhcI_PE_{group}_{nartype}_{readpair}.log" conda: "../envs/yara.yml" threads: config['threads'] shell: """ yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.nartype} \ - {input.reads} | samtools view -h -F 4 -b1 - | samtools sort -O bam - -o {output.reads} > {log} + {input.reads} | samtools view -h -F 4 -b1 - -o {output.reads} > {log} 2>&1 """ -rule index_reads_mhcI_PE: +rule sort_and_index_reads_mhcI_PE: input: - "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_{readpair}.bam" + "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_{readpair}.bam", output: - "results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_{readpair}.bam.bai" - log: - "logs/{sample}/{group}_hla_reads_filtering_{nartype}_index_{readpair}.log" - params: - extra="", # optional params string - threads: 4 # This value - 1 will be sent to -@ - wrapper: - "v2.3.0/bio/samtools/index" - -rule merge_reads_mhcI_PE: - input: - unpack(get_filtered_reads_hlatyping_PE), - output: - bam="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_{readpair}.bam", - idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_{readpair}.bam.bai" + bam="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_{readpair}_sorted.bam", + idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_{readpair}_sorted.bam.bai" message: - "Merge the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample} with readpair: {wildcards.readpair}" + "Sort the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample} with readpair: {wildcards.readpair}" + threads: 4 + resources: + mem_mb=20000 log: - "logs/{sample}/hla/merge_reads_mhcI_{nartype}_PE_{readpair}.log", + "logs/{sample}/hlatyping/sort_and_index_reads_mhcI_PE_{group}_{nartype}_{readpair}.log" conda: "../envs/samtools.yml" - threads: 4 shell: """ - samtools merge -@ {threads} {output.bam} {input.bam} > {log} 2>&1 + samtools sort -@ {threads} -m4g {input} -o {output.bam} > {log} 2>&1 samtools index {output.bam} >> {log} 2>&1 """ checkpoint split_reads_mhcI_PE: input: - fwd="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R1.bam", - fwd_idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R1.bam.bai", - rev="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R2.bam", - rev_idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R2.bam.bai" + fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R1_sorted.bam", + fwd_idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R1_sorted.bam.bai", + rev="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R2_sorted.bam", + rev_idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R2_sorted.bam.bai" output: - directory("results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/") + directory("results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/") message: - "Splitting filtered BAM files for HLA typing" + "Splitting filtered group: {wildcards.group} BAM files ({wildcards.nartype}seq reads) for HLA typing" log: - "logs/{sample}/hla/split_bam_{nartype}.log" + "logs/{sample}/hlatyping/split_bam_mhcI_PE_{group}_{nartype}.log" conda: "../envs/gatk.yml" threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.nartype}_flt_merged_PE/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.nartype}_flt_PE/ gatk SplitSamByNumberOfReads \ -I {input.fwd} \ --OUTPUT {output} \ @@ -282,15 +242,15 @@ checkpoint split_reads_mhcI_PE: rule hlatyping_mhcI_PE: input: - fwd="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/R1_{no}.bam", - rev="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/R2_{no}.bam", + fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/R1_{no}.bam", + rev="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/R2_{no}.bam", output: - pdf="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: - "logs/{sample}/optitype/merged_{nartype}_{no}_call.log" + "logs/{sample}/hlatyping/hlatyping_mhcI_PE_{group}_{nartype}_{no}.log" conda: "../envs/optitype.yml" threads: 64 @@ -298,26 +258,25 @@ rule hlatyping_mhcI_PE: """ python3 workflow/scripts/genotyping/optitype_wrapper.py \ '{input.fwd} {input.rev}' {wildcards.nartype} {wildcards.no} \ - results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.nartype}_flt_merged_PE/ > {log} - + results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_PE/ > {log} 2>&1 """ rule combine_hlatyping_mhcI_PE: input: aggregate_mhcI_PE, output: - "results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE.tsv", + "results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE.tsv", message: - "Combining HLA alleles from predicted optitype results" + "Combining HLA alleles from predicted optitype results from {wildcards.nartype}seq reads in group: {wildcards.group}" log: - "logs/{sample}/genotyping/combine_optitype_mhcI_{nartype}_PE.log" + "logs/{sample}/hlatyping/combine_optitype_mhcI_PE_{group}_{nartype}.log" conda: "../envs/basic.yml" threads: 1 shell: """ python3 workflow/scripts/genotyping/combine_optitype_results.py \ - '{input}' {output} + '{input}' {wildcards.group} {output} """ rule combine_all_mhcI_alleles: @@ -338,7 +297,6 @@ rule combine_all_mhcI_alleles: '{input}' mhc-I {output} """ - ######### MHC-II HLA GENOTYPING ########### rule filter_reads_mhcII_SE: input: diff --git a/workflow/scripts/genotyping/combine_optitype_results.py b/workflow/scripts/genotyping/combine_optitype_results.py index 12617dc..3b18bef 100644 --- a/workflow/scripts/genotyping/combine_optitype_results.py +++ b/workflow/scripts/genotyping/combine_optitype_results.py @@ -1,7 +1,7 @@ import sys """ - python3 merge_alleles.py '' + python3 merge_alleles.py '' """ def main(): @@ -24,9 +24,9 @@ def main(): # remove duplicates alleles = list(set(alleles)) - output = open(sys.argv[2], 'w') + output = open(sys.argv[3], 'w') for i in alleles: - output.write(i + '\n') + output.write(sys.argv[2] + "\t" + i + '\n') output.close() def load_alleles(alleles_file): diff --git a/workflow/scripts/genotyping/optitype_wrapper.py b/workflow/scripts/genotyping/optitype_wrapper.py index b26cf85..0bd6034 100644 --- a/workflow/scripts/genotyping/optitype_wrapper.py +++ b/workflow/scripts/genotyping/optitype_wrapper.py @@ -10,7 +10,6 @@ def main(): inbams = sys.argv[1] nartype = "dna" if sys.argv[2] == "DNA" else "rna" - nartype = sys.argv[2] prefix = sys.argv[3] outpath = sys.argv[4] @@ -32,8 +31,10 @@ def main(): tsv = subprocess.Popen("touch " + outpath + prefix + "_result.tsv", shell=True) else: # call optitype - optitype = subprocess.Popen("OptiTypePipeline.py --input " + inbams + - " --prefix " + prefix + " --" + nartype + "-v", shell=True) + optitype_call = "OptiTypePipeline.py --input " + inbams + " --outdir " + outpath + " --prefix " + prefix + " --" + nartype + " -v" + print(optitype_call) + os.system(optitype_call) +# optitype = subprocess.Popen(optitype_call, shell=True) except subprocess.CalledProcessError as e: