From f9f1da503855a2b2f48421b2c11ee9d493ee263d Mon Sep 17 00:00:00 2001 From: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com> Date: Wed, 7 Feb 2024 17:42:06 +0000 Subject: [PATCH 1/2] fix: chip input issue (#136) * Fix slurm preset (#118) * require snakemake<8 * fix escape character and whitespace errors * Delete setup.cfg file * fix: split peak call rules * tests: added all peak call methods to atac test * Update output file paths in alignment_post_processing.smk * Update file paths in hub.smk * fix: updated wildcards for bigBed files * Refactor test_seqnado_config_creation function and add missing options to config_atac.yml * Update config file for chip sequencing * fix: seqnado-design * chore: removed commented code * Fix file path in lanceotron_no_input rule * Fix metadata and experiment creation in DesignIP class * Fix symlink_files function to handle both paired and single-end assays * Add log and wrapper for fastqc_raw_single rule * update config if ucsc is null * Fix config (#119) * remove split fastq from config and all rules * clean up config and fix spelling of indices * remove test config files * update default heatmap options * return to config but with small changes * refactor config.py * fix typo in config.py * use indices consistently for genome indices * update config process in docs * add entrypoint and chmod profile (#122) * Feature add config rerun (#126) * add option to rerun config * update config docs with rerun * move sigularity fix to faq in docs (#127) * feat(pipeline): handle failed peak calls (#131) * fix: add validate peaks rule * Add get_peak_files function to retrieve peak files based on assay type * fix(pipeline): inputs not used for peak call (#132) * Develop (#128) * Fix slurm preset (#118) * require snakemake<8 * fix escape character and whitespace errors * Delete setup.cfg file * fix: split peak call rules * tests: added all peak call methods to atac test * Update output file paths in alignment_post_processing.smk * Update file paths in hub.smk * fix: updated wildcards for bigBed files * Refactor test_seqnado_config_creation function and add missing options to config_atac.yml * Update config file for chip sequencing * fix: seqnado-design * chore: removed commented code * Fix file path in lanceotron_no_input rule * Fix metadata and experiment creation in DesignIP class * Fix symlink_files function to handle both paired and single-end assays * Add log and wrapper for fastqc_raw_single rule * update config if ucsc is null * Fix config (#119) * remove split fastq from config and all rules * clean up config and fix spelling of indices * remove test config files * update default heatmap options * return to config but with small changes * refactor config.py * fix typo in config.py * use indices consistently for genome indices * update config process in docs * add entrypoint and chmod profile (#122) * Feature add config rerun (#126) * add option to rerun config * update config docs with rerun * move sigularity fix to faq in docs (#127) --------- Co-authored-by: alsmith * fix: "f" missing for all get_control_X functions. Whoops... * fix: multiple errors with get_control_X files * fix: removed touch sentinel * fix: && removed at end of lines * fix: make blank file if peak calls fail * fix: validate peaks wrong param supplied --------- Co-authored-by: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com> * Fix: chip input issue (#135) * fix symlinking issue for input fastq files --------- Co-authored-by: alsmith Co-authored-by: Alastair Smith <49727900+alsmith151@users.noreply.github.com> --- seqnado/utils.py | 72 ++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 49 deletions(-) diff --git a/seqnado/utils.py b/seqnado/utils.py index 88acabee..5ae4e194 100644 --- a/seqnado/utils.py +++ b/seqnado/utils.py @@ -683,71 +683,45 @@ def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): return cls(assays=experiments, **kwargs) - -def symlink_files_paired( - output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str -): - r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}_1.fastq.gz") - r2_path_new = pathlib.Path(f"{output_dir}/{assay_name}_2.fastq.gz") - - if not r1_path_new.exists(): - try: - r1_path_new.symlink_to(assay.r1.path.resolve()) - except FileExistsError: - logger.warning(f"Symlink for {r1_path_new} already exists.") - - if assay.r2 and not r2_path_new.exists(): - try: - r2_path_new.symlink_to(assay.r2.path.resolve()) - except FileExistsError: - logger.warning(f"Symlink for {r2_path_new} already exists.") - - -def symlink_files_single( - output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str -): - r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}.fastq.gz") - - if not r1_path_new.exists(): +def symlink_file(output_dir: pathlib.Path, source_path: pathlib.Path, new_file_name: str): + """ + Create a symlink in the output directory with the new file name. + """ + new_path = output_dir / new_file_name + if not new_path.exists(): try: - r1_path_new.symlink_to(assay.r1.path.resolve()) + new_path.symlink_to(source_path.resolve()) except FileExistsError: - logger.warning(f"Symlink for {r1_path_new} already exists.") - + logger.warning(f"Symlink for {new_path} already exists.") -def symlink_fastq_files( - design: Union[Design, DesignIP], output_dir: str = "seqnado_output/fastqs/" -) -> None: +def symlink_fastq_files(design: Union[Design, DesignIP], output_dir: str = "seqnado_output/fastqs/") -> None: """ Symlink the fastq files to the output directory. """ output_dir = pathlib.Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + if isinstance(design, Design): for assay_name, assay in design.assays.items(): + symlink_file(output_dir, assay.r1.path, f"{assay_name}_1.fastq.gz") if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + symlink_file(output_dir, assay.r2.path, f"{assay_name}_2.fastq.gz") elif isinstance(design, DesignIP): for experiment_name, experiment in design.assays.items(): - assay = experiment.ip_files - assay_name = assay.name - - if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + # IP files + ip_assay = experiment.ip_files + symlink_file(output_dir, ip_assay.r1.path, f"{ip_assay.name}_1.fastq.gz") + if ip_assay.is_paired: + symlink_file(output_dir, ip_assay.r2.path, f"{ip_assay.name}_2.fastq.gz") if experiment.control_files: - assay = experiment.control_files - assay_name = assay.name - if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + control_assay = experiment.control_files + control_r1_name = control_assay.r1.path.name + symlink_file(output_dir, control_assay.r1.path, control_r1_name) + if control_assay.is_paired: + control_r2_name = control_assay.r2.path.name + symlink_file(output_dir, control_assay.r2.path, control_r2_name) def define_output_files( From 07f84208d5e96785af86f0b4892c2b2a7f890659 Mon Sep 17 00:00:00 2001 From: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com> Date: Thu, 8 Feb 2024 12:18:27 +0000 Subject: [PATCH 2/2] bug fixes for conda release (#139) --- seqnado/config.py | 211 +++++++++++++++------- seqnado/workflow/rules/peak_call_chip.smk | 21 ++- seqnado/workflow/scripts/split_bam.py | 109 ----------- seqnado/workflow/scripts/split_bam2.py | 88 --------- 4 files changed, 155 insertions(+), 274 deletions(-) delete mode 100644 seqnado/workflow/scripts/split_bam.py delete mode 100644 seqnado/workflow/scripts/split_bam2.py diff --git a/seqnado/config.py b/seqnado/config.py index a36d0d6a..0093d11b 100644 --- a/seqnado/config.py +++ b/seqnado/config.py @@ -6,12 +6,16 @@ package_dir = os.path.dirname(os.path.abspath(__file__)) template_dir = os.path.join(package_dir, "workflow/config") + # Helper Functions def get_user_input(prompt, default=None, is_boolean=False, choices=None): while True: - user_input = input(f"{prompt} [{'/'.join(choices) if choices else default}]: ") or default + user_input = ( + input(f"{prompt} [{'/'.join(choices) if choices else default}]: ") + or default + ) if is_boolean: - return user_input.lower() == 'yes' + return user_input.lower() == "yes" if choices and user_input not in choices: print(f"Invalid choice. Please choose from {', '.join(choices)}.") continue @@ -19,20 +23,23 @@ def get_user_input(prompt, default=None, is_boolean=False, choices=None): def setup_configuration(assay, genome, template_data): - username = os.getenv('USER', 'unknown_user') - today = datetime.datetime.now().strftime('%Y-%m-%d') - project_name = get_user_input("What is your project name?", default=f"{username}_project") + username = os.getenv("USER", "unknown_user") + today = datetime.datetime.now().strftime("%Y-%m-%d") + project_name = get_user_input( + "What is your project name?", default=f"{username}_project" + ) + project_name = project_name.replace(" ", "_") common_config = { - 'username': username, - 'project_date': today, - 'project_name': project_name, - 'genome': genome + "username": username, + "project_date": today, + "project_name": project_name, + "genome": genome, } - + template_data.update(common_config) - with open(os.path.join(template_dir, 'preset_genomes.json'), 'r') as f: + with open(os.path.join(template_dir, "preset_genomes.json"), "r") as f: genome_values = json.load(f) genome_dict = {} @@ -41,72 +48,135 @@ def setup_configuration(assay, genome, template_data): genome = get_user_input("What is your genome name?", default="other") genome_dict = { genome: { - "indices": get_user_input("Path to Bowtie2 genome indices:") if assay in ["chip", "atac"] else get_user_input("Path to STAR v2.7.10b genome indices:"), + "indices": ( + get_user_input("Path to Bowtie2 genome indices:") + if assay in ["chip", "atac"] + else get_user_input("Path to STAR v2.7.10b genome indices:") + ), "chromosome_sizes": get_user_input("Path to chromosome sizes file:"), "gtf": get_user_input("Path to GTF file:"), - "blacklist": get_user_input("Path to blacklist bed file:") + "blacklist": get_user_input("Path to blacklist bed file:"), } } else: if genome in genome_values: genome_dict[genome] = { - "indices": genome_values[genome].get('bt2_indices' if assay in ["chip", "atac"] else 'star_indices', ''), - "chromosome_sizes": genome_values[genome].get('chromosome_sizes', ''), - "gtf": genome_values[genome].get('gtf', ''), - "blacklist": genome_values[genome].get('blacklist', '') + "indices": genome_values[genome].get( + "bt2_indices" if assay in ["chip", "atac"] else "star_indices", "" + ), + "chromosome_sizes": genome_values[genome].get("chromosome_sizes", ""), + "gtf": genome_values[genome].get("gtf", ""), + "blacklist": genome_values[genome].get("blacklist", ""), } - genome_config = { - 'genome': genome, - 'indices': genome_dict[genome]['indices'], - 'chromosome_sizes': genome_dict[genome]['chromosome_sizes'], - 'gtf': genome_dict[genome]['gtf'], + "genome": genome, + "indices": genome_dict[genome]["indices"], + "chromosome_sizes": genome_dict[genome]["chromosome_sizes"], + "gtf": genome_dict[genome]["gtf"], } template_data.update(genome_config) - - template_data['remove_blacklist'] = get_user_input("Do you want to remove blacklist regions? (yes/no)", default="yes", is_boolean=True) - if template_data['remove_blacklist']: - template_data['blacklist'] = genome_dict[genome]['blacklist'] - - template_data['remove_pcr_duplicates'] = get_user_input("Remove PCR duplicates? (yes/no)", default= "yes" if assay in ["chip", "atac"] else "no", is_boolean=True) - if template_data['remove_pcr_duplicates']: - template_data['remove_pcr_duplicates_method'] = get_user_input("Remove PCR duplicates method:", default="picard", choices=["picard"]) + template_data["remove_blacklist"] = get_user_input( + "Do you want to remove blacklist regions? (yes/no)", + default="yes", + is_boolean=True, + ) + if template_data["remove_blacklist"]: + template_data["blacklist"] = genome_dict[genome]["blacklist"] + + template_data["remove_pcr_duplicates"] = get_user_input( + "Remove PCR duplicates? (yes/no)", + default="yes" if assay in ["chip", "atac"] else "no", + is_boolean=True, + ) + if template_data["remove_pcr_duplicates"]: + template_data["remove_pcr_duplicates_method"] = get_user_input( + "Remove PCR duplicates method:", default="picard", choices=["picard"] + ) else: - template_data['remove_pcr_duplicates_method'] = "False" - + template_data["remove_pcr_duplicates_method"] = "False" + if assay == "atac": - template_data['shift_atac_reads'] = get_user_input("Shift ATAC-seq reads? (yes/no)", default="yes", is_boolean=True) if assay == "atac" else "False" + template_data["shift_atac_reads"] = ( + get_user_input( + "Shift ATAC-seq reads? (yes/no)", default="yes", is_boolean=True + ) + if assay == "atac" + else "False" + ) if assay == "chip": - template_data['spikein'] = get_user_input("Do you have spikein? (yes/no)", default="no", is_boolean=True) - if template_data['spikein']: - template_data['normalisation_method'] = get_user_input("Normalisation method:", default="orlando", choices=["orlando", "with_input"]) - template_data['reference_genome'] = get_user_input("Reference genome:", default="hg38") - template_data['spikein_genome'] = get_user_input("Spikein genome:", default="dm6") - template_data['fastq_screen_config'] = get_user_input("Path to fastqscreen config:", default="/ceph/project/milne_group/shared/seqnado_reference/fastqscreen_reference/fastq_screen.conf") - - template_data['make_bigwigs'] = get_user_input("Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True) - if template_data['make_bigwigs']: - template_data['pileup_method'] = get_user_input("Pileup method:", default="deeptools", choices=["deeptools", "homer"]) - template_data['make_heatmaps'] = get_user_input("Do you want to make heatmaps? (yes/no)", default="no", is_boolean=True) - - if assay in ["chip", "atac"]: - template_data['call_peaks'] = get_user_input("Do you want to call peaks? (yes/no)", default="no", is_boolean=True) - if template_data['call_peaks']: - template_data['peak_calling_method'] = get_user_input("Peak caller:", default="lanceotron", choices=["lanceotron", "macs", "homer"]) - - template_data['run_deseq2'] = get_user_input("Run DESeq2? (yes/no)", default="no", is_boolean=True) if assay == "rna" else "False" + template_data["spikein"] = get_user_input( + "Do you have spikein? (yes/no)", default="no", is_boolean=True + ) + if template_data["spikein"]: + template_data["normalisation_method"] = get_user_input( + "Normalisation method:", + default="orlando", + choices=["orlando", "with_input"], + ) + template_data["reference_genome"] = get_user_input( + "Reference genome:", default="hg38" + ) + template_data["spikein_genome"] = get_user_input( + "Spikein genome:", default="dm6" + ) + template_data["fastq_screen_config"] = get_user_input( + "Path to fastqscreen config:", + default="/ceph/project/milne_group/shared/seqnado_reference/fastqscreen_reference/fastq_screen.conf", + ) + + template_data["make_bigwigs"] = get_user_input( + "Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True + ) + if template_data["make_bigwigs"]: + template_data["pileup_method"] = get_user_input( + "Pileup method:", default="deeptools", choices=["deeptools", "homer"] + ) + template_data["make_heatmaps"] = get_user_input( + "Do you want to make heatmaps? (yes/no)", default="no", is_boolean=True + ) - template_data['make_ucsc_hub'] = get_user_input("Do you want to make a UCSC hub? (yes/no)", default="no", is_boolean=True) - - template_data['UCSC_hub_directory'] = get_user_input("UCSC hub directory:", default="/path/to/ucsc_hub/") if template_data['make_ucsc_hub'] else "." - template_data['email'] = get_user_input("What is your email address?", default=f"{username}@example.com") if template_data['make_ucsc_hub'] else f"{username}@example.com" - template_data['color_by'] = get_user_input("Color by (for UCSC hub):", default="samplename") if template_data['make_ucsc_hub'] else "samplename" - - template_data['options'] = TOOL_OPTIONS_RNA if assay == "rna" else TOOL_OPTIONS + if assay in ["chip", "atac"]: + template_data["call_peaks"] = get_user_input( + "Do you want to call peaks? (yes/no)", default="no", is_boolean=True + ) + if template_data["call_peaks"]: + template_data["peak_calling_method"] = get_user_input( + "Peak caller:", + default="lanceotron", + choices=["lanceotron", "macs", "homer"], + ) + + template_data["run_deseq2"] = ( + get_user_input("Run DESeq2? (yes/no)", default="no", is_boolean=True) + if assay == "rna" + else "False" + ) + + template_data["make_ucsc_hub"] = get_user_input( + "Do you want to make a UCSC hub? (yes/no)", default="no", is_boolean=True + ) + + template_data["UCSC_hub_directory"] = ( + get_user_input("UCSC hub directory:", default="/path/to/ucsc_hub/") + if template_data["make_ucsc_hub"] + else "." + ) + template_data["email"] = ( + get_user_input("What is your email address?", default=f"{username}@example.com") + if template_data["make_ucsc_hub"] + else f"{username}@example.com" + ) + template_data["color_by"] = ( + get_user_input("Color by (for UCSC hub):", default="samplename") + if template_data["make_ucsc_hub"] + else "samplename" + ) + + template_data["options"] = TOOL_OPTIONS_RNA if assay == "rna" else TOOL_OPTIONS # Tool Specific Options @@ -178,35 +248,40 @@ def setup_configuration(assay, genome, template_data): colormap: RdYlBu_r """ + def create_config(assay, genome, rerun): env = Environment(loader=FileSystemLoader(template_dir), auto_reload=False) - template = env.get_template("config.yaml.jinja") + template = env.get_template("config.yaml.jinja") template_deseq2 = env.get_template("deseq2.qmd.jinja") - + # Initialize template data - template_data = {'assay': assay, 'genome': genome} + template_data = {"assay": assay, "genome": genome} # Setup configuration setup_configuration(assay, genome, template_data) - + # Create directory and render template if rerun: dir_name = os.getcwd() - with open(os.path.join(dir_name, f"config_{assay}.yml"), 'w') as file: + with open(os.path.join(dir_name, f"config_{assay}.yml"), "w") as file: file.write(template.render(template_data)) else: dir_name = f"{template_data['project_date']}_{template_data['assay']}_{template_data['project_name']}" os.makedirs(dir_name, exist_ok=True) fastq_dir = os.path.join(dir_name, "fastq") os.makedirs(fastq_dir, exist_ok=True) - - with open(os.path.join(dir_name, f"config_{assay}.yml"), 'w') as file: + + with open(os.path.join(dir_name, f"config_{assay}.yml"), "w") as file: file.write(template.render(template_data)) # add deseq2 qmd file if rna if assay == "rna": - with open(os.path.join(dir_name, f"deseq2_{template_data['project_name']}.qmd"), 'w') as file: + with open( + os.path.join(dir_name, f"deseq2_{template_data['project_name']}.qmd"), "w" + ) as file: file.write(template_deseq2.render(template_data)) - - print(f"Directory '{dir_name}' has been created with the 'config_{assay}.yml' file.") + + print( + f"Directory '{dir_name}' has been created with the 'config_{assay}.yml' file." + ) diff --git a/seqnado/workflow/rules/peak_call_chip.smk b/seqnado/workflow/rules/peak_call_chip.smk index 88a7313d..32c75d2e 100644 --- a/seqnado/workflow/rules/peak_call_chip.smk +++ b/seqnado/workflow/rules/peak_call_chip.smk @@ -43,16 +43,17 @@ rule macs2_with_input: params: options=seqnado.utils.check_options(config["macs"]["callpeak"]), narrow=lambda wc, output: output.peaks.replace(".bed", "_peaks.narrowPeak"), + basename=lambda wc, output: output.peaks.replace(".bed", ""), threads: 1 resources: mem_mb=2000, time="0-02:00:00", log: - "seqnado_output/logs/macs/{sample}_{treatment}.bed", + "seqnado_output/logs/macs/{sample}_{treatment}.log", shell: """ - macs2 callpeak -t {input.treatment} -c {input.control} -n seqnado_output/peaks/macs/{wildcards.treatment} -f BAMPE {params.options} > {log} 2>&1 && - cat {params.narrow} | cut -f 1-3 > {output.peaks} || touch {output.peaks} + macs2 callpeak -t {input.treatment} -c {input.control} -n {params.basename} -f BAMPE {params.options} > {log} 2>&1 && + cat {params.narrow} | cut -f 1-3 > {output.peaks} """ @@ -70,7 +71,7 @@ rule macs2_no_input: mem_mb=2000, time="0-02:00:00", log: - "seqnado_output/logs/macs/{sample}_{treatment}.bed", + "seqnado_output/logs/macs/{sample}_{treatment}.log", shell: """ macs2 callpeak -t {input.treatment} -n {params.basename} -f BAMPE {params.options} > {log} 2>&1 && @@ -85,7 +86,7 @@ rule homer_with_input: output: peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: - "seqnado_output/logs/homer/{sample}_{treatment}.bed", + "seqnado_output/logs/homer/{sample}_{treatment}.log", params: options=seqnado.utils.check_options(config["homer"]["findpeaks"]), threads: 1 @@ -106,7 +107,7 @@ rule homer_no_input: output: peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: - "seqnado_output/logs/homer/{sample}_{treatment}.bed", + "seqnado_output/logs/homer/{sample}_{treatment}.log", params: options=seqnado.utils.check_options(config["homer"]["findpeaks"]), threads: 1 @@ -128,10 +129,11 @@ rule lanceotron_with_input: output: peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed", log: - "seqnado_output/logs/lanceotron/{sample}_{treatment}.bed", + "seqnado_output/logs/lanceotron/{sample}_{treatment}.log", params: threshold=get_lanceotron_threshold, outdir=lambda wc, output: os.path.dirname(output.peaks), + basename=lambda wc, output: output.peaks.replace(".bed", ""), container: "library://asmith151/seqnado/seqnado_extra:latest" threads: 1 @@ -141,7 +143,7 @@ rule lanceotron_with_input: shell: """ lanceotron callPeaksInput {input.treatment} -i {input.control} -f {params.outdir} --skipheader > {log} 2>&1 && - cat {params.outdir}/{wildcards.treatment}_L-tron.bed | awk 'BEGIN{{OFS="\\t"}} $4 >= {params.threshold} {{print $1, $2, $3}}' > {output.peaks} || touch {output.peaks} + cat {params.basename}_L-tron.bed | awk 'BEGIN{{OFS="\\t"}} $4 >= {params.threshold} {{print $1, $2, $3}}' > {output.peaks} """ @@ -155,6 +157,7 @@ rule lanceotron_no_input: params: options=seqnado.utils.check_options(config["lanceotron"]["callpeak"]), outdir=lambda wc, output: os.path.dirname(output.peaks), + basename=lambda wc, output: output.peaks.replace(".bed", ""), threads: 1 container: "library://asmith151/seqnado/seqnado_extra:latest" @@ -164,7 +167,7 @@ rule lanceotron_no_input: shell: """ lanceotron callPeaks {input.treatment} -f {params.outdir} --skipheader {params.options} > {log} 2>&1 && - cat {params.outdir}/{wildcards.sample}_{wildcards.treatment}_L-tron.bed | cut -f 1-3 > {output.peaks} + cat {params.basename}_L-tron.bed | cut -f 1-3 > {output.peaks} """ diff --git a/seqnado/workflow/scripts/split_bam.py b/seqnado/workflow/scripts/split_bam.py deleted file mode 100644 index 23a7f686..00000000 --- a/seqnado/workflow/scripts/split_bam.py +++ /dev/null @@ -1,109 +0,0 @@ -import logging -import os -import pysam -import shutil -import subprocess -import sys -from optparse import OptionParser -from loguru import logger - -# Set up logging -logger.add(snakemake.log[0], level="INFO") - - __version__ = "1.0.5" - - def create_headers(bamfile, ex_chr_prefix): - """Create BAM headers for sample and exogenous genomes.""" - bam_header = bamfile.header - sample_header, exo_header = {}, {} - sample_header.update(bam_header) - exo_header.update(bam_header) - - sample_header["SQ"] = [sq for sq in bam_header["SQ"] if sq["SN"].startswith("chr")] - exo_header["SQ"] = [sq for sq in bam_header["SQ"] if sq["SN"].startswith(ex_chr_prefix)] - - for header in [sample_header, exo_header]: - header.setdefault("CO", []).extend([]) - - return sample_header, exo_header - - - def create_report(output_prefix, stats): - """ - Create a report file with statistics from the BAM processing in TSV format. - - Parameters: - output_prefix (str): Prefix used for output files. - stats (dict): A dictionary containing the statistics to report. - """ - report_file = output_prefix + "_report.tsv" - with open(report_file, "w") as report: - # Writing the headers - headers = stats.keys() - report.write("\t".join(headers) + "\n") - - # Writing the values - values = [str(stats[key]) for key in headers] - report.write("\t".join(values) + "\n") - - def process_bam(bam_file, output_prefix, ex_chr_prefix, sample_genome, map_qual_threshold): - """Process the BAM file and collect statistics.""" - stats = { - "bam_file": os.path.basename(bam_file), - sample_genome + "_reads": 0, - ex_chr_prefix + "_reads": 0, - "unmapped_reads": 0, - "qcfail_reads": 0, - "duplicate_reads": 0, - "secondary_reads": 0, - "low_mapq_reads": 0, - } - - samfile = pysam.AlignmentFile(bam_file, "rb") - sample_header, ex_header = create_headers(samfile, ex_chr_prefix) - - with pysam.AlignmentFile(output_prefix + "_" + sample_genome +".bam", "wb", header=sample_header) as sample_out, \ - pysam.AlignmentFile(output_prefix + "_" + ex_chr_prefix + ".bam", "wb", header=ex_header) as exo_out: - - for read in samfile: - if read.is_unmapped: - stats["unmapped_reads"] += 1 - elif read.is_qcfail: - stats["qcfail_reads"] += 1 - elif read.is_duplicate: - stats["duplicate_reads"] += 1 - elif read.is_secondary: - stats["secondary_reads"] += 1 - elif read.mapq < map_qual_threshold: - stats["low_mapq_reads"] += 1 - elif read.reference_name.startswith(ex_chr_prefix): - stats[ex_chr_prefix + "_reads"] += 1 - exo_out.write(read) - else: - stats[sample_genome + "_reads"] += 1 - sample_out.write(read) - - return stats - - - def main(): - parser = OptionParser(usage="%prog [options]", version="%prog " + __version__) - parser.add_option("-i", dest="bam_file", help="BAM file of the composite genome") - parser.add_option("-o", "--output", dest="out_prefix", help="Output prefix") - parser.add_option("-g", "--sample-prefix", dest="sample_prefix", default="hg38", help="Prefix for exogenous chromosome IDs") - parser.add_option("-p", "--exo-prefix", dest="chr_prefix", default="dm6", help="Prefix for exogenous chromosome IDs") - parser.add_option("-q", "--mapq", dest="map_qual", type="int", default=30, help="Mapping quality threshold") - - - (options, args) = parser.parse_args() - if not (options.bam_file and options.out_prefix): - parser.print_help() - sys.exit() - - process_bam(options.bam_file, options.out_prefix, options.chr_prefix, options.sample_prefix, options.map_qual) - stats = process_bam(options.bam_file, options.out_prefix, options.chr_prefix, options.sample_prefix, options.map_qual) - create_report(options.out_prefix, stats) - -with logger.catch(): - if __name__ == "__main__": - main() diff --git a/seqnado/workflow/scripts/split_bam2.py b/seqnado/workflow/scripts/split_bam2.py deleted file mode 100644 index d2d1afcc..00000000 --- a/seqnado/workflow/scripts/split_bam2.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -import pathlib -import pysam -import shutil -import subprocess -from optparse import OptionParser -from loguru import logger - -# Set up logging -logger.add(snakemake.log[0], level="INFO") - -def create_headers(bamfile, ex_chr_prefix): - """Create BAM headers for sample and exogenous genomes.""" - bam_header = bamfile.header - sample_header, exo_header = {}, {} - sample_header.update(bam_header) - exo_header.update(bam_header) - - sample_header["SQ"] = [sq for sq in bam_header["SQ"] if sq["SN"].startswith("chr")] - exo_header["SQ"] = [sq for sq in bam_header["SQ"] if sq["SN"].startswith(ex_chr_prefix)] - - for header in [sample_header, exo_header]: - header.setdefault("CO", []).extend([]) - - return sample_header, exo_header - - - -def process_bam(bam_file, output_prefix, ex_chr_prefix, sample_genome, map_qual_threshold): - """Process the BAM file and collect statistics.""" - stats = { - "bam_file": os.path.basename(bam_file), - sample_genome + "_reads": 0, - ex_chr_prefix + "_reads": 0, - "unmapped_reads": 0, - "qcfail_reads": 0, - "duplicate_reads": 0, - "secondary_reads": 0, - "low_mapq_reads": 0, - } - - samfile = pysam.AlignmentFile(bam_file, "rb") - sample_header, ex_header = create_headers(samfile, ex_chr_prefix) - - with pysam.AlignmentFile(output_prefix + "_" + sample_genome +".bam", "wb", header=sample_header) as sample_out, \ - pysam.AlignmentFile(output_prefix + "_" + ex_chr_prefix + ".bam", "wb", header=ex_header) as exo_out: - - for read in samfile: - if read.is_unmapped: - stats["unmapped_reads"] += 1 - elif read.is_qcfail: - stats["qcfail_reads"] += 1 - elif read.is_duplicate: - stats["duplicate_reads"] += 1 - elif read.is_secondary: - stats["secondary_reads"] += 1 - elif read.mapq < map_qual_threshold: - stats["low_mapq_reads"] += 1 - elif read.reference_name.startswith(ex_chr_prefix): - stats[ex_chr_prefix + "_reads"] += 1 - exo_out.write(read) - else: - stats[sample_genome + "_reads"] += 1 - sample_out.write(read) - - return stats - - report_file = output_prefix + "_report.tsv" - with open(report_file, "w") as report: - # Writing the headers - headers = stats.keys() - report.write("\t".join(headers) + "\n") - - # Writing the values - values = [str(stats[key]) for key in headers] - report.write("\t".join(values) + "\n") - - -with logger.catch(): - logger.info("Split bam files") - - bam_file = snakemake.input.bam - out_prefix = snakemake.params.prefix - sample_prefix = snakemake.params.genome_prefix - chr_prefix = snakemake.params.exo_prefix - map_qual = snakemake.params.map_qual - - process_bam(bam_file, out_prefix, chr_prefix, sample_prefix, map_qual)