diff --git a/README.md b/README.md index d5cacdc0..9fc26ff4 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,4 @@ Pipeline based on snakemake to process ChIP-seq, ATAC-seq, RNA-seq and short read WGS data for SNP calling. -See the [SeqNado documentation](alsmith151.github.io/SeqNado/) for more information. +See the SeqNado documentation https://alsmith151.github.io/SeqNado/ for more information. diff --git a/docs/faq.md b/docs/faq.md index dc2bee65..db82e587 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -5,3 +5,17 @@ ### Workflow defines configfile config_chip.yml but it is not present or accessible. This error occurs when the pipeline is run without a config file present in the working directory. Follow the [Pipeline Setup](pipeline.md#create-a-design-file) instructions to create a config file. + + +## Singularity configuration + +### Workflow Error + +Failed to pull singularity image from library://asmith151/seqnado/seqnado_pipeline:latest: +FATAL: Unable to get library client configuration: +remote has no library client (see https://apptainer.org/docs/user/latest/endpoint.html#no-default-remote) + +Fix: + +apptainer remote add --no-login SylabsCloud cloud.sylabs.io +apptainer remote use SylabsCloud \ No newline at end of file diff --git a/docs/pipeline.md b/docs/pipeline.md index d5fdebcb..1e887fd3 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -10,6 +10,11 @@ The following command will generate the working directory and configuration file ```bash seqnado-config chip + +# options +-r, --rerun # Re-run the config +-g, --genome [dm6|hg19|hg38|hg38_dm6|hg38_mm39|hg38_spikein|mm10|mm39|other] # Genome to use if genome preset is configured + ``` You should get somthing like this: diff --git a/seqnado/cli.py b/seqnado/cli.py index d8e5ee7b..bc76e8fb 100644 --- a/seqnado/cli.py +++ b/seqnado/cli.py @@ -9,6 +9,7 @@ @click.command(context_settings=dict(ignore_unknown_options=True)) @click.argument("method", type=click.Choice(["atac", "chip", "rna", "snp"])) +@click.option("-r", "--rerun", is_flag=True, help="Re-run the config") @click.option( "-g", "--genome", @@ -28,13 +29,13 @@ ] ), ) -def cli_config(method, help=False, genome="other"): +def cli_config(method, help=False, genome="other", rerun=False): """ Runs the config for the data processing pipeline. """ import seqnado.config as config - config.create_config(method, genome) + config.create_config(method, genome, rerun) @click.command() diff --git a/seqnado/config.py b/seqnado/config.py index ee1cc3f8..a36d0d6a 100644 --- a/seqnado/config.py +++ b/seqnado/config.py @@ -178,7 +178,7 @@ def setup_configuration(assay, genome, template_data): colormap: RdYlBu_r """ -def create_config(assay, genome): +def create_config(assay, genome, rerun): env = Environment(loader=FileSystemLoader(template_dir), auto_reload=False) template = env.get_template("config.yaml.jinja") @@ -189,15 +189,20 @@ def create_config(assay, genome): # Setup configuration setup_configuration(assay, genome, template_data) - - # Create directory and render template - dir_name = f"{template_data['project_date']}_{template_data['assay']}_{template_data['project_name']}" - os.makedirs(dir_name, exist_ok=True) - fastq_dir = os.path.join(dir_name, "fastq") - os.makedirs(fastq_dir, exist_ok=True) - with open(os.path.join(dir_name, f"config_{assay}.yml"), 'w') as file: - file.write(template.render(template_data)) + # Create directory and render template + if rerun: + dir_name = os.getcwd() + with open(os.path.join(dir_name, f"config_{assay}.yml"), 'w') as file: + file.write(template.render(template_data)) + else: + dir_name = f"{template_data['project_date']}_{template_data['assay']}_{template_data['project_name']}" + os.makedirs(dir_name, exist_ok=True) + fastq_dir = os.path.join(dir_name, "fastq") + os.makedirs(fastq_dir, exist_ok=True) + + with open(os.path.join(dir_name, f"config_{assay}.yml"), 'w') as file: + file.write(template.render(template_data)) # add deseq2 qmd file if rna if assay == "rna": diff --git a/seqnado/utils.py b/seqnado/utils.py index 88eb5a07..5ae4e194 100644 --- a/seqnado/utils.py +++ b/seqnado/utils.py @@ -624,6 +624,7 @@ def to_dataframe(self, simplify: bool = True): @classmethod def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): + experiments = {} for experiment_name, row in df.iterrows(): if simplified: @@ -682,71 +683,45 @@ def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): return cls(assays=experiments, **kwargs) - -def symlink_files_paired( - output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str -): - r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}_1.fastq.gz") - r2_path_new = pathlib.Path(f"{output_dir}/{assay_name}_2.fastq.gz") - - if not r1_path_new.exists(): - try: - r1_path_new.symlink_to(assay.r1.path.resolve()) - except FileExistsError: - logger.warning(f"Symlink for {r1_path_new} already exists.") - - if assay.r2 and not r2_path_new.exists(): - try: - r2_path_new.symlink_to(assay.r2.path.resolve()) - except FileExistsError: - logger.warning(f"Symlink for {r2_path_new} already exists.") - - -def symlink_files_single( - output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str -): - r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}.fastq.gz") - - if not r1_path_new.exists(): +def symlink_file(output_dir: pathlib.Path, source_path: pathlib.Path, new_file_name: str): + """ + Create a symlink in the output directory with the new file name. + """ + new_path = output_dir / new_file_name + if not new_path.exists(): try: - r1_path_new.symlink_to(assay.r1.path.resolve()) + new_path.symlink_to(source_path.resolve()) except FileExistsError: - logger.warning(f"Symlink for {r1_path_new} already exists.") + logger.warning(f"Symlink for {new_path} already exists.") - -def symlink_fastq_files( - design: Union[Design, DesignIP], output_dir: str = "seqnado_output/fastqs/" -) -> None: +def symlink_fastq_files(design: Union[Design, DesignIP], output_dir: str = "seqnado_output/fastqs/") -> None: """ Symlink the fastq files to the output directory. """ output_dir = pathlib.Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + if isinstance(design, Design): for assay_name, assay in design.assays.items(): + symlink_file(output_dir, assay.r1.path, f"{assay_name}_1.fastq.gz") if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + symlink_file(output_dir, assay.r2.path, f"{assay_name}_2.fastq.gz") elif isinstance(design, DesignIP): for experiment_name, experiment in design.assays.items(): - assay = experiment.ip_files - assay_name = assay.name - - if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + # IP files + ip_assay = experiment.ip_files + symlink_file(output_dir, ip_assay.r1.path, f"{ip_assay.name}_1.fastq.gz") + if ip_assay.is_paired: + symlink_file(output_dir, ip_assay.r2.path, f"{ip_assay.name}_2.fastq.gz") if experiment.control_files: - assay = experiment.control_files - assay_name = assay.name - if assay.is_paired: - symlink_files_paired(output_dir, assay, assay_name) - else: - symlink_files_single(output_dir, assay, assay_name) + control_assay = experiment.control_files + control_r1_name = control_assay.r1.path.name + symlink_file(output_dir, control_assay.r1.path, control_r1_name) + if control_assay.is_paired: + control_r2_name = control_assay.r2.path.name + symlink_file(output_dir, control_assay.r2.path, control_r2_name) def define_output_files( diff --git a/seqnado/workflow/rules/hub.smk b/seqnado/workflow/rules/hub.smk index 707920be..b913bb44 100644 --- a/seqnado/workflow/rules/hub.smk +++ b/seqnado/workflow/rules/hub.smk @@ -93,6 +93,30 @@ def get_hub_input(wildcards): return input_files +def get_peak_files(wildcards): + peak_files = [] + + if config["call_peaks"]: + if ASSAY == "ChIP": + peak_files.extend( + expand( + "seqnado_output/peaks/{method}/{sample}.bed", + method=config["peak_calling_method"], + sample=SAMPLE_NAMES_IP, + ) + ) + elif ASSAY == "ATAC": + peak_files.extend( + expand( + "seqnado_output/peaks/{method}/{sample}.bed", + method=config["peak_calling_method"], + sample=SAMPLE_NAMES, + ) + ) + + return peak_files + + rule save_design: output: "seqnado_output/design.csv", @@ -102,9 +126,33 @@ rule save_design: DESIGN.to_dataframe().to_csv("seqnado_output/design.csv", index=False) +rule validate_peaks: + input: + peaks=get_peak_files, + output: + sentinel="seqnado_output/peaks/.validated", + container: + None + log: + "seqnado_output/logs/validate_peaks.log", + run: + from loguru import logger + + with logger.catch(): + for peak_file in input.peaks: + with open(peak_file, "r+") as p: + peak_entries = p.readlines() + if len(peak_entries) < 1: + p.write("chr21\t1\t2\n") + + with open(output.sentinel, "w") as s: + s.write("validated") + + rule bed_to_bigbed: input: bed="seqnado_output/peaks/{directory}/{sample}.bed", + sentinel="seqnado_output/peaks/.validated", output: bigbed="seqnado_output/peaks/{directory}/{sample}.bigBed", params: @@ -149,3 +197,4 @@ rule generate_hub: localrules: generate_hub, + validate_peaks, diff --git a/seqnado/workflow/rules/peak_call_chip.smk b/seqnado/workflow/rules/peak_call_chip.smk index 15c14bf3..88a7313d 100644 --- a/seqnado/workflow/rules/peak_call_chip.smk +++ b/seqnado/workflow/rules/peak_call_chip.smk @@ -12,17 +12,26 @@ def get_lanceotron_threshold(wildcards): def get_control_bam(wildcards): exp = DESIGN.query(sample_name=wildcards.sample, ip=wildcards.treatment) - return "seqnado_output/aligned/{sample}_{exp.control}.bam" + control = f"seqnado_output/aligned/{wildcards.sample}_{exp.control}.bam".replace( + " ", "" + ) + return control def get_control_tag(wildcards): exp = DESIGN.query(sample_name=wildcards.sample, ip=wildcards.treatment) - return "seqnado_output/tag_dirs/{sample}_{exp.control}" + control = f"seqnado_output/tag_dirs/{wildcards.sample}_{exp.control}".replace( + " ", "" + ) + return control def get_control_bigwig(wildcards): exp = DESIGN.query(sample_name=wildcards.sample, ip=wildcards.treatment) - return "seqnado_output/bigwigs/deeptools/{sample}_{exp.control}.bigWig" + control = f"seqnado_output/bigwigs/deeptools/{wildcards.sample}_{exp.control}.bigWig".replace( + " ", "" + ) + return control rule macs2_with_input: @@ -43,7 +52,7 @@ rule macs2_with_input: shell: """ macs2 callpeak -t {input.treatment} -c {input.control} -n seqnado_output/peaks/macs/{wildcards.treatment} -f BAMPE {params.options} > {log} 2>&1 && - cat {params.narrow} | cut -f 1-3 > {output.peaks} + cat {params.narrow} | cut -f 1-3 > {output.peaks} || touch {output.peaks} """ @@ -132,7 +141,7 @@ rule lanceotron_with_input: shell: """ lanceotron callPeaksInput {input.treatment} -i {input.control} -f {params.outdir} --skipheader > {log} 2>&1 && - cat {params.outdir}/{wildcards.treatment}_L-tron.bed | awk 'BEGIN{{OFS="\\t"}} $4 >= {params.threshold} {{print $1, $2, $3}}' > {output.peaks} + cat {params.outdir}/{wildcards.treatment}_L-tron.bed | awk 'BEGIN{{OFS="\\t"}} $4 >= {params.threshold} {{print $1, $2, $3}}' > {output.peaks} || touch {output.peaks} """ @@ -159,4 +168,6 @@ rule lanceotron_no_input: """ -ruleorder: lanceotron_with_input > lanceotron_no_input > homer_with_input > homer_no_input > macs2_with_input > macs2_no_input +ruleorder: lanceotron_with_input > lanceotron_no_input +ruleorder: homer_with_input > homer_no_input +ruleorder: macs2_with_input > macs2_no_input