From 610722db777f2fc2e4e525e71f1aed9fd74628cd Mon Sep 17 00:00:00 2001 From: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:09:26 +0000 Subject: [PATCH] Develop (#120) * Fix slurm preset (#118) * require snakemake<8 * fix escape character and whitespace errors * Delete setup.cfg file * fix: split peak call rules * tests: added all peak call methods to atac test * Update output file paths in alignment_post_processing.smk * Update file paths in hub.smk * fix: updated wildcards for bigBed files * Refactor test_seqnado_config_creation function and add missing options to config_atac.yml * Update config file for chip sequencing * fix: seqnado-design * chore: removed commented code * Fix file path in lanceotron_no_input rule * Fix metadata and experiment creation in DesignIP class * Fix symlink_files function to handle both paired and single-end assays * Add log and wrapper for fastqc_raw_single rule * update config if ucsc is null * Fix config (#119) * remove split fastq from config and all rules * clean up config and fix spelling of indices * remove test config files * update default heatmap options * return to config but with small changes * refactor config.py * fix typo in config.py * use indices consistently for genome indices * update config process in docs --------- Co-authored-by: alsmith --- docs/pipeline.md | 98 ++++-------- pyproject.toml | 2 +- seqnado/cli.py | 4 +- seqnado/config.py | 108 ++++--------- seqnado/utils.py | 136 +++++++++++++--- seqnado/workflow/config/config.yaml.jinja | 7 +- seqnado/workflow/config/preset_genomes.json | 32 ++-- seqnado/workflow/rules/align.smk | 31 +--- seqnado/workflow/rules/align_rna.smk | 2 +- .../rules/alignment_post_processing.smk | 110 ++++++++----- seqnado/workflow/rules/chip_refnorm.smk | 2 +- seqnado/workflow/rules/fastq_split.smk | 72 --------- seqnado/workflow/rules/hub.smk | 4 +- .../{peak_call.smk => peak_call_chip.smk} | 36 ++--- seqnado/workflow/rules/peak_call_other.smk | 76 +++++++++ seqnado/workflow/rules/qc.smk | 98 ++++++++---- seqnado/workflow/snakefile_atac | 9 +- seqnado/workflow/snakefile_chip | 10 +- seqnado/workflow/snakefile_rna | 2 +- seqnado/workflow/snakefile_snp | 8 +- setup.cfg | 35 ----- tests/data/config/config_atac.yml | 136 ---------------- tests/data/config/config_chip.yml | 145 ------------------ tests/data/config/config_rna.yml | 129 ---------------- tests/test_atac.py | 51 +++--- tests/test_chip.py | 60 ++++---- tests/test_rna.py | 26 ++-- 27 files changed, 525 insertions(+), 904 deletions(-) delete mode 100644 seqnado/workflow/rules/fastq_split.smk rename seqnado/workflow/rules/{peak_call.smk => peak_call_chip.smk} (77%) create mode 100644 seqnado/workflow/rules/peak_call_other.smk delete mode 100644 setup.cfg delete mode 100644 tests/data/config/config_atac.yml delete mode 100644 tests/data/config/config_chip.yml delete mode 100644 tests/data/config/config_rna.yml diff --git a/docs/pipeline.md b/docs/pipeline.md index deabd4ce..d5fdebcb 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -9,82 +9,45 @@ The pipeline is configured using a YAML file: e.g. `config_atac.yml`, `config_ch The following command will generate the working directory and configuration file for the ATAC-seq pipeline: ```bash -seqnado-config atac +seqnado-config chip ``` You should get somthing like this: ```bash $ seqnado-config chip - [1/23] user_name (Your name): asmith - [2/23] Select date - 1 - 2024-01-13 - Choose from [1] (1): - [3/23] project_name (Project name): TEST - [4/23] Select project_id - 1 - test - Choose from [1] (1): 1 - [5/23] genome (hg38): - [6/23] chromosome_sizes (/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/sequence/hg38.chrom.sizes): - [7/23] indicies (/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/bt2_index/hg38): - [8/23] gtf (/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/genes/hg38.ncbiRefSeq.gtf): - [9/23] Select read_type - 1 - paired - 2 - single - Choose from [1/2] (1): 1 - [10/23] Select split_fastq - 1 - True - 2 - False - Choose from [1/2] (1): 2 - [11/23] split_fastq_parts (int): - [12/23] Select remove_pcr_duplicates_method - 1 - picard - 2 - deeptools - Choose from [1/2] (1): 1 - [13/23] Select remove_blacklist - 1 - yes - 2 - no - Choose from [1/2] (1): 1 - [14/23] blacklist (/ceph/project/milne_group/shared/seqnado_reference/hg38/hg38-blacklist.v2.bed.gz): - [15/23] Select make_bigwigs - 1 - yes - 2 - no - Choose from [1/2] (1): 1 - [16/23] Select pileup_method - 1 - deeptools - 2 - homer - Choose from [1/2] (1): 1 - [17/23] Select make_heatmaps - 1 - yes - 2 - no - Choose from [1/2] (1): 1 - [18/23] Select call_peaks - 1 - yes - 2 - no - Choose from [1/2] (1): 1 - [19/23] Select peak_calling_method - 1 - macs - 2 - lanceotron - 3 - homer - Choose from [1/2/3] (1): 2 - [20/23] Select make_ucsc_hub - 1 - yes - 2 - no - Choose from [1/2] (1): 1 - [21/23] UCSC_hub_directory (path/to/ publically accessible location on the server): /project/milne_group/datashare/asmith/chipseq/TEST_HUB - [22/23] email (Email address (UCSC required)): alastair.smith@ndcls.ox.ac.uk - [23/23] Select color_by - 1 - samplename - 2 - method - Choose from [1/2] (1): 1 + What is your project name? [cchahrou_project]: TEST + What is your genome name? [other]: hg38 + Path to Bowtie2 genome indices: [None]: /ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/bt2_index/hg38 + Path to chromosome sizes file: [None]: /ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/sequence/hg38.chrom.sizes + Path to GTF file: [None]: /ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/genes/hg38.ncbiRefSeq.gtf + Path to blacklist bed file: [None]: /ceph/project/milne_group/shared/seqnado_reference/hg38/hg38-blacklist.v2.bed.gz + Do you want to remove blacklist regions? (yes/no) [yes]: yes + Remove PCR duplicates? (yes/no) [yes]: yes + Remove PCR duplicates method: [picard]: picard + Do you have spikein? (yes/no) [no]: yes + Normalisation method: [orlando/with_input]: orlando + Reference genome: [hg38]: hg38 + Spikein genome: [dm6]: dm6 + Path to fastqscreen config: [/ceph/project/milne_group/shared/seqnado_reference/fastqscreen_reference/fastq_screen.conf]: /ceph/project/milne_group/shared/seqnado_reference/fastqscreen_reference/fastq_screen.conf + Do you want to make bigwigs? (yes/no) [no]: yes + Pileup method: [deeptools/homer]: deeptools + Do you want to make heatmaps? (yes/no) [no]: yes + Do you want to call peaks? (yes/no) [no]: yes + Peak caller: [lanceotron/macs/homer]: lanceotron + Do you want to make a UCSC hub? (yes/no) [no]: yes + UCSC hub directory: [/path/to/ucsc_hub/]: /project/milne_group/datashare/etc + What is your email address? [cchahrou@example.com]: email for UCSC + Color by (for UCSC hub): [samplename]: samplename + Directory '2024-01-26_chip_TEST' has been created with the 'config_chip.yml' file. ``` This will generate the following files: ```bash -$ tree 2024-01-13_test/ +$ tree 2024-01-13_chip_test/ -2024-01-13_test/ +2024-01-13_chip_test/ ├── config_chip.yml └── readme_test.md @@ -230,6 +193,13 @@ $ ls -l ```bash tmux new -s NAME_OF_SESSION + +# or + +screen -S NAME_OF_SESSION + +# to exit screen session + ctrl+a d ``` diff --git a/pyproject.toml b/pyproject.toml index 33f84728..90efb950 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "seaborn", "setuptools_scm", "snakemake-wrapper-utils", - "snakemake<=9.0.0", + "snakemake<8.0.0", "tracknado", "wget", ] diff --git a/seqnado/cli.py b/seqnado/cli.py index ec344d56..d8e5ee7b 100644 --- a/seqnado/cli.py +++ b/seqnado/cli.py @@ -62,7 +62,9 @@ def cli_design(method, files, output="design.csv"): design = DesignIP.from_fastq_files([FastqFileIP(path=fq) for fq in files]) - design.to_dataframe().to_csv(output) + design.to_dataframe().reset_index().rename(columns={"index": "sample"}).to_csv( + output, index=False + ) @click.command(context_settings=dict(ignore_unknown_options=True)) diff --git a/seqnado/config.py b/seqnado/config.py index e3a80c36..ee1cc3f8 100644 --- a/seqnado/config.py +++ b/seqnado/config.py @@ -18,7 +18,6 @@ def get_user_input(prompt, default=None, is_boolean=False, choices=None): return user_input - def setup_configuration(assay, genome, template_data): username = os.getenv('USER', 'unknown_user') today = datetime.datetime.now().strftime('%Y-%m-%d') @@ -40,60 +39,38 @@ def setup_configuration(assay, genome, template_data): if genome == "other": genome = get_user_input("What is your genome name?", default="other") - if assay in ["chip", "atac"]: - genome_dict = { - genome: { - "index": get_user_input("Path to Bowtie2 genome index:"), - "chromosome_sizes": get_user_input("Path to chromosome sizes file:"), - "gtf": get_user_input("Path to GTF file:"), - "blacklist": get_user_input("Path to blacklist bed file:") - } + genome_dict = { + genome: { + "indices": get_user_input("Path to Bowtie2 genome indices:") if assay in ["chip", "atac"] else get_user_input("Path to STAR v2.7.10b genome indices:"), + "chromosome_sizes": get_user_input("Path to chromosome sizes file:"), + "gtf": get_user_input("Path to GTF file:"), + "blacklist": get_user_input("Path to blacklist bed file:") } - elif assay == "rna": - genome_dict = { - genome: { - "index": get_user_input("Path to STAR v2.7.10b genome index:"), - "chromosome_sizes": get_user_input("Path to chromosome sizes file:"), - "gtf": get_user_input("Path to GTF file:"), - "blacklist": get_user_input("Path to blacklist bed file:") - } + } + else: + if genome in genome_values: + genome_dict[genome] = { + "indices": genome_values[genome].get('bt2_indices' if assay in ["chip", "atac"] else 'star_indices', ''), + "chromosome_sizes": genome_values[genome].get('chromosome_sizes', ''), + "gtf": genome_values[genome].get('gtf', ''), + "blacklist": genome_values[genome].get('blacklist', '') } - elif genome in genome_values: - if assay in ["chip", "atac"]: - genome_dict = { - genome: { - "index": genome_values[genome]['bt2_index'], - "chromosome_sizes": genome_values[genome]['chromosome_sizes'], - "gtf": genome_values[genome]['gtf'], - "blacklist": genome_values[genome]['blacklist'] - } - } - elif assay == "rna": - genome_dict = { - genome: { - "index": genome_values[genome]['star_index'], - "chromosome_sizes": genome_values[genome]['chromosome_sizes'], - "gtf": genome_values[genome]['gtf'], - "blacklist": genome_values[genome]['blacklist'] - } - } + + genome_config = { + 'genome': genome, + 'indices': genome_dict[genome]['indices'], + 'chromosome_sizes': genome_dict[genome]['chromosome_sizes'], + 'gtf': genome_dict[genome]['gtf'], + } + template_data.update(genome_config) - template_data['genome'] = genome - template_data['indicies'] = genome_dict[genome]['index'] - template_data['chromosome_sizes'] = genome_dict[genome]['chromosome_sizes'] - template_data['gtf'] = genome_dict[genome]['gtf'] - template_data['read_type'] = get_user_input("What is your read type?", default="paired", choices=["paired", "single"]) template_data['remove_blacklist'] = get_user_input("Do you want to remove blacklist regions? (yes/no)", default="yes", is_boolean=True) if template_data['remove_blacklist']: template_data['blacklist'] = genome_dict[genome]['blacklist'] - if assay in ["chip", "atac"]: - template_data['remove_pcr_duplicates'] = get_user_input("Remove PCR duplicates? (yes/no)", default="yes", is_boolean=True) - elif assay == "rna": - template_data['remove_pcr_duplicates'] = get_user_input("Remove PCR duplicates? (yes/no)", default="no", is_boolean=True) - + template_data['remove_pcr_duplicates'] = get_user_input("Remove PCR duplicates? (yes/no)", default= "yes" if assay in ["chip", "atac"] else "no", is_boolean=True) if template_data['remove_pcr_duplicates']: template_data['remove_pcr_duplicates_method'] = get_user_input("Remove PCR duplicates method:", default="picard", choices=["picard"]) @@ -101,25 +78,16 @@ def setup_configuration(assay, genome, template_data): template_data['remove_pcr_duplicates_method'] = "False" if assay == "atac": - template_data['shift_atac_reads'] = get_user_input("Shift ATAC-seq reads? (yes/no)", default="yes", is_boolean=True) - elif assay in ["chip", "rna"]: - template_data['shift_atac_reads'] = "False" + template_data['shift_atac_reads'] = get_user_input("Shift ATAC-seq reads? (yes/no)", default="yes", is_boolean=True) if assay == "atac" else "False" if assay == "chip": - template_data['spikein'] = get_user_input("Do you have spikein? (yes/no)", default="no", is_boolean=True) + template_data['spikein'] = get_user_input("Do you have spikein? (yes/no)", default="no", is_boolean=True) if template_data['spikein']: template_data['normalisation_method'] = get_user_input("Normalisation method:", default="orlando", choices=["orlando", "with_input"]) template_data['reference_genome'] = get_user_input("Reference genome:", default="hg38") template_data['spikein_genome'] = get_user_input("Spikein genome:", default="dm6") template_data['fastq_screen_config'] = get_user_input("Path to fastqscreen config:", default="/ceph/project/milne_group/shared/seqnado_reference/fastqscreen_reference/fastq_screen.conf") - elif assay in ["atac", "rna"]: - template_data['normalisation_method'] = "False" - - template_data['split_fastq'] = get_user_input("Do you want to split FASTQ files? (yes/no)", default="no", is_boolean=True) - if template_data['split_fastq']: - template_data.update['split_fastq_parts'] = get_user_input("How many parts do you want to split the FASTQ files into?", default="4") - - + template_data['make_bigwigs'] = get_user_input("Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True) if template_data['make_bigwigs']: template_data['pileup_method'] = get_user_input("Pileup method:", default="deeptools", choices=["deeptools", "homer"]) @@ -129,25 +97,16 @@ def setup_configuration(assay, genome, template_data): template_data['call_peaks'] = get_user_input("Do you want to call peaks? (yes/no)", default="no", is_boolean=True) if template_data['call_peaks']: template_data['peak_calling_method'] = get_user_input("Peak caller:", default="lanceotron", choices=["lanceotron", "macs", "homer"]) - - elif assay == "rna": - template_data['call_peaks'] = "False" - if assay == "rna": - template_data['run_deseq2'] = get_user_input("Run DESeq2? (yes/no)", default="no", is_boolean=True) - elif assay in ["chip", "atac"]: - template_data['run_deseq2'] = "False" + template_data['run_deseq2'] = get_user_input("Run DESeq2? (yes/no)", default="no", is_boolean=True) if assay == "rna" else "False" template_data['make_ucsc_hub'] = get_user_input("Do you want to make a UCSC hub? (yes/no)", default="no", is_boolean=True) - if template_data['make_ucsc_hub']: - template_data['UCSC_hub_directory'] = get_user_input("UCSC hub directory:", default="/path/to/ucsc_hub/") - template_data['email'] = get_user_input("What is your email address?", default=f"{username}@example.com") - template_data['color_by'] = get_user_input("Color by (for UCSC hub):", default="samplename") - - if assay in ["chip", "atac"]: - template_data['options'] = TOOL_OPTIONS - elif assay == "rna": - template_data['options'] = TOOL_OPTIONS_RNA + + template_data['UCSC_hub_directory'] = get_user_input("UCSC hub directory:", default="/path/to/ucsc_hub/") if template_data['make_ucsc_hub'] else "." + template_data['email'] = get_user_input("What is your email address?", default=f"{username}@example.com") if template_data['make_ucsc_hub'] else f"{username}@example.com" + template_data['color_by'] = get_user_input("Color by (for UCSC hub):", default="samplename") if template_data['make_ucsc_hub'] else "samplename" + + template_data['options'] = TOOL_OPTIONS_RNA if assay == "rna" else TOOL_OPTIONS # Tool Specific Options @@ -246,4 +205,3 @@ def create_config(assay, genome): file.write(template_deseq2.render(template_data)) print(f"Directory '{dir_name}' has been created with the 'config_{assay}.yml' file.") - diff --git a/seqnado/utils.py b/seqnado/utils.py index eb6f7960..b3e40f68 100644 --- a/seqnado/utils.py +++ b/seqnado/utils.py @@ -109,9 +109,9 @@ def has_bowtie2_index(prefix: str) -> bool: path_dir = path_prefix.parent path_prefix_stem = path_prefix.stem - bowtie2_indicies = list(path_dir.glob(f"{path_prefix_stem}*.bt2")) + bowtie2_indices = list(path_dir.glob(f"{path_prefix_stem}*.bt2")) - if len(bowtie2_indicies) > 0: + if len(bowtie2_indices) > 0: return True @@ -297,6 +297,10 @@ class AssayIP(AssayNonIP): r2: Optional[FastqFileIP] = None metadata: Optional[dict] = None + @property + def is_control(self) -> bool: + return self.r1.is_control + class ExperimentIP(BaseModel): ip_files: AssayIP @@ -323,7 +327,7 @@ def model_post_init(self, *args) -> None: @classmethod def from_fastq_files(cls, fq: List[FastqFileIP], **kwargs): """ - Create a SampleInfo object from a list of FastqFiles. + Create a Experiment object from a list of FastqFiles. """ @@ -466,7 +470,10 @@ def sample_names_control(self) -> List[str]: if experiment.control is not None: sample_names.add(experiment.control_files.name) - return list(sample_names) + if all([s is None for s in sample_names]): + return [] + else: + return list(sample_names) @property def sample_names(self) -> List[str]: @@ -478,7 +485,11 @@ def ip_names(self) -> List[str]: @property def control_names(self) -> List[str]: - return list(set([experiment.control for experiment in self.assays.values()])) + names = list(set([experiment.control for experiment in self.assays.values()])) + if all([s is None for s in names]): + return [] + else: + return names @computed_field @property @@ -516,15 +527,59 @@ def from_fastq_files(cls, fq: List[FastqFileIP], **kwargs): """ - samples = defaultdict(list) + ## Run through the list and pair upt the read1 and read2 files + ## If there is only one file, then it is the read1 file + import itertools + + # Collate the fastq files by sample name + fq = sorted(fq) + fastq_collated = dict() for f in fq: - samples[f.sample_base_without_ip].append(f) + if f.sample_base not in fastq_collated: + fastq_collated[f.sample_base] = dict() + fastq_collated[f.sample_base][f.read_number or 1] = f + else: + fastq_collated[f.sample_base][f.read_number] = f + # Create the assays assays = {} - for sample_name, sample in samples.items(): - assays[sample_name] = ExperimentIP.from_fastq_files(sample, **kwargs) + for sample_name, fastq_files in fastq_collated.items(): + assays[sample_name] = AssayIP( + name=sample_name, r1=fastq_files[1], r2=fastq_files.get(2), **kwargs + ) - return cls(assays=assays, **kwargs) + # Create the experiments + experiments = {} + + for base, assay in itertools.groupby( + assays.values(), lambda x: x.r1.sample_base_without_ip + ): + assay = list(assay) + + if len(assay) == 1: + experiments[assay[0].name] = ExperimentIP(ip_files=assay[0], **kwargs) + elif len(assay) == 2 and any([a.is_control for a in assay]): + ip = [a for a in assay if not a.is_control][0] + control = [a for a in assay if a.is_control][0] + experiments[ip.name] = ExperimentIP( + ip_files=ip, control_files=control, **kwargs + ) + elif len(assay) >= 2 and not any([a.is_control for a in assay]): + for a in assay: + experiments[a.name] = ExperimentIP(ip_files=a, **kwargs) + + elif len(assay) >= 2 and any([a.is_control for a in assay]): + logger.warning(f"Multiple controls for {assay[0].name}") + logger.warning("Will generate all possible combinations") + ip = [a for a in assay if not a.is_control] + control = [a for a in assay if a.is_control] + + for combination in itertools.product(ip, control): + experiments[combination[0].name] = ExperimentIP( + ip_files=combination[0], control_files=combination[1], **kwargs + ) + + return cls(assays=experiments, **kwargs) @classmethod def from_directory( @@ -572,9 +627,11 @@ def to_dataframe(self, simplify: bool = True): @classmethod def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): + experiments = {} for experiment_name, row in df.iterrows(): if simplified: + # Add the metadata metadata = {} for k, v in row.items(): if k not in [ @@ -587,14 +644,18 @@ def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): ]: metadata[k] = v + # Add the experiment ip = row["ip"] control = row["control"] if "control_r1" not in row: experiments[experiment_name] = ExperimentIP( ip_files=AssayIP( + name=experiment_name, r1=FastqFileIP(path=row["ip_r1"]), - r2=FastqFileIP(path=row["ip_r2"]), + r2=FastqFileIP(path=row["ip_r2"]) + if "ip_r2" in row + else None, ), ip=ip, control=None, @@ -603,12 +664,18 @@ def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): else: experiments[experiment_name] = ExperimentIP( ip_files=AssayIP( + name=experiment_name, r1=FastqFileIP(path=row["ip_r1"]), - r2=FastqFileIP(path=row["ip_r2"]), + r2=FastqFileIP(path=row["ip_r2"]) + if "ip_r2" in row + else None, ), control_files=AssayIP( + name=experiment_name, r1=FastqFileIP(path=row["control_r1"]), - r2=FastqFileIP(path=row["control_r2"]), + r2=FastqFileIP(path=row["control_r2"]) + if "control_r2" in row + else None, ), ip=ip, control=control, @@ -620,7 +687,7 @@ def from_dataframe(cls, df: pd.DataFrame, simplified: bool = True, **kwargs): return cls(assays=experiments, **kwargs) -def symlink_files( +def symlink_files_paired( output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str ): r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}_1.fastq.gz") @@ -639,6 +706,18 @@ def symlink_files( logger.warning(f"Symlink for {r2_path_new} already exists.") +def symlink_files_single( + output_dir: pathlib.Path, assay: Union[AssayNonIP, AssayIP], assay_name: str +): + r1_path_new = pathlib.Path(f"{output_dir}/{assay_name}.fastq.gz") + + if not r1_path_new.exists(): + try: + r1_path_new.symlink_to(assay.r1.path.resolve()) + except FileExistsError: + logger.warning(f"Symlink for {r1_path_new} already exists.") + + def symlink_fastq_files( design: Union[Design, DesignIP], output_dir: str = "seqnado_output/fastqs/" ) -> None: @@ -650,18 +729,28 @@ def symlink_fastq_files( if isinstance(design, Design): for assay_name, assay in design.assays.items(): - symlink_files(output_dir, assay, assay_name) + if assay.is_paired: + symlink_files_paired(output_dir, assay, assay_name) + else: + symlink_files_single(output_dir, assay, assay_name) elif isinstance(design, DesignIP): for experiment_name, experiment in design.assays.items(): assay = experiment.ip_files assay_name = assay.name - symlink_files(output_dir, assay, assay_name) + + if assay.is_paired: + symlink_files_paired(output_dir, assay, assay_name) + else: + symlink_files_single(output_dir, assay, assay_name) if experiment.control_files: assay = experiment.control_files assay_name = assay.name - symlink_files(output_dir, assay, assay_name) + if assay.is_paired: + symlink_files_paired(output_dir, assay, assay_name) + else: + symlink_files_single(output_dir, assay, assay_name) def define_output_files( @@ -693,12 +782,12 @@ def define_output_files( analysis_output.append("seqnado_output/qc/library_complexity_qc.html") if make_heatmaps: - assay_output.extend( - [ - "seqnado_output/heatmap/heatmap.pdf", - "seqnado_output/heatmap/metaplot.pdf", - ] - ) + assay_output.extend( + [ + "seqnado_output/heatmap/heatmap.pdf", + "seqnado_output/heatmap/metaplot.pdf", + ] + ) if make_ucsc_hub: hub_dir = pathlib.Path(kwargs["ucsc_hub_details"]["directory"]) @@ -772,7 +861,6 @@ def define_output_files( "Not running DESeq2 as no 'deseq2' column in design file." ) - elif assay == "SNP": if call_snps: assay_output.expand( diff --git a/seqnado/workflow/config/config.yaml.jinja b/seqnado/workflow/config/config.yaml.jinja index 0cbd8045..d30370c0 100755 --- a/seqnado/workflow/config/config.yaml.jinja +++ b/seqnado/workflow/config/config.yaml.jinja @@ -10,12 +10,10 @@ design: "design.csv" genome: name: "{{genome}}" - indicies: "{{indicies}}" + indices: "{{indices}}" chromosome_sizes: "{{chromosome_sizes}}" gtf: "{{gtf}}" -read_type: "{{read_type}}" - remove_blacklist: "{{remove_blacklist}}" blacklist: "{{blacklist}}" @@ -30,9 +28,6 @@ spikein_options: spikein_genome: "{{spikein_genome}}" fastq_screen_config: "{{fastq_screen_config}}" -split_fastq: "{{split_fastq}}" -split_fastq_parts: "{{split_fastq_parts}}" - make_bigwigs: "{{make_bigwigs}}" pileup_method: "{{pileup_method}}" make_heatmaps: "{{make_heatmaps}}" diff --git a/seqnado/workflow/config/preset_genomes.json b/seqnado/workflow/config/preset_genomes.json index f9339602..805bb14a 100755 --- a/seqnado/workflow/config/preset_genomes.json +++ b/seqnado/workflow/config/preset_genomes.json @@ -1,56 +1,56 @@ { "dm6": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/bt2_index/dm6", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/STAR_2.7.10b", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/bt2_index/dm6", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/sequence/dm6.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/dm6/UCSC/genes/dm6.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/dm6/dm6-blacklist.v2.bed.gz" }, "hg19": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/bt2_index/hg19", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/STAR_2.7.10b", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/bt2_index/hg19", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/sequence/hg19.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/hg19/UCSC/genes/hg19.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/hg19/hg19-blacklist.v2.bed.gz " }, "hg38": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/bt2_index/hg38", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/STAR_2.7.10b", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/bt2_index/hg38", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/sequence/hg38.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/hg38/UCSC/genes/hg38.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/hg38/hg38-blacklist.v2.bed.gz" }, "hg38_dm6": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/hg38_dm6/UCSC/bt2_index/hg38_dm6", - "star_index": "NA", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg38_dm6/UCSC/bt2_index/hg38_dm6", + "star_indices": "NA", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/hg38_dm6/UCSC/sequence/hg38_dm6.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/hg38_dm6/UCSC/genes/hg38_dm6.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/hg38_dm6/hg38_dm6-blacklist.v2.bed.gz" }, "hg38_mm39": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/hg38_mm39/bt2_index/hg38_mm39", - "star_index": "NA", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg38_mm39/bt2_index/hg38_mm39", + "star_indices": "NA", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/hg38_mm39/sequence/hg38_mm39.fa.fai", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/hg38_mm39/genes/hg38_mm39.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/hg38_mm39/hg38_mm39-blacklist.bed.gz" }, "hg38_spikein": { - "bt2_index": "NA", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/hg38_spikein/UCSC/STAR_2.7.10b", + "bt2_indices": "NA", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/hg38_spikein/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/hg38_spikein/hg38_spikein.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/hg38_spikein/UCSC/genes/hg38_spikein_transcripts.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/hg38/hg38-blacklist.v2.bed.gz" }, "mm10": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/bt2_index/mm10", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/STAR_2.7.10b", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/bt2_index/mm10", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/sequence/mm10.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/mm10/UCSC/genes/mm10.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/mm10/mm10-blacklist.v2.bed.gz" }, "mm39": { - "bt2_index": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/bt2_index/mm39", - "star_index": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/STAR_2.7.10b", + "bt2_indices": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/bt2_index/mm39", + "star_indices": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/STAR_2.7.10b", "chromosome_sizes": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/sequence/mm39.chrom.sizes", "gtf": "/ceph/project/milne_group/shared/seqnado_reference/mm39/UCSC/genes/mm39.ncbiRefSeq.gtf", "blacklist": "/ceph/project/milne_group/shared/seqnado_reference/mm39/mm10-blacklist.v2.Liftover.mm39.bed.gz" diff --git a/seqnado/workflow/rules/align.smk b/seqnado/workflow/rules/align.smk index 9c00d802..94b9fdcc 100644 --- a/seqnado/workflow/rules/align.smk +++ b/seqnado/workflow/rules/align.smk @@ -1,41 +1,18 @@ import seqnado.utils as utils -if config["split_fastq"] == "False": - rule align_paired: - input: - fq1="seqnado_output/trimmed/{sample}_1.fastq.gz", - fq2="seqnado_output/trimmed/{sample}_2.fastq.gz", - params: - index=config["genome"]["indicies"], - options=utils.check_options(config["bowtie2"]["options"]), - output: - bam=temp("seqnado_output/aligned/raw/{sample}.bam"), - threads: config["bowtie2"]["threads"] - resources: - mem_mb=4000, - log: - "seqnado_output/logs/align/{sample}.log", - shell: - """bowtie2 -p {threads} -x {params.index} -1 {input.fq1} -2 {input.fq2} {params.options} 2> {log} | - samtools view -bS - > {output.bam} && - samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} >> {log} 2>&1 && - mv {output.bam}_sorted {output.bam} - """ - - rule align_paired: input: fq1="seqnado_output/trimmed/{sample}_1.fastq.gz", fq2="seqnado_output/trimmed/{sample}_2.fastq.gz", params: - index=config["genome"]["indicies"], + index=config["genome"]["indices"], options=utils.check_options(config["bowtie2"]["options"]), output: bam=temp("seqnado_output/aligned/raw/{sample}.bam"), threads: config["bowtie2"]["threads"] resources: + time=lambda wildcards, attempt: "0-{hours}:00:00".format(hours=4 * 2**(attempt-1)), mem_mb=4000, - time="0-04:00:00", log: "seqnado_output/logs/align/{sample}.log", shell: @@ -45,16 +22,16 @@ rule align_paired: mv {output.bam}_sorted {output.bam} """ - rule align_single: input: fq1="seqnado_output/trimmed/{sample}.fastq.gz", params: - index=config["genome"]["indicies"], + index=config["genome"]["indices"], options=utils.check_options(config["bowtie2"]["options"]), output: bam=temp("seqnado_output/aligned/raw/{sample}.bam"), resources: + time=lambda wildcards, attempt: "0-{hours}:00:00".format(hours=4 * 2**(attempt-1)), mem_mb=4000, threads: config["bowtie2"]["threads"] log: diff --git a/seqnado/workflow/rules/align_rna.smk b/seqnado/workflow/rules/align_rna.smk index f599f0c1..2d1f3990 100644 --- a/seqnado/workflow/rules/align_rna.smk +++ b/seqnado/workflow/rules/align_rna.smk @@ -6,7 +6,7 @@ rule align_paired: fq1="seqnado_output/trimmed/{sample}_1.fastq.gz", fq2="seqnado_output/trimmed/{sample}_2.fastq.gz", params: - index=config["genome"]["indicies"], + index=config["genome"]["indices"], options=utils.check_options(config["star"]["options"]), prefix="seqnado_output/aligned/star/{sample}_" output: diff --git a/seqnado/workflow/rules/alignment_post_processing.smk b/seqnado/workflow/rules/alignment_post_processing.smk index 09097ce8..60638cdf 100644 --- a/seqnado/workflow/rules/alignment_post_processing.smk +++ b/seqnado/workflow/rules/alignment_post_processing.smk @@ -11,12 +11,14 @@ rule sort_bam: threads: 8 log: "seqnado_output/logs/sorted/{sample}.log", - shell:""" + shell: + """ samtools sort {input.bam} -@ {threads} -o {output.bam} -m 900M && echo 'Sorted bam number of mapped reads:' > {log} 2>&1 && samtools view -f 2 -c {output.bam} >> {log} 2>&1 """ + rule index_bam: input: bam="seqnado_output/aligned/sorted/{sample}.bam", @@ -24,17 +26,22 @@ rule index_bam: bai=temp("seqnado_output/aligned/sorted/{sample}.bam.bai"), threads: 1 resources: - mem_mb=1000 - shell:"samtools index -@ {threads} -b {input.bam}" + mem_mb=1000, + shell: + "samtools index -@ {threads} -b {input.bam}" + if config["remove_blacklist"] and os.path.exists(config.get("blacklist", "")): + rule remove_blacklisted_regions: input: bam="seqnado_output/aligned/sorted/{sample}.bam", bai=rules.index_bam.output.bai, output: bam=temp("seqnado_output/aligned/blacklist_regions_removed/{sample}.bam"), - bai=temp("seqnado_output/aligned/blacklist_regions_removed/{sample}.bam.bai"), + bai=temp( + "seqnado_output/aligned/blacklist_regions_removed/{sample}.bam.bai" + ), threads: 1 params: blacklist=utils.check_options(config["blacklist"]), @@ -43,7 +50,8 @@ if config["remove_blacklist"] and os.path.exists(config.get("blacklist", "")): time="24:00:00", log: "seqnado_output/logs/blacklist/{sample}.log", - shell:""" + shell: + """ bedtools intersect -v -b {params.blacklist} -a {input.bam} > {output.bam} && samtools index -b {output.bam} -o {output.bai} && echo "Removed blacklisted regions" > {log} && @@ -52,19 +60,23 @@ if config["remove_blacklist"] and os.path.exists(config.get("blacklist", "")): """ else: + rule ignore_blacklisted_regions: - input: - bam="seqnado_output/aligned/sorted/{sample}.bam", - bai=rules.index_bam.output.bai, - output: - bam=temp("seqnado_output/aligned/blacklist_regions_removed/{sample}.bam"), - bai=temp("seqnado_output/aligned/blacklist_regions_removed/{sample}.bam.bai"), - threads: 1 - resources: - mem_mb=3000 - log: - "seqnado_output/logs/blacklist/{sample}.log", - shell:""" + input: + bam="seqnado_output/aligned/sorted/{sample}.bam", + bai=rules.index_bam.output.bai, + output: + bam=temp("seqnado_output/aligned/blacklist_regions_removed/{sample}.bam"), + bai=temp( + "seqnado_output/aligned/blacklist_regions_removed/{sample}.bam.bai" + ), + threads: 1 + resources: + mem_mb=3000, + log: + "seqnado_output/logs/blacklist/{sample}.log", + shell: + """ mv {input.bam} {output.bam} && mv {input.bai} {output.bai} && echo "No blacklisted regions specified" > {log} && @@ -74,6 +86,7 @@ else: if config["remove_pcr_duplicates_method"] == "picard": + rule remove_duplicates_using_picard: input: bam="seqnado_output/aligned/blacklist_regions_removed/{sample}.bam", @@ -84,20 +97,22 @@ if config["remove_pcr_duplicates_method"] == "picard": metrics=temp("seqnado_output/aligned/duplicates_removed/{sample}.metrics"), threads: 8 params: - options=utils.check_options(config['picard']['options']), + options=utils.check_options(config["picard"]["options"]), resources: mem_mb=5000, time="24:00:00", log: "seqnado_output/logs/duplicates/{sample}.log", - shell: """ - picard MarkDuplicates -I {input.bam} -O {output.bam} -M {output.metrics} --REMOVE_DUPLICATES true --CREATE_INDEX true {params.options} > {log} 2>&1 && + shell: + """ + picard MarkDuplicates -I {input.bam} -O {output.bam} -M {output.metrics} --REMOVE_DUPLICATES true --CREATE_INDEX true {params.options} > {log} 2>&1 && mv seqnado_output/aligned/duplicates_removed/{wildcards.sample}.bai {output.bai} && echo 'duplicates_removed bam number of mapped reads:' >> {log} 2>&1 && samtools view -f 2 -c {output.bam} >> {log} 2>&1 """ else: + rule handle_duplicates: input: bam="seqnado_output/aligned/blacklist_regions_removed/{sample}.bam", @@ -107,68 +122,83 @@ else: bai=temp("seqnado_output/aligned/duplicates_removed/{sample}.bam.bai"), threads: 8 resources: - mem_mb=500 + mem_mb=500, log: "seqnado_output/logs/duplicates/{sample}.log", script: "../scripts/remove_duplicates.py" + if config["shift_atac_reads"]: + rule shift_atac_alignments: input: bam="seqnado_output/aligned/duplicates_removed/{sample}.bam", bai="seqnado_output/aligned/duplicates_removed/{sample}.bam.bai", output: bam=temp("seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam"), - bai=temp("seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.bai"), - tmp=temp("seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.tmp"), + bai=temp( + "seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.bai" + ), + tmp=temp( + "seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.tmp" + ), resources: - mem_mb=2500 + mem_mb=2500, threads: 1 log: "seqnado_output/logs/atac_shift/{sample}.log", - shell: """ - rsbamtk shift -b {input.bam} -o {output.tmp} && - samtools sort {output.tmp} -@ {threads} -o {output.bam} && - samtools index {output.bam} && + shell: + """ + rsbamtk shift -b {input.bam} -o {output.tmp} && + samtools sort {output.tmp} -@ {threads} -o {output.bam} && + samtools index {output.bam} && echo 'Shifted reads' > {log} 2>&1 && samtools view -f 2 -c {output.bam} >> {log} 2>&1 """ - + else: + rule move_bam_to_temp_location: input: bam="seqnado_output/aligned/duplicates_removed/{sample}.bam", bai="seqnado_output/aligned/duplicates_removed/{sample}.bam.bai", output: bam=temp("seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam"), - bai=temp("seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.bai"), + bai=temp( + "seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.bai" + ), threads: 1 log: "seqnado_output/logs/atac_shift/{sample}.log", - shell: """ - echo 'Will not shift reads' > {log} && - mv {input.bam} {output.bam} && - mv {input.bam}.bai {output.bai} && + shell: + """ + echo 'Will not shift reads' > {log} && + mv {input.bam} {output.bam} && + mv {input.bam}.bai {output.bai} && echo 'Number of reads' >> {log} 2>&1 && samtools view -f 2 -c {output.bam} >> {log} 2>&1 """ + rule move_bam_to_final_location: input: bam="seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam", bai="seqnado_output/aligned/shifted_for_tn5_insertion/{sample}.bam.bai", output: - bam="seqnado_output/aligned/{sample,[A-Za-z0-9_\-]+}.bam", - bai="seqnado_output/aligned/{sample,[A-Za-z0-9_\-]+}.bam.bai", + bam="seqnado_output/aligned/{sample,[A-Za-z\\d\\-_]+}.bam", + bai="seqnado_output/aligned/{sample,[A-Za-z\\d\\-_]+}.bam.bai", log: "seqnado_output/logs/move_bam/{sample}.log", - shell:""" + shell: + """ mv {input.bam} {output.bam} && - mv {input.bai} {output.bai} && - echo "BAM moved to final location" > {log} && + mv {input.bai} {output.bai} && + echo "BAM moved to final location" > {log} && echo 'Number of reads' > {log} 2>&1 && samtools view -f 2 -c {output.bam} >> {log} 2>&1 """ -localrules: move_bam_to_final_location + +localrules: + move_bam_to_final_location, diff --git a/seqnado/workflow/rules/chip_refnorm.smk b/seqnado/workflow/rules/chip_refnorm.smk index 7ec79dc0..2d0f7045 100644 --- a/seqnado/workflow/rules/chip_refnorm.smk +++ b/seqnado/workflow/rules/chip_refnorm.smk @@ -50,7 +50,7 @@ rule align_paired_spikein: fq1="seqnado_output/trimmed/{sample}_1.fastq.gz", fq2="seqnado_output/trimmed/{sample}_2.fastq.gz", params: - index=config["genome"]["indicies"], + index=config["genome"]["indices"], options="--no-mixed --no-discordant", output: bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"), diff --git a/seqnado/workflow/rules/fastq_split.smk b/seqnado/workflow/rules/fastq_split.smk deleted file mode 100644 index 3b5d2b67..00000000 --- a/seqnado/workflow/rules/fastq_split.smk +++ /dev/null @@ -1,72 +0,0 @@ -import seqnado.utils as utils -PARTS=[str (x) for x in range(int(config["split_fastq_parts"]))] -if config["split_fastq"]: - if config["read_type"] == "paired": - rule split_fq: - input: - unpack(lambda wc: seqnado.utils.translate_fq_files(wc, samples=FASTQ_SAMPLES, paired=True)), - output: - expand("seqnado_output/fastq_split/{{sample}}_{part}_{read}.fastq.gz", part=PARTS, read=["1", "2"]), - params: - split1=expand("-o seqnado_output/fastq_split/{{sample}}_{part}_1.fastq.gz", part=PARTS), - split2=expand("-o seqnado_output/fastq_split/{{sample}}_{part}_2.fastq.gz", part=PARTS), - resources: - mem_mb=750, - shell:""" - fastqsplitter -i {input.fq1} {params.split1} && - fastqsplitter -i {input.fq2} {params.split2} - """ - - rule trimgalore_paired: - input: - split1="seqnado_output/fastq_split/{sample}_{part}_1.fastq.gz", - split2="seqnado_output/fastq_split/{sample}_{part}_2.fastq.gz", - output: - trimmed1=temp("seqnado_output/trimmed/{sample}_{part}_1_trimmed.fq.gz"), - trimmed2=temp("seqnado_output/trimmed/{sample}_{part}_2_trimmed.fq.gz"), - threads: 16 - resources: - mem_mb=8000, - time="24:00:00", - params: - options=utils.check_options(config['trim_galore']['options']), - trim_dir="seqnado_output/trimmed" - log:"seqnado_output/logs/trimming/{sample}_{part}.log", - shell:""" - trim_galore --cores {threads} {params.options} --basename {wildcards.sample}_{wildcards.part} --paired --output_dir {params.trim_dir} {input.split1} {input.split2} >> {log} 2>&1 && - mv {params.trim_dir}/{wildcards.sample}_{wildcards.part}_val_1.fq.gz {output.trimmed1} && - mv {params.trim_dir}/{wildcards.sample}_{wildcards.part}_val_2.fq.gz {output.trimmed2} - """ - - rule align_split: - input: - fq1="seqnado_output/trimmed/{sample}_{part}_1_trimmed.fq.gz", - fq2="seqnado_output/trimmed/{sample}_{part}_2_trimmed.fq.gz", - output: - bam=temp("seqnado_output/aligned/split/{sample}_{part}.bam"), - params: - index=config["genome"]["indicies"], - options=utils.check_options(config["bowtie2"]["options"]), - threads: config["bowtie2"]["threads"] - resources: - mem_mb=4000 * int(config["bowtie2"]["threads"]), - time="24:00:00", - log:"seqnado_output/logs/aligned/split/{sample}_part{part}.log", - shell:""" - bowtie2 -p {threads} -x {params.index} -1 {input.fq1} -2 {input.fq2} {params.options} 2> {log} | - samtools view -bS - > {output.bam} && - samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} >> {log} 2>&1 && - mv {output.bam}_sorted {output.bam} - """ - - - rule merge_bams: - input: - expand("seqnado_output/aligned/split/{{sample}}_{part}.bam", part=PARTS), - output: - bam=temp("seqnado_output/aligned/raw/{sample}.bam"), - threads: 8 - log:"seqnado_output/logs/merge/{sample}.log", - shell:""" - samtools merge -o {output.bam} -@ {threads} -h {input} >> {log} 2>&1 - """ diff --git a/seqnado/workflow/rules/hub.smk b/seqnado/workflow/rules/hub.smk index fbd87868..707920be 100644 --- a/seqnado/workflow/rules/hub.smk +++ b/seqnado/workflow/rules/hub.smk @@ -112,7 +112,7 @@ rule bed_to_bigbed: resources: mem_mb=500, log: - "seqnado_output/logs/bed_to_bigbed/{directory}_{sample}.log", + "seqnado_output/logs/bed_to_bigbed/{directory}/{sample}.log", shell: """ sort -k1,1 -k2,2n {input.bed} | grep '#' -v > {input.bed}.tmp && @@ -137,7 +137,7 @@ rule generate_hub: output: hub=get_hub_txt_path(), log: - log=f"seqnado_output/logs/{config['ucsc_hub_details']['name']}.hub.log", + log=f"seqnado_output/logs/{config['ucsc_hub_details']['name']}.hub.log".strip(), container: None params: diff --git a/seqnado/workflow/rules/peak_call.smk b/seqnado/workflow/rules/peak_call_chip.smk similarity index 77% rename from seqnado/workflow/rules/peak_call.smk rename to seqnado/workflow/rules/peak_call_chip.smk index e88200c3..15c14bf3 100644 --- a/seqnado/workflow/rules/peak_call.smk +++ b/seqnado/workflow/rules/peak_call_chip.smk @@ -12,7 +12,7 @@ def get_lanceotron_threshold(wildcards): def get_control_bam(wildcards): exp = DESIGN.query(sample_name=wildcards.sample, ip=wildcards.treatment) - return "seqnado_output/alignments/{sample}_{exp.control}.bam" + return "seqnado_output/aligned/{sample}_{exp.control}.bam" def get_control_tag(wildcards): @@ -27,10 +27,10 @@ def get_control_bigwig(wildcards): rule macs2_with_input: input: - treatment="seqnado_output/alignments/{sample}_{treatment}.bam", + treatment="seqnado_output/aligned/{sample}_{treatment}.bam", control=get_control_bam, output: - peaks="seqnado_output/peaks/macs/{wildcards.treatment}.bed", + peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed", params: options=seqnado.utils.check_options(config["macs"]["callpeak"]), narrow=lambda wc, output: output.peaks.replace(".bed", "_peaks.narrowPeak"), @@ -39,7 +39,7 @@ rule macs2_with_input: mem_mb=2000, time="0-02:00:00", log: - "seqnado_output/logs/macs/{wildcards.treatment}.bed", + "seqnado_output/logs/macs/{sample}_{treatment}.bed", shell: """ macs2 callpeak -t {input.treatment} -c {input.control} -n seqnado_output/peaks/macs/{wildcards.treatment} -f BAMPE {params.options} > {log} 2>&1 && @@ -49,9 +49,9 @@ rule macs2_with_input: rule macs2_no_input: input: - treatment="seqnado_output/alignments/{sample}_{treatment}.bam", + treatment="seqnado_output/aligned/{sample}_{treatment}.bam", output: - peaks="seqnado_output/peaks/macs/{treatment}.bed", + peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed", params: options=seqnado.utils.check_options(config["macs"]["callpeak"]), narrow=lambda wc, output: output.peaks.replace(".bed", "_peaks.narrowPeak"), @@ -61,7 +61,7 @@ rule macs2_no_input: mem_mb=2000, time="0-02:00:00", log: - "seqnado_output/logs/macs/{treatment}.bed", + "seqnado_output/logs/macs/{sample}_{treatment}.bed", shell: """ macs2 callpeak -t {input.treatment} -n {params.basename} -f BAMPE {params.options} > {log} 2>&1 && @@ -74,9 +74,9 @@ rule homer_with_input: treatment="seqnado_output/tag_dirs/{sample}_{treatment}", control=get_control_tag, output: - peaks="seqnado_output/peaks/homer/{treatment}.bed", + peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: - "seqnado_output/logs/homer/{treatment}.bed", + "seqnado_output/logs/homer/{sample}_{treatment}.bed", params: options=seqnado.utils.check_options(config["homer"]["findpeaks"]), threads: 1 @@ -95,9 +95,9 @@ rule homer_no_input: input: treatment="seqnado_output/tag_dirs/{sample}_{treatment}", output: - peaks="seqnado_output/peaks/homer/{treatment}.bed", + peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: - "seqnado_output/logs/homer/{treatment}.bed", + "seqnado_output/logs/homer/{sample}_{treatment}.bed", params: options=seqnado.utils.check_options(config["homer"]["findpeaks"]), threads: 1 @@ -114,12 +114,12 @@ rule homer_no_input: rule lanceotron_with_input: input: - treatment="seqnado_output/bigwigs/deeptools/{treatment}.bigWig", + treatment="seqnado_output/bigwigs/deeptools/{sample}_{treatment}.bigWig", control=get_control_bigwig, output: - peaks="seqnado_output/peaks/lanceotron/{treatment}.bed", + peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed", log: - "seqnado_output/logs/lanceotron/{treatment}.bed", + "seqnado_output/logs/lanceotron/{sample}_{treatment}.bed", params: threshold=get_lanceotron_threshold, outdir=lambda wc, output: os.path.dirname(output.peaks), @@ -138,11 +138,11 @@ rule lanceotron_with_input: rule lanceotron_no_input: input: - treatment="seqnado_output/bigwigs/deeptools/{treatment}.bigWig", + treatment="seqnado_output/bigwigs/deeptools/{sample}_{treatment}.bigWig", output: - peaks="seqnado_output/peaks/lanceotron/{treatment}.bed", + peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed", log: - "seqnado_output/logs/lanceotron/{treatment}.bed", + "seqnado_output/logs/lanceotron/{sample}_{treatment}.bed", params: options=seqnado.utils.check_options(config["lanceotron"]["callpeak"]), outdir=lambda wc, output: os.path.dirname(output.peaks), @@ -155,7 +155,7 @@ rule lanceotron_no_input: shell: """ lanceotron callPeaks {input.treatment} -f {params.outdir} --skipheader {params.options} > {log} 2>&1 && - cat {params.outdir}/{wildcards.treatment}_L-tron.bed | cut -f 1-3 > {output.peaks} + cat {params.outdir}/{wildcards.sample}_{wildcards.treatment}_L-tron.bed | cut -f 1-3 > {output.peaks} """ diff --git a/seqnado/workflow/rules/peak_call_other.smk b/seqnado/workflow/rules/peak_call_other.smk new file mode 100644 index 00000000..81a2a4e1 --- /dev/null +++ b/seqnado/workflow/rules/peak_call_other.smk @@ -0,0 +1,76 @@ +from typing import Literal +import seqnado.utils +import re + + +def get_lanceotron_threshold(wildcards): + options = config["lanceotron"]["callpeak"] + threshold_pattern = re.compile(r"\-c\s+(\d+.?\d*)") + threshold = threshold_pattern.search(options).group(1) + return threshold + + +rule macs2_no_input: + input: + treatment="seqnado_output/aligned/{sample}.bam", + output: + peaks="seqnado_output/peaks/macs/{sample}.bed", + params: + options=seqnado.utils.check_options(config["macs"]["callpeak"]), + narrow=lambda wc, output: output.peaks.replace(".bed", "_peaks.narrowPeak"), + basename=lambda wc, output: output.peaks.replace(".bed", ""), + threads: 1 + resources: + mem_mb=2000, + time="0-02:00:00", + log: + "seqnado_output/logs/macs/{sample}.bed", + shell: + """ + macs2 callpeak -t {input.treatment} -n {params.basename} -f BAMPE {params.options} > {log} 2>&1 && + cat {params.narrow} | cut -f 1-3 > {output.peaks} + """ + + +rule homer_no_input: + input: + treatment="seqnado_output/tag_dirs/{sample}", + output: + peaks="seqnado_output/peaks/homer/{sample}.bed", + log: + "seqnado_output/logs/homer/{sample}.bed", + params: + options=seqnado.utils.check_options(config["homer"]["findpeaks"]), + threads: 1 + resources: + mem_mb=4000, + time="0-02:00:00", + shell: + """ + findPeaks {input.treatment} {params.options} -o {output.peaks}.tmp > {log} 2>&1 && + pos2bed.pl {output.peaks}.tmp -o {output.peaks} >> {log} 2>&1 && + rm {output.peaks}.tmp + """ + + +rule lanceotron_no_input: + input: + treatment="seqnado_output/bigwigs/deeptools/{sample}.bigWig", + output: + peaks="seqnado_output/peaks/lanceotron/{sample}.bed", + log: + "seqnado_output/logs/lanceotron/{sample}.bed", + params: + options=seqnado.utils.check_options(config["lanceotron"]["callpeak"]), + outdir=lambda wc, output: os.path.dirname(output.peaks), + threads: 1 + container: + "library://asmith151/seqnado/seqnado_extra:latest" + resources: + mem_mb=10_1000, + time="0-06:00:00", + shell: + """ + lanceotron callPeaks {input.treatment} -f {params.outdir} --skipheader {params.options} > {log} 2>&1 && + cat {params.outdir}/{wildcards.sample}_L-tron.bed | cut -f 1-3 > {output.peaks} + """ diff --git a/seqnado/workflow/rules/qc.smk b/seqnado/workflow/rules/qc.smk index 73eb1b7c..b2164846 100644 --- a/seqnado/workflow/rules/qc.smk +++ b/seqnado/workflow/rules/qc.smk @@ -19,6 +19,18 @@ rule fastqc_raw_paired: "v3.0.1/bio/fastqc" +rule fastqc_raw_single: + input: + "seqnado_output/fastqs/{sample}.fastq.gz", + output: + html="seqnado_output/qc/fastqc_raw/{sample}.html", + zip="seqnado_output/qc/fastqc_raw/{sample}_fastqc.zip", # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + log: + "seqnado_output/logs/fastqc_raw/{sample}.log", + wrapper: + "v3.0.1/bio/fastqc" + + use rule fastqc_raw_paired as fastqc_trimmed_paired with: input: "seqnado_output/trimmed/{sample}_{read}.fastq.gz", @@ -29,6 +41,16 @@ use rule fastqc_raw_paired as fastqc_trimmed_paired with: "seqnado_output/logs/fastqc_trimmed/{sample}_{read}.log", +use rule fastqc_raw_single as fastqc_trimmed_single with: + input: + "seqnado_output/trimmed/{sample}.fastq.gz", + output: + html="seqnado_output/qc/fastqc_trimmed/{sample}.html", + zip="seqnado_output/qc/fastqc_trimmed/{sample}_fastqc.zip", # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + log: + "seqnado_output/logs/fastqc_trimmed/{sample}.log", + + rule samtools_stats: input: bam="seqnado_output/aligned/raw/{sample}.bam", @@ -47,29 +69,35 @@ use rule samtools_stats as samtools_stats_filtered with: output: stats="seqnado_output/qc/alignment_filtered/{sample}.txt", -if config["split_fastq"] == "False": - rule multiqc: - input: - expand( - "seqnado_output/qc/fastqc_raw/{sample}_{read}_fastqc.html", - sample=SAMPLE_NAMES, - read=[1, 2], - ), - expand( - "seqnado_output/qc/fastqc_trimmed/{sample}_{read}_fastqc.html", - sample=SAMPLE_NAMES, - read=[1, 2], - ), - expand("seqnado_output/qc/alignment_raw/{sample}.txt", sample=SAMPLE_NAMES), - expand("seqnado_output/qc/alignment_filtered/{sample}.txt", sample=SAMPLE_NAMES), - output: - "seqnado_output/qc/full_qc_report.html", - log: - "seqnado_output/logs/multiqc.log", - resources: - mem_mb=lambda wildcards, attempt: 2000 * 2**attempt, - shell: - "multiqc -o seqnado_output/qc seqnado_output/qc -n full_qc_report.html --force > {log} 2>&1" + + + +rule multiqc: + input: + expand( + "seqnado_output/qc/fastqc_raw/{sample}_{read}_fastqc.html", + sample=SAMPLE_NAMES, + read=[1, 2], + ), + expand( + "seqnado_output/qc/fastqc_trimmed/{sample}_{read}_fastqc.html", + sample=SAMPLE_NAMES, + read=[1, 2], + ), + expand("seqnado_output/qc/alignment_raw/{sample}.txt", sample=SAMPLE_NAMES), + expand( + "seqnado_output/qc/alignment_filtered/{sample}.txt", + sample=SAMPLE_NAMES, + ), + output: + "seqnado_output/qc/full_qc_report.html", + log: + "seqnado_output/logs/multiqc.log", + resources: + mem_mb=lambda wildcards, attempt: 2000 * 2**attempt, + shell: + "multiqc -o seqnado_output/qc seqnado_output/qc -n full_qc_report.html --force > {log} 2>&1" + def get_fastqc_files(*args, **kwargs): """Return a list of fastq files for a given sample name.""" @@ -99,13 +127,24 @@ rule multiqc_raw: "multiqc -o seqnado_output/qc seqnado_output/qc/fastqc_raw -n fastq_raw_qc.html --force > {log} 2>&1" +def get_trimmed_files(wc): + """Return a list of fastq files for a given sample name.""" + import pathlib + + fastqc_dir = pathlib.Path("seqnado_output/qc/fastqc_trimmed/") + + fastqc_files = [] + fq_files = pathlib.Path("seqnado_output/fastqs").glob("*.fastq.gz") + for fq_file in fq_files: + fastqc_file = fastqc_dir / (fq_file.stem.replace(".fastq", "") + ".html") + fastqc_files.append(str(fastqc_file)) + + return fastqc_files + + rule multiqc_trimmed: input: - expand( - "seqnado_output/qc/fastqc_trimmed/{sample}_{read}.html", - sample=SAMPLE_NAMES, - read=[1, 2], - ), + get_trimmed_files, output: "seqnado_output/qc/fastq_trimmed_qc.html", log: @@ -162,3 +201,6 @@ rule multiqc_library_complexity: mem_mb=lambda wildcards, attempt: 2000 * 2**attempt, shell: "multiqc -o seqnado_output/qc seqnado_output/aligned/duplicates_removed -n library_complexity_qc.html --force > {log} 2>&1" + + +ruleorder: fastqc_raw_paired > fastqc_raw_single > fastqc_trimmed_paired > fastqc_trimmed_single > samtools_stats > samtools_stats_filtered > multiqc_raw > multiqc_trimmed > multiqc_alignment_raw > multiqc_alignment_filtered > multiqc_library_complexity diff --git a/seqnado/workflow/snakefile_atac b/seqnado/workflow/snakefile_atac index f3ce29ef..048b2327 100644 --- a/seqnado/workflow/snakefile_atac +++ b/seqnado/workflow/snakefile_atac @@ -25,7 +25,7 @@ utils.format_config_dict(config) # Generate design if os.path.exists(config["design"]): - df = pd.read_csv(config["design"], sep="[\s+,\t]", engine="python") + df = pd.read_csv(config["design"], sep=r"\s+|,|\t", engine="python", index_col=0) DESIGN = Design.from_dataframe(df) else: DESIGN = Design.from_directory(".") @@ -41,7 +41,6 @@ ANALYSIS_OUTPUT = utils.define_output_files(sample_names=SAMPLE_NAMES, assay=ASSAY, **config) - ################### # Pipeline config # ################### @@ -50,12 +49,16 @@ include: "rules/fastq_trim.smk" include: "rules/qc.smk" include: "rules/align.smk" include: "rules/alignment_post_processing.smk" -include: "rules/peak_call.smk" +include: "rules/peak_call_other.smk" include: "rules/pileup.smk" include: "rules/heatmap.smk" include: "rules/hub.smk" + + + + rule all: input: ANALYSIS_OUTPUT diff --git a/seqnado/workflow/snakefile_chip b/seqnado/workflow/snakefile_chip index 8f48e9a3..db15922a 100644 --- a/seqnado/workflow/snakefile_chip +++ b/seqnado/workflow/snakefile_chip @@ -24,7 +24,7 @@ utils.format_config_dict(config) # Generate design if os.path.exists(config["design"]): - df = pd.read_csv(config["design"], sep="[\s+,\t]", engine="python") + df = pd.read_csv(config["design"], sep=r"\s+|,|\t", engine="python", index_col=0) DESIGN = DesignIP.from_dataframe(df) else: DESIGN = DesignIP.from_directory(".") @@ -48,7 +48,7 @@ include: "rules/qc.smk" include: "rules/fastq_trim.smk" include: "rules/align.smk" include: "rules/alignment_post_processing.smk" -include: "rules/peak_call.smk" +include: "rules/peak_call_chip.smk" include: "rules/pileup.smk" include: "rules/heatmap.smk" include: "rules/hub.smk" @@ -62,6 +62,7 @@ ANALYSIS_OUTPUT = seqnado.utils.define_output_files( **config ) + if config["spikein"]: include: "rules/chip_refnorm.smk" include: "rules/normalisation.smk" @@ -74,10 +75,7 @@ if config["spikein"]: # Define wildcard constraints wildcard_constraints: - read = r"[12]", - sample = "|".join(SAMPLE_NAMES), - treatment= "|".join(SAMPLE_NAMES_IP), - control = "|".join(SAMPLE_NAMES_CONTROL), + treatment= "|".join(IP), diff --git a/seqnado/workflow/snakefile_rna b/seqnado/workflow/snakefile_rna index dd3d6849..6cafa0c3 100644 --- a/seqnado/workflow/snakefile_rna +++ b/seqnado/workflow/snakefile_rna @@ -29,7 +29,7 @@ utils.format_config_dict(config) # Generate design if os.path.exists(config["design"]): - df = pd.read_csv(config["design"], sep="[\s+,\t]", engine="python", index_col=0) + df = pd.read_csv(config["design"], sep=r"\s+|,|\t", engine="python", index_col=0) DESIGN = Design.from_dataframe(df) else: DESIGN = Design.from_directory(".") diff --git a/seqnado/workflow/snakefile_snp b/seqnado/workflow/snakefile_snp index 936635b7..16a26bfd 100755 --- a/seqnado/workflow/snakefile_snp +++ b/seqnado/workflow/snakefile_snp @@ -34,12 +34,8 @@ else: DESIGN = FASTQ_SAMPLES.design SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all -if config["split_fastq"]: - include: "rules/fastq_split.smk" -else: - include: "rules/fastq_trim.smk" - include: "rules/align.smk" - +include: "rules/fastq_trim.smk" +include: "rules/align.smk" include: "rules/alignment_post_processing.smk" include: "rules/hub.smk" include: "rules/qc.smk" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index bd476084..00000000 --- a/setup.cfg +++ /dev/null @@ -1,35 +0,0 @@ -[metadata] -name = seqnado -author = asmith, cchahrour -author_email = alastair.smith@ndcls.ox.ac.uk -description = Pipelines for ATAC-seq, ChIP-seq and RNA-seq analysis -readme = "README.md" -license = GNU GENERAL PUBLIC LICENSE Version 3 -license_file = LICENSE -url = https://github.com/alsmith151/seqnado - - -[options] -zip_safe = False -include_package_data = True -packages=find: -install_requires= - click - cookiecutter - seaborn - wget - pyyaml - pandas - snakemake<=7.32.3 - tracknado - - -[options.entry_points] -console_scripts = - seqnado-config = seqnado.cli:cli_config - seqnado-design = seqnado.cli:cli_design - seqnado = seqnado.cli:cli_pipeline - -[options.extras_require] -atac = - # rsbamtk @ git+https://github.com/alsmith151/rsbamtk.git diff --git a/tests/data/config/config_atac.yml b/tests/data/config/config_atac.yml deleted file mode 100644 index b2cd5bca..00000000 --- a/tests/data/config/config_atac.yml +++ /dev/null @@ -1,136 +0,0 @@ -################################# -# ATAC-seq Pipeline Config file # -################################# - -# This file contains the configuration for the ATAC-seq pipeline. -# The pipeline is run by the following command: -# seqnado atac -c N_CORES -# -# To use the singularity container (allows for running the pipeline with a minimal conda environment), -# you will also need to 'bind' paths to the container (this allows for folders outside the current directory to be used i.e. /t1-data). -# -# seqnado atac -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " -# -# To run all jobs on the cluster (highly recommended; these options are for slurm i.e. cbrg cluster): -# -# seqnado atac -c N_CORES --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=24:00:00 " -# -# Combining both singularity and slurm options: -# -# seqnado atac -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem} --time 24:00:00 " -# -# The keys marked as essential are required for the pipeline to run. -# The keys marked as optional can either be left as the default or adjusted if required. - -version: 1.0 - -################################### -# Essential configuration options # -################################### - -genome: - name: - GENOME_NAME - - chromosome_sizes: - CHROMOSOME_SIZES_FILE - - indicies: - INDICES_DIRECTORY - -################################### -# Optional configuration # -################################### - -design: - DESIGN - -# Allows for removal of specific regions from the bam file. Must supply a bed file. -blacklist: - BLACKLIST_FILE - -# Method(s) for bigWig generation -pileup_method: - - deeptools - -# Method(s) for peak calling -peak_calling_method: - - lanceotron - -# Tool to remove duplicates: None - do not remove, picard - picard markDuplicates, deeptools - deeptools alignmentSieve -remove_pcr_duplicates_method: - picard - -# Shift ATAC-seq reads to account for the transposon insertion site. -shift_atac_reads: - True - -ucsc_hub_details: - - # Location of publically accessible location on the server - directory: - HUB_DIRECTORY_PATH - - # Name for the hub (UCSC required) - name: - HUB_NAME - - # Short hub name (UCSC required) - short: - - - # Long hub name (UCSC required) - long: - - # Email address (UCSC required) - email: - alastair.smith@ndcls.ox.ac.uk - - color_by: - # Options: 'samplename', 'antibody', 'method' (i.e. pileup/peakcall method), - - samplename - -################################# -# Tool specific options # -################################# - -trim_galore: - threads: - 4 - options: - - -bowtie2: - threads: - 4 - options: - -X 2000 - -homer: - maketagdirectory: - - makebigwig: - - findpeaks: - -deeptools: - threads: - 8 - alignmentsieve: - --minMappingQuality 30 - - # Options passed to deeptools BamCoverage - # These need to be replaced - # e.g. --extendReads -bs 1 --normalizeUsing RPKM - bamcoverage: - --extendReads -bs 1 --normalizeUsing RPKM - -macs: - version: - 2 - callpeak: - -lanceotron: - # Options passed to callPeaks[Input] command - callpeak: - -c 0.5 diff --git a/tests/data/config/config_chip.yml b/tests/data/config/config_chip.yml deleted file mode 100644 index 003c3898..00000000 --- a/tests/data/config/config_chip.yml +++ /dev/null @@ -1,145 +0,0 @@ -################################# -# ChIP-seq Pipeline Config file # -################################# - -# This file contains the configuration for the ChIP-seq pipeline. -# The pipeline is run by the following command: -# seqnado chip -c N_CORES -# -# To use the singularity container (allows for running the pipeline with a minimal conda environment), -# you will also need to 'bind' paths to the container (this allows for folders outside the current directory to be used i.e. /t1-data). -# -# seqnado chip -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " -# -# To run all jobs on the cluster (highly recommended; these options are for slurm i.e. cbrg cluster): -# -# seqnado chip -c N_CORES --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=24:00:00 " -# -# Combining both singularity and slurm options: -# -# seqnado chip -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem} --time 24:00:00 " -# -# The keys marked as essential are required for the pipeline to run. -# The keys marked as optional can either be left as the default or adjusted if required. - -version: 1.0 - -################################### -# Essential configuration options # -################################### - -genome: - name: - GENOME_NAME - - chromosome_sizes: - CHROMOSOME_SIZES_FILE - - indicies: - INDICES_DIRECTORY - - -################################### -# Optional configuration # -################################### - -# Allows for specification of an experimental design. Expecting columns (comma, tab or space sep): sample, fq1, fq2, control -design: - DESIGN_FILE - - -# Allows for removal of specific regions from the bam file. Must supply a bed file. -blacklist: - BLACKLIST_FILE - -# Method(s) for bigWig generation -pileup_method: - - deeptools - -# Method(s) for peak calling -peak_calling_method: - - lanceotron - -# Tool to remove duplicates: None - do not remove, picard - picard markDuplicates, deeptools - deeptools alignmentSieve -remove_pcr_duplicates_method: - deeptools - -ucsc_hub_details: - - # Location of publically accessible location on the server - directory: - HUB_DIRECTORY_PATH - - # Name for the hub (UCSC required) - name: - HUB_NAME - - # Short hub name (UCSC required) - short: - - - # Long hub name (UCSC required) - long: - - # Email address (UCSC required) - email: - alastair.smith@ndcls.ox.ac.uk - - color_by: - # Options: 'samplename', 'antibody', 'method' (i.e. pileup/peakcall method), - - samplename - - antibody - -################################# -# Tool specific options # -################################# - -trim_galore: - threads: - 4 - options: - - -bowtie2: - threads: - 4 - options: - - -homer: - use_input: - true - - maketagdirectory: - - makebigwig: - - findpeaks: - -deeptools: - threads: - 8 - alignmentsieve: - --minMappingQuality 30 - - - # Options passed to deeptools BamCoverage - # These need to be replaced - # e.g. --extendReads -bs 1 --normalizeUsing RPKM - bamcoverage: - --extendReads -bs 1 --normalizeUsing RPKM - -macs: - version: - 2 - callpeak: - -lanceotron: - - # Instructs lanceotron to use the matched input file for peak calling - # No effect if input file is not matched - use_input: - True - # Options passed to callPeaks[Input] command - callpeak: - -c 0.5 diff --git a/tests/data/config/config_rna.yml b/tests/data/config/config_rna.yml deleted file mode 100644 index 8394599e..00000000 --- a/tests/data/config/config_rna.yml +++ /dev/null @@ -1,129 +0,0 @@ -################################# -# RNA-seq Pipeline Config file # -################################# - -# This file contains the configuration for the ATAC-seq pipeline. -# The pipeline is run by the following command: -# seqnado rna -c N_CORES -# -# To use the singularity container (allows for running the pipeline with a minimal conda environment), -# you will also need to 'bind' paths to the container (this allows for folders outside the current directory to be used i.e. /t1-data). -# -# seqnado rna -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " -# -# To run all jobs on the cluster (highly recommended; these options are for slurm i.e. cbrg cluster): -# -# seqnado rna -c N_CORES --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=24:00:00 " -# -# Combining both singularity and slurm options: -# -# seqnado rna -c N_CORES --use-singularity --singularity-args "--bind /t1-data --bind /databank " --drmaa "--cpus-per-task={threads} --mem-per-cpu={resources.mem} --time 24:00:00 " -# -# The keys marked as essential are required for the pipeline to run. -# The keys marked as optional can either be left as the default or adjusted if required. - -version: 1.0 - -################################### -# Essential configuration options # -################################### - -genome: - name: - GENOME_NAME - - chromosome_sizes: - CHROMOSOME_SIZES_FILE - - indicies: - INDICES_DIRECTORY - - annotation: - GTF - - -################################### -# Optional configuration # -################################### - -design: - DESIGN - -# Allows for removal of specific regions from the bam file. Must supply a bed file. -blacklist: - BLACKLIST_FILE - -# Method(s) for bigWig generation -pileup_method: - - deeptools - -# Method(s) for peak calling -peak_calling_method: - - lanceotron - -# Tool to remove duplicates: None - do not remove, picard - picard markDuplicates, deeptools - deeptools alignmentSieve -remove_pcr_duplicates_method: - - picard - -ucsc_hub_details: - - # Location of publically accessible location on the server - directory: - HUB_DIRECTORY_PATH - - # Name for the hub (UCSC required) - name: - HUB_NAME - - # Short hub name (UCSC required) - short: - - - # Long hub name (UCSC required) - long: - - # Email address (UCSC required) - email: - alastair.smith@ndcls.ox.ac.uk - - color_by: - # Options: 'samplename', 'antibody', 'method' (i.e. pileup/peakcall method), - - samplename - -################################# -# Tool specific options # -################################# - -trim_galore: - threads: - 4 - options: - - -star: - threads: - 4 - options: - -featurecounts: - threads: - 4 - options: - -homer: - maketagdirectory: - - makebigwig: - - findpeaks: - -deeptools: - threads: - 8 - alignmentsieve: - --minMappingQuality 30 - - # Options passed to deeptools BamCoverage - # These need to be replaced - # e.g. -bs 1 --normalizeUsing RPKM - bamcoverage: diff --git a/tests/test_atac.py b/tests/test_atac.py index fdbe7e12..945b533a 100644 --- a/tests/test_atac.py +++ b/tests/test_atac.py @@ -57,10 +57,10 @@ def config_path(data_path): @pytest.fixture(scope="module") -def genome_indicies(genome_path): - indicies = os.path.join(genome_path, "bt2") +def genome_indices(genome_path): + indices = os.path.join(genome_path, "bt2") - if not os.path.exists(indicies): + if not os.path.exists(indices): try: import requests import tarfile @@ -75,16 +75,16 @@ def genome_indicies(genome_path): tar.extractall(path=genome_path) tar.close() os.remove(output) - os.rename(genome_path + "/bt2", indicies) + os.rename(genome_path + "/bt2", indices) except Exception as e: print(e) - print("Could not download indicies so generating them") - os.mkdir(indicies) - cmd = f"bowtie2-build {os.path.join(genome_path,'chr21_rename.fa')} {indicies}/bt2 --threads 8" + print("Could not download indices so generating them") + os.mkdir(indices) + cmd = f"bowtie2-build {os.path.join(genome_path,'chr21_rename.fa')} {indices}/bt2 --threads 8" subprocess.run(cmd.split()) - return os.path.join(indicies, "chr21") + return os.path.join(indices, "chr21") @pytest.fixture(scope="module") @@ -96,22 +96,20 @@ def run_directory(tmpdir_factory): @pytest.fixture(scope="module") def user_inputs( data_path, - genome_indicies, + genome_indices, chromsizes, ): return { "project_name": "test", "genome_name": "hg19", - "index": genome_indicies, + "indices": genome_indices, "chromsizes": chromsizes, "gtf": f"{data_path}/genome/chr21.gtf", "blacklist": f"{data_path}/genome/hg19-blacklist.v2.chr21.bed.gz", - "read_type": "paired", "remove_blacklist": "yes", "remove_pcr_duplicates": "yes", "remove_pcr_duplicates_method": "picard", "shift_atac_reads": "yes", - "split_fastq": "no", "make_bigwigs": "yes", "pileup_method": "deeptools", "make_heatmaps": "yes", @@ -123,20 +121,15 @@ def user_inputs( "color_by": "samplename", } + @pytest.fixture(scope="module") -def test_seqnado_config_creation( - run_directory, - user_inputs - ): +def test_seqnado_config_creation(run_directory, user_inputs): temp_dir = pathlib.Path(run_directory) date = datetime.now().strftime("%Y-%m-%d") config_file_path = temp_dir / f"{date}_atac_test/config_atac.yml" user_inputs = "\n".join(user_inputs.values()) - cmd = [ - "seqnado-config", - "atac" - ] + cmd = ["seqnado-config", "atac"] # Run the script with subprocess process = subprocess.Popen( @@ -145,7 +138,7 @@ def test_seqnado_config_creation( stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - cwd=temp_dir + cwd=temp_dir, ) stdout, stderr = process.communicate(input=user_inputs) @@ -153,6 +146,7 @@ def test_seqnado_config_creation( # Assert that the config file was created assert os.path.exists(config_file_path), "Config file not created." + @pytest.fixture(scope="module", autouse=True) def set_up( run_directory, @@ -169,13 +163,24 @@ def set_up( for fq in fastqs: shutil.copy(fq, ".") + # Add missing options to config file + import yaml + + with open("config_atac.yml", "r") as stream: + config = yaml.safe_load(stream) + config["peak_calling_method"] = ["lanceotron", "homer", "macs"] + config["pileup_method"] = ["deeptools", "homer"] + + with open("config_atac.yml", "w") as stream: + yaml.dump(config, stream) + yield os.chdir(cwd) def test_pipeline_singularity(genome_path, cores): - indicies_dir = os.path.join(genome_path, "bt2") + indices_dir = os.path.join(genome_path, "bt2") cmd = [ "seqnado", @@ -186,7 +191,7 @@ def test_pipeline_singularity(genome_path, cores): "config_atac.yml", "--use-singularity", "--singularity-args", - f'" -B {indicies_dir} -B {genome_path}"', + f'" -B {indices_dir} -B {genome_path}"', ] completed = subprocess.run(" ".join(cmd), shell=True) assert completed.returncode == 0 diff --git a/tests/test_chip.py b/tests/test_chip.py index 6c5837b8..4c1684ae 100644 --- a/tests/test_chip.py +++ b/tests/test_chip.py @@ -57,10 +57,10 @@ def config_path(data_path): @pytest.fixture(scope="module") -def genome_indicies(genome_path): - indicies = os.path.join(genome_path, "bt2") +def genome_indices(genome_path): + indices = os.path.join(genome_path, "bt2") - if not os.path.exists(indicies): + if not os.path.exists(indices): try: import requests import tarfile @@ -75,16 +75,16 @@ def genome_indicies(genome_path): tar.extractall(path=genome_path) tar.close() os.remove(output) - os.rename(genome_path + "/bt2", indicies) + os.rename(genome_path + "/bt2", indices) except Exception as e: print(e) - print("Could not download indicies so generating them") - os.mkdir(indicies) - cmd = f"bowtie2-build {os.path.join(genome_path,'chr21_rename.fa')} {indicies}/bt2 --threads 8" + print("Could not download indices so generating them") + os.mkdir(indices) + cmd = f"bowtie2-build {os.path.join(genome_path,'chr21_rename.fa')} {indices}/bt2 --threads 8" subprocess.run(cmd.split()) - return os.path.join(indicies, "chr21") + return os.path.join(indices, "chr21") @pytest.fixture(scope="module") @@ -96,22 +96,20 @@ def run_directory(tmpdir_factory): @pytest.fixture(scope="module") def user_inputs( data_path, - genome_indicies, + genome_indices, chromsizes, ): return { "project_name": "test", "genome_name": "hg19", - "index": genome_indicies, + "indices": genome_indices, "chromsizes": chromsizes, "gtf": f"{data_path}/genome/chr21.gtf", "blacklist": f"{data_path}/genome/hg19-blacklist.v2.chr21.bed.gz", - "read_type": "paired", "remove_blacklist": "yes", "remove_pcr_duplicates": "yes", "remove_pcr_duplicates_method": "picard", "spikein": "no", - "split_fastq": "no", "make_bigwigs": "yes", "pileup_method": "deeptools", "make_heatmaps": "yes", @@ -123,20 +121,15 @@ def user_inputs( "color_by": "samplename", } + @pytest.fixture(scope="module") -def test_seqnado_config_creation( - run_directory, - user_inputs - ): +def test_seqnado_config_creation(run_directory, user_inputs): temp_dir = pathlib.Path(run_directory) date = datetime.now().strftime("%Y-%m-%d") config_file_path = temp_dir / f"{date}_chip_test/config_chip.yml" user_inputs = "\n".join(user_inputs.values()) - cmd = [ - "seqnado-config", - "chip" - ] + cmd = ["seqnado-config", "chip"] # Run the script with subprocess process = subprocess.Popen( @@ -145,21 +138,17 @@ def test_seqnado_config_creation( stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - cwd=temp_dir + cwd=temp_dir, ) stdout, stderr = process.communicate(input=user_inputs) # Assert that the config file was created assert os.path.exists(config_file_path), "Config file not created." - + + @pytest.fixture(scope="module", autouse=True) -def set_up( - run_directory, - fastqs, - user_inputs, - test_seqnado_config_creation -): +def set_up(run_directory, fastqs, user_inputs, test_seqnado_config_creation): cwd = os.getcwd() os.chdir(run_directory) @@ -169,13 +158,24 @@ def set_up( for fq in fastqs: shutil.copy(fq, ".") + # Add missing options to config file + import yaml + + with open("config_chip.yml", "r") as stream: + config = yaml.safe_load(stream) + config["peak_calling_method"] = ["lanceotron", "homer", "macs"] + config["pileup_method"] = ["deeptools", "homer"] + + with open("config_chip.yml", "w") as stream: + yaml.dump(config, stream) + yield os.chdir(cwd) def test_pipeline_singularity(genome_path, cores): - indicies_dir = os.path.join(genome_path, "bt2") + indices_dir = os.path.join(genome_path, "bt2") cmd = [ "seqnado", @@ -186,7 +186,7 @@ def test_pipeline_singularity(genome_path, cores): "config_chip.yml", "--use-singularity", "--singularity-args", - f'" -B {indicies_dir} -B {genome_path}"', + f'" -B {indices_dir} -B {genome_path}"', ] completed = subprocess.run(" ".join(cmd), shell=True) assert completed.returncode == 0 diff --git a/tests/test_rna.py b/tests/test_rna.py index dddb7f56..04fe8837 100644 --- a/tests/test_rna.py +++ b/tests/test_rna.py @@ -57,12 +57,12 @@ def config_path(data_path): @pytest.fixture(scope="module") -def genome_indicies(genome_path): - indicies = os.path.join(genome_path, "GenomeDir") +def genome_indices(genome_path): + indices = os.path.join(genome_path, "GenomeDir") gtf = os.path.join(genome_path, "chr21.gtf") fasta = os.path.join(genome_path, "chr21_rename.fa") - if not os.path.exists(indicies): + if not os.path.exists(indices): try: import requests import tarfile @@ -77,16 +77,16 @@ def genome_indicies(genome_path): tar.extractall(path=genome_path) tar.close() os.remove(output) - os.rename(os.path.join(genome_path, "GenomeDir"), indicies) + os.rename(os.path.join(genome_path, "GenomeDir"), indices) except Exception as e: print(e) - print("Could not download indicies so generating them") - os.mkdir(indicies) + print("Could not download indices so generating them") + os.mkdir(indices) cmd = f"""STAR --runMode genomeGenerate --runThreadN 4 - --genomeDir {indicies} + --genomeDir {indices} --genomeFastaFiles {fasta} --sjdbGTFfile {gtf} --sjdbOverhang 100 @@ -94,7 +94,7 @@ def genome_indicies(genome_path): """ subprocess.run(cmd.split()) - return indicies + return indices @pytest.fixture(scope="module") @@ -107,20 +107,18 @@ def run_directory(tmpdir_factory): @pytest.fixture(scope="module") def user_inputs( data_path, - genome_indicies, + genome_indices, chromsizes, ): return { "project_name": "test", "genome_name": "hg19", - "index": genome_indicies, + "indices": genome_indices, "chromsizes": chromsizes, "gtf": f"{data_path}/genome/chr21.gtf", "blacklist": f"{data_path}/genome/hg19-blacklist.v2.chr21.bed.gz", - "read_type": "paired", "remove_blacklist": "yes", "remove_pcr_duplicates": "no", - "split_fastq": "no", "make_bigwigs": "yes", "pileup_method": "deeptools", "make_heatmaps": "yes", @@ -182,7 +180,7 @@ def set_up( os.chdir(cwd) -def test_pipeline_singularity(genome_path, genome_indicies, chromsizes, cores): +def test_pipeline_singularity(genome_path, genome_indices, chromsizes, cores): cmd = [ "seqnado", "rna", @@ -192,7 +190,7 @@ def test_pipeline_singularity(genome_path, genome_indicies, chromsizes, cores): "config_rna.yml", "--use-singularity", "--singularity-args", - f'" -B {genome_indicies} -B {genome_path} -B {chromsizes} "', + f'" -B {genome_indices} -B {genome_path} -B {chromsizes} "', ] completed = subprocess.run(" ".join(cmd), shell=True) assert completed.returncode == 0