diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6f67f9b..1ff5bf5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,7 @@ jobs: snakefile: workflow/Snakefile args: "--lint" + Testing: runs-on: ubuntu-latest needs: @@ -42,13 +43,12 @@ jobs: - name: Test workflow uses: snakemake/snakemake-github-action@v1.24.0 with: - directory: . - snakefile: workflow/Snakefile - args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp" - + directory: '.' + snakefile: 'workflow/Snakefile' + args: "--dry-run --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp" - name: Test report uses: snakemake/snakemake-github-action@v1.24.0 with: - directory: . - snakefile: workflow/Snakefile + directory: '.' + snakefile: 'workflow/Snakefile' args: "--report report.zip" \ No newline at end of file diff --git a/.test/config/config.yml b/.test/config/config.yml new file mode 100644 index 0000000..f5d25af --- /dev/null +++ b/.test/config/config.yml @@ -0,0 +1,6 @@ +data: + folder: ".test/data" + +reference: + folder: ".test/data" + genome: "placeholder.fa" \ No newline at end of file diff --git a/.test/data/placeholder.bam b/.test/data/placeholder.bam new file mode 100644 index 0000000..e69de29 diff --git a/.test/data/placeholder.fa b/.test/data/placeholder.fa new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index aab998b..0b708fe 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -# Snakemake workflow: `` +# Snakemake workflow: SNPs from RNA [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com///workflows/Tests/badge.svg?branch=main)](https://github.com///actions?query=branch%3Amain+workflow%3ATests) -A Snakemake workflow for `` +A Snakemake workflow for the call of Single Nucleotide Polymorphisms (SNPs) from RNA-seq data. +The workflow starts from aligned .bam files and outputs variants called by FreeBayes (or GATK's HaplotypeCaller) and annotated by VEP. ## Usage @@ -13,9 +14,3 @@ The usage of this workflow is described in the [Snakemake Workflow Catalog](http If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) sitory and its DOI (see above). -# TODO - -* Replace `` and `` everywhere in the template (also under .github/workflows) with the correct `` name and owning user or organization. -* Replace `` with the workflow name (can be the same as ``). -* Replace `` with a description of what the workflow does. -* The workflow will occur in the snakemake-workflow-catalog once it has been made public. Then the link under "Usage" will point to the usage instructions if `` and `` were correctly set. \ No newline at end of file diff --git a/config/config.yml b/config/config.yml index d01cdd2..63af5bb 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,2 +1,6 @@ data: - folder: "data_path" \ No newline at end of file + folder: "data_path" + +reference: + folder: "genome_folder" + genome: "genome.fa" \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 2de55c9..9bcfb63 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -2,15 +2,20 @@ import glob import os -configfile: "config/config.yml" +configfile: ".test/config/config.yml" # Access the data folder from the config data_folder = config["data"]["folder"] - +reference_folder = config["reference"]["folder"] +genome_name = config["reference"]["genome"] +reference = os.path.join(reference_folder, genome_name) +reference_idx = f"{reference}.fai" +reference_dict = f"{reference}.dict" sample_files = glob.glob(os.path.join(data_folder, "*.bam")) samples = [os.path.basename(f).replace(".bam", "") for f in sample_files] + read_groups = [f"results/grouped/{sample}.bam" for sample in samples] deduped_files = [f"results/dedup/{sample}.bam" for sample in samples] @@ -18,7 +23,10 @@ deduped_files = [f"results/dedup/{sample}.bam" for sample in samples] rule all: input: deduped_files, + reference_idx, + reference_dict, include: "rules/add_or_replace_rg.smk" include: "rules/mark_duplicates.smk" +include: "rules/index_genome.smk" diff --git a/workflow/rules/index_genome.smk b/workflow/rules/index_genome.smk new file mode 100644 index 0000000..157d559 --- /dev/null +++ b/workflow/rules/index_genome.smk @@ -0,0 +1,28 @@ +# Rule to create the FASTA index using samtools +rule samtools_faidx: + input: + reference, + output: + reference_idx, + log: + f"{reference}.log", + params: + extra="", + wrapper: + "v3.12.1/bio/samtools/faidx" + + +# Rule to create the sequence dictionary using Picard +rule create_dict: + input: + reference, + output: + reference_dict, + log: + "logs/picard/create_dict.log", + params: + extra="", # Optional: extra arguments for picard. + resources: + mem_mb=1024, + wrapper: + "v3.12.1/bio/picard/createsequencedictionary"