Merge pull request #3 from fcaretti/Add-rules

feat(rule): add index and dict to reference genome
fcaretti · Jun 14, 2024 · 5b71699 · 5b71699
2 parents 4437030 + 8cf3c10
commit 5b71699
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 17 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -31,6 +31,7 @@ jobs:
         snakefile: workflow/Snakefile
         args: "--lint"
 
+
   Testing:
     runs-on: ubuntu-latest
     needs: 
@@ -42,13 +43,12 @@ jobs:
     - name: Test workflow
       uses: snakemake/[email protected]
       with:
-        directory: .
-        snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
-
+        directory: '.'
+        snakefile: 'workflow/Snakefile'
+        args: "--dry-run --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
     - name: Test report
       uses: snakemake/[email protected]
       with:
-        directory: .
-        snakefile: workflow/Snakefile
+        directory: '.'
+        snakefile: 'workflow/Snakefile'
         args: "--report report.zip"
diff --git a/.test/config/config.yml b/.test/config/config.yml
@@ -0,0 +1,6 @@
+data:
+  folder: ".test/data"
+
+reference:
+  folder: ".test/data"
+  genome: "placeholder.fa"
diff --git a/.test/data/placeholder.bam b/.test/data/placeholder.bam
diff --git a/.test/data/placeholder.fa b/.test/data/placeholder.fa
diff --git a/README.md b/README.md
@@ -1,10 +1,11 @@
-# Snakemake workflow: `<name>`
+# Snakemake workflow: SNPs from RNA
 
 [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io)
 [![GitHub actions status](https://github.com/<owner>/<repo>/workflows/Tests/badge.svg?branch=main)](https://github.com/<owner>/<repo>/actions?query=branch%3Amain+workflow%3ATests)
 
 
-A Snakemake workflow for `<description>`
+A Snakemake workflow for the call of Single Nucleotide Polymorphisms (SNPs) from RNA-seq data.
+The workflow starts from aligned .bam files and outputs variants called by FreeBayes (or GATK's HaplotypeCaller) and annotated by VEP.
 
 
 ## Usage
@@ -13,9 +14,3 @@ The usage of this workflow is described in the [Snakemake Workflow Catalog](http
 
 If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) <repo>sitory and its DOI (see above).
 
-# TODO
-
-* Replace `<owner>` and `<repo>` everywhere in the template (also under .github/workflows) with the correct `<repo>` name and owning user or organization.
-* Replace `<name>` with the workflow name (can be the same as `<repo>`).
-* Replace `<description>` with a description of what the workflow does.
-* The workflow will occur in the snakemake-workflow-catalog once it has been made public. Then the link under "Usage" will point to the usage instructions if `<owner>` and `<repo>` were correctly set.
diff --git a/config/config.yml b/config/config.yml
@@ -1,2 +1,6 @@
 data:
-  folder: "data_path"
+  folder: "data_path"
+
+reference:
+  folder: "genome_folder"
+  genome: "genome.fa"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -2,23 +2,31 @@ import glob
 import os
 
 
-configfile: "config/config.yml"
+configfile: ".test/config/config.yml"
 
 
 # Access the data folder from the config
 data_folder = config["data"]["folder"]
-
+reference_folder = config["reference"]["folder"]
+genome_name = config["reference"]["genome"]
+reference = os.path.join(reference_folder, genome_name)
+reference_idx = f"{reference}.fai"
+reference_dict = f"{reference}.dict"
 
 sample_files = glob.glob(os.path.join(data_folder, "*.bam"))
 samples = [os.path.basename(f).replace(".bam", "") for f in sample_files]
+
 read_groups = [f"results/grouped/{sample}.bam" for sample in samples]
 deduped_files = [f"results/dedup/{sample}.bam" for sample in samples]
 
 
 rule all:
     input:
         deduped_files,
+        reference_idx,
+        reference_dict,
 
 
 include: "rules/add_or_replace_rg.smk"
 include: "rules/mark_duplicates.smk"
+include: "rules/index_genome.smk"
diff --git a/workflow/rules/index_genome.smk b/workflow/rules/index_genome.smk
@@ -0,0 +1,28 @@
+# Rule to create the FASTA index using samtools
+rule samtools_faidx:
+    input:
+        reference,
+    output:
+        reference_idx,
+    log:
+        f"{reference}.log",
+    params:
+        extra="",
+    wrapper:
+        "v3.12.1/bio/samtools/faidx"
+
+
+# Rule to create the sequence dictionary using Picard
+rule create_dict:
+    input:
+        reference,
+    output:
+        reference_dict,
+    log:
+        "logs/picard/create_dict.log",
+    params:
+        extra="",  # Optional: extra arguments for picard.
+    resources:
+        mem_mb=1024,
+    wrapper:
+        "v3.12.1/bio/picard/createsequencedictionary"