fcaretti · fcaretti · Jun 18, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -31,6 +31,7 @@ jobs:
         snakefile: workflow/Snakefile
         args: "--lint"
 
+
 
   Testing:
     runs-on: ubuntu-latest
@@ -40,15 +41,26 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+
     - name: Test workflow
       uses: snakemake/[email protected]
       with:
-        directory: '.'
+        directory: '.test'
         snakefile: 'workflow/Snakefile'
         args: "--dry-run --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+        stagein: |
+          echo "Current working directory: $(pwd)"
+          echo "Listing contents of current directory:"
+          ls -R
+          echo "Listing contents of .test directory:"
+          ls -R .test
+          echo "Listing contents of .test/config directory:"
+          ls -R .test/config
+          echo "Listing contents of .test/data directory:"
+          ls -R .test/data
     - name: Test report
       uses: snakemake/[email protected]
       with:
-        directory: '.'
+        directory: '.test'
         snakefile: 'workflow/Snakefile'
         args: "--report report.zip"
diff --git a/config/config.yml b/config/config.yml
@@ -1,5 +1,5 @@
 data:
-  folder: "data_path"
+  folder: "data_folder"
 
 reference:
   folder: "genome_folder"

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -2,7 +2,7 @@ import glob
 import os
 
 
-configfile: ".test/config/config.yml"
+configfile: "config/config.yml"
 
 
 # Access the data folder from the config
@@ -18,15 +18,15 @@ samples = [os.path.basename(f).replace(".bam", "") for f in sample_files]
 
 read_groups = [f"results/grouped/{sample}.bam" for sample in samples]
 deduped_files = [f"results/dedup/{sample}.bam" for sample in samples]
+splitted_files = [f"results/split/{sample}.bam" for sample in samples]
 
 
 rule all:
     input:
-        deduped_files,
-        reference_idx,
-        reference_dict,
+        splitted_files,
 
 
 include: "rules/add_or_replace_rg.smk"
 include: "rules/mark_duplicates.smk"
 include: "rules/index_genome.smk"
+include: "rules/split_n_cigar_reads.smk"
diff --git a/workflow/rules/index_genome.smk b/workflow/rules/index_genome.smk
@@ -5,7 +5,7 @@ rule samtools_faidx:
     output:
         reference_idx,
     log:
-        f"{reference}.log",
+        f"logs/create_idx.log",
     params:
         extra="",
     wrapper:
@@ -19,7 +19,7 @@ rule create_dict:
     output:
         reference_dict,
     log:
-        "logs/picard/create_dict.log",
+        "logs/create_dict.log",
     params:
         extra="",  # Optional: extra arguments for picard.
     resources:

diff --git a/workflow/rules/mark_duplicates.smk b/workflow/rules/mark_duplicates.smk
@@ -1,11 +1,8 @@
 rule markduplicates_bam:
     input:
         bams="results/grouped/{sample}.bam",
-    # optional to specify a list of BAMs; this has the same effect
-    # of marking duplicates on separate read groups for a sample
-    # and then merging
     output:
-        bam="results/dedup/{sample}.bam",
+        bam=temp("results/dedup/{sample}.bam"),
         metrics="results/dedup/{sample}.metrics.txt",
     log:
         "logs/dedup_bam/{sample}.log",

diff --git a/workflow/rules/split_n_cigar_reads.smk b/workflow/rules/split_n_cigar_reads.smk
@@ -0,0 +1,17 @@
+rule splitncigarreads:
+    input:
+        bam="results/dedup/{sample}.bam",
+        ref=reference,
+        idx=reference_idx,
+        dict=reference_dict,
+    output:
+        temp("results/split/{sample}.bam"),
+    log:
+        "logs/splitNCIGARreads/{sample}.log",
+    params:
+        extra="",  # optional
+        java_mem_overhead_mb=512,  # Specify overhead for non-heap memory
+    resources:
+        mem_mb=4096,  # Total memory available for the rule
+    wrapper:
+        "v3.12.1/bio/gatk/splitncigarreads"