-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSNAKEFILE
137 lines (122 loc) · 4.3 KB
/
SNAKEFILE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Snakefile
import os
# Note: this script assumes that you have cloned the repository and
# are running it from the base path in the repository
# =============================================================================
# Configuration
# =============================================================================
# Specify the config file (see example in repository for structure)
# Note that path is interpreted relative to working directory
# Not the path of the SNAKEFILE
configfile: "pipeline_config.yaml"
# Define paths to scripts
FILTER_TRIM_SCRIPT = "scripts/filter_trim.py"
BOWTIE_SCRIPT = "scripts/run_bowtie.sh"
PROCESS_SAM_SCRIPT = "scripts/process_sam.py"
# Define Bowtie2 reference genome index prefix
REFERENCE_GENOME = config["reference"]
# Define number of threads for Bowtie2
THREADS = config["threads"]
# =============================================================================
# Identify Samples
# =============================================================================
# Identify all samples based on Read1 FastQ files in the raw directory
SAMPLES, = glob_wildcards("data/raw/{sample}_R1.fastq")
# =============================================================================
# Rules
# =============================================================================
rule all:
"""
The final target of the workflow: all merged position count files.
"""
input:
expand("results/{sample}_merged.pos", sample=SAMPLES)
rule filter_trim_read1:
"""
Rule: filter_trim_read1
Description: Filters and trims Read1 FastQ files, extracting UMIs and PF indices.
"""
input:
read1="data/raw/{sample}_R1.fastq"
output:
filtered_fastq="data/filtered/filtered_{sample}_R1.fastq",
umi="data/filtered/UMI_{sample}_R1.txt",
indices_pf="data/filtered/PF_{sample}_R1.index"
params:
filter_trim_script=FILTER_TRIM_SCRIPT
shell:
"""
python {params.filter_trim_script} \
--input {input.read1} \
--output_dir data/filtered \
# Add additional parameters if your filter_trim.py script requires them, e.g., --trim_length 50
"""
rule run_bowtie_read1:
"""
Rule: run_bowtie
Description: Aligns filtered Read1 FastQ files to the reference genome using Bowtie2.
Outputs SAM files for Read1.
"""
input:
read1_filtered="data/filtered/filtered_{sample}_R1.fastq",
output:
sam_r1="data/sam/filtered_{sample}_R1.sam",
params:
bowtie_script=BOWTIE_SCRIPT,
reference_genome=REFERENCE_GENOME,
threads=THREADS
shell:
"""
bash {params.bowtie_script} \
--input {input.read1_filtered} \
--output_dir data/sam \
--reference {params.reference_genome} \
--threads {params.threads}
"""
rule run_bowtie_read2:
"""
Rule: run_bowtie
Description: Aligns raw Read2 FastQ files to the reference genome using Bowtie2.
Outputs SAM files for Read2.
"""
input:
read2="data/raw/{sample}_R2.fastq"
output:
sam_r2="data/sam/{sample}_R2.sam"
params:
bowtie_script=BOWTIE_SCRIPT,
reference_genome=REFERENCE_GENOME,
threads=THREADS
shell:
"""
bash {params.bowtie_script} \
--input {input.read1_filtered} \
--input2 {input.read2} \
--output_dir data/sam \
--reference {params.reference_genome} \
--threads {params.threads}
"""
rule process_sam:
"""
Rule: process_sam
Description: Processes paired SAM files to merge data, handle UMIs, discard PCR duplicates, and count reads per TA site.
Outputs a merged position count file.
"""
input:
sam_r1="data/sam/filtered_{sample}_R1.sam",
sam_r2="data/sam/{sample}_R2.sam",
umi="data/filtered/UMI_{sample}_R1.txt",
indices_pf="data/filtered/PF_{sample}_R1.index"
output:
merged_pos="results/{sample}_merged.pos"
params:
process_sam_script=PROCESS_SAM_SCRIPT
shell:
"""
python {params.process_sam_script} \
--sam_r1 {input.sam_r1} \
--sam_r2 {input.sam_r2} \
--umi_list {input.umi} \
--indices_pf {input.indices_pf} \
--output_dir results
"""