-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
301 lines (266 loc) · 16 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#######################################################################
## MeBaPiNa - Meta Barcoding Analysis Pipeline for Nanopore Datasets ##
#######################################################################
## author: Marc Ruebsam ([email protected])
## description: Pipeline for automated analysis of 16S metabarcoding samples.
## SNAKEMAKE CONFIGURATIONS ##
##############################
## import pakages
from snakemake.utils import min_version
import pandas as pd
## set minimum snakemake version
min_version("5.4")
## location of configuration and report specifications
report: "report/workflow.rst" #!# not yet implemented
## set working directory to specified data location
workdir: config["experiments"]["project"]
## prevent unwanted extension of wildcards
wildcard_constraints:
tmp="[a-zA-Z0-9_]+/",
barc_dir="[a-zA-Z0-9_]+/",
barc="[a-zA-Z0-9]+",
run="\w+", ## is equalt to [a-zA-Z0-9_]+
reftype="[a-zA-Z0-9]+",
reference="[a-zA-Z0-9]+"
## METADATA AND THRESHOLDS ##
#############################
## load run information
METADATA = pd.read_excel( config["experiments"]["tmp"] + config["experiments"]["meta"], header = 1 )
## find meta data for required samples
METADATA = METADATA.loc[ METADATA['Sample name'].isin(config["experiments"]["samples"]), : ]
## runs used to produce the samples
RUNS = [METADATA['Run ID'].value_counts().sort_values(ascending=False).keys()[0]] #!# only one run id can be used at a time
METADATA = METADATA.loc[ METADATA['Run ID'].isin(RUNS), : ]
## sample barcode information
SAMPLES = pd.Series(METADATA['Sample name'].values,index=METADATA['Barcode']).to_dict()
TIMEPOINTS = pd.Series(METADATA['Zeitpunkt'].values,index=METADATA['Barcode']).to_dict()
## get run information
FLOWCELL = METADATA['Flow cell product'].unique()[0]
SEQ_KIT = METADATA['Sequencing kit'].unique()[0]
BAC_KIT = METADATA['Barcoding kit'].unique()[0]
LAM_DCS = METADATA['Lambda DCS'].unique()[0]
## path to the csv file logs are written to
LOGS = config["experiments"]["tmp"] + config["experiments"]["log"]
## sample depth for downsampling in some plots
PLOT_SMPL = config["filtering"]["plot_sample"]
## max read lengths in some plots
PLOT_MAXLEN = config["filtering"]["len_max"]
## PIELINE RULES ##
###################
## load rule set
include: "rules/basecall.smk"
include: "rules/readqc.smk"
include: "rules/align.smk"
include: "rules/kmer.smk"
include: "rules/otu.smk"
include: "rules/misc.smk"
include: "rules/stats.smk"
include: "rules/plot.smk"
include: "rules/report.smk"
## TARGET RULE ##
#################
## target rule for all output
def input_all_target(wildcards):
from os import listdir
## get "pass" directory
basecall_dir = checkpoints.basecall_raw.get(tmp=config["experiments"]["tmp"],run=RUNS[0]).output[0]
## get barcode directory names within "pass" directory (excludes any barcodes without assigned reads)
all_barc = listdir(basecall_dir)
## retain only barcodes containing one of the selected barcodes from the metadata (not unwanted barcodes)
all_barc = [barc for barc in all_barc if barc in SAMPLES.keys()]
all_barc.sort()
## report directories...
all_barc_dir = (
## ...per PROMISE timepoint and sample as specified in the METADATA
["03_report/" + TPs + "/" + IDs + "/" + RUNs + "-" + barc + "/"
for TPs,IDs,barc,RUNs in zip(TIMEPOINTS.values(), SAMPLES.values(), SAMPLES.keys(), METADATA['Run ID']) if ("PROM" in IDs) & (barc in all_barc)] +
## ...for all other sample specified in the METADATA
["03_report/" + "non-PROMISE_samples" + "/" + IDs + "/" + RUNs + "-" + barc + "/"
for TPs,IDs,barc,RUNs in zip(TIMEPOINTS.values(), SAMPLES.values(), SAMPLES.keys(), METADATA['Run ID']) if (not "PROM" in IDs) & (barc in all_barc)])
## requested files...
input_list = (
## LOGS ##
["{tmp}METADATA/ANALYSIS_PROGRESS_MANAGEMENT.csv"] +
## REFERENCE ##
["{tmp}03_report/Reference_Sequences/{reference}/reference_lengthdist.tsv", ## reference lenth distribution
"{tmp}03_report/Reference_Sequences/{reference}/reference_lengthdist.pdf", ## reference lenth distribution
"{tmp}03_report/Reference_Sequences/{reference}_{reftype}/reference_taxaranks.tsv"] + ## reference taxa distribution
## RAW READS ##
["{tmp}{barc_dir}read_base_counts.tsv"] + ## raw read statistics
## BASECALL ##
["{tmp}{barc_dir}01_basecalling-nanoplot-NanoPlot-report.html", ## general QC: all reads, including calibtation strads, intentional downsampling
"{tmp}{barc_dir}01_basecalling-nanoplot-NanoStats.txt", ## general QC: all reads, including calibtation strads, intentional downsampling
"{tmp}{barc_dir}01_basecalling-pycoqc-pycoQC_report.html", ## general QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-pycoqc-pycoQC_report.json", ## general QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-nanoqc-nanoQC.html", ## per base QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-fastqc-stdin_fastqc.html"] + ## read QC: all passed reads
[x for x in
["{tmp}{barc_dir}01_basecalling-nanocomp-NanoComp-report.html", ## barcode QC: per barcode
"{tmp}{barc_dir}01_basecalling-nanocomp-NanoStats.txt"] ## barcode QC: per barcode
if BAC_KIT] + ## if BAC_KIT is not ""
## TRIM AND FILTER ##
["{tmp}{barc_dir}02_trimming_filtering-nanoplot-NanoPlot-report.html", ## general QC: trimed and filtered barcoded reads, intentional downsampling
"{tmp}{barc_dir}02_trimming_filtering-nanoplot-NanoStats.txt", ## general QC: trimed and filtered barcoded reads, intentional downsampling
"{tmp}{barc_dir}02_trimming_filtering-nanoqc-nanoQC.html", ## per base QC: trimed and filtered barcoded reads, forced downsampling
"{tmp}{barc_dir}02_trimming_filtering-fastqc-stdin_fastqc.html"] + ## read QC: trimed and filtered barcoded reads
[x for x in
["{tmp}{barc_dir}02_trimming_filtering-nanocomp-NanoComp-report.html", ## barcode QC: trimed and filtered barcoded reads
"{tmp}{barc_dir}02_trimming_filtering-nanocomp-NanoStats.txt"] ## barcode QC: trimed and filtered barcoded reads
if BAC_KIT] + ## if BAC_KIT is not ""
## OTU ##
[x for x in
["{tmp}{barc_dir}03_otu_picking-{reference}-q2otupick-index.html", ## clustered reads
"{tmp}{barc_dir}03_otu_picking-{reference}-q2filter-index.html", ## filtered reads
"{tmp}{barc_dir}03_otu_picking-{reference}_{reftype}-krona.html", ## classified taxa
"{tmp}{barc_dir}03_otu_picking-{reference}_{reftype}-kmer.counttaxlist", ## taxonomic classifications
"{tmp}{barc_dir}03_otu_picking-{reference}_{reftype}-taxa_covdist.pdf", ## distribution of taxa abundance
"{tmp}{barc_dir}03_otu_picking-{reference}-feature_counts.tsv", ## feature statistics
"{tmp}{barc_dir}03_otu_picking-{reference}_{reftype}-taxa_counts.tsv", ## taxa statistics
"{tmp}{barc_dir}03_otu_picking-{reference}_{reftype}-taxa_diversity.tsv"] ## diversity and richness measures
if "otu" in config["methodologie"]] + ## if "otu" is selected
## ALIGNMENT ##
[x for x in
["{tmp}{barc_dir}03_alignment-{reference}-pycoqc.html", ## per barcode, intentional downsampling
"{tmp}{barc_dir}03_alignment-{reference}-pycoqc.json", ## per barcode, intentional downsampling
"{tmp}{barc_dir}03_alignment-{reference}-covdist.pdf", ## per barcode, reads per reference sequence histogram
"{tmp}{barc_dir}03_alignment-{reference}-covpos.pdf", ## per barcode, coverage over reference sequence positions
"{tmp}{barc_dir}03_alignment-{reference}_{reftype}-krona.html", ## taxonomic classification
"{tmp}{barc_dir}03_alignment-{reference}_{reftype}-aligned.counttaxlist", ## taxonomic classification
"{tmp}{barc_dir}03_alignment-{reference}_{reftype}-taxa_covdist.pdf", ## distribution of taxa abundance
"{tmp}{barc_dir}03_alignment-{reference}-alignment_rates.tsv", ## alignment statistics
"{tmp}{barc_dir}03_alignment-{reference}_{reftype}-taxa_counts.tsv", ## taxa statistics
"{tmp}{barc_dir}03_alignment-{reference}_{reftype}-taxa_diversity.tsv"] ## diversity and richness measures
if "align" in config["methodologie"]] + ## if "align" is selected
## K-MER MAPPING ##
[x for x in
["{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-krona.html", ## taxonomic composition
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-krona_bracken.html", ## taxonomic composition after reestimation
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-kmer.counttaxlist", ## taxonomic classification
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-taxa_covdist.pdf", ## distribution of taxa abundance
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-taxa_counts.tsv", ## taxa statistics
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-retaxa_counts.tsv", ## taxa statistics after abundance reestimation
"{tmp}{barc_dir}03_kmer_mapping-{reference}_{reftype}-taxa_diversity.tsv"] ## diversity and richness measures
if "kmer" in config["methodologie"]] ) ## if "kmer" is selected
## expand for all barcodes
input_list = expand(input_list,
tmp = config["experiments"]["tmp"],
barc_dir = all_barc_dir,
reference = config['reference']['source'],
reftype = config['reference']['rank'] )
## return
return input_list
rule all_target:
input:
input_all_target
shell:
"echo \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" > \"{input[0]}.temp\"; " ## store header in temporary output
"grep -v \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" \"{input[0]}\" | sort -n | uniq >> \"{input[0]}.temp\"; " ## store unique lines in temporary output
"cat \"{input[0]}.temp\" > \"{input[0]}\"; rm \"{input[0]}.temp\"" ## convert temporary back to file
## ONLY BASECALL ##
###################
## target rule for completing basecalling only
def input_only_basecall(wildcards):
from os import listdir
## get "pass" directory
basecall_dir = checkpoints.basecall_raw.get(tmp=config["experiments"]["tmp"],run=RUNS[0]).output[0]
## get barcode directory names within "pass" directory (excludes any barcodes without assigned reads)
all_barc = listdir(basecall_dir)
## retain only barcodes containing one of the selected barcodes from the metadata (not unwanted barcodes)
all_barc = [barc for barc in all_barc if barc in SAMPLES.keys()]
all_barc.sort()
## report directories...
all_barc_dir = (
## ...per PROMISE timepoint and sample as specified in the METADATA
["03_report/" + TPs + "/" + IDs + "/" + RUNs + "-" + barc + "/"
for TPs,IDs,barc,RUNs in zip(TIMEPOINTS.values(), SAMPLES.values(), SAMPLES.keys(), METADATA['Run ID']) if ("PROM" in IDs) & (barc in all_barc)] +
## ...for all other sample specified in the METADATA
["03_report/" + "non-PROMISE_samples" + "/" + IDs + "/" + RUNs + "-" + barc + "/"
for TPs,IDs,barc,RUNs in zip(TIMEPOINTS.values(), SAMPLES.values(), SAMPLES.keys(), METADATA['Run ID']) if (not "PROM" in IDs) & (barc in all_barc)])
## requested files...
input_list = (
## LOGS ##
["{tmp}METADATA/ANALYSIS_PROGRESS_MANAGEMENT.csv"] +
## RAW READS ##
["{tmp}{barc_dir}read_base_counts.tsv"] + ## raw read statistics
## BASECALL ##
["{tmp}{barc_dir}01_basecalling-nanoplot-NanoPlot-report.html", ## general QC: all reads, including calibtation strads, intentional downsampling
"{tmp}{barc_dir}01_basecalling-nanoplot-NanoStats.txt", ## general QC: all reads, including calibtation strads, intentional downsampling
"{tmp}{barc_dir}01_basecalling-pycoqc-pycoQC_report.html", ## general QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-pycoqc-pycoQC_report.json", ## general QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-nanoqc-nanoQC.html", ## per base QC: all reads, forced downsampling
"{tmp}{barc_dir}01_basecalling-fastqc-stdin_fastqc.html"] + ## read QC: all passed reads
[x for x in
["{tmp}{barc_dir}01_basecalling-nanocomp-NanoComp-report.html", ## barcode QC: per barcode
"{tmp}{barc_dir}01_basecalling-nanocomp-NanoStats.txt"] ## barcode QC: per barcode
if BAC_KIT] ) ## if BAC_KIT is not ""
## expand for all barcodes
input_list = expand(input_list,
tmp = config["experiments"]["tmp"],
barc_dir = all_barc_dir )
## return
return input_list
rule only_basecall:
input:
input_only_basecall
shell:
"echo \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" > \"{input[0]}.temp\"; " ## store header in temporary output
"grep -v \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" \"{input[0]}\" | sort -n | uniq >> \"{input[0]}.temp\"; " ## store unique lines in temporary output
"cat \"{input[0]}.temp\" > \"{input[0]}\"; rm \"{input[0]}.temp\"" ## convert temporary back to file
## UPDATE REPORT ##
#################
## target rule for updating all report lines in the log file
def input_update_report(wildcards):
from os import listdir
## get "pass" directory
basecall_dir = checkpoints.basecall_raw.get(tmp=config["experiments"]["tmp"],run=RUNS[0]).output[0]
## get barcode directory names within "pass" directory (excludes any barcodes without assigned reads)
all_barc = listdir(basecall_dir)
## retain only barcodes containing one of the selected barcodes from the metadata (not unwanted barcodes)
all_barc = [barc for barc in all_barc if barc in SAMPLES.keys()]
all_barc.sort()
## requested files...
input_list = (
## LOGS ##
["{tmp}METADATA/ANALYSIS_PROGRESS_MANAGEMENT.csv"] +
## RAW READS ##
["{tmp}00_raw_data/{run}/MeBaPiNa_move_raw.report"] + ## REPORT
## BASECALL ##
["{tmp}00_raw_data/{run}/MeBaPiNa_basecall_raw_seqsum.report", ## REPORT
"{tmp}00_raw_data/{run}/MeBaPiNa_basecall_raw_pass.report"] + ## REPORT
## TRIM AND FILTER ##
["{tmp}01_processed_data/02_trimming_filtering/{run}/{barc}/MeBaPiNa_trim_basecalled.report", ## REPORT
"{tmp}01_processed_data/02_trimming_filtering/{run}/{barc}/MeBaPiNa_filter_trimmed.report"] + ## REPORT
## OTU ##
[x for x in
["{tmp}01_processed_data/03_otu_picking/{run}/{barc}/{reference}/MeBaPiNa_q2filter_uchime_ftable.report", ## REPORT
"{tmp}01_processed_data/03_otu_picking/{run}/{barc}/{reference}/MeBaPiNa_q2filter_uchime_centseq.report", ## REPORT
"{tmp}02_analysis_results/03_otu_picking/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_kmermap_q2rereplicate.report", ## REPORT
"{tmp}02_analysis_results/03_otu_picking/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_counttax_q2kmermap.report"] ## REPORT
if "otu" in config["methodologie"]] + ## if "otu" is selected
## ALIGNMENT ##
[x for x in
["{tmp}01_processed_data/03_alignment/{run}/{barc}/{reference}/MeBaPiNa_filter_aligned.report",
"{tmp}02_analysis_results/03_alignment/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_counttax_aligned.report"]
if "align" in config["methodologie"]] + ## if "align" is selected
## K-MER MAPPING ##
[x for x in
["{tmp}01_processed_data/03_kmer_mapping/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_kmermap_filtered.report", ## REPORT
"{tmp}02_analysis_results/03_kmer_mapping/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_retax_kmermap.report", ## REPORT
"{tmp}02_analysis_results/03_kmer_mapping/{run}/{barc}/{reference}_{reftype}/MeBaPiNa_counttax_kmermap.report"] ## REPORT
if "kmer" in config["methodologie"]] ) ## if "kmer" is selected
## expand for all barcodes
input_list = expand(input_list,
tmp = config["experiments"]["tmp"],
run = RUNS,
barc = all_barc,
reference = config['reference']['source'],
reftype = config['reference']['rank'] )
## return
return input_list
rule update_report:
input:
input_update_report
shell:
"echo \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" > \"{input[0]}.temp\"; " ## store header in temporary output
"grep -v \"Sample name;File/Directory;Action;Date;Checksum;Who;Description\" \"{input[0]}\" | sort -n | uniq >> \"{input[0]}.temp\"; " ## store unique lines in temporary output
"cat \"{input[0]}.temp\" > \"{input[0]}\"; rm \"{input[0]}.temp\"" ## convert temporary back to file