-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile
156 lines (131 loc) · 7 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
AMR_annotation pipeline
Authors: Alejandra Hernandez-Segura
Organization: Rijksinstituut voor Volksgezondheid en Milieu (RIVM)
Department: Infektieziekteonderzoek, Diagnostiek en Laboratorium Surveillance (IDS), Bacteriologie (BPD)
Date: 03-11-2020
Documentation: https://github.com/AleSR13/AMR_annotation.git
Snakemake rules (in order of execution):
1 Circlator # correct the start of the chromosomes/plasmids if necessary
2 PGAP # Annotation using the NCBI tool, PGAP.
3 Prokka # Annotation using Prokka and the PLSD database
"""
#################################################################################
##### DEPENDENCIES #####
#################################################################################
import yaml
import os
#################################################################################
##### Load samplesheet, load species file and define output directory #####
#################################################################################
# Config file
configfile: "config/pipeline_parameters.yaml"
# SAMPLES is a dict with sample in the form sample > file. E.g.: SAMPLES["sample_1"]["file"] = "sample_1.fasta"
SAMPLES = {}
with open(config["sample_sheet"]) as sample_sheet_file:
SAMPLES = yaml.safe_load(sample_sheet_file)
# OUT defines output directory for most rules.
OUT = os.path.abspath(config["out"])
GENUS_ALL = config["genus"]
SPECIES_ALL = config["species"]
PROTEIN_DB = config["protein_db"]
# Assign genus and species for the samples in which it was not identified
for sample in SAMPLES:
# Assign genus if non-existing
try:
SAMPLES[sample]["genus"]
except KeyError:
SAMPLES[sample]["genus"] = GENUS_ALL
# Assign species if non-existing
try:
SAMPLES[sample]["species"]
except KeyError:
SAMPLES[sample]["species"] = SPECIES_ALL
#@################################################################################
#@#### Processes #####
#@################################################################################
#############################################################################
##### Fix starting point of chromosome and plasmids #####
#############################################################################
include: "bin/rules/filter_contigs.smk"
include: "bin/rules/circlator.smk"
#############################################################################
##### Annotation #####
#############################################################################
include: "bin/rules/update_tbl2asn.smk"
include: "bin/rules/prokka.smk"
#############################################################################
##### Fix GenBank file and fine-tune results #####
#############################################################################
#include: "bin/rules/fix_genbank.smk"
#@################################################################################
#@#### The `onstart` checker codeblock #####
#@################################################################################
onstart:
try:
print("Checking if all specified files are accessible...")
important_files = [ config["sample_sheet"] ]
for filename in important_files:
if not os.path.exists(filename):
raise FileNotFoundError(filename)
except FileNotFoundError as e:
print("This file is not available or accessible: %s" % e)
sys.exit(1)
else:
print("\tAll specified files are present!")
shell("""
mkdir -p {OUT}/results
echo -e "\nLogging pipeline settings..."
echo -e "\tGenerating methodological hash (fingerprint)..."
echo -e "This is the link to the code used for this analysis:\thttps://github.com/AleSR13/AMR_annotation/tree/$(git log -n 1 --pretty=format:"%H")" > '{OUT}/results/log_git.txt'
echo -e "This code with unique fingerprint $(git log -n1 --pretty=format:"%H") was committed by $(git log -n1 --pretty=format:"%an <%ae>") at $(git log -n1 --pretty=format:"%ad")" >> '{OUT}/results/log_git.txt'
echo -e "\tGenerating full software list of current Conda environment (\"amr_master\")..."
conda list > '{OUT}/results/log_conda.txt'
echo -e "\tGenerating config file log..."
rm -f '{OUT}/results/log_config.txt'
cat config/amr_annotation_call.txt > '{OUT}/results/log_conda.txt'
for file in config/*.yaml
do
echo -e "\n==> Contents of file \"${{file}}\": <==" >> '{OUT}/results/log_config.txt'
cat ${{file}} >> '{OUT}/results/log_config.txt'
echo -e "\n\n" >> '{OUT}/results/log_config.txt'
done
echo -e "\n==> Contents of sample sheet: <==" >> '{OUT}/results/log_config.txt'
cat sample_sheet.yaml >> '{OUT}/results/log_config.txt'
echo -e "\n\n" >> '{OUT}/results/log_config.txt'
echo -e "\n==> Extra parameters given while calling the pipeline (they overwrite any defaults): \n \
Output directory: {OUT} \n \
Genus used as default (if not provided in metadata): {GENUS_ALL} \n \
Species used as default (if not provided in metadata): {SPECIES_ALL} \n \
Protein database: {PROTEIN_DB}\n\n" >> '{OUT}/results/log_config.txt'
""")
#@################################################################################
#@#### These are the conditional cleanup rules #####
#@################################################################################
onerror:
print("An error occurred")
#shell("rm -r {OUT}/pgap_1")
onsuccess:
shell("""
echo -e "Removing temporary files..."
rm -f tbl2asn
find {OUT}/circlator/ -name "*prodigal.for_prodigal.fa" -delete
find {OUT}/circlator/ -name "*prodigal.prodigal.gff" -delete
find {OUT} -type d -empty -delete
echo -e "\tGenerating HTML index of log files..."
echo -e "\tGenerating Snakemake report..."
snakemake --profile config --config out="{OUT}" genus="{GENUS_ALL}" species={SPECIES_ALL} protein_db={PROTEIN_DB} --unlock
snakemake --profile config --config out="{OUT}" genus="{GENUS_ALL}" species={SPECIES_ALL} protein_db={PROTEIN_DB} --report '{OUT}/results/snakemake_report.html'
echo -e "Finished"
""")
#################################################################################
##### Specify final output: #####
#################################################################################
localrules:
all,
filter_contigs,
update_tbl2asn
rule all:
input:
expand(OUT + "/circlator/{sample}/{sample}.fasta", sample = SAMPLES),
expand(OUT + "/prokka/{sample}/{sample}.gbk", sample = SAMPLES)