forked from tom-van-wijk/metagenome_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmetagenome_pipeline.py
executable file
·231 lines (205 loc) · 8.8 KB
/
metagenome_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env python
# Name: metagenome_pipeline.py
# Author: Tom van Wijk - RIVM Bilthoven
# Date: 13-10-2019
# Licence: GNU General Public License v3.0 (copy provided in directory)
# For detailed information and instruction on how to install and use this software
# please view the included "READNE.md" file in this repository
# import python libraries
from argparse import ArgumentParser
import logging
import logging.handlers
import os
import sys
import random
# Function to parse the command-line arguments
# Returns a namespace with argument keys and values
def parse_arguments(args, log):
log.info("Parsing command line arguments...")
parser = ArgumentParser(prog="metagenome_pipeline.py")
parser.add_argument("-i", "--indir", dest = "input_dir",
action = "store", default = None, type = str,
help = "Location of input directory (required)",
required = True)
parser.add_argument("-o", "--outdir", dest = "output_dir",
action = "store", default = "inputdir", type = str,
help = "Location of output directory (default=inputdir)")
parser.add_argument("-t", "--threads", dest = "threads",
action = "store", default = 6, type = int,
help = "Number of threads to be used (default=6)")
parser.add_argument("-d", "--database", dest = "groot_db",
action = "store", default = "default", type = str,
help = "Database for AMR profiling. supported: arg-annot, resfinder, card, groot_db, groot_core_db. Default: resfinder/card")
parser.add_argument("-l", "--length", dest = "length",
action = "store", default = 150, type = int,
help = "Length of query reads (default=150)")
return parser.parse_args()
# Function creates logger with handlers for both logfile and console output
# Returns logger
def create_logger(logid):
# create logger
log = logging.getLogger()
log.setLevel(logging.INFO)
# create file handler
fh = logging.FileHandler(str(logid)+'_metagenome.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter('%(message)s'))
log.addHandler(fh)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter('%(message)s'))
log.addHandler(ch)
return log
# takes a lication of input data
# returns a dictionary with sample names as keys and lists of the R1 and R2 file names as values
# returns a list of sample files
def find_files(input_dir):
sample_dict = {}
sample_file_list = []
filelist = list_directory(input_dir, "files", 1)
for file in filelist:
if '_R1' in file and file in filelist and file.replace('_R1', '_R2') in filelist:
R1_file = input_dir+"/"+file
R2_file = input_dir+"/"+file.replace('_R1', '_R2')
prefix = file.split("_R1")[0].replace(".","_").replace(" ","_")
# adding sample to dictionary
sample_dict[prefix]=[R1_file, R2_file]
# adding sample files to list
sample_file_list.append(R1_file)
sample_file_list.append(R2_file)
# unzipping fastqc reports
return sample_dict, sample_file_list
# Function creates a list of files or directories in <inputdir>
# on the specified directory depth
def list_directory(input_dir, obj_type, depth):
dir_depth = 1
for root, dirs, files in os.walk(input_dir):
if dir_depth == depth:
if obj_type == 'files':
return files
elif obj_type == 'dirs':
return dirs
dir_depth += 1
# Function to generate a FastQC rapport of each filepair and a MultiQC report for all data
# Takes a dictionaru containing locations where the .fastq files are located and a output_dir for storage output
def quality_control(sample_dict, output_dir):
os.system("mkdir -p "+output_dir+"/quality_control/fastqc")
for key in sorted(sample_dict):
R1_file = sample_dict[key][0]
R2_file = sample_dict[key][1]
print R1_file, R2_file
# create a fastCQ quality report
os.system("fastqc "+R1_file+" "+R2_file+" -o "+output_dir+"/quality_control/fastqc")
# unzipping fastqc reports
files = list_directory(output_dir+"/quality_control/fastqc", "files", 1)
for file in files:
if file.endswith(".zip"):
os.system("unzip "+output_dir+"/quality_control/fastqc/"+file
+" -d "+output_dir+"/quality_control/fastqc/"+file.replace('.zip', ''))
# create multiqc report
os.system("multiqc "+output_dir+"/quality_control/fastqc/ -o "+output_dir+"/quality_control")
# Function that generates SeqKit statistics off all files in a given list
def data_statistics(filelist, threads, output_dir):
os.system("mkdir -p "+output_dir+"/data_statistics")
command = "seqkit stats --all"
for file in sorted(filelist):
command += " %s" % file
command += " -e -j %s -o %s/data_statistics/seqkit_data_statistics.txt" % (str(threads), output_dir)
print "\n"+command+"\n"
os.system(command)
# Function that classifies the raw reads using metaphlan2
# Takes a dictionary with file locations as input
def classify_reads(sample_dict, threads, output_dir):
os.system("mkdir -p "+output_dir+"/metaphlan2")
profiles = []
# classify reads
for key in sorted(sample_dict):
os.system("metaphlan2.py %s,%s --bowtie2out %s --nproc %s --input_type fastq > %s" % (sample_dict[key][0],
sample_dict[key][1],
output_dir+"/metaphlan2/"+key+"_metagenome.bowtie2.bz2",
str(threads),
output_dir+"/metaphlan2/"+key+"_metagenome_profile.txt"))
# merge profiles
os.system("merge_metaphlan_tables.py %s/metaphlan2/*_metagenome_profile.txt > %s/metaphlan2/merged_profiles_table.txt" % (output_dir,
output_dir))
# Function to profile AMR genes with groot
# Takes a dictionary with file locations, no. of threads, groot database, read lengths and output dir as input
# Writes output to /outdir/AMR_profiles
def profile_amr(sample_dict, threads, groot_db, length, outdir, log):
os.system("mkdir -p "+outdir+"/AMR_profiles/temp")
# validate database and index existance
dbs = list_directory(os.environ['META_DB'], "dirs", 1)
valid = "No"
valid_db = ""
log.info("checking database:\t"+groot_db)
for db in dbs:
if db.startswith(groot_db) == True and db.endswith('l-'+str(length)) == True: # and "_index_" in db == True
log.info("database found:\t"+db)
valid = "Yes"
valid_db = db
if valid == "No":
log.warning("Indexed database:\t%s_index_l-%s not found, please consult manual on how to download and index the databases." % (groot_db,
str(length)))
else:
# iterate over filepairs in dict to process them with groot
for key in sorted(sample_dict):
# command to unzip the filepairs and pipe into groot align
command = "gunzip -c %s | groot align -i %s -p %s -y %s > %s" % (sample_dict[key][0]+" "+sample_dict[key][1],
os.environ['META_DB']+"/"+valid_db,
str(threads),
outdir+"/logfiles/groot/"+key+"_"+groot_db+"_l-"+str(length)+"_groot-align.log",
outdir+"/AMR_profiles/"+key+"_"+groot_db+"_l-"+str(length)+"_ARG-reads.bam")
print "\n"+command
os.system(command)
os.system("mv groot-graphs-* "+outdir+"/AMR_profiles/temp/")
# command to generate groot report from alignment files
command2 = "groot report -i %s -y %s > %s" % (outdir+"/AMR_profiles/"+key+"_"+groot_db+"_l-"+str(length)+"_ARG-reads.bam",
outdir+"/logfiles/groot/"+key+"_"+groot_db+"_l-"+str(length)+"_groot-report.log",
outdir+"/AMR_profiles/"+key+"_"+groot_db+"_l-"+str(length)+"_AMR-profile.txt")
print "\n"+command2
os.system(command2)
# Function closes logger handlers
def close_logger(log):
for handler in log.handlers:
handler.close()
log.removeFilter(handler)
# MAIN function
def main():
# create logger
logid = random.randint(99999, 9999999)
log = create_logger(logid)
# parse command line arguments
args = parse_arguments(sys.argv, log)
# creating output directory
if args.output_dir == 'inputdir':
outdir = os.path.abspath(args.input_dir+"/metagenome_pipeline_output")
else:
outdir = os.path.abspath(args.output_dir)
log.info("output directory: "+outdir)
# create output dir
os.system("mkdir -p "+outdir+"/logfiles/groot")
# Create dictionary of input files
sample_dict, sample_file_list = find_files(args.input_dir)
print "\n"
for key in sorted(sample_dict):
print "%s:\n%s" % (key, sample_dict[key])
print "\n"+str(sorted(sample_file_list))+"\n"
# Generate FastQC and MultiQC Quality reports
quality_control(sample_dict, outdir)
# Generate Seqkit statistics
data_statistics(sample_file_list, args.threads, outdir)
# Classify reads
classify_reads(sample_dict, args.threads, outdir)
# Generate AMR profiles
if args.groot_db == "default":
profile_amr(sample_dict, args.threads, "resfinder", args.length, outdir, log)
profile_amr(sample_dict, args.threads, "card", args.length, outdir, log)
else:
profile_amr(sample_dict, args.threads, args.groot_db, args.length, outdir, log)
# close logger handlers
log.info("\nClosing logger and finalising metagenome_pipeline.py")
close_logger(log)
# move logfile to output directory
os.system("mv "+str(logid)+"_metagenome.log "+outdir+"/logfiles/")
main()