-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathfastqc_to_pgtable.py
executable file
·308 lines (266 loc) · 11.1 KB
/
fastqc_to_pgtable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python
import os
import sys
import psycopg2
import argparse
import subprocess
import shutil
parser = argparse.ArgumentParser(description= """
DESCRIPTION
Convert the FastQC report (file: fastqc_data.txt) to a single line with columns
tab-separated. The output is uploaded to the postgres database given in the
connection string. The input is the directory or zip archive produced by fastqc
(actually, any dir containing the file fastqc_data.txt with the actual stats).
EXAMPLE
fastqc_to_pgtable.py -i fastqc/
## Include file stats:
fastqc_to_pgtable.py -i test.fq_fastqc --md5sum <md5sum> --fsize <int> --mtime <datetime>
## Or
fastqc_to_pgtable.py -i test.fq_fastqc --filestats "{'md5sum':'abcd', 'fsize':1234, 'mtime':'25/01/1977'}"
## Or
fastqc_to_pgtable.py -i test.fq_fastqc --filestats "`get_file_stats2.py -i test.fq.gz --md5sum`" ## Note the quoting!
## Or:
get_file_stats2.py -i test.fq.gz --md5sum > out.stats
fastqc_to_pgtable.py -i test.fq_fastqc --filestats "`cat out.stats`"
## Process all fastqc_data.txt files in all subdirs in current dir:
for fastqc in `ls .`
do
fastqc_to_pgtable.py -i $fastqc/
done
DEPENDS-ON:
- Table schema fastqc
- psycopg2 and file ~/.psycopgapss for connection to sblab.
DEPENDS-ON-ME:
update_fastqc.py
TODO:
- Add argument --filestats which takes as input the dictionary-like string from
get_file_stats2.py. md5sum, mtime and fsize are then taken from this string.
""", formatter_class= argparse.RawTextHelpFormatter)
parser.add_argument('--infile', '-i',
required= True,
help='''Fastqc directory or zip archive where to look for file fastqc_data.txt.
''')
parser.add_argument('--djangobase',
required= False,
default= 'sblab-srv001:/nas/sblab_data1/group_folders/berald01/django/sblabsite',
help='''Hostname (ip address) and directory hosting sblab.
Defualt is $mac_office:$django_sblab
The Fastqc directory will be scp'd to this address, and put in the subdir given in
--djangolink.
''')
parser.add_argument('--djangolink',
required= False,
default= 'uploads/fastqc',
help='''Upload directory where django will look for the file to fastqc_report.html and
the associated files produced by fastqc. The string "--djangolink/--infile/fastqc_report.html"
will be the value in column fastqc_file.
Default is 'uploads/fastqc'. For example, if infile is directory 'db001_fastqc',
fastqc_to_pgtable.py will upload: "uploads/fastqc/db001_fastqc/fastqc_report.html"
''')
parser.add_argument('--nocommit',
action= 'store_true',
help='''Do not commit changes to database. Use this option
to test whether the data can be uploaded. The data that would be uploaded is printed
to stdout.
''')
parser.add_argument('--nosend',
action= 'store_true',
help='''Do not send the fastqc directory to django.
''')
#parser.add_argument('--md5sum',
# required= False,
# default= None,
# help='''Optional md5sum of the fastq/bam file passed to fastqc
#default is None
# ''')
#parser.add_argument('--fsize',
# required= False,
# default= None,
# type= int,
# help='''Optional file size (as int) of the fastq/bam file passed to fastqc
#default is None
# ''')
#parser.add_argument('--mtime',
# required= False,
# default= None,
# help='''Optional modification time of the fastq/bam file passed to fastqc
#default is None. Typically this is returned by os.path.getmtime().
# ''')
#parser.add_argument('--filestats',
# required= False,
# default= None,
# help='''A string formatted like a dictionary which contains
#the file stats md5sum, mtime, fsize. Typically this is the output of get_file_stats2.py.
#If this string is given, it has precedence over the individual args --md5sum, --mtime
#--fsize. MEMO: get_file_stats2.py does not return md5sum unless --mdsum flag is used.
# ''')
args = parser.parse_args()
# -----------------------------------------------------------------------------
def get_psycopgpass():
"""
Read file ~/.psycopgpass to get the connection string to pass to
psycopg2.connect()
"""
conn= open(os.path.join(os.getenv("HOME"), '.psycopgpass'))
conn= conn.readlines()
conn_args= [x.strip() for x in conn if x.strip() != '' and x.strip().startswith('#') is False][0]
return(conn_args)
def parse_module(fastqc_module):
"""
Parse a fastqc module from the table format to a line format (list).
Input is list containing the module. One list-item per line. E.g.:
fastqc_module= [
'>>Per base sequence quality pass',
'#Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile',
'1 36.34 38.0 35.0 39.0 31.0 40.0',
'2 35.64 38.0 35.0 39.0 28.0 40.0',
'3 35.50 38.0 34.0 39.0 28.0 40.0',
...
]
Return a list like this where each sublist after 1st is a column:
['pass', ['1', '2', '3', ...], ['36,34', '35.64', '35.50', ...], ['40.0', '40.0', '40.0', ...], ...]
"""
row_list= []
module_header= fastqc_module[0]
module_name= module_header.split('\t')[0]
row_list.append(module_header.split('\t')[1]) ## Line with module name >>Per base ... pass/warn/fail
# Handle odd cases:
# Where no table is returned:
if len(fastqc_module) == 1 and module_name == '>>Overrepresented sequences':
return(row_list + [[]]*4)
if len(fastqc_module) == 1 and module_name == '>>Kmer Content':
return(row_list + [[]]*5)
# Table is not the secod row:
if module_name == '>>Sequence Duplication Levels':
tot_dupl= fastqc_module[1].split('\t')[1]
row_list.append(tot_dupl)
del(fastqc_module[1])
## Conevrt table to list of lists:
tbl= []
for line in fastqc_module[2:]:
tbl.append(line.split('\t'))
## Put each column in a list:
nrows= len(tbl)
ncols= len(tbl[0])
for i in range(0, ncols):
col= []
for j in range(0, nrows):
col.append(tbl[j][i])
row_list.append(col)
return(row_list)
def list_to_pgcolumns(lst):
"""
Convert the nested lists in lst to strings formatted as array type suitable
for postgres. E.g.
['a', [1,2,3]] -> ['a', '{1,2,3}']
['x', ['a', 'b', 'c']]-> ['x', "{'a', 'b', 'c'}"]
"""
pgrow= []
for x in lst:
if type(x) == list:
try:
x= [int(z) for z in x]
# pgrow.append('{' + str(x).strip('[]') + '}')
pgrow.append(x)
except ValueError:
try:
x= [float(z) for z in x]
pgrow.append(x)
# pgrow.append('{' + str(x).strip('[]') + '}')
except:
pgrow.append(x)
# pgrow.append('{' + str(x).strip('[]') + '}')
else:
pgrow.append(x)
return(pgrow)
" ---------------------------------------------------------------------------- "
rmunzip= False
if args.infile.endswith('.zip'):
fastqcdir= args.infile.rstrip('.zip')
archivedir= os.path.split(args.infile)[0] ## Directory where archive lives
if archivedir == '':
archivedir= '.'
cmd= 'unzip -q -o -d %s %s' %(archivedir, args.infile)
p= subprocess.Popen(cmd, shell= True)
p.wait()
rmunzip= True
else:
fastqcdir= args.infile
fq= open(os.path.join(fastqcdir, 'fastqc_data.txt')).readlines()
fq= [x.rstrip('\n\r') for x in fq]
fastqc_line= [] ## The entire report will be in this list. Each item is a column in the database table
fastqc_line.append(fq[0].split('\t')[1]) ## Get Fastqc version
fastqc_line.append(fq[1].split('\t')[1]) ## base stats
## Get start and end position of all modules:
mod_start= []
mod_end= []
for i in range(0, len(fq)):
line= fq[i]
if line == '>>END_MODULE':
mod_end.append(i)
elif line.startswith('>>'):
mod_start.append(i)
else:
pass
## Process First module:
module= fq[mod_start[0]:mod_end[0]]
if not module[-1].startswith('md5sum'):
"""This is a patch to make fastqc_to_pgtable compatible with the output of fastqc v0.10.0 and with fastqc_md5.py.
fastqc_md5.py puts md5sum as last line of the basic stats module. Add a dummy line if this output comes from fastqc.
"""
module.append('md5sum\tNULL')
for line in module[2:]:
line= line.split('\t')
if line[0] == 'Filename':
filename= line[1]
fastqc_line.append(line[1])
## Start processing modules. First one (Basic statitics) is apart:
for s, e in zip(mod_start[1:], mod_end[1:]):
module= fq[s:e]
module_name= module[0].split('\t')[0]
row= parse_module(module)
# For these modules remove the column with base position (1st column, 2nd item in list):
if module_name in ['>>Per base sequence quality', '>>Per base sequence content', '>>Per base GC content', '>>Per base N content']:
del(row[1])
if module_name == '>>Sequence Duplication Levels':
row[2][9]= '10' ## Replace 10++ with 10
row= list_to_pgcolumns(row)
fastqc_line= fastqc_line + row
## Get md5sum, fsize and mtime of fastq/bam file
#if args.filestats is not None:
# fstats= eval(args.filestats)
# fastqc_line.append(fstats['md5sum'])
# fastqc_line.append(fstats['fsize'])
# fastqc_line.append(fstats['mtime'])
#else:
# fastqc_line.append(args.md5sum)
# fastqc_line.append(args.fsize)
# fastqc_line.append(args.mtime)
fastqc_line.append(os.path.join(args.djangolink, os.path.split(fastqcdir)[1], 'fastqc_report.html'))
# --------------------------[Send to django]-----------------------------------
if not args.nosend:
cmd= 'scp -q -r %s %s/%s' %(fastqcdir, args.djangobase, args.djangolink)
p= subprocess.Popen(cmd, shell= True)
p.wait()
# ---------------------[ Send to postres ]--------------------------------------
conn= psycopg2.connect(get_psycopgpass())
cur= conn.cursor()
## Memo: get columns with query like this:
## select * from information_schema.columns where table_name = 'fastqc' order by ordinal_position;
cur.execute("CREATE TEMP TABLE fastqc_tmp AS (SELECT * FROM fastqc WHERE 1=2)") ## Where to put results
sql= "INSERT INTO fastqc_tmp VALUES (%s" %('%s, ' * len(fastqc_line))
sql= sql.rstrip(', ') + ')'
cur.execute(sql, fastqc_line)
cur.execute("INSERT INTO fastqc SELECT DISTINCT * FROM fastqc_tmp EXCEPT SELECT * FROM fastqc") ## Import only rows not already present
cur.execute("DROP TABLE fastqc_tmp")
cur.close()
if args.nocommit is False:
conn.commit()
else:
print(fastqc_line)
conn.close()
if rmunzip:
## This means that the input was a zipped directory unzipped by fastqc_to_pgtable
## so the unzipped directory is removed to return to original state.
shutil.rmtree(fastqcdir)
sys.exit()