forked from zstephens/neat-genreads
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mergeJobs.py
121 lines (104 loc) · 3 KB
/
mergeJobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
import os
import argparse
def getListOfFiles(inDir,pattern):
return [inDir+n for n in os.listdir(inDir) if (pattern in n and os.path.getsize(inDir+n))]
TEMP_IND = 0
def stripVCF_header(fn):
global TEMP_IND
f = open(fn,'r')
ftn = fn+'_temp'+str(TEMP_IND)
f_t = open(ftn,'w')
hasHeader = False
for line in f:
if line[0] == '#':
if not hasHeader:
TEMP_IND += 1
hasHeader = True
elif hasHeader:
f_t.write(line)
else:
break
f_t.close()
f.close()
if hasHeader:
return ftn
else:
os.system('rm '+ftn)
return fn
def catListOfFiles(l,outName,gzipped=False):
for n in l:
if n[-3:] == '.gz' or n[-5:] == '.gzip':
gzipped = True
if gzipped:
for n in l:
if not n[-3:] == '.gz' and not n[-5:] == '.gzip':
print '\nError: Found a mixture of compressed and decompressed files with the specified prefix. Abandoning ship...\n'
for m in l:
print m
print ''
exit(1)
cmd = 'cat '+' '.join(sorted(l))+' > '+outName+'.gz'
else:
cmd = 'cat '+' '.join(sorted(l))+' > '+outName
print cmd
os.system(cmd)
def catBams(l,outName,samtools_exe):
l_sort = sorted(l)
tmp = outName+'.tempHeader.sam'
os.system(samtools_exe+' view -H '+l_sort[0]+' > '+tmp)
cmd = samtools_exe+' cat -h '+tmp+' '+' '.join(l_sort)+' > '+outName
print cmd
os.system(cmd)
os.system('rm '+tmp)
#####################################
# main() #
#####################################
def main():
parser = argparse.ArgumentParser(description='mergeJobs.py')
parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* input prefix: [prefix_1] [prefix_2] ...")
parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix")
parser.add_argument('-s', type=str, required=True, metavar='<str>', help="* /path/to/samtools")
args = parser.parse_args()
(INP,OUP,SAMTOOLS) = (args.i,args.o,args.s)
inDir = '/'.join(INP[0].split('/')[:-1])+'/'
if inDir == '/':
inDir = './'
#print inDir
INP_LIST = []
for n in INP:
if n[-1] == '/':
n = n[:-1]
INP_LIST.append(n.split('/')[-1])
listing_r1 = []
listing_r2 = []
listing_b = []
listing_v = []
for n in INP_LIST:
listing_r1 += getListOfFiles(inDir,n+'_read1.fq.job')
listing_r2 += getListOfFiles(inDir,n+'_read2.fq.job')
listing_b += getListOfFiles(inDir,n+'_golden.bam.job')
if len(listing_v): # remove headers from vcf files that aren't the first being processed
initList = getListOfFiles(inDir,n+'_golden.vcf.job')
listing_v += [stripVCF_header(n) for n in initList]
else:
listing_v += getListOfFiles(inDir,n+'_golden.vcf.job')
#
# merge fq files
#
if len(listing_r1):
catListOfFiles(listing_r1,OUP+'_read1.fq')
if len(listing_r2):
catListOfFiles(listing_r2,OUP+'_read2.fq')
#
# merge golden alignments, if present
#
if len(listing_b):
catBams(listing_b,OUP+'_golden.bam',SAMTOOLS)
#
# merge golden vcfs, if present
#
if len(listing_v):
catListOfFiles(listing_v,OUP+'_golden.vcf')
if __name__ == "__main__":
main()