forked from roryk/junkdrawer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgtf2fasta.py
executable file
·107 lines (85 loc) · 2.87 KB
/
gtf2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/local/bin/python
from optparse import OptionParser
from BCBio import GFF
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.SeqRecord import SeqRecord
import subprocess
import os
import sys
def lookupSequences(files):
gtf_file = open(files['gtf_file'])
records = []
for rec in GFF.parse(gtf_file):
chrom = rec.id
for feature in rec.features:
if feature.sub_features == []:
seq = lookup_sequence(files, feature, chrom)
id = feature.qualifiers['transcript_id'][0]
strand = feature.strand
else:
seq = Seq("", generic_dna)
id = feature.id
for subf in feature.sub_features:
seq = seq + lookup_sequence(files, subf, chrom)
strand = subf.strand
if strand is -1:
seq = seq.reverse_complement()
records.append(SeqRecord(seq, id=id))
SeqIO.write(records, sys.stdout, "fasta")
def lookup_sequence(files, feature, chrom):
"""
use samtools to look up the sequence
"""
args = [files['samtools'], "faidx", files['seq_file'], str(chrom) +
":" + str(int(str(feature.location.start))+1) + "-" +
str(feature.location.end)]
child = subprocess.Popen(args, stdout=subprocess.PIPE)
seq = ""
for line in child.stdout:
if line.strip()[0] == ">":
continue
seq = seq + line.strip()
seq = Seq(seq, generic_dna)
return seq
def which(program):
def is_exe(fpath):
return os.path.exists(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def main():
usage = "usage: gtf2fasta seq_file gtf_file"
parser = OptionParser()
(options, args) = parser.parse_args()
samtools = which("samtools")
if samtools is None:
print "samtools must executable, add it to your path or " \
"download it from http://samtools.sourceforge.net/"
exit(-1)
files = {}
files['samtools'] = samtools
if len(args) != 2:
print usage
exit(-1)
files['seq_file'] = args[0]
files['gtf_file'] = args[1]
if not os.path.exists(files['seq_file']):
print "seq_file does not exist"
print usage
exit(-1)
if not os.path.exists(files['gtf_file']):
print "gtf_file does not exist"
print usage
exit(-1)
lookupSequences(files)
if __name__ == "__main__":
main()