-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_intergene.py
65 lines (61 loc) · 2.78 KB
/
get_intergene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
#Got this script from http://biopython.org/wiki/Intergenic_regions
import sys
import Bio
from Bio import SeqIO, SeqFeature
from Bio.SeqRecord import SeqRecord
import os
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
# Do not remove this comment
def get_interregions(genbank_path,intergene_length=1):
seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
cds_list_plus = []
cds_list_minus = []
intergenic_records = []
# Loop over the genome file, get the CDS features on each of the strands
for feature in seq_record.features:
if feature.type == 'CDS':
mystart = feature.location._start.position
myend = feature.location._end.position
if feature.strand == -1:
cds_list_minus.append((mystart,myend,-1))
elif feature.strand == 1:
cds_list_plus.append((mystart,myend,1))
else:
sys.stderr.write("No strand indicated %d-%d. Assuming +\n" %
(mystart, myend))
cds_list_plus.append((mystart,myend,1))
for i,pospair in enumerate(cds_list_plus[1:]):
# Compare current start position to previous end position
last_end = cds_list_plus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "+"
intergenic_records.append(
SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
description="%s %d-%d %s" % (seq_record.name, last_end+1,
this_start,strand_string)))
for i,pospair in enumerate(cds_list_minus[1:]):
last_end = cds_list_minus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "-"
intergenic_records.append(
SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
description="%s %d-%d %s" % (seq_record.name, last_end+1,
this_start,strand_string)))
outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
if __name__ == '__main__':
if len(sys.argv) == 2:
get_interregions(sys.argv[1])
elif len(sys.argv) == 3:
get_interregions(sys.argv[1],int(sys.argv[2]))
else:
print("Usage: get_intergenic.py gb_file [intergenic_length]")
sys.exit(0)