-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcircRNA.py
145 lines (136 loc) · 6.06 KB
/
circRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import sys
import requests
import os
import uuid
import json
SPECIES = 'homo_sapiens'
ASSEMBLY = 'GRCh38'
OFFSET = 200
def get_input():
try:
coord = sys.argv[1].rstrip('\n')
strand = sys.argv[2].rstrip('\n')
outfile = sys.argv[3].rstrip('\n')
except IndexError:
print ("USAGE:\npython circRNA.py coordinates strand outfile\n")
print ("\t'coordinates': chrom:start-end format (exp: 10:25000000-25001000)")
print ("\t'strand' : can be '+' or '-'")
print ("\t'outfile' : Give a name where the output will be saved in CSV format")
sys.exit()
coord = coord.rstrip('\n').rstrip('\r')
strand = strand.rstrip('\n').rstrip('\r')
try:
chrom, start, end = parse_coords(coord)
except:
print ("ERROR: Please provide coordinate in chrom:start-end format (exp: 10:25000000-25001000)")
sys.exit()
if strand not in ['+', '-']:
print ("ERROR: Strand should be either '+' or '-'")
sys.exit()
return chrom.upper().replace('CHR', ''), start, end, strand, outfile
def parse_coords(coord):
chrom = coord.split(":")[0]
start = coord.split(":")[1].split("-")[0]
end = coord.split(":")[1].split("-")[1]
print ("INFO: Input coordinates successfully parsed as following:")
print ("INFO: Chrom:%s Start:%s End:%s" % (chrom, start, end))
return chrom, start, end
def get_circ_coordinates(start, end, strand, offset=OFFSET):
# if its positive strand the offset is subtracted from end and add to the start sequence
if strand == "+":
start1 = int(end) - offset
end1 = int(end)
start2 = int(start)
end2 = int(start) + offset
# if its negative strand, reverse complement of the obtained sequence is joined together
elif strand == "-":
start1 = int(start)
end1 = int(start) + offset
start2 = int(end) - offset
end2 = int(end)
else:
return False
print ("INFO: Calculation of backspliced coordinates with offset of %dbp complete" % offset)
return ((start1, end1), (start2, end2))
def fetch_ensembl(chrom, start, end, strand, species=SPECIES, assembly=ASSEMBLY):
print ("INFO: Will connect to ENSEMBL for species: %s and assembly:%s" % (species, assembly))
server = "https://rest.ensembl.org"
s = '1' if strand == '+' else '-1'
ext = "/sequence/region/%s/%s:%s..%s:%s?coord_system_version=%s" % (
species, chrom, str(start), str(end), s, assembly)
print ("INFO:", server+ext)
r = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
if not r.ok:
print ("ERROR: Unable to contact Ensembl to fetch sequences. Exiting now..")
sys.exit()
print ("INFO: Ensembl query successful")
return r.text
def get_backspliced_seq(circ_cords):
print ("INFO: Connecting to Ensembl for fetching 5' sequence")
seq1 = fetch_ensembl(chrom, circ_cords[0][0], circ_cords[0][1], strand)
print ("INFO: Connecting to Ensembl for fetching 3' sequence")
seq2 = fetch_ensembl(chrom, circ_cords[1][0], circ_cords[1][1], strand)
# Strand info is allow implicit in seq1 and seq2
return seq1+seq2
def run_primer3(seq):
print ("INFO: Creating template file for Primer3 with template length of: %d" % len(seq))
temp_fn = str(uuid.uuid4())
with open(temp_fn, 'w') as OUT:
OUT.write("SEQUENCE_ID=sequence\nSEQUENCE_TEMPLATE=%s\n=\n" % seq)
script_dir = os.path.dirname(os.path.realpath(__file__))
exe = "%s/primer3_bin/primer3_core" % script_dir
settings_fn = "%s/primer3_bin/circrna_primers_settings.p3" % script_dir
print ("INFO: Using Primer3 settings file: %s" % settings_fn)
cmd = "%s -p3_settings_file=%s -echo_settings_file %s" % (exe, settings_fn, temp_fn)
print ("INFO: Running Primer3...")
print ("INFO: ", cmd)
output = os.popen(cmd).read()
os.system('rm %s' % temp_fn)
print ("INFO: Removed temporary input file for Primer3")
return output
def prep_output(raw_output):
output_dict = {}
for line in raw_output.split('\n'):
cols = line.rstrip('\n').split('=')
try:
output_dict[cols[0]] = cols[1]
except IndexError:
pass
num_primers = int(output_dict['PRIMER_PAIR_NUM_RETURNED'])
valid_primers = 0
if num_primers > 0:
parsed = [','.join([
"Primer ID", "Product size", "Left primer", "Right primer",
"Left GC", "Right GC", "Left TM", "Right TM",
"Left pos", "Right pos", "Left size", "Right size"
])]
for i in range(num_primers):
left_pos = output_dict['PRIMER_LEFT_%d' % i].split(',')[0]
right_pos = output_dict['PRIMER_RIGHT_%d' % i].split(',')[0]
if int(left_pos) < 195 and int(right_pos) > 205:
valid_primers += 1
parsed.append(",".join([
str(i), output_dict['PRIMER_PAIR_%d_PRODUCT_SIZE' % i],
output_dict['PRIMER_LEFT_%d_SEQUENCE' % i], output_dict['PRIMER_RIGHT_%d_SEQUENCE' % i],
output_dict['PRIMER_LEFT_%d_GC_PERCENT' % i], output_dict['PRIMER_RIGHT_%d_GC_PERCENT' % i],
output_dict['PRIMER_LEFT_%d_TM' % i], output_dict['PRIMER_RIGHT_%d_TM' % i],
left_pos, right_pos,
output_dict['PRIMER_LEFT_%d' % i].split(',')[1], output_dict['PRIMER_RIGHT_%d' % i].split(',')[1]
]))
else:
print ("WARNING: No primers found")
return "No primers found"
print ("INFO: %d primers found" % valid_primers)
return '\n'.join(parsed)
if __name__ == '__main__':
print ("Welcome to CircPrimer")
chrom, start, end, strand, outfile = get_input()
cc = get_circ_coordinates(start, end, strand)
back_seq = get_backspliced_seq(cc)
primer3_output = run_primer3(back_seq)
with open("%s_raw_primer3.csv" % outfile, 'w') as OUT:
OUT.write(primer3_output)
primers_table = prep_output(primer3_output)
with open("%s.csv" % outfile, 'w') as OUT:
OUT.write(primers_table)
print ("SUCCESS: Analysis complete.. Primers are saved in %s.csv" % outfile)