-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmit_psipred.py
160 lines (122 loc) · 4.26 KB
/
submit_psipred.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 12 00:33:50 2020
Last modified 20210226 fix urllib.request, need to type python3 submit_psipred
For psipred, fasta file must be clean without the header
Skip protein > 1500A (limit of psipred)
Ignore_exsisting function
Problem: still doesn't work well with nohup, need to use python3 to run
Script to submit a list of Uniprot ID for PSIPRED prediction
It will automatically download the Uniprot fasta file and submit
then download the results to the same directory
See further
http://bioinf.cs.ucl.ac.uk/web_servers/web_services/
@author: kbui2
"""
import requests
import urllib, argparse, os, time, urllib.request
import os.path
""" Submit psipred job """
def psipredSubmit(pID, filepath, email):
url = 'http://bioinf.cs.ucl.ac.uk/psipred/api/submission.json'
filename = os.path.basename(filepath)
payload = {'input_data': (filename, open(filepath, 'rb'))}
data = {'job': 'psipred',
'submission_name': pID,
'email': email, }
r = requests.post(url, data=data, files=payload)
dict = r.json()
uuid = dict['UUID']
return uuid
""" Get psipred job status """
def psipredProgress(uuid):
# Not done yet
url = 'http://bioinf.cs.ucl.ac.uk/psipred_new/api/submission/'
params = {'format': 'json'}
r = requests.get(url + uuid, params=params)
dict = r.json()
return dict
""" Get psipred job results ss2, horiz file """
def psipredDownload(ss2, pID, outdir):
""" Download ss2 file """
url = 'http://bioinf.cs.ucl.ac.uk/psipred/api'
# SS2 file
r = requests.get(url + ss2)
with open(outdir + '/' + pID + '.ss2','wb') as f:
f.write(r.content)
f.close()
# Horiz file
r2 = requests.get(url + str.replace(ss2, '.ss2', '.horiz'))
with open(outdir + '/' + pID + '.horiz','wb') as f2:
f2.write(r2.content)
f2.close()
""" Retrieve the fasta sequence from uniprot ID and write to an output file """
def retrieveFasta(pID, outfile):
print('Retrieving ' + pID)
response = urllib.request.urlopen("http://www.uniprot.org/uniprot/" + pID + ".fasta").read()
content = response.decode('utf-8')
outhandle = open(outfile, 'w')
outhandle.write(content)
outhandle.close()
def trimFasta(file, trimfile):
""" Trim the header with > of fasta file """
fastain = open(file, 'r')
fastaout = open(trimfile, 'w')
for line in fastain:
if line[0] == '>':
continue
fastaout.write(line)
fastain.close()
fastaout.close()
def calcFastaLength(file):
""" Calc the length of fasta file """
fastain = open(file, 'r')
aalen = 0
for line in fastain:
if line[0] == '>':
continue
aalen = aalen + len(line)
fastain.close()
return aalen
if __name__=='__main__':
parser = argparse.ArgumentParser(description='Automatically submit a list of Uniprot protein to PSIPRED');
parser.add_argument('--list', help='Input of Uniprot ID list',required=True)
parser.add_argument('--email', help='Email for job submission',required=False,default='[email protected]')
parser.add_argument('--odir', help='Output directory for output',required=True)
parser.add_argument('--ignore_existing', help='Ignore existing file (1/0)',required=False,default='0')
args = parser.parse_args()
# Limit of PSIPRED
PSIPREDLIMIT = 1500
listid = open(args.list, 'r')
pIDlist = listid.read().splitlines()
listid.close()
email = args.email
outdir = args.odir
ignore_existing = int(args.ignore_existing)
for pID in pIDlist:
if os.path.exists(outdir + "/" + pID + ".ss2") and ignore_existing == 1:
print('Skip ' + pID + ' due to existing file')
continue
outfile = outdir + '/' + pID + '_full.fasta'
trimfile = outdir + '/' + pID + '.fasta'
retrieveFasta(pID, outfile)
trimFasta(outfile, trimfile)
if calcFastaLength(trimfile) > PSIPREDLIMIT:
print('Skip due to length limit')
continue
print ('AA length ' + str(calcFastaLength(trimfile)))
print('Submit ' + trimfile)
uuid = psipredSubmit(pID, trimfile, email)
while True:
time.sleep(300)
dict = psipredProgress(uuid)
print('Job ' + uuid + ' is ' + dict['state'])
if dict['state'] == 'Complete':
break
out = dict['submissions'][0]['results']
# It seems to change now, becareful with this by debugging
# print(out)
ss2 = out[1]['data_path']
print ('Download results')
psipredDownload(ss2, pID, outdir)