-
Notifications
You must be signed in to change notification settings - Fork 0
/
YeastORFRead.py
107 lines (81 loc) · 3.67 KB
/
YeastORFRead.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Return either the set of yeast genes in DNA or protein format
# If we don't have them locally then go to yeastgenome.org and pull them
# NOTE: Possibly look into NCBI if yeastgenome gives trouble
# http://www.ncbi.nlm.nih.gov/nuccore/nc_001144.5
import urllib
import gzip
import os #for abs path
import sys #for normal print
import ProjectDefinitions
data_directory = ProjectDefinitions.data_directory
# as of now this function requires biopython to be installed on the system
# http://biopython.org
from Bio import SeqIO
def yeast_main(is_dna_seq):
yeast_sequences = getYeastData(is_dna_seq)
# This is the parsed record, the Name and Seq fields are of use
return yeast_sequences
def getYeastData(is_dna_seq):
#do we want to do nucleotide or amino acid analysis ?
if is_dna_seq:
zipped_filename = data_directory + "yeast_genome_dna.gz"
filename = data_directory + "yeast_genome_dna.fasta"
source_url = "http://downloads.yeastgenome.org/sequence/S288C_reference/orf_dna/orf_coding.fasta.gz"
individual_root_dir = data_directory + "yeast_dna/"
else:
zipped_filename = data_directory + "yeast_genome_proteins.gz"
filename = data_directory + "yeast_genome_proteins.fasta"
source_url = "http://downloads.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans.fasta.gz"
individual_root_dir = data_directory + "yeast_protein/"
#Download the data if it isn't present
try:
if ProjectDefinitions.DEBUG:
print "trying to open yeast data file at: " + filename
f_ptr = open(filename, "r")
if ProjectDefinitions.DEBUG:
print "File opened successfully!"
#file_contents = f_ptr.read()
f_ptr.close()
except IOError:
if not os.path.exists(data_directory):
sys.stdout.write("WARNING: cache directories were not found, creating... ")
os.makedirs(data_directory)
if not os.path.exists(individual_root_dir):
os.makedirs(individual_root_dir)
print "OK!"
print "WARNING: Couldn't find the file, downloading the compressed version..."
print "source: " + source_url
print "destination: " + zipped_filename
#download from previously known good loction
urllib.urlretrieve(source_url, zipped_filename);
if ProjectDefinitions.DEBUG:
sys.stdout.write("Download completed, decompressing...")
#unzip the results
f = gzip.open(zipped_filename, 'rb');
fileout = open(filename, 'wb');
fileout.write(f.read())
fileout.close();
f.close()
if ProjectDefinitions.DEBUG:
print "file decompressed";
# for some reason biopython wants to give fasta files to blast
# they need to be divided for paralellization though
yeast_sequences = extractORFNames(filename)
# return the full contents of the fasta file
return yeast_sequences;
# as of now this function requires biopython to be installed on the system
# http://biopython.org
def extractORFNames(filename):
sys.stdout.write("parsing record... ")
#Create a BioPython seq object
records = list(SeqIO.parse(filename, "fasta"))
print "parse complete, " + str(len(records)) + " records found!"
if ProjectDefinitions.DEBUG:
print "dumping one record of " + str(len(records)) + "..."
print records[0]
print "sequence: " + records[0].seq
return records
# Standard boilerplate to call the main() function to begin
# the program. This also ensures that the main isn't called when used as a module
if __name__ == '__main__':
yeast_main(False)