Skip to content

Commit 6c1ba85

Browse files
committed
remove test results
1 parent 0a31858 commit 6c1ba85

File tree

764 files changed

+499
-1129704
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

764 files changed

+499
-1129704
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
*.pyc
2-
*~
2+
*~
3+
\#*
4+
tests/*
5+
entrez/*

README.rst

-14
This file was deleted.

cgb/binding_model.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ def build_bayesian_estimator(self, bg_scores):
7979
# Estimate the mean/std of the background scores.
8080
self._mu_bg, self._std_bg = np.mean(bg_scores), np.std(bg_scores)
8181

82+
print 'regulation parameters:', self._mu_m, self._std_m
83+
print 'background parameters:', self._mu_bg, self._std_bg
84+
8285
def binding_probability(self, seq, p_motif, alpha=1/350.0):
8386
"""Returns the probability of binding to the given seq.
8487
@@ -90,7 +93,6 @@ def binding_probability(self, seq, p_motif, alpha=1/350.0):
9093
float: the probability of TF-binding to the sequence.
9194
"""
9295
pssm_scores = self.score_seq(seq)
93-
p_bg = 1.0 - p_motif
9496
# Probability density functions
9597
pdf_m = scipy.stats.distributions.norm(self._mu_m, self._std_m).pdf
9698
pdf_bg = scipy.stats.distributions.norm(self._mu_bg, self._std_bg).pdf
@@ -99,7 +101,7 @@ def binding_probability(self, seq, p_motif, alpha=1/350.0):
99101
lh_bg = pdf_bg(pssm_scores)
100102
# Compute the likelihood ratio
101103
lh_ratio = np.exp(np.sum(np.log(lh_bg) - np.log(lh_m)))
102-
return 1 / (1 + lh_ratio * p_bg / p_motif)
104+
return 1 / (1 + lh_ratio * (1-p_motif) / p_motif)
103105

104106
# All of the following methods should be overridden in the subclass.
105107
@abstractmethod

cgb/bio_utils.py

+19
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
"""Miscellaneous bioinformatics utility functions."""
22

3+
from subprocess import PIPE, Popen
4+
35
from Bio.Seq import Seq
46
from Bio.Alphabet import generic_dna
7+
from Bio import motifs
58

69

710
def complement(seq):
@@ -24,3 +27,19 @@ def reverse_complement(seq):
2427
string: the reverse complement sequence
2528
"""
2629
return str(Seq(seq, generic_dna).reverse_complement())
30+
31+
32+
def weblogo(seqs, filename):
33+
"""Generates the sequence logo for the given sequences.
34+
35+
Uses WebLogo program (http://weblogo.threeplusone.com/).
36+
"""
37+
# Sequences in FASTA format
38+
fasta = '\n'.join('>seq%d\n%s' % (i, seq) for i, seq in enumerate(seqs))
39+
p = Popen(['weblogo',
40+
'--format', 'png',
41+
'--fout', filename,
42+
'--color-scheme', 'classic',
43+
'--errorbars', 'YES'],
44+
stdout=PIPE, stderr=PIPE, stdin=PIPE, close_fds=True)
45+
p.communicate(input=fasta)

cgb/chromid.py

+18-13
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ class Chromid:
2929
"""
3030

3131
def __init__(self, accession_number, genome):
32-
self._record = entrez_utils.get_genome_record(accession_number)
32+
raw_record = entrez_utils.get_genome_record(accession_number)
33+
self._record = SeqIO.read(cStringIO.StringIO(raw_record), 'gb')
3334
self._genome = genome
3435

3536
@property
@@ -40,7 +41,7 @@ def genome(self):
4041
@cached_property
4142
def record(self):
4243
"""Returns the Biopython SeqRecord created from a GenBank file."""
43-
return SeqIO.read(cStringIO.StringIO(self._record), 'gb')
44+
return self._record
4445

4546
@property
4647
def accession_number(self):
@@ -127,7 +128,7 @@ def _directons(self):
127128
return [directon if directon[0].is_forward_strand else directon[::-1]
128129
for directon in directons]
129130

130-
def _operon_prediction(self, use_binding_sites=True):
131+
def _operon_prediction(self):
131132
"""Identifies all operons of the chromosome/plasmid.
132133
133134
Two neighboring genes in the same strand are considered to be in the
@@ -138,11 +139,10 @@ def _operon_prediction(self, use_binding_sites=True):
138139
This provides an adaptive threshold that takes into account the
139140
intergenic compression of different genomes.
140141
141-
Args:
142-
use_binding_sites (bool): If true, the binding site predictions are
143-
used to improve the operon predictions. If a gene with a putative
144-
binding site in its promoter is in the middle of an operon, the
145-
operon is split.
142+
If putative binding sites are identified for the genome, they are used
143+
to improve the operon predictions. If a gene with a putative
144+
binding site in its promoter is in the middle of an operon, the
145+
operon is split.
146146
"""
147147
my_logger.info("Predicting operons - %s (%s)" %
148148
(self.accession_number, self.genome.strain_name))
@@ -151,8 +151,11 @@ def _operon_prediction(self, use_binding_sites=True):
151151
# Compute the mean intergenic distance of directons' first two genes.
152152
mean_dist = mean([directon[0].distance(directon[1])
153153
for directon in directons if len(directon) > 1])
154+
154155
# Find genes with binding sites in their promoters
155-
genes_to_split = set(site.gene for site in self.genome.putative_sites)
156+
genes_to_split = (set(site.gene for site in self.genome.putative_sites)
157+
if self.genome.has_putative_sites
158+
else [])
156159

157160
directons_rest = self._directons()
158161
while directons_rest:
@@ -161,16 +164,18 @@ def _operon_prediction(self, use_binding_sites=True):
161164
for directon in processing:
162165
operon = [directon[0]]
163166
i = 1
164-
while (i < len(directon) and
165-
directon[i-1].distance(directon[i]) < mean_dist):
166-
if use_binding_sites and directon[i] in genes_to_split:
167+
while i < len(directon):
168+
if directon[i-1].distance(directon[i]) >= mean_dist:
169+
break
170+
if directon[i] in genes_to_split:
167171
break
168172
operon.append(directon[i])
169173
i += 1
170174
operons.append(operon)
171175
if i < len(directon):
172176
directons_rest.append(directon[i:])
173-
177+
my_logger.info("Number of operons in %s: %d" %
178+
(self.accession_number, len(operons)))
174179
return [Operon(opr) for opr in operons]
175180

176181
def find_closest_gene(self, pos):

cgb/entrez_utils.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
Entrez.email = '[email protected]'
1414

1515
# The directory used to save NCBI records for later use.
16-
ENTREZ_DIRECTORY = tempfile.gettempdir()
16+
ENTREZ_DIRECTORY = '../entrez'
1717

1818

1919
def get_genome_record(accession):
@@ -23,15 +23,24 @@ def get_genome_record(accession):
2323
# Download and save Genbank record
2424
handle = Entrez.efetch(db='nuccore', id=accession,
2525
retmode='gbwithparts', rettype='text')
26+
record = handle.read()
2627
with open(genbank_file, 'w') as f:
27-
f.write(handle.read())
28+
f.write(record)
2829

2930
handle = open(genbank_file)
3031
return handle.read()
3132

3233

3334
def get_protein_record(accession):
3435
"""Fetches the protein record from NCBI Protein database."""
35-
handle = Entrez.efetch(db='protein', id=accession,
36-
rettype='gb', retmode='text')
36+
protein_file = os.path.join(ENTREZ_DIRECTORY, accession+'.gb')
37+
if not os.path.isfile(protein_file):
38+
# Download and save file
39+
handle = Entrez.efetch(db='protein', id=accession,
40+
rettype='gb', retmode='text')
41+
record = handle.read()
42+
with open(protein_file, 'w') as f:
43+
f.write(record)
44+
45+
handle = open(protein_file)
3746
return handle.read()

cgb/gene.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def upstream_forward_strand_gene():
8383
if up:
8484
loc_start = max(0, self.start-up)
8585
elif self.upstream_gene:
86-
loc_start = min(self.upstream_gene.end, self.start)
86+
loc_start = min(self.upstream_gene.end-50, self.start)
8787
else:
8888
loc_start = 0
8989
loc_end = self.start + down
@@ -94,7 +94,7 @@ def upstream_reverse_strand_gene():
9494
if up:
9595
loc_end = min(self.chromid.length, self.end+up)
9696
elif self.upstream_gene:
97-
loc_end = max(self.upstream_gene.start, self.end)
97+
loc_end = max(self.upstream_gene.start+50, self.end)
9898
else:
9999
loc_end = self.chromid.length
100100
loc_start = self.end - down

cgb/genome.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88

99
from chromid import Chromid
1010
from blast import BLAST
11-
from pssm_model import PSSMModel
11+
from PSSM_model import PSSMModel
1212
from misc import weighted_choice
13+
from misc import temp_file_name
1314
from my_logger import my_logger
1415
from bio_utils import reverse_complement
16+
from bio_utils import weblogo
1517

1618

1719
Site = namedtuple('Site', 'chromid start end strand score gene')
@@ -40,8 +42,8 @@ def __init__(self, strain_name, accession_numbers):
4042
accession_numbers (list): List of corresponding accession numbers.
4143
"""
4244

43-
logging.debug('Creating genome: %s %s' %
44-
(strain_name, str(accession_numbers)))
45+
logging.info('Creating genome: %s %s' %
46+
(strain_name, str(accession_numbers)))
4547
self._strain_name = strain_name
4648
self._chromids = [Chromid(acc, self) for acc in accession_numbers]
4749
self._TF_instance = None # TF-instance in this genome
@@ -67,6 +69,11 @@ def operons(self):
6769
"""Returns all operons of the genome."""
6870
return [opr for chromid in self.chromids for opr in chromid.operons]
6971

72+
@cached_property
73+
def length(self):
74+
"""Returns the total length of the genome."""
75+
return sum(c.length for c in self.chromids)
76+
7077
@property
7178
def num_operons(self):
7279
"""Returns the number of operons of the genome."""
@@ -154,13 +161,10 @@ def build_PSSM_model(self, collections, weights):
154161
"""
155162
# Create a PSSM model using site collections and associated weights.
156163
model = PSSMModel(collections, weights)
157-
# Generate and PSSM-score random sequences to estimate background
158-
# PSSM-score distribution.
159-
random_seqs = self.random_seqs(length=model.length, count=100)
160-
bg_scores = [model.score_seq(random_seq) for random_seq in random_seqs]
164+
random_seqs = self.random_seqs(length=model.length, count=100000)
161165
# Create a Bayesian estimator which is used to compute the probability
162166
# of TF-binding of any given sequence.
163-
model.build_bayesian_estimator(bg_scores)
167+
model.build_bayesian_estimator(random_seqs)
164168
self._TF_binding_model = model
165169

166170
def random_seqs(self, length, count):
@@ -247,12 +251,14 @@ def identify_TF_instance(self, proteins):
247251
if blast_hits:
248252
# If there are BLAST hits, return the one with the best e-value.
249253
TF, _ = min(blast_hits, key=lambda x: x[1])
254+
my_logger.info("%s" % TF.accession_number)
250255
else:
251256
# Otherwise, set the TF-instance to None.
252257
# The genome will be dropped from the analysis.
253258
TF = None
254259
my_logger.warning("No TF-instance found for %s. " %
255260
self.strain_name)
261+
256262
self._TF_instance = TF
257263

258264
def infer_regulation(self, prior, threshold=0.5, filename=None):
@@ -307,11 +313,24 @@ def _output_posterior_probabilities(self, scan_results, filename):
307313

308314
@property
309315
def putative_sites(self):
310-
"""Returns the lsit of putative sites in non-coding regions."""
316+
"""Returns the list of putative sites in non-coding regions."""
311317
return self._putative_sites
312318

319+
@property
320+
def has_putative_sites(self):
321+
"""Returns true if the putative binding sites are identified."""
322+
return hasattr(self, '_putative_sites')
323+
324+
@property
325+
def weblogo_from_putative_sites(self):
326+
"""Returns the file of sequence logo built using putative sites."""
327+
filename = temp_file_name()
328+
weblogo([site.chromid.subsequence(site.start, site.end, site.strand)
329+
for site in self.putative_sites], filename)
330+
return filename
331+
313332
def identify_sites(self, promoter_up=300, filename=None):
314-
"""Returns the list of sites in non-coding regions..
333+
"""Returns the list of sites in non-coding regions.
315334
316335
It searches exclusively the [-promoter_up, +50] for all genes in the
317336
genome. It returns all sites with a score over threshold, and tags the
@@ -331,7 +350,7 @@ def identify_sites(self, promoter_up=300, filename=None):
331350
filename (string): the CSV file to report putative binding sites.
332351
"""
333352
# If already computed, return the stored results.
334-
if hasattr(self, '_putative_sites' ):
353+
if hasattr(self, '_putative_sites'):
335354
return self._putative_sites
336355

337356
threshold = self.TF_binding_model.threshold() # score threshold

0 commit comments

Comments
 (0)