ErillLab
diff --git a/‎.gitignore
+4-1 b/‎.gitignore
+4-1
diff --git a/‎README.rst
-14 b/‎README.rst
-14
diff --git a/‎cgb/binding_model.py
+4-2 b/‎cgb/binding_model.py
+4-2
diff --git a/‎cgb/bio_utils.py
+19 b/‎cgb/bio_utils.py
+19
diff --git a/‎cgb/chromid.py
+18-13 b/‎cgb/chromid.py
+18-13
diff --git a/‎cgb/entrez_utils.py
+13-4 b/‎cgb/entrez_utils.py
+13-4
diff --git a/‎cgb/gene.py
+2-2 b/‎cgb/gene.py
+2-2
diff --git a/‎cgb/genome.py
+30-11 b/‎cgb/genome.py
+30-11
@@ -1,2 +1,5 @@
 *.pyc
-*~
+*~
+\#*
+tests/*
+entrez/*
@@ -79,6 +79,9 @@ def build_bayesian_estimator(self, bg_scores):
         # Estimate the mean/std of the background scores.
         self._mu_bg, self._std_bg = np.mean(bg_scores), np.std(bg_scores)
 
+        print 'regulation parameters:', self._mu_m, self._std_m
+        print 'background parameters:', self._mu_bg, self._std_bg
+
     def binding_probability(self, seq, p_motif, alpha=1/350.0):
         """Returns the probability of binding to the given seq.
 
@@ -90,7 +93,6 @@ def binding_probability(self, seq, p_motif, alpha=1/350.0):
             float: the probability of TF-binding to the sequence.
         """
         pssm_scores = self.score_seq(seq)
-        p_bg = 1.0 - p_motif
         # Probability density functions
         pdf_m = scipy.stats.distributions.norm(self._mu_m, self._std_m).pdf
         pdf_bg = scipy.stats.distributions.norm(self._mu_bg, self._std_bg).pdf
@@ -99,7 +101,7 @@ def binding_probability(self, seq, p_motif, alpha=1/350.0):
         lh_bg = pdf_bg(pssm_scores)
         # Compute the likelihood ratio
         lh_ratio = np.exp(np.sum(np.log(lh_bg) - np.log(lh_m)))
-        return 1 / (1 + lh_ratio * p_bg / p_motif)
+        return 1 / (1 + lh_ratio * (1-p_motif) / p_motif)
 
     # All of the following methods should be overridden in the subclass.
     @abstractmethod
 
@@ -1,7 +1,10 @@
 """Miscellaneous bioinformatics utility functions."""
 
+from subprocess import PIPE, Popen
+
 from Bio.Seq import Seq
 from Bio.Alphabet import generic_dna
+from Bio import motifs
 
 
 def complement(seq):
@@ -24,3 +27,19 @@ def reverse_complement(seq):
         string: the reverse complement sequence
     """
     return str(Seq(seq, generic_dna).reverse_complement())
+
+
+def weblogo(seqs, filename):
+    """Generates the sequence logo for the given sequences.
+
+    Uses WebLogo program (http://weblogo.threeplusone.com/).
+    """
+    # Sequences in FASTA format
+    fasta = '\n'.join('>seq%d\n%s' % (i, seq) for i, seq in enumerate(seqs))
+    p = Popen(['weblogo',
+               '--format', 'png',
+               '--fout', filename,
+               '--color-scheme', 'classic',
+               '--errorbars', 'YES'],
+              stdout=PIPE, stderr=PIPE, stdin=PIPE, close_fds=True)
+    p.communicate(input=fasta)
@@ -29,7 +29,8 @@ class Chromid:
     """
 
     def __init__(self, accession_number, genome):
-        self._record = entrez_utils.get_genome_record(accession_number)
+        raw_record = entrez_utils.get_genome_record(accession_number)
+        self._record = SeqIO.read(cStringIO.StringIO(raw_record), 'gb')
         self._genome = genome
 
     @property
@@ -40,7 +41,7 @@ def genome(self):
     @cached_property
     def record(self):
         """Returns the Biopython SeqRecord created from a GenBank file."""
-        return SeqIO.read(cStringIO.StringIO(self._record), 'gb')
+        return self._record
 
     @property
     def accession_number(self):
@@ -127,7 +128,7 @@ def _directons(self):
         return [directon if directon[0].is_forward_strand else directon[::-1]
                 for directon in directons]
 
-    def _operon_prediction(self, use_binding_sites=True):
+    def _operon_prediction(self):
         """Identifies all operons of the chromosome/plasmid.
 
         Two neighboring genes in the same strand are considered to be in the
@@ -138,11 +139,10 @@ def _operon_prediction(self, use_binding_sites=True):
         This provides an adaptive threshold that takes into account the
         intergenic compression of different genomes.
 
-        Args:
-            use_binding_sites (bool): If true, the binding site predictions are
-            used to improve the operon predictions. If a gene with a putative
-            binding site in its promoter is in the middle of an operon, the
-            operon is split.
+        If putative binding sites are identified for the genome, they are used
+        to improve the operon predictions. If a gene with a putative
+        binding site in its promoter is in the middle of an operon, the
+        operon is split.
         """
         my_logger.info("Predicting operons - %s (%s)" %
                        (self.accession_number, self.genome.strain_name))
@@ -151,8 +151,11 @@ def _operon_prediction(self, use_binding_sites=True):
         # Compute the mean intergenic distance of directons' first two genes.
         mean_dist = mean([directon[0].distance(directon[1])
                           for directon in directons if len(directon) > 1])
+
         # Find genes with binding sites in their promoters
-        genes_to_split = set(site.gene for site in self.genome.putative_sites)
+        genes_to_split = (set(site.gene for site in self.genome.putative_sites)
+                          if self.genome.has_putative_sites
+                          else [])
 
         directons_rest = self._directons()
         while directons_rest:
@@ -161,16 +164,18 @@ def _operon_prediction(self, use_binding_sites=True):
             for directon in processing:
                 operon = [directon[0]]
                 i = 1
-                while (i < len(directon) and
-                       directon[i-1].distance(directon[i]) < mean_dist):
-                    if use_binding_sites and directon[i] in genes_to_split:
+                while i < len(directon):
+                    if directon[i-1].distance(directon[i]) >= mean_dist:
+                        break
+                    if directon[i] in genes_to_split:
                         break
                     operon.append(directon[i])
                     i += 1
                 operons.append(operon)
                 if i < len(directon):
                     directons_rest.append(directon[i:])
-
+        my_logger.info("Number of operons in %s: %d" %
+                       (self.accession_number, len(operons)))
         return [Operon(opr) for opr in operons]
 
     def find_closest_gene(self, pos):
 
@@ -13,7 +13,7 @@
 Entrez.email = '[email protected]'
 
 # The directory used to save NCBI records for later use.
-ENTREZ_DIRECTORY = tempfile.gettempdir()
+ENTREZ_DIRECTORY = '../entrez'
 
 
 def get_genome_record(accession):
@@ -23,15 +23,24 @@ def get_genome_record(accession):
         # Download and save Genbank record
         handle = Entrez.efetch(db='nuccore', id=accession,
                                retmode='gbwithparts', rettype='text')
+        record = handle.read()
         with open(genbank_file, 'w') as f:
-            f.write(handle.read())
+            f.write(record)
 
     handle = open(genbank_file)
     return handle.read()
 
 
 def get_protein_record(accession):
     """Fetches the protein record from NCBI Protein database."""
-    handle = Entrez.efetch(db='protein', id=accession,
-                           rettype='gb', retmode='text')
+    protein_file = os.path.join(ENTREZ_DIRECTORY, accession+'.gb')
+    if not os.path.isfile(protein_file):
+        # Download and save file
+        handle = Entrez.efetch(db='protein', id=accession,
+                               rettype='gb', retmode='text')
+        record = handle.read()
+        with open(protein_file, 'w') as f:
+            f.write(record)
+
+    handle = open(protein_file)
     return handle.read()
@@ -83,7 +83,7 @@ def upstream_forward_strand_gene():
             if up:
                 loc_start = max(0, self.start-up)
             elif self.upstream_gene:
-                loc_start = min(self.upstream_gene.end, self.start)
+                loc_start = min(self.upstream_gene.end-50, self.start)
             else:
                 loc_start = 0
             loc_end = self.start + down
@@ -94,7 +94,7 @@ def upstream_reverse_strand_gene():
             if up:
                 loc_end = min(self.chromid.length, self.end+up)
             elif self.upstream_gene:
-                loc_end = max(self.upstream_gene.start, self.end)
+                loc_end = max(self.upstream_gene.start+50, self.end)
             else:
                 loc_end = self.chromid.length
             loc_start = self.end - down
 
@@ -8,10 +8,12 @@
 
 from chromid import Chromid
 from blast import BLAST
-from pssm_model import PSSMModel
+from PSSM_model import PSSMModel
 from misc import weighted_choice
+from misc import temp_file_name
 from my_logger import my_logger
 from bio_utils import reverse_complement
+from bio_utils import weblogo
 
 
 Site = namedtuple('Site', 'chromid start end strand score gene')
@@ -40,8 +42,8 @@ def __init__(self, strain_name, accession_numbers):
             accession_numbers (list): List of corresponding accession numbers.
         """
 
-        logging.debug('Creating genome: %s %s' %
-                      (strain_name, str(accession_numbers)))
+        logging.info('Creating genome: %s %s' %
+                     (strain_name, str(accession_numbers)))
         self._strain_name = strain_name
         self._chromids = [Chromid(acc, self) for acc in accession_numbers]
         self._TF_instance = None   # TF-instance in this genome
@@ -67,6 +69,11 @@ def operons(self):
         """Returns all operons of the genome."""
         return [opr for chromid in self.chromids for opr in chromid.operons]
 
+    @cached_property
+    def length(self):
+        """Returns the total length of the genome."""
+        return sum(c.length for c in self.chromids)
+
     @property
     def num_operons(self):
         """Returns the number of operons of the genome."""
@@ -154,13 +161,10 @@ def build_PSSM_model(self, collections, weights):
         """
         # Create a PSSM model using site collections and associated weights.
         model = PSSMModel(collections, weights)
-        # Generate and PSSM-score random sequences to estimate background
-        # PSSM-score distribution.
-        random_seqs = self.random_seqs(length=model.length, count=100)
-        bg_scores = [model.score_seq(random_seq) for random_seq in random_seqs]
+        random_seqs = self.random_seqs(length=model.length, count=100000)
         # Create a Bayesian estimator which is used to compute the probability
         # of TF-binding of any given sequence.
-        model.build_bayesian_estimator(bg_scores)
+        model.build_bayesian_estimator(random_seqs)
         self._TF_binding_model = model
 
     def random_seqs(self, length, count):
@@ -247,12 +251,14 @@ def identify_TF_instance(self, proteins):
         if blast_hits:
             # If there are BLAST hits, return the one with the best e-value.
             TF, _ = min(blast_hits, key=lambda x: x[1])
+            my_logger.info("%s" % TF.accession_number)
         else:
             # Otherwise, set the TF-instance to None.
             # The genome will be dropped from the analysis.
             TF = None
             my_logger.warning("No TF-instance found for %s. " %
                               self.strain_name)
+
         self._TF_instance = TF
 
     def infer_regulation(self, prior, threshold=0.5, filename=None):
@@ -307,11 +313,24 @@ def _output_posterior_probabilities(self, scan_results, filename):
 
     @property
     def putative_sites(self):
-        """Returns the lsit of putative sites in non-coding regions."""
+        """Returns the list of putative sites in non-coding regions."""
         return self._putative_sites
 
+    @property
+    def has_putative_sites(self):
+        """Returns true if the putative binding sites are identified."""
+        return hasattr(self, '_putative_sites')
+
+    @property
+    def weblogo_from_putative_sites(self):
+        """Returns the file of sequence logo built using putative sites."""
+        filename = temp_file_name()
+        weblogo([site.chromid.subsequence(site.start, site.end, site.strand)
+                 for site in self.putative_sites], filename)
+        return filename
+
     def identify_sites(self, promoter_up=300, filename=None):
-        """Returns the list of sites in non-coding regions..
+        """Returns the list of sites in non-coding regions.
 
         It searches exclusively the [-promoter_up, +50] for all genes in the
         genome. It returns all sites with a score over threshold, and tags the
@@ -331,7 +350,7 @@ def identify_sites(self, promoter_up=300, filename=None):
             filename (string): the CSV file to report putative binding sites.
         """
         # If already computed, return the stored results.
-        if hasattr(self, '_putative_sites' ):
+        if hasattr(self, '_putative_sites'):
             return self._putative_sites
 
         threshold = self.TF_binding_model.threshold()  # score threshold
-Original file line number
+Diff line change
 *.pyc
 -*~
 +*~
 +\#*
 +tests/*
 +entrez/*