project cleanup

sefakilic · sefakilic · commit b75a25ffd50d · 2016-05-11T13:22:03.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,6 @@
 *~
 \#*
 tests/*
-entrez/*
+entrez_cache/*.gb
 BayesTraitsV2
 *.pkl
diff --git a/cgb/__init__.py b/cgb/__init__.py
diff --git a/cgb/bayestraits_wrapper.py b/cgb/bayestraits_wrapper.py
@@ -2,10 +2,10 @@
 import os
 import re
 
-from misc import temp_file_name
+from .misc import temp_file_name
 
 # Path to BayesTraits executable
-BAYES_TRAITS = './BayesTraitsV2'
+BAYES_TRAITS = os.path.join(os.path.dirname(__file__), 'BayesTraitsV2')
 
 
 def generate_tree_file(phylo):
diff --git a/cgb/binding_model.py b/cgb/binding_model.py
diff --git a/cgb/bio_utils.py b/cgb/bio_utils.py
@@ -4,7 +4,6 @@
 
 from Bio.Seq import Seq
 from Bio.Alphabet import generic_dna
-from Bio import motifs
 
 
 def complement(seq):
diff --git a/cgb/blast.py b/cgb/blast.py
@@ -5,7 +5,7 @@
 
 from Bio.Blast import NCBIXML
 
-from misc import temp_file_name
+from .misc import temp_file_name
 
 
 class BLAST:
diff --git a/cgb/chromid.py b/cgb/chromid.py
@@ -4,12 +4,12 @@
 from Bio import SeqIO
 from cached_property import cached_property
 
-import entrez_utils
-import bio_utils
-from gene import Gene
-from operon import Operon
-from misc import mean
-from my_logger import my_logger
+from . import entrez_utils
+from . import bio_utils
+from .gene import Gene
+from .operon import Operon
+from .misc import mean
+from .my_logger import my_logger
 
 
 class Chromid:
diff --git a/cgb/entrez_utils.py b/cgb/entrez_utils.py
@@ -6,14 +6,13 @@
 """
 
 import os
-import tempfile
 
 from Bio import Entrez
 
 Entrez.email = 'sefa1@umbc.edu'
 
 # The directory used to save NCBI records for later use.
-ENTREZ_DIRECTORY = '../entrez'
+ENTREZ_DIRECTORY = os.path.join(os.path.dirname(__file__), 'entrez_cache')
 
 
 def get_genome_record(accession):
diff --git a/cgb/gene.py b/cgb/gene.py
@@ -1,8 +1,8 @@
 from cached_property import cached_property
 
-from my_logger import my_logger
-from protein import Protein
-from blast import BlastNoHitFoundException
+from .my_logger import my_logger
+from .protein import Protein
+from .blast import BlastNoHitFoundException
 
 
 class Gene:
diff --git a/cgb/genome.py b/cgb/genome.py
@@ -6,14 +6,14 @@
 from tqdm import tqdm
 from Bio.motifs import jaspar
 
-from chromid import Chromid
-from blast import BLAST
-from PSSM_model import PSSMModel
-from misc import weighted_choice
-from misc import temp_file_name
-from my_logger import my_logger
-from bio_utils import reverse_complement
-from bio_utils import weblogo
+from .chromid import Chromid
+from .blast import BLAST
+from .pssm_model import PSSMModel
+from .misc import weighted_choice
+from .misc import temp_file_name
+from .my_logger import my_logger
+from .bio_utils import reverse_complement
+from .bio_utils import weblogo
 
 
 Site = namedtuple('Site', 'chromid start end strand score gene')
@@ -161,10 +161,11 @@ def build_PSSM_model(self, collections, weights):
         """
         # Create a PSSM model using site collections and associated weights.
         model = PSSMModel(collections, weights)
-        random_seqs = self.random_seqs(length=model.length, count=100000)
+        bg_seq = ''.join(self.random_seqs(model.length, 100000/model.length))
+        bg_scores = model.score_seq(bg_seq)
         # Create a Bayesian estimator which is used to compute the probability
         # of TF-binding of any given sequence.
-        model.build_bayesian_estimator(random_seqs)
+        model.build_bayesian_estimator(bg_scores)
         self._TF_binding_model = model
 
     def random_seqs(self, length, count):
diff --git a/cgb/logging.conf b/cgb/logging.conf
diff --git a/cgb/main.py b/cgb/main.py
@@ -1,17 +1,17 @@
 import os
 import pickle
 
-from genome import Genome
-from protein import Protein
-from site_collection import SiteCollection
-from my_logger import my_logger
-from phylo import Phylo
-from user_input import UserInput
-from orthologous_group import construct_orthologous_groups
-from orthologous_group import orthologous_groups_to_csv
-from orthologous_group import ancestral_state_reconstruction
-from orthologous_group import ancestral_states_to_csv
-from visualization import all_plots
+from .genome import Genome
+from .protein import Protein
+from .site_collection import SiteCollection
+from .my_logger import my_logger
+from .phylo import Phylo
+from .user_input import UserInput
+from .orthologous_group import construct_orthologous_groups
+from .orthologous_group import orthologous_groups_to_csv
+from .orthologous_group import ancestral_state_reconstruction
+from .orthologous_group import ancestral_states_to_csv
+from .visualization import all_plots
 
 
 def directory(*paths):
@@ -249,7 +249,7 @@ def get_prior(genome, user_input, weights):
 
     print genome.TF_binding_model.IC
     prior = (genome.length /
-             2**genome.TF_binding_model.IC /
+             2**genome.TF_binding_model.IC / 10 /
              genome.num_operons)
     my_logger.info("Prior for %s: %f" % (genome.strain_name, prior))
     return prior
@@ -378,16 +378,19 @@ def perform_ancestral_state_reconstruction(user_input, genomes,
     phylo.draw(os.path.join(output_dir, "phylogeny.png"))
 
 
-def main():
+def run_analysis():
     """The entry-point for the pipeline."""
     # Read user input and configuration from two files
-    user_input = UserInput('../tests/test6/input.json',
-                           '../tests/test6/config.json')
+    user_input = UserInput('./tests/test4/input.json',
+                           './tests/test4/config.json')
+    pickle.dump(user_input, open('user_input.pkl', 'w'))
+
     # Make output directory
     directory(user_input.output_dir)
 
     # Create proteins
     proteins = create_proteins(user_input)
+    pickle.dump(proteins, open('proteins.pkl', 'w'))
 
     # Create genomes
     genomes = create_genomes(user_input)
@@ -418,13 +421,11 @@ def main():
         all_regulated_genes.extend(regulated_genes)
         # Output operons
         output_operons(user_input, genome)
-
-    pickle.dump(genomes, open('genome.pkl', 'w'))
+    pickle.dump(genomes, open('genomes.pkl', 'w'))
 
     # Create orthologous groups
     ortholog_groups = create_orthologous_groups(
         user_input, all_regulated_genes, genomes)
-
     pickle.dump(ortholog_groups, open('orthos.pkl', 'w'))
 
     # Ancestral state reconstruction step
diff --git a/cgb/misc.py b/cgb/misc.py
diff --git a/cgb/my_logger.py b/cgb/my_logger.py
@@ -5,10 +5,10 @@
 package. See Python logging module documentation
 (https://docs.python.org/2/library/logging.html) for more information.
 """
-
+import os
 import logging
 from logging.config import fileConfig
 
-fileConfig('logging.conf')
+fileConfig(os.path.join(os.path.dirname(__file__), 'logging.conf'))
 
 my_logger = logging.getLogger()
diff --git a/cgb/operon.py b/cgb/operon.py
diff --git a/cgb/orthologous_group.py b/cgb/orthologous_group.py
@@ -4,9 +4,10 @@
 
 from tqdm import tqdm
 
-from misc import weighted_choice
-import bayestraits_wrapper
-from my_logger import my_logger
+from . import misc
+from . import visualization
+from . import bayestraits_wrapper
+from .my_logger import my_logger
 
 
 class OrthologousGroup:
@@ -61,7 +62,7 @@ def discretize_regulation_states(self, phylo):
             states = ['1', '0', 'A']
             probabilities = [terminal_states[(node.name, state)]
                              for state in states]
-            trait[node.name], = weighted_choice(states, probabilities)
+            trait[node.name], = misc.weighted_choice(states, probabilities)
         return trait
 
     def bootstrap_traits(self, phylo, sample_size):
@@ -130,6 +131,14 @@ def most_likely_state_at(self, node_name):
         return max(['1', '0', 'A'],
                    key=lambda x: self.regulation_states[(node_name, x)])
 
+    def ancestral_state_reconstruction_svg_view(self, phylo):
+        temp_file = misc.temp_file_name(suffix='.svg')
+        t = visualization.biopython_to_ete3(phylo.tree)
+        visualization.view_by_gene(t, self, temp_file)
+        with open(temp_file) as f:
+            contents = f.read()
+        return contents
+
     def __repr__(self):
         return str(self.genes)
 
@@ -172,6 +181,9 @@ def construct_orthologous_groups(genes, genomes):
         # Create the orthologous group with gene + orthologs on all other
         # genomes [if there are orthologs in the respective genomes]
         groups.append(OrthologousGroup([gene] + [rbh for rbh in rbhs if rbh]))
+
+        if len(groups) > 10:
+            break
     return groups
 
 
diff --git a/cgb/phylo.py b/cgb/phylo.py
@@ -10,7 +10,8 @@
 from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
 from cached_property import cached_property
 
-from misc import unique
+from . import visualization
+from . import misc
 
 # Structure of a Nexus tree-only file
 NEX_TEMPLATE = """\
@@ -53,7 +54,7 @@ def __init__(self, proteins, names=None, distance_model='identity',
             distance_model (string): see DistanceCalculator.protein_models
             tree_algorithm (string): 'nj' or 'upgma'
         """
-        self._proteins = unique(proteins, lambda p: p.accession_number)
+        self._proteins = misc.unique(proteins, lambda p: p.accession_number)
         self._names = names
         if names:
             assert len(proteins) == len(names)
@@ -167,3 +168,14 @@ def draw(self, filename):
         """Draws tree and saves it into the given file."""
         BioPhylo.draw(self.tree, do_show=False)
         pylab.savefig(filename)
+
+    @property
+    def svg_view(self):
+        """Draws the tree in SVG format."""
+        # Convert the tree from Biopython's Bio.Phylo.Tree to ETE3 Treenode
+        t = visualization.biopython_to_ete3(self.tree)
+        temp_file = misc.temp_file_name(suffix='.svg')
+        visualization.tree_svg_plot(t, temp_file)
+        with open(temp_file) as f:
+            contents = f.read()
+        return contents
diff --git a/cgb/protein.py b/cgb/protein.py
@@ -6,8 +6,7 @@
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
 
-import entrez_utils
-from my_logger import my_logger
+from . import entrez_utils
 
 
 class Protein:
diff --git a/cgb/pssm_model.py b/cgb/pssm_model.py
@@ -3,10 +3,10 @@
 from Bio.motifs.matrix import PositionWeightMatrix
 
 
-from binding_model import TFBindingModel
-from misc import log2
-from misc import temp_file_name
-from bio_utils import weblogo
+from .binding_model import TFBindingModel
+from .misc import log2
+from .misc import temp_file_name
+from .bio_utils import weblogo
 
 
 class PSSMModel(TFBindingModel):
@@ -31,8 +31,6 @@ def __init__(self, collections, weights,
         """Constructor for the PSSMModel class."""
         super(PSSMModel, self).__init__(collections, background)
         self._pwm = self._combine_pwms([c.pwm for c in collections], weights)
-        self._collections = collections
-        self._weights = weights
 
     @cached_property
     def pwm(self):
@@ -111,6 +109,7 @@ def score_seq(self, seq, both=True):
         if both:
             scores = [log2(2**score + 2**rc_score)
                       for score, rc_score in zip(scores, rc_scores)]
+
         return scores
 
     @property
diff --git a/cgb/site_collection.py b/cgb/site_collection.py
diff --git a/cgb/user_input.py b/cgb/user_input.py
@@ -47,6 +47,11 @@ def sites_list(self):
         """Returns the lists of binding sites."""
         return [m['sites'] for m in self._input['motifs']]
 
+    @property
+    def protein_accessions_and_sites(self):
+        """Zips protein accessions and binding sites."""
+        return zip(self.protein_accessions, self.sites_list)
+
     @property
     def output_dir(self):
         """Returns the directory to be used for logging."""
diff --git a/cgb/visualization.py b/cgb/visualization.py
@@ -1,7 +1,6 @@
 
 import os
 import copy
-import math
 
 from ete3 import Tree
 from ete3 import TreeStyle, NodeStyle
@@ -11,7 +10,7 @@
 import numpy as np
 from tqdm import tqdm
 
-from my_logger import my_logger
+from .my_logger import my_logger
 
 
 def biopython_to_ete3(biopython_tree):
@@ -25,6 +24,11 @@ def biopython_to_ete3(biopython_tree):
     return t
 
 
+def tree_svg_plot(tree, file, **kwargs):
+    ts = TreeStyle()
+    tree.render(file, tree_style=ts)
+
+
 def rgb2hex(red, green, blue):
     """Converts the given rgb values to hexadecimal color code.
 
diff --git a/entrez_cache/README.rst b/entrez_cache/README.rst
@@ -0,0 +1,3 @@
+
+This directory contains the genome and protein records downloaded via NCBI
+Entrez.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+`
	`2`	`+This directory contains the genome and protein records downloaded via NCBI`
	`3`	`+Entrez.`