Skip to content

Commit b75a25f

Browse files
committed
project cleanup
1 parent d109d4e commit b75a25f

23 files changed

+96
-62
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
*~
33
\#*
44
tests/*
5-
entrez/*
5+
entrez_cache/*.gb
66
BayesTraitsV2
77
*.pkl

src/__init__.py cgb/__init__.py

File renamed without changes.

src/bayestraits_wrapper.py cgb/bayestraits_wrapper.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
import os
33
import re
44

5-
from misc import temp_file_name
5+
from .misc import temp_file_name
66

77
# Path to BayesTraits executable
8-
BAYES_TRAITS = './BayesTraitsV2'
8+
BAYES_TRAITS = os.path.join(os.path.dirname(__file__), 'BayesTraitsV2')
99

1010

1111
def generate_tree_file(phylo):
File renamed without changes.

src/bio_utils.py cgb/bio_utils.py

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from Bio.Seq import Seq
66
from Bio.Alphabet import generic_dna
7-
from Bio import motifs
87

98

109
def complement(seq):

src/blast.py cgb/blast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from Bio.Blast import NCBIXML
77

8-
from misc import temp_file_name
8+
from .misc import temp_file_name
99

1010

1111
class BLAST:

src/chromid.py cgb/chromid.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
from Bio import SeqIO
55
from cached_property import cached_property
66

7-
import entrez_utils
8-
import bio_utils
9-
from gene import Gene
10-
from operon import Operon
11-
from misc import mean
12-
from my_logger import my_logger
7+
from . import entrez_utils
8+
from . import bio_utils
9+
from .gene import Gene
10+
from .operon import Operon
11+
from .misc import mean
12+
from .my_logger import my_logger
1313

1414

1515
class Chromid:

src/entrez_utils.py cgb/entrez_utils.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@
66
"""
77

88
import os
9-
import tempfile
109

1110
from Bio import Entrez
1211

1312
Entrez.email = '[email protected]'
1413

1514
# The directory used to save NCBI records for later use.
16-
ENTREZ_DIRECTORY = '../entrez'
15+
ENTREZ_DIRECTORY = os.path.join(os.path.dirname(__file__), 'entrez_cache')
1716

1817

1918
def get_genome_record(accession):

src/gene.py cgb/gene.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from cached_property import cached_property
22

3-
from my_logger import my_logger
4-
from protein import Protein
5-
from blast import BlastNoHitFoundException
3+
from .my_logger import my_logger
4+
from .protein import Protein
5+
from .blast import BlastNoHitFoundException
66

77

88
class Gene:

src/genome.py cgb/genome.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
from tqdm import tqdm
77
from Bio.motifs import jaspar
88

9-
from chromid import Chromid
10-
from blast import BLAST
11-
from PSSM_model import PSSMModel
12-
from misc import weighted_choice
13-
from misc import temp_file_name
14-
from my_logger import my_logger
15-
from bio_utils import reverse_complement
16-
from bio_utils import weblogo
9+
from .chromid import Chromid
10+
from .blast import BLAST
11+
from .pssm_model import PSSMModel
12+
from .misc import weighted_choice
13+
from .misc import temp_file_name
14+
from .my_logger import my_logger
15+
from .bio_utils import reverse_complement
16+
from .bio_utils import weblogo
1717

1818

1919
Site = namedtuple('Site', 'chromid start end strand score gene')
@@ -161,10 +161,11 @@ def build_PSSM_model(self, collections, weights):
161161
"""
162162
# Create a PSSM model using site collections and associated weights.
163163
model = PSSMModel(collections, weights)
164-
random_seqs = self.random_seqs(length=model.length, count=100000)
164+
bg_seq = ''.join(self.random_seqs(model.length, 100000/model.length))
165+
bg_scores = model.score_seq(bg_seq)
165166
# Create a Bayesian estimator which is used to compute the probability
166167
# of TF-binding of any given sequence.
167-
model.build_bayesian_estimator(random_seqs)
168+
model.build_bayesian_estimator(bg_scores)
168169
self._TF_binding_model = model
169170

170171
def random_seqs(self, length, count):

src/logging.conf cgb/logging.conf

File renamed without changes.

src/main.py cgb/main.py

+19-18
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import os
22
import pickle
33

4-
from genome import Genome
5-
from protein import Protein
6-
from site_collection import SiteCollection
7-
from my_logger import my_logger
8-
from phylo import Phylo
9-
from user_input import UserInput
10-
from orthologous_group import construct_orthologous_groups
11-
from orthologous_group import orthologous_groups_to_csv
12-
from orthologous_group import ancestral_state_reconstruction
13-
from orthologous_group import ancestral_states_to_csv
14-
from visualization import all_plots
4+
from .genome import Genome
5+
from .protein import Protein
6+
from .site_collection import SiteCollection
7+
from .my_logger import my_logger
8+
from .phylo import Phylo
9+
from .user_input import UserInput
10+
from .orthologous_group import construct_orthologous_groups
11+
from .orthologous_group import orthologous_groups_to_csv
12+
from .orthologous_group import ancestral_state_reconstruction
13+
from .orthologous_group import ancestral_states_to_csv
14+
from .visualization import all_plots
1515

1616

1717
def directory(*paths):
@@ -249,7 +249,7 @@ def get_prior(genome, user_input, weights):
249249

250250
print genome.TF_binding_model.IC
251251
prior = (genome.length /
252-
2**genome.TF_binding_model.IC /
252+
2**genome.TF_binding_model.IC / 10 /
253253
genome.num_operons)
254254
my_logger.info("Prior for %s: %f" % (genome.strain_name, prior))
255255
return prior
@@ -378,16 +378,19 @@ def perform_ancestral_state_reconstruction(user_input, genomes,
378378
phylo.draw(os.path.join(output_dir, "phylogeny.png"))
379379

380380

381-
def main():
381+
def run_analysis():
382382
"""The entry-point for the pipeline."""
383383
# Read user input and configuration from two files
384-
user_input = UserInput('../tests/test6/input.json',
385-
'../tests/test6/config.json')
384+
user_input = UserInput('./tests/test4/input.json',
385+
'./tests/test4/config.json')
386+
pickle.dump(user_input, open('user_input.pkl', 'w'))
387+
386388
# Make output directory
387389
directory(user_input.output_dir)
388390

389391
# Create proteins
390392
proteins = create_proteins(user_input)
393+
pickle.dump(proteins, open('proteins.pkl', 'w'))
391394

392395
# Create genomes
393396
genomes = create_genomes(user_input)
@@ -418,13 +421,11 @@ def main():
418421
all_regulated_genes.extend(regulated_genes)
419422
# Output operons
420423
output_operons(user_input, genome)
421-
422-
pickle.dump(genomes, open('genome.pkl', 'w'))
424+
pickle.dump(genomes, open('genomes.pkl', 'w'))
423425

424426
# Create orthologous groups
425427
ortholog_groups = create_orthologous_groups(
426428
user_input, all_regulated_genes, genomes)
427-
428429
pickle.dump(ortholog_groups, open('orthos.pkl', 'w'))
429430

430431
# Ancestral state reconstruction step

src/misc.py cgb/misc.py

File renamed without changes.

src/my_logger.py cgb/my_logger.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
package. See Python logging module documentation
66
(https://docs.python.org/2/library/logging.html) for more information.
77
"""
8-
8+
import os
99
import logging
1010
from logging.config import fileConfig
1111

12-
fileConfig('logging.conf')
12+
fileConfig(os.path.join(os.path.dirname(__file__), 'logging.conf'))
1313

1414
my_logger = logging.getLogger()

src/operon.py cgb/operon.py

File renamed without changes.

src/orthologous_group.py cgb/orthologous_group.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
from tqdm import tqdm
66

7-
from misc import weighted_choice
8-
import bayestraits_wrapper
9-
from my_logger import my_logger
7+
from . import misc
8+
from . import visualization
9+
from . import bayestraits_wrapper
10+
from .my_logger import my_logger
1011

1112

1213
class OrthologousGroup:
@@ -61,7 +62,7 @@ def discretize_regulation_states(self, phylo):
6162
states = ['1', '0', 'A']
6263
probabilities = [terminal_states[(node.name, state)]
6364
for state in states]
64-
trait[node.name], = weighted_choice(states, probabilities)
65+
trait[node.name], = misc.weighted_choice(states, probabilities)
6566
return trait
6667

6768
def bootstrap_traits(self, phylo, sample_size):
@@ -130,6 +131,14 @@ def most_likely_state_at(self, node_name):
130131
return max(['1', '0', 'A'],
131132
key=lambda x: self.regulation_states[(node_name, x)])
132133

134+
def ancestral_state_reconstruction_svg_view(self, phylo):
135+
temp_file = misc.temp_file_name(suffix='.svg')
136+
t = visualization.biopython_to_ete3(phylo.tree)
137+
visualization.view_by_gene(t, self, temp_file)
138+
with open(temp_file) as f:
139+
contents = f.read()
140+
return contents
141+
133142
def __repr__(self):
134143
return str(self.genes)
135144

@@ -172,6 +181,9 @@ def construct_orthologous_groups(genes, genomes):
172181
# Create the orthologous group with gene + orthologs on all other
173182
# genomes [if there are orthologs in the respective genomes]
174183
groups.append(OrthologousGroup([gene] + [rbh for rbh in rbhs if rbh]))
184+
185+
if len(groups) > 10:
186+
break
175187
return groups
176188

177189

src/phylo.py cgb/phylo.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
1111
from cached_property import cached_property
1212

13-
from misc import unique
13+
from . import visualization
14+
from . import misc
1415

1516
# Structure of a Nexus tree-only file
1617
NEX_TEMPLATE = """\
@@ -53,7 +54,7 @@ def __init__(self, proteins, names=None, distance_model='identity',
5354
distance_model (string): see DistanceCalculator.protein_models
5455
tree_algorithm (string): 'nj' or 'upgma'
5556
"""
56-
self._proteins = unique(proteins, lambda p: p.accession_number)
57+
self._proteins = misc.unique(proteins, lambda p: p.accession_number)
5758
self._names = names
5859
if names:
5960
assert len(proteins) == len(names)
@@ -167,3 +168,14 @@ def draw(self, filename):
167168
"""Draws tree and saves it into the given file."""
168169
BioPhylo.draw(self.tree, do_show=False)
169170
pylab.savefig(filename)
171+
172+
@property
173+
def svg_view(self):
174+
"""Draws the tree in SVG format."""
175+
# Convert the tree from Biopython's Bio.Phylo.Tree to ETE3 Treenode
176+
t = visualization.biopython_to_ete3(self.tree)
177+
temp_file = misc.temp_file_name(suffix='.svg')
178+
visualization.tree_svg_plot(t, temp_file)
179+
with open(temp_file) as f:
180+
contents = f.read()
181+
return contents

src/protein.py cgb/protein.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
from Bio import SeqIO
77
from Bio.SeqRecord import SeqRecord
88

9-
import entrez_utils
10-
from my_logger import my_logger
9+
from . import entrez_utils
1110

1211

1312
class Protein:

src/pssm_model.py cgb/pssm_model.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
from Bio.motifs.matrix import PositionWeightMatrix
44

55

6-
from binding_model import TFBindingModel
7-
from misc import log2
8-
from misc import temp_file_name
9-
from bio_utils import weblogo
6+
from .binding_model import TFBindingModel
7+
from .misc import log2
8+
from .misc import temp_file_name
9+
from .bio_utils import weblogo
1010

1111

1212
class PSSMModel(TFBindingModel):
@@ -31,8 +31,6 @@ def __init__(self, collections, weights,
3131
"""Constructor for the PSSMModel class."""
3232
super(PSSMModel, self).__init__(collections, background)
3333
self._pwm = self._combine_pwms([c.pwm for c in collections], weights)
34-
self._collections = collections
35-
self._weights = weights
3634

3735
@cached_property
3836
def pwm(self):
@@ -111,6 +109,7 @@ def score_seq(self, seq, both=True):
111109
if both:
112110
scores = [log2(2**score + 2**rc_score)
113111
for score, rc_score in zip(scores, rc_scores)]
112+
114113
return scores
115114

116115
@property
File renamed without changes.

src/user_input.py cgb/user_input.py

+5
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ def sites_list(self):
4747
"""Returns the lists of binding sites."""
4848
return [m['sites'] for m in self._input['motifs']]
4949

50+
@property
51+
def protein_accessions_and_sites(self):
52+
"""Zips protein accessions and binding sites."""
53+
return zip(self.protein_accessions, self.sites_list)
54+
5055
@property
5156
def output_dir(self):
5257
"""Returns the directory to be used for logging."""

src/visualization.py cgb/visualization.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11

22
import os
33
import copy
4-
import math
54

65
from ete3 import Tree
76
from ete3 import TreeStyle, NodeStyle
@@ -11,7 +10,7 @@
1110
import numpy as np
1211
from tqdm import tqdm
1312

14-
from my_logger import my_logger
13+
from .my_logger import my_logger
1514

1615

1716
def biopython_to_ete3(biopython_tree):
@@ -25,6 +24,11 @@ def biopython_to_ete3(biopython_tree):
2524
return t
2625

2726

27+
def tree_svg_plot(tree, file, **kwargs):
28+
ts = TreeStyle()
29+
tree.render(file, tree_style=ts)
30+
31+
2832
def rgb2hex(red, green, blue):
2933
"""Converts the given rgb values to hexadecimal color code.
3034

entrez_cache/README.rst

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
This directory contains the genome and protein records downloaded via NCBI
3+
Entrez.

0 commit comments

Comments
 (0)