Initial commit

Merck · Jan 14, 2019 · 68c6576 · 68c6576
commit 68c6576
Show file tree

Hide file tree

Showing 20 changed files with 2,027 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.idea
+*.pyc
+__pycache__
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright © 2018 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., Inc., Kenilworth, NJ, USA."
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/LICENSES_THIRD_PARTY b/LICENSES_THIRD_PARTY
@@ -0,0 +1,27 @@
+--------------------------------------------------
+Third party dependencies listed by License type
+[Format: name (Python module) - URL]
+--------------------------------------------------
+
+Biopython License Agreement
+* Biopython (biopython) - https://github.com/biopython/biopython/blob/d5718bb7e3ee505b859b39c03f8ffad8a9a0be2f/LICENSE.rst
+
+OSI Approved (new BSD)
+* scikit-learn (scikit-learn) - https://github.com/scikit-learn/scikit-learn/blob/2e85c8608c93ad0e3290414c4e5e650b87d44b27/COPYING
+* hmmlearn (hmmlearn) - https://github.com/hmmlearn/hmmlearn/blob/1f60373d28c427a2a05c9ea26231c717772066dc/LICENSE.txt
+
+BSD 3-Clause License
+* Pandas (pandas) - https://github.com/pandas-dev/pandas/blob/5aba6659e422e985683cfb46c07c3364a02b6e5b/AUTHORS.md
+* HMMER - https://github.com/EddyRivasLab/hmmer/blob/3e38d667761e0a98a263079cb4a90e49d4b720d5/LICENSE
+
+MIT License (MIT)
+* Keras (keras) - https://github.com/keras-team/keras/blob/dc698c5486117780b643eda0a2f60a8753625b8a/LICENSE
+
+Apache Software License (Apache 2.0)
+* TensorFlow (tensorflow) - https://github.com/tensorflow/tensorflow/blob/6b6d843ccab78f9f91c3b98a43ca09ffecad4747/LICENSE
+
+Python Software Foundation License (BSD)
+* Matplotlib (matplotlib) - https://matplotlib.org/users/license.html
+
+GNU General Public License v3.0
+* Prodigal - https://github.com/hyattpd/Prodigal/blob/b1321f0899c4d7a835583feb344e2c9a5bd908d1/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,43 @@
+# DeepBGC: Biosynthetic Gene Cluster detection and classification.
+
+## Install DeepBGC
+
+- Run `pip install deepbgc` to install the `deepbgc` python module.
+
+## Prerequisities
+
+- Install Python 3.6 (version 3.7 is not supported by TensorFlow yet)
+- Install Prodigal and put the `prodigal` binary it on your PATH: https://github.com/hyattpd/Prodigal/releases
+- Install HMMER and put the `hmmscan` and `hmmpress` binaries on your PATH: http://hmmer.org/download.html
+- Download and **extract** Pfam database from: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam31.0/Pfam-A.hmm.gz
+
+## Use DeepBGC
+
+### Detection
+
+Detect BGCs in a genomic sequence.
+
+```bash
+# Show detection help
+deepbgc detect --help
+
+# Detect BGCs in a nucleotide sequence
+deepbgc detect --model DeepBGCDetector_v0.0.1.pkl --pfam Pfam-A.hmm --output myCandidates/ myInputSequence.fa
+
+# Detect BGCs with >0.9 score in existing Pfam CSV sequence
+deepbgc detect --model myModel.pkl --output myStrictCandidates/ -s 0.9 myCandidates/myCandidates.pfam.csv
+
+```
+
+### Classification
+
+Classify BGCs into one or more classes.
+
+```bash
+# Show classification help
+deepbgc classify --help
+
+# Predict biosynthetic class of detected BGCs
+deepbgc classify --model RandomForestMIBiGClasses_v0.0.1.pkl --output myCandidates/myCandidates.classes.csv myCandidates/myCandidates.candidates.csv
+
+```
diff --git a/deepbgc/__init__.py b/deepbgc/__init__.py
@@ -0,0 +1,3 @@
+VERSION = '0.0.1'
+
+from .pipeline import DeepBGCModel
diff --git a/deepbgc/commands/__init__.py b/deepbgc/commands/__init__.py
diff --git a/deepbgc/commands/base.py b/deepbgc/commands/base.py
@@ -0,0 +1,24 @@
+from abc import ABC, abstractmethod
+import argparse
+
+
+class BaseCommand(ABC):
+    """
+    Base abstract class for commands
+    """
+    command = ''
+    help = ""
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = subparsers.add_parser(cls.command, description=cls.help, help=cls.help,
+                                       formatter_class=argparse.RawTextHelpFormatter)
+        parser.set_defaults(func=cls)
+        return parser
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/deepbgc/commands/classify.py b/deepbgc/commands/classify.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from deepbgc.commands.base import BaseCommand
+import os
+import pickle
+import numpy as np
+
+SCORE_COLUMN = 'deepbgc_score'
+
+class ClassifyCommand(BaseCommand):
+    command = 'classify'
+    help = """Classify BGCs into one or more classes.
+    
+Examples:
+    
+  deepbgc classify --model myClassifier.pkl --output classes.csv inputSequence.fa
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.output_path = args.output
+        self.input_path = args.input
+        self.model_path = args.model
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        parser.add_argument('-o', '--output', required=True, help="Output CSV file path.")
+        parser.add_argument('-m', '--model', required=True, help="Trained classification model file path.")
+        parser.add_argument(dest='input', help="Input candidate CSV file path.")
+
+    def run(self):
+        candidates = pd.read_csv(self.input_path)
+        if 'candidate_hash' not in candidates.columns:
+            raise AttributeError('Input CSV is not a candidate CSV file, "candidate_hash" column should be present.')
+
+        candidates = candidates.set_index('candidate_hash')
+
+        with open(self.model_path, 'rb') as f:
+            model = pickle.load(f)
+
+        vectors = domain_set_vectors(candidates)
+
+        predictions = predict_classes(vectors, model)
+        predictions.to_csv(self.output_path, index=False)
+        print('Saved {} predictions to {}'.format(len(predictions), self.output_path))
+
+
+def domain_set_vectors(candidates):
+    candidate_pfam_ids = [pfam_ids.split(';') for pfam_ids in candidates['pfam_ids']]
+    unique_pfam_ids = sorted(list(set([p for ids in candidate_pfam_ids for p in ids])))
+    print('Getting domain set vectors for {} candidates with {} unique Pfam IDs...'.format(len(candidates), len(unique_pfam_ids)))
+    vectors = pd.DataFrame(np.zeros((len(candidates), len(unique_pfam_ids))), columns=unique_pfam_ids)
+    for i, pfam_ids in enumerate(candidate_pfam_ids):
+        vectors.iloc[i][pfam_ids] = 1
+    return vectors
+
+
+def predict_classes(samples, model, add_classes_list=True):
+    # Set missing columns to 0
+    if not hasattr(model, 'input_columns'):
+        raise AttributeError('Trained model does not contain the "input_columns" attribute.')
+    if not hasattr(model, 'label_columns'):
+        raise AttributeError('Trained model does not contain the "label_columns" attribute.')
+
+    missing_columns = set(model.input_columns).difference(samples.columns)
+    for col in missing_columns:
+        samples[col] = 0
+    #print('Missing columns:\n{}'.format(sorted(list(missing_columns))))
+    print('Warning: Setting {} missing columns to 0'.format(len(missing_columns)))
+    samples = samples[model.input_columns]
+
+    results = np.array([r[:,1] for r in model.predict_proba(samples.values)]).transpose()
+    predictions = pd.DataFrame(results, index=samples.index, columns=model.label_columns)
+    if add_classes_list:
+        predictions['classes'] = [';'.join(model.label_columns[x >= 0.5]) for x in results]
+
+    return predictions
+
+def sequence_id_from_filename(path):
+    """
+    Create a basic sequence_id from a file name without extension
+    :param path: Path of file
+    :return: file name without extension that can be used as sequence_id
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
diff --git a/deepbgc/commands/detect.py b/deepbgc/commands/detect.py
@@ -0,0 +1,74 @@
+import pandas as pd
+from deepbgc.commands.base import BaseCommand
+from deepbgc.converter import SequenceToPfamCSVConverter
+import os
+from deepbgc.detector import DeepBGCDetector
+
+SCORE_COLUMN = 'deepbgc_score'
+
+class DetectCommand(BaseCommand):
+    command = 'detect'
+    help = """Detect BGCs in a genomic sequence.
+    
+Examples:
+    
+  # Detect BGCs in FASTA sequence with default settings
+  deepbgc detect --model myModel.pkl --output myDetections/ --pfam Pfam-A.hmm inputSequence.fa
+    
+  # Detect BGCs with >0.9 score in existing Pfam CSV sequence
+  deepbgc detect --model myModel.pkl --output myStrictDetections/ -s 0.9 myDetections/myDetections.pfam.csv
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.output_path = args.output
+        self.output_basename = os.path.basename(self.output_path)
+        self.input_path = args.input
+        self.model_path = args.model
+        self.score_threshold = args.score
+        self.converter = SequenceToPfamCSVConverter(db_path=args.pfam)
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        parser.add_argument('-o', '--output', required=True, help="Output folder path.")
+        parser.add_argument('-m', '--model', required=True, help="Trained detection model file path.")
+        parser.add_argument('-p', '--pfam', required=False, help="Pfam DB (Pfam-A.hmm) file path.")
+        parser.add_argument('-s', '--score', default=0.5, type=float, help="Average protein-wise DeepBGC score threshold for extracting BGC regions from domain sequences.")
+        parser.add_argument(dest='input', help="Input pfam CSV file path.")
+
+    def _outpath(self, suffix, extension):
+        return os.path.join(self.output_path, '{}.{}.{}'.format(self.output_basename, suffix, extension))
+
+    def run(self):
+        try:
+            os.makedirs(self.output_path, exist_ok=True)
+        except FileExistsError:
+            raise AttributeError("Output directory already exists: {}".format(self.output_path))
+        except Exception as e:
+            raise AttributeError("Output directory not writable: {}".format(self.output_path), e)
+
+        domain_path = self._outpath('pfam', 'csv')
+        if not self.converter.convert(self.input_path, domain_path):
+            # Input was already a pfam CSV file, use original path
+            domain_path = self.input_path
+
+        domains = pd.read_csv(domain_path)
+        detector = DeepBGCDetector(model=self.model_path)
+
+        candidates = detector.detect(domains, score_threshold=self.score_threshold)
+
+        cand_path = self._outpath('candidates', 'csv')
+        candidates.to_csv(cand_path, index=False)
+        print('Saved {} detected BGCs to {}'.format(len(candidates), cand_path))
+
+
+def sequence_id_from_filename(path):
+    """
+    Create a basic sequence_id from a file name without extension
+    :param path: Path of file
+    :return: file name without extension that can be used as sequence_id
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
diff --git a/deepbgc/commands/pfam.py b/deepbgc/commands/pfam.py
@@ -0,0 +1,37 @@
+from deepbgc.commands.base import BaseCommand
+from deepbgc.converter import SequenceToPfamCSVConverter
+
+
+class PfamCommand(BaseCommand):
+    command = 'pfam'
+    help = """Convert genomic BGCs sequence into a pfam domain CSV file by detecting proteins and pfam domains.
+    
+Examples:
+    
+  # Detect proteins and pfam domains in a FASTA sequence and save the result as csv file 
+  deepbgc pfam --pfam Pfam-A.hmm inputSequence.fa outputPfamSequence.csv
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.input_path = args.input
+        self.output_path = args.output
+        self.converter = SequenceToPfamCSVConverter(db_path=args.pfam)
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        # parser.add_argument('--mode', default='auto', choices=['auto', 'nucl', 'prot', 'pfam'],
+        #                     help="Input modes: \n"
+        #                          "--mode auto: Automatic based on file extension.\n"
+        #                          "--mode nucl: Nucleotide sequence without annotated genes. Will detect genes and pfam domains. \n"
+        #                          "--mode prot: Protein sequence. Will detect pfam domains.)")
+        parser.add_argument('-p', '--pfam', required=True, help="Pfam DB (Pfam-A.hmm) file path.")
+        parser.add_argument(dest='input', help="Input sequence file path.")
+        parser.add_argument(dest='output', help="Output pfam CSV file path.")
+
+    def run(self):
+        self.converter.convert(self.input_path, self.output_path)
+        print()
+        print('Saved Pfam CSV to: {}'.format(self.output_path))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		VERSION = '0.0.1'

		from .pipeline import DeepBGCModel