Merge pull request #17 from Merck/develop

Enable using path to model instead of model name.
Merck · Sep 6, 2019 · 32cb566 · 32cb566
2 parents a9383eb + 1418263
commit 32cb566
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -60,11 +60,16 @@ Proteins and Pfam domains are detected automatically if not already annotated (H
 # Show command help docs
 deepbgc pipeline --help
 
-# Detect and classify BGCs in mySequence.fa using DeepBGC algorithm and save the output to mySequence directory.
+# Detect and classify BGCs in mySequence.fa using DeepBGC detector.
 deepbgc pipeline mySequence.fa
+
+# Detect and classify BGCs in mySequence.fa using custom DeepBGC detector trained on your own data.
+deepbgc pipeline --detector path/to/myDetector.pkl mySequence.fa
 ```
 
-This will produce a directory with multiple files and a README.txt with file descriptions.
+This will produce a `mySequence` directory with multiple files and a README.txt with file descriptions.
+
+See [Train DeepBGC on your own data](#train-deepbgc-on-your-own-data) section below for more information about training a custom detector or classifier.
 
 #### Example output
 
@@ -98,7 +103,7 @@ JSON template for DeepBGC LSTM with pfam2vec is structured as follows:
     "timesteps": 256, - Number of pfam2vec vectors trained in one batch
     "validation_size": 0, - Fraction of training data to use for validation (if validation data is not provided explicitly). Use 0.2 for 20% data used for testing.
     "verbose": 1, - Verbosity during training
-    "num_epochs": 1000, - Number of epochs to train for
+    "num_epochs": 1000, - Number of passes over your training set during training. You probably want to use a lower number if not using early stopping on validation data.
     "early_stopping" : { - Stop model training when at certain validation performance
       "monitor": "val_auc_roc", - Use validation AUC ROC to observe performance
       "min_delta": 0.0001, - Stop training when the improvement in the last epochs did not improve more than 0.0001
@@ -124,3 +129,12 @@ JSON template for DeepBGC LSTM with pfam2vec is structured as follows:
 }
 ```
 
+### Using your trained model
+
+Since version `0.1.10` you can provide a direct path to the detector or classifier model like so:
+```bash
+deepbgc pipeline \
+    mySequence.fa \
+    --detector path/to/myDetector.pkl \
+    --classifier path/to/myClassifier.pkl 
+```
diff --git a/deepbgc/__version__.py b/deepbgc/__version__.py
@@ -1 +1 @@
-__version__ = '0.1.9'
+__version__ = '0.1.10-dev'
diff --git a/deepbgc/command/pipeline.py b/deepbgc/command/pipeline.py
@@ -63,7 +63,7 @@ def add_arguments(self, parser):
         no_models_message = 'run "deepbgc download" to download models'
         detector_names = util.get_available_models('detector')
         group.add_argument('-d', '--detector', dest='detectors', action='append', default=[],
-                           help="Trained detection model name ({}). "
+                           help="Trained detection model name ({}) or path to trained model pickle file. "
                                 "Can be provided multiple times (-d first -d second).".format(', '.join(detector_names) or no_models_message))
         group.add_argument('--no-detector', action='store_true', help="Disable BGC detection.")
         group.add_argument('-l', '--label', dest='labels', action='append', default=[], help="Label for detected clusters (equal to --detector by default). "
@@ -80,7 +80,7 @@ def add_arguments(self, parser):
         group = parser.add_argument_group('BGC classification options', '')
         classifier_names = util.get_available_models('classifier')
         group.add_argument('-c', '--classifier', dest='classifiers', action='append', default=[],
-                            help="Trained classification model name ({}). "
+                            help="Trained classification model name ({}) or path to trained model pickle file. "
                                  "Can be provided multiple times (-c first -c second).".format(', '.join(classifier_names) or no_models_message))
         group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification.")
         group.add_argument('--classifier-score', default=0.5, type=float,
@@ -117,9 +117,9 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
             elif len(labels) != len(detectors):
                 raise ValueError('A separate label should be provided for each of the detectors: {}'.format(detectors))
 
-            for detector_name, label in zip(detectors, labels):
+            for detector, label in zip(detectors, labels):
                 steps.append(DeepBGCDetector(
-                    detector=detector_name,
+                    detector=detector,
                     label=label,
                     score_threshold=score,
                     merge_max_protein_gap=merge_max_protein_gap,
@@ -148,8 +148,8 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
         writers.append(ReadmeWriter(out_path=os.path.join(output, 'README.txt'), root_path=output, writers=writers))
 
         if not no_classifier:
-            for classifier_name in classifiers:
-                steps.append(DeepBGCClassifier(classifier=classifier_name, score_threshold=classifier_score))
+            for classifier in classifiers:
+                steps.append(DeepBGCClassifier(classifier=classifier, score_threshold=classifier_score))
 
         # Create temp and evaluation dir
         if not os.path.exists(tmp_path):

diff --git a/deepbgc/pipeline/classifier.py b/deepbgc/pipeline/classifier.py
@@ -6,15 +6,21 @@
 from deepbgc.models.wrapper import SequenceModelWrapper
 from deepbgc.pipeline.step import PipelineStep
 import six
+import os
 
 class DeepBGCClassifier(PipelineStep):
 
     def __init__(self, classifier, score_threshold=0.5):
         if classifier is None or not isinstance(classifier, six.string_types):
-            raise ValueError('Expected classifier name, got {}'.format(classifier))
+            raise ValueError('Expected classifier name or path, got {}'.format(classifier))
+        if os.path.exists(classifier) or os.path.sep in classifier:
+            classifier_path = classifier
+            # Set classifier name to filename without suffix
+            classifier, _ = os.path.splitext(os.path.basename(classifier))
+        else:
+            classifier_path = util.get_model_path(classifier, 'classifier')
         self.classifier_name = classifier
         self.score_threshold = score_threshold
-        classifier_path = util.get_model_path(self.classifier_name, 'classifier')
         self.model = SequenceModelWrapper.load(classifier_path)
         self.total_class_counts = pd.Series()
 

diff --git a/deepbgc/pipeline/detector.py b/deepbgc/pipeline/detector.py
@@ -8,13 +8,21 @@
 from deepbgc.pipeline.step import PipelineStep
 import collections
 import six
+import os
 
 class DeepBGCDetector(PipelineStep):
     def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0,
                  merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0):
         self.score_threshold = score_threshold
         if detector is None or not isinstance(detector, six.string_types):
-            raise ValueError('Expected detector name, got {}'.format(detector))
+            raise ValueError('Expected detector name or path, got {}'.format(detector))
+        if os.path.exists(detector) or os.path.sep in detector:
+            model_path = detector
+            # Set detector name to filename without suffix
+            detector, _ = os.path.splitext(os.path.basename(detector))
+        else:
+            model_path = util.get_model_path(detector, 'detector')
+
         self.detector_name = detector
         self.detector_label = label or self.detector_name
         self.score_column = util.format_bgc_score_column(self.detector_name)
@@ -24,7 +32,6 @@ def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_
         self.min_proteins = min_proteins
         self.min_domains = min_domains
         self.min_bio_domains = min_bio_domains
-        model_path = util.get_model_path(self.detector_name, 'detector')
         self.model = SequenceModelWrapper.load(model_path)
         self.num_detected = 0