Skip to content

Commit

Permalink
Merge pull request #17 from Merck/develop
Browse files Browse the repository at this point in the history
Enable using path to model instead of model name.
  • Loading branch information
prihoda committed Sep 6, 2019
2 parents a9383eb + 1418263 commit 32cb566
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 14 deletions.
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,16 @@ Proteins and Pfam domains are detected automatically if not already annotated (H
# Show command help docs
deepbgc pipeline --help

# Detect and classify BGCs in mySequence.fa using DeepBGC algorithm and save the output to mySequence directory.
# Detect and classify BGCs in mySequence.fa using DeepBGC detector.
deepbgc pipeline mySequence.fa

# Detect and classify BGCs in mySequence.fa using custom DeepBGC detector trained on your own data.
deepbgc pipeline --detector path/to/myDetector.pkl mySequence.fa
```

This will produce a directory with multiple files and a README.txt with file descriptions.
This will produce a `mySequence` directory with multiple files and a README.txt with file descriptions.

See [Train DeepBGC on your own data](#train-deepbgc-on-your-own-data) section below for more information about training a custom detector or classifier.

#### Example output

Expand Down Expand Up @@ -98,7 +103,7 @@ JSON template for DeepBGC LSTM with pfam2vec is structured as follows:
"timesteps": 256, - Number of pfam2vec vectors trained in one batch
"validation_size": 0, - Fraction of training data to use for validation (if validation data is not provided explicitly). Use 0.2 for 20% data used for testing.
"verbose": 1, - Verbosity during training
"num_epochs": 1000, - Number of epochs to train for
"num_epochs": 1000, - Number of passes over your training set during training. You probably want to use a lower number if not using early stopping on validation data.
"early_stopping" : { - Stop model training when at certain validation performance
"monitor": "val_auc_roc", - Use validation AUC ROC to observe performance
"min_delta": 0.0001, - Stop training when the improvement in the last epochs did not improve more than 0.0001
Expand All @@ -124,3 +129,12 @@ JSON template for DeepBGC LSTM with pfam2vec is structured as follows:
}
```

### Using your trained model

Since version `0.1.10` you can provide a direct path to the detector or classifier model like so:
```bash
deepbgc pipeline \
mySequence.fa \
--detector path/to/myDetector.pkl \
--classifier path/to/myClassifier.pkl
```
2 changes: 1 addition & 1 deletion deepbgc/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.9'
__version__ = '0.1.10-dev'
12 changes: 6 additions & 6 deletions deepbgc/command/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def add_arguments(self, parser):
no_models_message = 'run "deepbgc download" to download models'
detector_names = util.get_available_models('detector')
group.add_argument('-d', '--detector', dest='detectors', action='append', default=[],
help="Trained detection model name ({}). "
help="Trained detection model name ({}) or path to trained model pickle file. "
"Can be provided multiple times (-d first -d second).".format(', '.join(detector_names) or no_models_message))
group.add_argument('--no-detector', action='store_true', help="Disable BGC detection.")
group.add_argument('-l', '--label', dest='labels', action='append', default=[], help="Label for detected clusters (equal to --detector by default). "
Expand All @@ -80,7 +80,7 @@ def add_arguments(self, parser):
group = parser.add_argument_group('BGC classification options', '')
classifier_names = util.get_available_models('classifier')
group.add_argument('-c', '--classifier', dest='classifiers', action='append', default=[],
help="Trained classification model name ({}). "
help="Trained classification model name ({}) or path to trained model pickle file. "
"Can be provided multiple times (-c first -c second).".format(', '.join(classifier_names) or no_models_message))
group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification.")
group.add_argument('--classifier-score', default=0.5, type=float,
Expand Down Expand Up @@ -117,9 +117,9 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
elif len(labels) != len(detectors):
raise ValueError('A separate label should be provided for each of the detectors: {}'.format(detectors))

for detector_name, label in zip(detectors, labels):
for detector, label in zip(detectors, labels):
steps.append(DeepBGCDetector(
detector=detector_name,
detector=detector,
label=label,
score_threshold=score,
merge_max_protein_gap=merge_max_protein_gap,
Expand Down Expand Up @@ -148,8 +148,8 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
writers.append(ReadmeWriter(out_path=os.path.join(output, 'README.txt'), root_path=output, writers=writers))

if not no_classifier:
for classifier_name in classifiers:
steps.append(DeepBGCClassifier(classifier=classifier_name, score_threshold=classifier_score))
for classifier in classifiers:
steps.append(DeepBGCClassifier(classifier=classifier, score_threshold=classifier_score))

# Create temp and evaluation dir
if not os.path.exists(tmp_path):
Expand Down
10 changes: 8 additions & 2 deletions deepbgc/pipeline/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,21 @@
from deepbgc.models.wrapper import SequenceModelWrapper
from deepbgc.pipeline.step import PipelineStep
import six
import os

class DeepBGCClassifier(PipelineStep):

def __init__(self, classifier, score_threshold=0.5):
if classifier is None or not isinstance(classifier, six.string_types):
raise ValueError('Expected classifier name, got {}'.format(classifier))
raise ValueError('Expected classifier name or path, got {}'.format(classifier))
if os.path.exists(classifier) or os.path.sep in classifier:
classifier_path = classifier
# Set classifier name to filename without suffix
classifier, _ = os.path.splitext(os.path.basename(classifier))
else:
classifier_path = util.get_model_path(classifier, 'classifier')
self.classifier_name = classifier
self.score_threshold = score_threshold
classifier_path = util.get_model_path(self.classifier_name, 'classifier')
self.model = SequenceModelWrapper.load(classifier_path)
self.total_class_counts = pd.Series()

Expand Down
11 changes: 9 additions & 2 deletions deepbgc/pipeline/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,21 @@
from deepbgc.pipeline.step import PipelineStep
import collections
import six
import os

class DeepBGCDetector(PipelineStep):
def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0,
merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0):
self.score_threshold = score_threshold
if detector is None or not isinstance(detector, six.string_types):
raise ValueError('Expected detector name, got {}'.format(detector))
raise ValueError('Expected detector name or path, got {}'.format(detector))
if os.path.exists(detector) or os.path.sep in detector:
model_path = detector
# Set detector name to filename without suffix
detector, _ = os.path.splitext(os.path.basename(detector))
else:
model_path = util.get_model_path(detector, 'detector')

self.detector_name = detector
self.detector_label = label or self.detector_name
self.score_column = util.format_bgc_score_column(self.detector_name)
Expand All @@ -24,7 +32,6 @@ def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_
self.min_proteins = min_proteins
self.min_domains = min_domains
self.min_bio_domains = min_bio_domains
model_path = util.get_model_path(self.detector_name, 'detector')
self.model = SequenceModelWrapper.load(model_path)
self.num_detected = 0

Expand Down

0 comments on commit 32cb566

Please sign in to comment.