Merge pull request #20 from Merck/develop

Improve download help annotation, add default values to help annotations Fix pfam description annotation
Merck · Oct 1, 2019 · a27f647 · a27f647
2 parents b01119c + 4a6d6de
commit a27f647
Show file tree

Hide file tree

Showing 10 changed files with 49 additions and 38 deletions.
diff --git a/deepbgc/__version__.py b/deepbgc/__version__.py
@@ -1 +1 @@
-__version__ = '0.1.10'
+__version__ = '0.1.11-dev'
diff --git a/deepbgc/command/download.py b/deepbgc/command/download.py
@@ -11,7 +11,12 @@
 
 class DownloadCommand(BaseCommand):
     command = 'download'
-    help = """Download trained models and other file dependencies to the DeepBGC downloads directory."""
+    help = """
+    Download trained models and other file dependencies to the DeepBGC downloads directory.
+    
+    By default, files are downloaded to: '{}'
+    Set {} env variable to specify a different downloads directory."
+    """.format(util.get_default_downloads_dir(), util.DEEPBGC_DOWNLOADS_DIR)
 
     def add_arguments(self, parser):
         pass

diff --git a/deepbgc/command/pipeline.py b/deepbgc/command/pipeline.py
@@ -53,38 +53,38 @@ class PipelineCommand(BaseCommand):
 
     def add_arguments(self, parser):
 
-        parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV).")
+        parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV)")
 
-        parser.add_argument('-o', '--output', required=False, help="Custom output directory path.")
-        parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
+        parser.add_argument('-o', '--output', required=False, help="Custom output directory path")
+        parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times")
         parser.add_argument('--minimal-output', dest='is_minimal_output', action='store_true', default=False,
-                            help="Produce minimal output with just the GenBank sequence file.")
+                            help="Produce minimal output with just the GenBank sequence file")
         group = parser.add_argument_group('BGC detection options', '')
         no_models_message = 'run "deepbgc download" to download models'
         detector_names = util.get_available_models('detector')
         group.add_argument('-d', '--detector', dest='detectors', action='append', default=[],
                            help="Trained detection model name ({}) or path to trained model pickle file. "
-                                "Can be provided multiple times (-d first -d second).".format(', '.join(detector_names) or no_models_message))
-        group.add_argument('--no-detector', action='store_true', help="Disable BGC detection.")
+                                "Can be provided multiple times (-d first -d second)".format(', '.join(detector_names) or no_models_message))
+        group.add_argument('--no-detector', action='store_true', help="Disable BGC detection")
         group.add_argument('-l', '--label', dest='labels', action='append', default=[], help="Label for detected clusters (equal to --detector by default). "
-                                                                                             "If multiple detectors are provided, a label should be provided for each one.")
+                                                                                             "If multiple detectors are provided, a label should be provided for each one")
         group.add_argument('-s', '--score', default=0.5, type=float,
-                            help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences.")
-        group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins.")
-        group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides.")
-        group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length.")
-        group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC.")
-        group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC.")
-        group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic protein domains in a BGC (from antiSMASH ClusterFinder).")
+                            help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences (default: %(default)s)")
+        group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins (default: %(default)s)")
+        group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides (default: %(default)s)")
+        group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length (default: %(default)s)")
+        group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC (default: %(default)s)")
+        group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC (default: %(default)s)")
+        group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic (as defined by antiSMASH) protein domains in a BGC (default: %(default)s)")
 
         group = parser.add_argument_group('BGC classification options', '')
         classifier_names = util.get_available_models('classifier')
         group.add_argument('-c', '--classifier', dest='classifiers', action='append', default=[],
                             help="Trained classification model name ({}) or path to trained model pickle file. "
-                                 "Can be provided multiple times (-c first -c second).".format(', '.join(classifier_names) or no_models_message))
-        group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification.")
+                                 "Can be provided multiple times (-c first -c second)".format(', '.join(classifier_names) or no_models_message))
+        group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification")
         group.add_argument('--classifier-score', default=0.5, type=float,
-                            help="DeepBGC classification score threshold for assigning classes to BGCs (inclusive).")
+                            help="DeepBGC classification score threshold for assigning classes to BGCs (default: %(default)s)")
 
     def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier,
             is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,

diff --git a/deepbgc/command/prepare.py b/deepbgc/command/prepare.py
@@ -31,11 +31,11 @@ class PrepareCommand(BaseCommand):
   """
 
     def add_arguments(self, parser):
-        parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank).")
+        parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank)")
         group = parser.add_argument_group('required arguments', '')
-        parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
-        group.add_argument('--output-gbk', required=False, help="Output GenBank file path.")
-        group.add_argument('--output-tsv', required=False, help="Output TSV file path.")
+        parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times")
+        group.add_argument('--output-gbk', required=False, help="Output GenBank file path")
+        group.add_argument('--output-tsv', required=False, help="Output TSV file path")
 
     def run(self, inputs, limit_to_record, output_gbk, output_tsv):
         first_output = output_gbk or output_tsv

diff --git a/deepbgc/command/train.py b/deepbgc/command/train.py
@@ -26,23 +26,23 @@ class TrainCommand(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument("-m", "--model", dest="model", required=True,
-                            help="Path to JSON model config file.")
+                            help="Path to JSON model config file")
         parser.add_argument('-t', '--target', required=False, default='in_cluster',
-                            help="Target column to predict in sequence prediction.")
+                            help="Target column to predict in sequence prediction")
         parser.add_argument('-o', '--output', required=True,
-                            help="Output trained model file path.")
+                            help="Output trained model file path")
         parser.add_argument('-l', '--log', required=False,
-                            help="Progress log output path (e.g. TensorBoard).")
+                            help="Progress log output path (e.g. TensorBoard)")
         parser.add_argument('-c', '--classes', required=False,
                             help="Class TSV file path - train a sequence classifier "
-                                 "using provided classes (binary columns), indexed by sequence_id column.")
+                                 "using provided classes (binary columns), indexed by sequence_id column")
         parser.add_argument("--config", nargs=2, action='append', default=[],
-                            help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv).")
+                            help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv)")
         parser.add_argument('-v', '--validation', action='append', required=False,
-                            help="Validation sequence file path. Repeat to specify multiple files.")
+                            help="Validation sequence file path. Repeat to specify multiple files")
         parser.add_argument("--verbose", dest="verbose", required=False, default=2, type=int,
-                            help="Verbosity level (0=none, 1=progress bar, 2=once per epoch).", metavar="INT")
-        parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths.")
+                            help="Verbosity level: 0=none, 1=progress bar, 2=once per epoch (default: %(default)s)", metavar="INT")
+        parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths")
 
     def run(self, inputs, output, model, target, classes, config, log, validation, verbose):
 

diff --git a/deepbgc/pipeline/classifier.py b/deepbgc/pipeline/classifier.py
@@ -13,7 +13,7 @@ class DeepBGCClassifier(PipelineStep):
     def __init__(self, classifier, score_threshold=0.5):
         if classifier is None or not isinstance(classifier, six.string_types):
             raise ValueError('Expected classifier name or path, got {}'.format(classifier))
-        if os.path.exists(classifier) or os.path.sep in classifier:
+        if (os.path.exists(classifier) or os.path.sep in classifier) and not os.path.isdir(classifier):
             classifier_path = classifier
             # Set classifier name to filename without suffix
             classifier, _ = os.path.splitext(os.path.basename(classifier))

diff --git a/deepbgc/pipeline/detector.py b/deepbgc/pipeline/detector.py
@@ -16,7 +16,7 @@ def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_
         self.score_threshold = score_threshold
         if detector is None or not isinstance(detector, six.string_types):
             raise ValueError('Expected detector name or path, got {}'.format(detector))
-        if os.path.exists(detector) or os.path.sep in detector:
+        if (os.path.exists(detector) or os.path.sep in detector) and not os.path.isdir(detector):
             model_path = detector
             # Set detector name to filename without suffix
             detector, _ = os.path.splitext(os.path.basename(detector))

diff --git a/deepbgc/pipeline/pfam.py b/deepbgc/pipeline/pfam.py
@@ -94,6 +94,7 @@ def annotate(self):
 
         # Read descriptions from Pfam clan TSV
         pfam_descriptions = self._get_pfam_descriptions()
+        print('DESCRIPTIONS: {}'.format(pfam_descriptions))
 
         # Extract all matched domain hits
         num = 0
@@ -117,7 +118,8 @@ def annotate(self):
                     'locus_tag': [query.id],
                     'database': [PFAM_DB_VERSION],
                 }
-                description = pfam_descriptions.get(pfam_id)
+                short_pfam_id = pfam_id.rsplit('.', maxsplit=1)[0]
+                description = pfam_descriptions.get(short_pfam_id)
                 if description:
                     qualifiers['description'] = [description]
                 pfam = SeqFeature(

diff --git a/deepbgc/util.py b/deepbgc/util.py
@@ -331,11 +331,15 @@ def get_data_release_version():
     return os.environ.get(DEEPBGC_DATA_RELEASE_VERSION, DATA_RELEASE_VERSION)
 
 
+def get_default_downloads_dir():
+    return user_data_dir("deepbgc", version="data")
+
+
 def get_downloads_dir(versioned=True):
     downloads_dir = os.environ.get(DEEPBGC_DOWNLOADS_DIR)
     data_release_version = get_data_release_version()
     if not downloads_dir:
-        downloads_dir = user_data_dir("deepbgc", version="data")
+        downloads_dir = get_default_downloads_dir()
     version = data_release_version if versioned else 'common'
     return os.path.join(downloads_dir, version)
 

diff --git a/test/integration/pipeline/test_integration_pfam.py b/test/integration/pipeline/test_integration_pfam.py
@@ -26,9 +26,9 @@ def test_integration_pfam_annotator(tmpdir):
     assert pfam.location.start == 249
     assert pfam.location.end == 696
     assert pfam.location.strand == -1
-    assert pfam.qualifiers.get('PFAM_ID') == ['PF00005']
+    assert pfam.qualifiers.get('db_xref') == ['PF00005.26']
     assert pfam.qualifiers.get('locus_tag') == ['AAK73498.1']
     assert pfam.qualifiers.get('description') == ['ABC transporter']
-    assert pfam.qualifiers.get('database') == ['Pfam-A.31.0.hmm']
+    assert pfam.qualifiers.get('database') == ['31.0']
 
     assert_sorted_features(record)