diff --git a/deepbgc/__version__.py b/deepbgc/__version__.py index 850505a..b561bc6 100644 --- a/deepbgc/__version__.py +++ b/deepbgc/__version__.py @@ -1 +1 @@ -__version__ = '0.1.10' +__version__ = '0.1.11-dev' diff --git a/deepbgc/command/download.py b/deepbgc/command/download.py index d35cf4f..d535b31 100644 --- a/deepbgc/command/download.py +++ b/deepbgc/command/download.py @@ -11,7 +11,12 @@ class DownloadCommand(BaseCommand): command = 'download' - help = """Download trained models and other file dependencies to the DeepBGC downloads directory.""" + help = """ + Download trained models and other file dependencies to the DeepBGC downloads directory. + + By default, files are downloaded to: '{}' + Set {} env variable to specify a different downloads directory." + """.format(util.get_default_downloads_dir(), util.DEEPBGC_DOWNLOADS_DIR) def add_arguments(self, parser): pass diff --git a/deepbgc/command/pipeline.py b/deepbgc/command/pipeline.py index 1e80338..2764b5e 100644 --- a/deepbgc/command/pipeline.py +++ b/deepbgc/command/pipeline.py @@ -53,38 +53,38 @@ class PipelineCommand(BaseCommand): def add_arguments(self, parser): - parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV).") + parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV)") - parser.add_argument('-o', '--output', required=False, help="Custom output directory path.") - parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.") + parser.add_argument('-o', '--output', required=False, help="Custom output directory path") + parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times") parser.add_argument('--minimal-output', dest='is_minimal_output', action='store_true', default=False, - help="Produce minimal output with just the GenBank sequence file.") + help="Produce minimal output with just the GenBank sequence file") group = parser.add_argument_group('BGC detection options', '') no_models_message = 'run "deepbgc download" to download models' detector_names = util.get_available_models('detector') group.add_argument('-d', '--detector', dest='detectors', action='append', default=[], help="Trained detection model name ({}) or path to trained model pickle file. " - "Can be provided multiple times (-d first -d second).".format(', '.join(detector_names) or no_models_message)) - group.add_argument('--no-detector', action='store_true', help="Disable BGC detection.") + "Can be provided multiple times (-d first -d second)".format(', '.join(detector_names) or no_models_message)) + group.add_argument('--no-detector', action='store_true', help="Disable BGC detection") group.add_argument('-l', '--label', dest='labels', action='append', default=[], help="Label for detected clusters (equal to --detector by default). " - "If multiple detectors are provided, a label should be provided for each one.") + "If multiple detectors are provided, a label should be provided for each one") group.add_argument('-s', '--score', default=0.5, type=float, - help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences.") - group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins.") - group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides.") - group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length.") - group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC.") - group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC.") - group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic protein domains in a BGC (from antiSMASH ClusterFinder).") + help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences (default: %(default)s)") + group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins (default: %(default)s)") + group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides (default: %(default)s)") + group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length (default: %(default)s)") + group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC (default: %(default)s)") + group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC (default: %(default)s)") + group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic (as defined by antiSMASH) protein domains in a BGC (default: %(default)s)") group = parser.add_argument_group('BGC classification options', '') classifier_names = util.get_available_models('classifier') group.add_argument('-c', '--classifier', dest='classifiers', action='append', default=[], help="Trained classification model name ({}) or path to trained model pickle file. " - "Can be provided multiple times (-c first -c second).".format(', '.join(classifier_names) or no_models_message)) - group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification.") + "Can be provided multiple times (-c first -c second)".format(', '.join(classifier_names) or no_models_message)) + group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification") group.add_argument('--classifier-score', default=0.5, type=float, - help="DeepBGC classification score threshold for assigning classes to BGCs (inclusive).") + help="DeepBGC classification score threshold for assigning classes to BGCs (default: %(default)s)") def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier, is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl, diff --git a/deepbgc/command/prepare.py b/deepbgc/command/prepare.py index 2ddd602..284018f 100644 --- a/deepbgc/command/prepare.py +++ b/deepbgc/command/prepare.py @@ -31,11 +31,11 @@ class PrepareCommand(BaseCommand): """ def add_arguments(self, parser): - parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank).") + parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank)") group = parser.add_argument_group('required arguments', '') - parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.") - group.add_argument('--output-gbk', required=False, help="Output GenBank file path.") - group.add_argument('--output-tsv', required=False, help="Output TSV file path.") + parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times") + group.add_argument('--output-gbk', required=False, help="Output GenBank file path") + group.add_argument('--output-tsv', required=False, help="Output TSV file path") def run(self, inputs, limit_to_record, output_gbk, output_tsv): first_output = output_gbk or output_tsv diff --git a/deepbgc/command/train.py b/deepbgc/command/train.py index f2a8872..2662d49 100644 --- a/deepbgc/command/train.py +++ b/deepbgc/command/train.py @@ -26,23 +26,23 @@ class TrainCommand(BaseCommand): def add_arguments(self, parser): parser.add_argument("-m", "--model", dest="model", required=True, - help="Path to JSON model config file.") + help="Path to JSON model config file") parser.add_argument('-t', '--target', required=False, default='in_cluster', - help="Target column to predict in sequence prediction.") + help="Target column to predict in sequence prediction") parser.add_argument('-o', '--output', required=True, - help="Output trained model file path.") + help="Output trained model file path") parser.add_argument('-l', '--log', required=False, - help="Progress log output path (e.g. TensorBoard).") + help="Progress log output path (e.g. TensorBoard)") parser.add_argument('-c', '--classes', required=False, help="Class TSV file path - train a sequence classifier " - "using provided classes (binary columns), indexed by sequence_id column.") + "using provided classes (binary columns), indexed by sequence_id column") parser.add_argument("--config", nargs=2, action='append', default=[], - help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv).") + help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv)") parser.add_argument('-v', '--validation', action='append', required=False, - help="Validation sequence file path. Repeat to specify multiple files.") + help="Validation sequence file path. Repeat to specify multiple files") parser.add_argument("--verbose", dest="verbose", required=False, default=2, type=int, - help="Verbosity level (0=none, 1=progress bar, 2=once per epoch).", metavar="INT") - parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths.") + help="Verbosity level: 0=none, 1=progress bar, 2=once per epoch (default: %(default)s)", metavar="INT") + parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths") def run(self, inputs, output, model, target, classes, config, log, validation, verbose): diff --git a/deepbgc/pipeline/classifier.py b/deepbgc/pipeline/classifier.py index 6a74949..55b4a14 100644 --- a/deepbgc/pipeline/classifier.py +++ b/deepbgc/pipeline/classifier.py @@ -13,7 +13,7 @@ class DeepBGCClassifier(PipelineStep): def __init__(self, classifier, score_threshold=0.5): if classifier is None or not isinstance(classifier, six.string_types): raise ValueError('Expected classifier name or path, got {}'.format(classifier)) - if os.path.exists(classifier) or os.path.sep in classifier: + if (os.path.exists(classifier) or os.path.sep in classifier) and not os.path.isdir(classifier): classifier_path = classifier # Set classifier name to filename without suffix classifier, _ = os.path.splitext(os.path.basename(classifier)) diff --git a/deepbgc/pipeline/detector.py b/deepbgc/pipeline/detector.py index d6b681f..a99d422 100644 --- a/deepbgc/pipeline/detector.py +++ b/deepbgc/pipeline/detector.py @@ -16,7 +16,7 @@ def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_ self.score_threshold = score_threshold if detector is None or not isinstance(detector, six.string_types): raise ValueError('Expected detector name or path, got {}'.format(detector)) - if os.path.exists(detector) or os.path.sep in detector: + if (os.path.exists(detector) or os.path.sep in detector) and not os.path.isdir(detector): model_path = detector # Set detector name to filename without suffix detector, _ = os.path.splitext(os.path.basename(detector)) diff --git a/deepbgc/pipeline/pfam.py b/deepbgc/pipeline/pfam.py index 5424bc9..4a07765 100644 --- a/deepbgc/pipeline/pfam.py +++ b/deepbgc/pipeline/pfam.py @@ -94,6 +94,7 @@ def annotate(self): # Read descriptions from Pfam clan TSV pfam_descriptions = self._get_pfam_descriptions() + print('DESCRIPTIONS: {}'.format(pfam_descriptions)) # Extract all matched domain hits num = 0 @@ -117,7 +118,8 @@ def annotate(self): 'locus_tag': [query.id], 'database': [PFAM_DB_VERSION], } - description = pfam_descriptions.get(pfam_id) + short_pfam_id = pfam_id.rsplit('.', maxsplit=1)[0] + description = pfam_descriptions.get(short_pfam_id) if description: qualifiers['description'] = [description] pfam = SeqFeature( diff --git a/deepbgc/util.py b/deepbgc/util.py index 829bf52..96cc63e 100644 --- a/deepbgc/util.py +++ b/deepbgc/util.py @@ -331,11 +331,15 @@ def get_data_release_version(): return os.environ.get(DEEPBGC_DATA_RELEASE_VERSION, DATA_RELEASE_VERSION) +def get_default_downloads_dir(): + return user_data_dir("deepbgc", version="data") + + def get_downloads_dir(versioned=True): downloads_dir = os.environ.get(DEEPBGC_DOWNLOADS_DIR) data_release_version = get_data_release_version() if not downloads_dir: - downloads_dir = user_data_dir("deepbgc", version="data") + downloads_dir = get_default_downloads_dir() version = data_release_version if versioned else 'common' return os.path.join(downloads_dir, version) diff --git a/test/integration/pipeline/test_integration_pfam.py b/test/integration/pipeline/test_integration_pfam.py index cd66c6a..389628e 100644 --- a/test/integration/pipeline/test_integration_pfam.py +++ b/test/integration/pipeline/test_integration_pfam.py @@ -26,9 +26,9 @@ def test_integration_pfam_annotator(tmpdir): assert pfam.location.start == 249 assert pfam.location.end == 696 assert pfam.location.strand == -1 - assert pfam.qualifiers.get('PFAM_ID') == ['PF00005'] + assert pfam.qualifiers.get('db_xref') == ['PF00005.26'] assert pfam.qualifiers.get('locus_tag') == ['AAK73498.1'] assert pfam.qualifiers.get('description') == ['ABC transporter'] - assert pfam.qualifiers.get('database') == ['Pfam-A.31.0.hmm'] + assert pfam.qualifiers.get('database') == ['31.0'] assert_sorted_features(record) \ No newline at end of file