Skip to content

Commit

Permalink
Merge pull request #20 from Merck/develop
Browse files Browse the repository at this point in the history
Improve download help annotation, add default values to help annotations
Fix pfam description annotation
  • Loading branch information
prihoda committed Oct 1, 2019
2 parents b01119c + 4a6d6de commit a27f647
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 38 deletions.
2 changes: 1 addition & 1 deletion deepbgc/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.10'
__version__ = '0.1.11-dev'
7 changes: 6 additions & 1 deletion deepbgc/command/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@

class DownloadCommand(BaseCommand):
command = 'download'
help = """Download trained models and other file dependencies to the DeepBGC downloads directory."""
help = """
Download trained models and other file dependencies to the DeepBGC downloads directory.
By default, files are downloaded to: '{}'
Set {} env variable to specify a different downloads directory."
""".format(util.get_default_downloads_dir(), util.DEEPBGC_DOWNLOADS_DIR)

def add_arguments(self, parser):
pass
Expand Down
34 changes: 17 additions & 17 deletions deepbgc/command/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,38 +53,38 @@ class PipelineCommand(BaseCommand):

def add_arguments(self, parser):

parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV).")
parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV)")

parser.add_argument('-o', '--output', required=False, help="Custom output directory path.")
parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
parser.add_argument('-o', '--output', required=False, help="Custom output directory path")
parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times")
parser.add_argument('--minimal-output', dest='is_minimal_output', action='store_true', default=False,
help="Produce minimal output with just the GenBank sequence file.")
help="Produce minimal output with just the GenBank sequence file")
group = parser.add_argument_group('BGC detection options', '')
no_models_message = 'run "deepbgc download" to download models'
detector_names = util.get_available_models('detector')
group.add_argument('-d', '--detector', dest='detectors', action='append', default=[],
help="Trained detection model name ({}) or path to trained model pickle file. "
"Can be provided multiple times (-d first -d second).".format(', '.join(detector_names) or no_models_message))
group.add_argument('--no-detector', action='store_true', help="Disable BGC detection.")
"Can be provided multiple times (-d first -d second)".format(', '.join(detector_names) or no_models_message))
group.add_argument('--no-detector', action='store_true', help="Disable BGC detection")
group.add_argument('-l', '--label', dest='labels', action='append', default=[], help="Label for detected clusters (equal to --detector by default). "
"If multiple detectors are provided, a label should be provided for each one.")
"If multiple detectors are provided, a label should be provided for each one")
group.add_argument('-s', '--score', default=0.5, type=float,
help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences.")
group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins.")
group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides.")
group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length.")
group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC.")
group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC.")
group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic protein domains in a BGC (from antiSMASH ClusterFinder).")
help="Average protein-wise DeepBGC score threshold for extracting BGC regions from Pfam sequences (default: %(default)s)")
group.add_argument('--merge-max-protein-gap', default=0, type=int, help="Merge detected BGCs within given number of proteins (default: %(default)s)")
group.add_argument('--merge-max-nucl-gap', default=0, type=int, help="Merge detected BGCs within given number of nucleotides (default: %(default)s)")
group.add_argument('--min-nucl', default=1, type=int, help="Minimum BGC nucleotide length (default: %(default)s)")
group.add_argument('--min-proteins', default=1, type=int, help="Minimum number of proteins in a BGC (default: %(default)s)")
group.add_argument('--min-domains', default=1, type=int, help="Minimum number of protein domains in a BGC (default: %(default)s)")
group.add_argument('--min-bio-domains', default=0, type=int, help="Minimum number of known biosynthetic (as defined by antiSMASH) protein domains in a BGC (default: %(default)s)")

group = parser.add_argument_group('BGC classification options', '')
classifier_names = util.get_available_models('classifier')
group.add_argument('-c', '--classifier', dest='classifiers', action='append', default=[],
help="Trained classification model name ({}) or path to trained model pickle file. "
"Can be provided multiple times (-c first -c second).".format(', '.join(classifier_names) or no_models_message))
group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification.")
"Can be provided multiple times (-c first -c second)".format(', '.join(classifier_names) or no_models_message))
group.add_argument('--no-classifier', action='store_true', help="Disable BGC classification")
group.add_argument('--classifier-score', default=0.5, type=float,
help="DeepBGC classification score threshold for assigning classes to BGCs (inclusive).")
help="DeepBGC classification score threshold for assigning classes to BGCs (default: %(default)s)")

def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier,
is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,
Expand Down
8 changes: 4 additions & 4 deletions deepbgc/command/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ class PrepareCommand(BaseCommand):
"""

def add_arguments(self, parser):
parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank).")
parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path(s) (FASTA/GenBank)")
group = parser.add_argument_group('required arguments', '')
parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
group.add_argument('--output-gbk', required=False, help="Output GenBank file path.")
group.add_argument('--output-tsv', required=False, help="Output TSV file path.")
parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times")
group.add_argument('--output-gbk', required=False, help="Output GenBank file path")
group.add_argument('--output-tsv', required=False, help="Output TSV file path")

def run(self, inputs, limit_to_record, output_gbk, output_tsv):
first_output = output_gbk or output_tsv
Expand Down
18 changes: 9 additions & 9 deletions deepbgc/command/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,23 @@ class TrainCommand(BaseCommand):

def add_arguments(self, parser):
parser.add_argument("-m", "--model", dest="model", required=True,
help="Path to JSON model config file.")
help="Path to JSON model config file")
parser.add_argument('-t', '--target', required=False, default='in_cluster',
help="Target column to predict in sequence prediction.")
help="Target column to predict in sequence prediction")
parser.add_argument('-o', '--output', required=True,
help="Output trained model file path.")
help="Output trained model file path")
parser.add_argument('-l', '--log', required=False,
help="Progress log output path (e.g. TensorBoard).")
help="Progress log output path (e.g. TensorBoard)")
parser.add_argument('-c', '--classes', required=False,
help="Class TSV file path - train a sequence classifier "
"using provided classes (binary columns), indexed by sequence_id column.")
"using provided classes (binary columns), indexed by sequence_id column")
parser.add_argument("--config", nargs=2, action='append', default=[],
help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv).")
help="Variables in model JSON file to replace (e.g. --config PFAM2VEC path/to/pfam2vec.csv)")
parser.add_argument('-v', '--validation', action='append', required=False,
help="Validation sequence file path. Repeat to specify multiple files.")
help="Validation sequence file path. Repeat to specify multiple files")
parser.add_argument("--verbose", dest="verbose", required=False, default=2, type=int,
help="Verbosity level (0=none, 1=progress bar, 2=once per epoch).", metavar="INT")
parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths.")
help="Verbosity level: 0=none, 1=progress bar, 2=once per epoch (default: %(default)s)", metavar="INT")
parser.add_argument(dest='inputs', nargs='+', help="Training sequences (Pfam TSV) file paths")

def run(self, inputs, output, model, target, classes, config, log, validation, verbose):

Expand Down
2 changes: 1 addition & 1 deletion deepbgc/pipeline/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class DeepBGCClassifier(PipelineStep):
def __init__(self, classifier, score_threshold=0.5):
if classifier is None or not isinstance(classifier, six.string_types):
raise ValueError('Expected classifier name or path, got {}'.format(classifier))
if os.path.exists(classifier) or os.path.sep in classifier:
if (os.path.exists(classifier) or os.path.sep in classifier) and not os.path.isdir(classifier):
classifier_path = classifier
# Set classifier name to filename without suffix
classifier, _ = os.path.splitext(os.path.basename(classifier))
Expand Down
2 changes: 1 addition & 1 deletion deepbgc/pipeline/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_
self.score_threshold = score_threshold
if detector is None or not isinstance(detector, six.string_types):
raise ValueError('Expected detector name or path, got {}'.format(detector))
if os.path.exists(detector) or os.path.sep in detector:
if (os.path.exists(detector) or os.path.sep in detector) and not os.path.isdir(detector):
model_path = detector
# Set detector name to filename without suffix
detector, _ = os.path.splitext(os.path.basename(detector))
Expand Down
4 changes: 3 additions & 1 deletion deepbgc/pipeline/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def annotate(self):

# Read descriptions from Pfam clan TSV
pfam_descriptions = self._get_pfam_descriptions()
print('DESCRIPTIONS: {}'.format(pfam_descriptions))

# Extract all matched domain hits
num = 0
Expand All @@ -117,7 +118,8 @@ def annotate(self):
'locus_tag': [query.id],
'database': [PFAM_DB_VERSION],
}
description = pfam_descriptions.get(pfam_id)
short_pfam_id = pfam_id.rsplit('.', maxsplit=1)[0]
description = pfam_descriptions.get(short_pfam_id)
if description:
qualifiers['description'] = [description]
pfam = SeqFeature(
Expand Down
6 changes: 5 additions & 1 deletion deepbgc/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,15 @@ def get_data_release_version():
return os.environ.get(DEEPBGC_DATA_RELEASE_VERSION, DATA_RELEASE_VERSION)


def get_default_downloads_dir():
return user_data_dir("deepbgc", version="data")


def get_downloads_dir(versioned=True):
downloads_dir = os.environ.get(DEEPBGC_DOWNLOADS_DIR)
data_release_version = get_data_release_version()
if not downloads_dir:
downloads_dir = user_data_dir("deepbgc", version="data")
downloads_dir = get_default_downloads_dir()
version = data_release_version if versioned else 'common'
return os.path.join(downloads_dir, version)

Expand Down
4 changes: 2 additions & 2 deletions test/integration/pipeline/test_integration_pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_integration_pfam_annotator(tmpdir):
assert pfam.location.start == 249
assert pfam.location.end == 696
assert pfam.location.strand == -1
assert pfam.qualifiers.get('PFAM_ID') == ['PF00005']
assert pfam.qualifiers.get('db_xref') == ['PF00005.26']
assert pfam.qualifiers.get('locus_tag') == ['AAK73498.1']
assert pfam.qualifiers.get('description') == ['ABC transporter']
assert pfam.qualifiers.get('database') == ['Pfam-A.31.0.hmm']
assert pfam.qualifiers.get('database') == ['31.0']

assert_sorted_features(record)

0 comments on commit a27f647

Please sign in to comment.