diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d942898 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea +*.pyc +__pycache__ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cbcf338 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright © 2018 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., Inc., Kenilworth, NJ, USA." + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/LICENSES_THIRD_PARTY b/LICENSES_THIRD_PARTY new file mode 100644 index 0000000..f7a0b3c --- /dev/null +++ b/LICENSES_THIRD_PARTY @@ -0,0 +1,27 @@ +-------------------------------------------------- +Third party dependencies listed by License type +[Format: name (Python module) - URL] +-------------------------------------------------- + +Biopython License Agreement +* Biopython (biopython) - https://github.com/biopython/biopython/blob/d5718bb7e3ee505b859b39c03f8ffad8a9a0be2f/LICENSE.rst + +OSI Approved (new BSD) +* scikit-learn (scikit-learn) - https://github.com/scikit-learn/scikit-learn/blob/2e85c8608c93ad0e3290414c4e5e650b87d44b27/COPYING +* hmmlearn (hmmlearn) - https://github.com/hmmlearn/hmmlearn/blob/1f60373d28c427a2a05c9ea26231c717772066dc/LICENSE.txt + +BSD 3-Clause License +* Pandas (pandas) - https://github.com/pandas-dev/pandas/blob/5aba6659e422e985683cfb46c07c3364a02b6e5b/AUTHORS.md +* HMMER - https://github.com/EddyRivasLab/hmmer/blob/3e38d667761e0a98a263079cb4a90e49d4b720d5/LICENSE + +MIT License (MIT) +* Keras (keras) - https://github.com/keras-team/keras/blob/dc698c5486117780b643eda0a2f60a8753625b8a/LICENSE + +Apache Software License (Apache 2.0) +* TensorFlow (tensorflow) - https://github.com/tensorflow/tensorflow/blob/6b6d843ccab78f9f91c3b98a43ca09ffecad4747/LICENSE + +Python Software Foundation License (BSD) +* Matplotlib (matplotlib) - https://matplotlib.org/users/license.html + +GNU General Public License v3.0 +* Prodigal - https://github.com/hyattpd/Prodigal/blob/b1321f0899c4d7a835583feb344e2c9a5bd908d1/LICENSE diff --git a/README.md b/README.md new file mode 100644 index 0000000..5f171c0 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# DeepBGC: Biosynthetic Gene Cluster detection and classification. + +## Install DeepBGC + +- Run `pip install deepbgc` to install the `deepbgc` python module. + +## Prerequisities + +- Install Python 3.6 (version 3.7 is not supported by TensorFlow yet) +- Install Prodigal and put the `prodigal` binary it on your PATH: https://github.com/hyattpd/Prodigal/releases +- Install HMMER and put the `hmmscan` and `hmmpress` binaries on your PATH: http://hmmer.org/download.html +- Download and **extract** Pfam database from: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam31.0/Pfam-A.hmm.gz + +## Use DeepBGC + +### Detection + +Detect BGCs in a genomic sequence. + +```bash +# Show detection help +deepbgc detect --help + +# Detect BGCs in a nucleotide sequence +deepbgc detect --model DeepBGCDetector_v0.0.1.pkl --pfam Pfam-A.hmm --output myCandidates/ myInputSequence.fa + +# Detect BGCs with >0.9 score in existing Pfam CSV sequence +deepbgc detect --model myModel.pkl --output myStrictCandidates/ -s 0.9 myCandidates/myCandidates.pfam.csv + +``` + +### Classification + +Classify BGCs into one or more classes. + +```bash +# Show classification help +deepbgc classify --help + +# Predict biosynthetic class of detected BGCs +deepbgc classify --model RandomForestMIBiGClasses_v0.0.1.pkl --output myCandidates/myCandidates.classes.csv myCandidates/myCandidates.candidates.csv + +``` diff --git a/deepbgc/__init__.py b/deepbgc/__init__.py new file mode 100644 index 0000000..b911eeb --- /dev/null +++ b/deepbgc/__init__.py @@ -0,0 +1,3 @@ +VERSION = '0.0.1' + +from .pipeline import DeepBGCModel \ No newline at end of file diff --git a/deepbgc/commands/__init__.py b/deepbgc/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deepbgc/commands/base.py b/deepbgc/commands/base.py new file mode 100644 index 0000000..649f9b3 --- /dev/null +++ b/deepbgc/commands/base.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod +import argparse + + +class BaseCommand(ABC): + """ + Base abstract class for commands + """ + command = '' + help = "" + + def __init__(self, args): + self.args = args + + @classmethod + def add_subparser(cls, subparsers): + parser = subparsers.add_parser(cls.command, description=cls.help, help=cls.help, + formatter_class=argparse.RawTextHelpFormatter) + parser.set_defaults(func=cls) + return parser + + @abstractmethod + def run(self): + raise NotImplementedError() diff --git a/deepbgc/commands/classify.py b/deepbgc/commands/classify.py new file mode 100644 index 0000000..c2656ad --- /dev/null +++ b/deepbgc/commands/classify.py @@ -0,0 +1,87 @@ +import pandas as pd +from deepbgc.commands.base import BaseCommand +import os +import pickle +import numpy as np + +SCORE_COLUMN = 'deepbgc_score' + +class ClassifyCommand(BaseCommand): + command = 'classify' + help = """Classify BGCs into one or more classes. + +Examples: + + deepbgc classify --model myClassifier.pkl --output classes.csv inputSequence.fa + """ + + def __init__(self, args): + super().__init__(args) + self.output_path = args.output + self.input_path = args.input + self.model_path = args.model + + @classmethod + def add_subparser(cls, subparsers): + parser = super().add_subparser(subparsers) + + parser.add_argument('-o', '--output', required=True, help="Output CSV file path.") + parser.add_argument('-m', '--model', required=True, help="Trained classification model file path.") + parser.add_argument(dest='input', help="Input candidate CSV file path.") + + def run(self): + candidates = pd.read_csv(self.input_path) + if 'candidate_hash' not in candidates.columns: + raise AttributeError('Input CSV is not a candidate CSV file, "candidate_hash" column should be present.') + + candidates = candidates.set_index('candidate_hash') + + with open(self.model_path, 'rb') as f: + model = pickle.load(f) + + vectors = domain_set_vectors(candidates) + + predictions = predict_classes(vectors, model) + predictions.to_csv(self.output_path, index=False) + print('Saved {} predictions to {}'.format(len(predictions), self.output_path)) + + +def domain_set_vectors(candidates): + candidate_pfam_ids = [pfam_ids.split(';') for pfam_ids in candidates['pfam_ids']] + unique_pfam_ids = sorted(list(set([p for ids in candidate_pfam_ids for p in ids]))) + print('Getting domain set vectors for {} candidates with {} unique Pfam IDs...'.format(len(candidates), len(unique_pfam_ids))) + vectors = pd.DataFrame(np.zeros((len(candidates), len(unique_pfam_ids))), columns=unique_pfam_ids) + for i, pfam_ids in enumerate(candidate_pfam_ids): + vectors.iloc[i][pfam_ids] = 1 + return vectors + + +def predict_classes(samples, model, add_classes_list=True): + # Set missing columns to 0 + if not hasattr(model, 'input_columns'): + raise AttributeError('Trained model does not contain the "input_columns" attribute.') + if not hasattr(model, 'label_columns'): + raise AttributeError('Trained model does not contain the "label_columns" attribute.') + + missing_columns = set(model.input_columns).difference(samples.columns) + for col in missing_columns: + samples[col] = 0 + #print('Missing columns:\n{}'.format(sorted(list(missing_columns)))) + print('Warning: Setting {} missing columns to 0'.format(len(missing_columns))) + samples = samples[model.input_columns] + + results = np.array([r[:,1] for r in model.predict_proba(samples.values)]).transpose() + predictions = pd.DataFrame(results, index=samples.index, columns=model.label_columns) + if add_classes_list: + predictions['classes'] = [';'.join(model.label_columns[x >= 0.5]) for x in results] + + return predictions + +def sequence_id_from_filename(path): + """ + Create a basic sequence_id from a file name without extension + :param path: Path of file + :return: file name without extension that can be used as sequence_id + """ + return os.path.splitext(os.path.basename(path))[0] + diff --git a/deepbgc/commands/detect.py b/deepbgc/commands/detect.py new file mode 100644 index 0000000..b054b94 --- /dev/null +++ b/deepbgc/commands/detect.py @@ -0,0 +1,74 @@ +import pandas as pd +from deepbgc.commands.base import BaseCommand +from deepbgc.converter import SequenceToPfamCSVConverter +import os +from deepbgc.detector import DeepBGCDetector + +SCORE_COLUMN = 'deepbgc_score' + +class DetectCommand(BaseCommand): + command = 'detect' + help = """Detect BGCs in a genomic sequence. + +Examples: + + # Detect BGCs in FASTA sequence with default settings + deepbgc detect --model myModel.pkl --output myDetections/ --pfam Pfam-A.hmm inputSequence.fa + + # Detect BGCs with >0.9 score in existing Pfam CSV sequence + deepbgc detect --model myModel.pkl --output myStrictDetections/ -s 0.9 myDetections/myDetections.pfam.csv + """ + + def __init__(self, args): + super().__init__(args) + self.output_path = args.output + self.output_basename = os.path.basename(self.output_path) + self.input_path = args.input + self.model_path = args.model + self.score_threshold = args.score + self.converter = SequenceToPfamCSVConverter(db_path=args.pfam) + + @classmethod + def add_subparser(cls, subparsers): + parser = super().add_subparser(subparsers) + + parser.add_argument('-o', '--output', required=True, help="Output folder path.") + parser.add_argument('-m', '--model', required=True, help="Trained detection model file path.") + parser.add_argument('-p', '--pfam', required=False, help="Pfam DB (Pfam-A.hmm) file path.") + parser.add_argument('-s', '--score', default=0.5, type=float, help="Average protein-wise DeepBGC score threshold for extracting BGC regions from domain sequences.") + parser.add_argument(dest='input', help="Input pfam CSV file path.") + + def _outpath(self, suffix, extension): + return os.path.join(self.output_path, '{}.{}.{}'.format(self.output_basename, suffix, extension)) + + def run(self): + try: + os.makedirs(self.output_path, exist_ok=True) + except FileExistsError: + raise AttributeError("Output directory already exists: {}".format(self.output_path)) + except Exception as e: + raise AttributeError("Output directory not writable: {}".format(self.output_path), e) + + domain_path = self._outpath('pfam', 'csv') + if not self.converter.convert(self.input_path, domain_path): + # Input was already a pfam CSV file, use original path + domain_path = self.input_path + + domains = pd.read_csv(domain_path) + detector = DeepBGCDetector(model=self.model_path) + + candidates = detector.detect(domains, score_threshold=self.score_threshold) + + cand_path = self._outpath('candidates', 'csv') + candidates.to_csv(cand_path, index=False) + print('Saved {} detected BGCs to {}'.format(len(candidates), cand_path)) + + +def sequence_id_from_filename(path): + """ + Create a basic sequence_id from a file name without extension + :param path: Path of file + :return: file name without extension that can be used as sequence_id + """ + return os.path.splitext(os.path.basename(path))[0] + diff --git a/deepbgc/commands/pfam.py b/deepbgc/commands/pfam.py new file mode 100644 index 0000000..9855099 --- /dev/null +++ b/deepbgc/commands/pfam.py @@ -0,0 +1,37 @@ +from deepbgc.commands.base import BaseCommand +from deepbgc.converter import SequenceToPfamCSVConverter + + +class PfamCommand(BaseCommand): + command = 'pfam' + help = """Convert genomic BGCs sequence into a pfam domain CSV file by detecting proteins and pfam domains. + +Examples: + + # Detect proteins and pfam domains in a FASTA sequence and save the result as csv file + deepbgc pfam --pfam Pfam-A.hmm inputSequence.fa outputPfamSequence.csv + """ + + def __init__(self, args): + super().__init__(args) + self.input_path = args.input + self.output_path = args.output + self.converter = SequenceToPfamCSVConverter(db_path=args.pfam) + + @classmethod + def add_subparser(cls, subparsers): + parser = super().add_subparser(subparsers) + + # parser.add_argument('--mode', default='auto', choices=['auto', 'nucl', 'prot', 'pfam'], + # help="Input modes: \n" + # "--mode auto: Automatic based on file extension.\n" + # "--mode nucl: Nucleotide sequence without annotated genes. Will detect genes and pfam domains. \n" + # "--mode prot: Protein sequence. Will detect pfam domains.)") + parser.add_argument('-p', '--pfam', required=True, help="Pfam DB (Pfam-A.hmm) file path.") + parser.add_argument(dest='input', help="Input sequence file path.") + parser.add_argument(dest='output', help="Output pfam CSV file path.") + + def run(self): + self.converter.convert(self.input_path, self.output_path) + print() + print('Saved Pfam CSV to: {}'.format(self.output_path)) diff --git a/deepbgc/converter.py b/deepbgc/converter.py new file mode 100644 index 0000000..b527c45 --- /dev/null +++ b/deepbgc/converter.py @@ -0,0 +1,213 @@ +import shutil +import subprocess +import os +import numpy as np +import pandas as pd +import tempfile + +MAX_EVALUE = 0.01 + +class SequenceToPfamCSVConverter: + def __init__(self, db_path): + self.db_path = db_path + + def convert(self, input_path, output_path): + + format = guess_format(input_path) + + if not format: + raise NotImplementedError("Filetype not recognized: {}".format(input_path)) + elif format == 'csv': + # Input is already a CSV file. + return False + + if not self.db_path: + raise AttributeError('Pfam DB path not specified.') + + from Bio import SeqIO + with tempfile.TemporaryDirectory() as tempdir: + sequences = SeqIO.parse(input_path, format) + if not sequences: + raise AttributeError("No sequences in {} file: {}".format(format, input_path)) + + with open(output_path, 'w') as outfile: + for i, sequence in enumerate(sequences): + print("="*80) + print('Processing sequence: {}'.format(sequence.id)) + print("="*80) + print() + + #from Bio.Alphabet import NucleotideAlphabet + #if not isinstance(sequence.seq.alphabet, NucleotideAlphabet): + # raise AttributeError("Unsupported alphabet: {}".format(sequence.seq.alphabet)) + + nucl_path = os.path.join(tempdir, 'nucl.fa') + SeqIO.write(sequence, nucl_path, 'fasta') + + protein_path = os.path.join(tempdir, 'proteins.fa') + save_protein_sequence(nucl_path, protein_path) + + domtbl_path = os.path.join(tempdir, 'pfam.tbl') + detect_pfam_domains(protein_path, self.db_path, domtbl_path) + + gene_locations = get_prodigal_gene_locations(protein_path) + + domains = domtbl_to_df(domtbl_path, gene_locations=gene_locations) + domains.insert(0, 'sequence_id', sequence.id) + domains.to_csv(outfile, index=False, header=(i == 0)) + return True + +def guess_format(file_path): + _, ext = os.path.splitext(file_path) + if ext in ['.fa', '.fasta']: + return 'fasta' + elif ext in ['.gbk', '.gb', '.genbank']: + return 'genbank' + elif ext in ['.csv']: + return 'csv' + return None + +def get_prodigal_gene_locations(protein_path): + from Bio import SeqIO + proteins = SeqIO.parse(protein_path, 'fasta') + locations = [] + for protein in proteins: + splits = protein.description.split('#') + try: + locations.append({ + 'protein_id': protein.id, + 'start': int(splits[1]), + 'end': int(splits[2]), + 'strand': int(splits[3]) + }) + except Exception as e: + raise AttributeError('Invalid Prodigal gene description: "{}"'.format(protein.description), e) + return pd.DataFrame(locations).set_index('protein_id') + + +def save_protein_sequence(input_path, protein_path): + if not shutil.which('prodigal'): + raise Exception("Prodigal needs to be installed and available on PATH in order to detect genes.") + + print('Detecting genes using Prodigal...') + + FNULL = open(os.devnull, 'w') + subprocess.call(['prodigal', '-i', input_path, '-a', protein_path], stdout=FNULL, stderr=FNULL) + + if not os.path.exists(protein_path): + # TODO improve message + raise Exception("Unexpected error detecting genes using Prodigal") + + +def detect_pfam_domains(protein_path, db_path, domtbl_path): + if not shutil.which('hmmscan') or not shutil.which('hmmpress'): + raise Exception( + "HMMscan and HMMpress needs to be installed and available on PATH in order to detect pfam domains.") + + pressed_db_path = db_path + '.h3m' + if not os.path.exists(pressed_db_path): + print('Pressing pfam DB...') + subprocess.call(['hmmpress', db_path]) + + if not os.path.exists(pressed_db_path): + # TODO improve message + raise Exception("Unexpected error running HMMpress on Pfam DB") + + print('Detecting pfam domains using HMMscan, this might take a while...') + FNULL = open(os.devnull, 'w') + subprocess.call(['hmmscan', '--domtblout', domtbl_path, db_path, protein_path], stdout=FNULL) + + if not os.path.exists(domtbl_path): + # TODO improve message + raise Exception("Unexpected error detecting protein domains using HMMscan") + + +SUPPORTED_FORMATS = [ + 'proteins2fasta' +] + + +def domtbl_to_df(domtbl_path, format=None, gene_locations=None): + """ + Pfam hmmscan tabular format into internal Domain DataFrame format, one protein domain per line + :param domtbl_path: Path to HMMscan tabular format result file + :param format: Format of the protein fasta file that was passed into HMMscan (supported: proteins2fasta or None). + If it was generated by proteins2fasta, we can extract other values from the sequence ID. + :param gene_locations: DataFrame of (start, end, strand) indexed by protein ID + :return: Domain DataFrame, one protein domain per line + """ + from Bio import SearchIO + # Read domain matches in all proteins + queries = SearchIO.parse(domtbl_path, 'hmmscan3-domtab') + + # Extract all matched domain hits + domains = [] + for query in queries: + query_domains = [] + for hit in query.hits: + best_index = np.argmin([hsp.evalue for hsp in hit.hsps]) + best_hsp = hit.hsps[best_index] + pfam_id = hit.accession.split('.')[0] + evalue = float(best_hsp.evalue) + if evalue > MAX_EVALUE: + continue + query_domains.append({ + 'pfam_id': pfam_id, + 'query_id': query.id, + 'domain_start': int(best_hsp.query_start), + 'domain_end': int(best_hsp.query_end) + }) + domains += sorted(query_domains, key=lambda x: x['domain_start']) + + domains = pd.DataFrame(domains) + + num_domains = len(domains) + print('Detected {} Pfam domain hits'.format(num_domains)) + + fields = ['pfam_id', 'domain_start', 'domain_end'] + + # Use sequence id generated by proteins2fasta.py to get our gene info directly. + if format == 'proteins2fasta': + domains['sequence_id'] = domains['query_id'].apply(lambda s: s.split('|')[0]) + domains['locus_tag'] = domains['query_id'].apply(lambda s: s.split('|')[1]) + domains['protein_id'] = domains['query_id'].apply(lambda s: s.split('|')[2]) + domains['gene_start'] = domains['query_id'].apply(lambda s: normalize_gene_coord(s.split('|')[3].split('-')[0])) + domains['gene_end'] = domains['query_id'].apply(lambda s: normalize_gene_coord(s.split('|')[3].split('-')[1])) + domains['gene_strand'] = domains['query_id'].apply(lambda s: s.split('|')[4]) + fields = ['contig_id', 'locus_tag', 'protein_id', 'gene_start', 'gene_end', 'gene_strand', 'pfam_id', + 'domain_start', 'domain_end'] + elif format is None: + missing_ids = set(domains['query_id']).difference(gene_locations.index.values) + if len(missing_ids): + raise AttributeError("There are {} protein IDs missing: {}".format(len(missing_ids), list(missing_ids)[:10])) + locations = gene_locations.loc[domains['query_id']] + domains['protein_id'] = locations.index.values + domains['gene_start'] = locations['start'].values + domains['gene_end'] = locations['end'].values + domains['gene_strand'] = locations['strand'].values + + fields = ['protein_id', 'gene_start', 'gene_end', 'gene_strand'] + fields + + else: + raise AttributeError( + 'Format {} not supported, use one of {} or define the protein file.'.format(format, SUPPORTED_FORMATS)) + domains = domains[fields] + + return domains + + +def normalize_gene_coord(gene_coord): + """ + Normalize imprecise gene coordinates, will pfam <0 into 0 and >1234 into 1234. + :param gene_coord: Gene coordinate as int or str + :return: normalized numeric gene coordinate + """ + if isinstance(gene_coord, int): + return gene_coord + if isinstance(gene_coord, str): + # TODO: Can we turn <0 into 0 and >1234 into 1234? + if gene_coord.startswith('<') or gene_coord.startswith('>'): + gene_coord = gene_coord[1:] + if gene_coord.isnumeric(): + return int(gene_coord) + raise AttributeError('Invalid gene coord {} ({})'.format(gene_coord, type(gene_coord))) diff --git a/deepbgc/detection/__init__.py b/deepbgc/detection/__init__.py new file mode 100644 index 0000000..ffe2ca6 --- /dev/null +++ b/deepbgc/detection/__init__.py @@ -0,0 +1,3 @@ +from .hmm_discrete import DiscreteHMM, GeneBorderHMM, ClusterFinderHMM +from .hmm_gaussian import GaussianHMM +from .rnn import KerasRNN \ No newline at end of file diff --git a/deepbgc/detection/hmm_discrete.py b/deepbgc/detection/hmm_discrete.py new file mode 100644 index 0000000..8896de5 --- /dev/null +++ b/deepbgc/detection/hmm_discrete.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python +# David Prihoda +# HMM models for BGC domain-level prediction +# Emission probability can be calculated from positive and negative training samples. +# Starting and transition probability have to be provided. + +import numpy as np +from hmmlearn import hmm +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin +import pickle +import os + + +class HMM(BaseEstimator, ClassifierMixin): + """ + HMM model parent class providing Sklearn mixins and saving/loading functionality + """ + def save(self, path): + with open(path, 'wb') as f: + pickle.dump(self, f) + return self + + @classmethod + def load(cls, path): + with open(path, 'rb') as f: + return pickle.load(f) + + +class DiscreteHMM(HMM): + + def get_sample_vector(self, X): + """ + Turn pfam IDs into integers based on our vocabulary + :param X: DataFrame of domains with pfam_id column + :return: numpy array of numbers representing given words in our vocabulary + """ + return np.array([self.vocabulary_.get(o, -1) for o in X['pfam_id']]) + + def predict(self, X: pd.DataFrame): + """ + Get BGC prediction score for a Domain DataFrame + :param X: DataFrame with pfam domains + :return: numpy array of BGC prediction scores for each domain in X + """ + word_vector = self.get_sample_vector(X) + # Predict posterior probability using our HMM + logprob, posteriors = self.model_.score_samples(word_vector.reshape(-1, 1)) + # BGC state probability is in second column + return posteriors[:,1] + + def _get_pfam_counts(self, X, y): + """ + Get number of occurences of each pfam ID in negative (non-BGC) and positive (BGC) states + :param X: Domain DataFrame with pfam_id column + :param y: Series of states for each domain (0 = non-BGC, 1 = BGC) + :return: DataFrame with number of positive and negative occurences (pos and neg columns) of each pfam_id (index). + """ + counts = X[['pfam_id']].drop_duplicates().set_index('pfam_id') + unique_y = set(y) + if unique_y != {0, 1}: + raise AttributeError('Invalid target values, expected {0, 1} got '+str(unique_y)) + counts['pos'] = X[y == 1]['pfam_id'].value_counts() + counts['neg'] = X[y == 0]['pfam_id'].value_counts() + return counts.fillna(0) + + def _construct_model(self, startprob, transmat, emissionprob, vocabulary): + """ + Create internal HMM model with given matrices and store it to self.model_ + :param startprob: Starting probability [negative_starting_prob, positive_starting_prob] + :param transmat: Transition matrix (An array where the [i][j]-th element corresponds to the posterior probability of transitioning between the i-th to j-th) + :param emissionprob: Emission probability [[neg_pfam1, neg_pfam2, ...], [pos_pfam1, pos_pfam2, ...]] with pfam IDs indexed by their vocabulary index numbers + :param vocabulary: Vocabulary dictionary with {pfam_id: index_number_in_emission} + :return: self + """ + self.model_ = hmm.MultinomialHMM(n_components=2) + if isinstance(startprob, list): + startprob = np.array(startprob) + if isinstance(transmat, list): + transmat = np.array(transmat) + self.model_.startprob_ = startprob + self.model_.transmat_ = transmat + self.model_.emissionprob_ = emissionprob + self.vocabulary_ = vocabulary + return self + + def fit(self, X_list, y_list, sample_weights=None, startprob=None, transmat=None, verbose=0, + default_emission_count=0.01, debug_progress_path=None, validation_X_list=None, validation_y_list=None): + """ + Create and train internal HMM model based on list of positive and negative samples. + Emission probability will be calculated from samples. Starting and transition probability have to be provided. + + :param X_list: List of samples (Domain DataFrames) + :param y_list: List of sample states (0 or 1), one value for each sample (DataFrame) + :param sample_weights: List of sample weights, marking their contribution to the emission probability. If not provided, will be set to 1 for all samples. + :param startprob: Starting probability [negative_starting_prob, positive_starting_prob] + :param transmat: Transition matrix (An array where the [i][j]-th element corresponds to the posterior probability of transitioning between the i-th to j-th) + :param verbose: Verbosity (0 = no output, 1 = plot top pfams for positive and negative states) + :param default_emission_count: Emission value for the other state for pfams that appear only in the positive / negative state + :param debug_progress_path: Not used in HMM models. + :param validation_X_list: List of validation samples, not used in HMM models. + :param validation_y_list: List of validation states, not used in HMM models. + :return: self + """ + if validation_X_list: + print('DiscreteHMM: Validation is present but has no effect yet.') + if startprob is None: + raise ValueError('Calculating start probability not supported yet, specify startprob explicitly') + if transmat is None: + raise ValueError('Calculating transition matrix not supported yet, specify transmat explicitly') + + if sample_weights is not None: + zipped = enumerate(zip(X_list, y_list, sample_weights)) + weighted_counts = [self._get_pfam_counts(X, y) * weight for i, (X, y, weight) in zipped] + all_counts = pd.concat(weighted_counts).reset_index().groupby('pfam_id').sum().sort_index() + else: + X: pd.DataFrame = pd.concat(X_list) + y: pd.DataFrame = pd.concat(y_list) + all_counts = self._get_pfam_counts(X, y).sort_index() + + if verbose: + print('Top positive:') + print(all_counts.sort_values(by='pos', ascending=False).head(3)) + print('Top negative:') + print(all_counts.sort_values(by='neg', ascending=False).head(3)) + + # For a pfam_id that appears only in the positive / negative state, set the default emission count instead of 0 + all_counts.replace(0, default_emission_count, inplace=True) + + # Vocabulary stores map of pfam_id -> index in emission vector + vocabulary = {pfam_id: i for i, pfam_id in enumerate(all_counts.index)} + + emissions = all_counts[['neg', 'pos']].values + # Divide each state's emission counts by the total number of observations to get emission frequency + emissions /= emissions.sum(axis=0) + # Add default emissions for unseen pfam_ids to the end (will be indexed by -1) + emissions = np.concatenate([emissions, np.array([[0.5, 0.5]])]) + + self._construct_model(startprob, transmat, emissions.T, vocabulary) + return self + + def get_sample_emissions(self, sample): + word_index = self.get_sample_vector(sample) + return pd.DataFrame({ + 'OUT': [None if x == -1 else self.model_.emissionprob_[0][x] for x in word_index], + 'BGC': [None if x == -1 else self.model_.emissionprob_[1][x] for x in word_index] + }) + + +class GeneBorderHMM(HMM): + """ + HMM that only changes its state at gene borders. + Implemented by turning each input symbol (pfam ID) into a tuple of (pfam ID, is_at_gene_end) + and each negative and positive state into four states with tuples (positive/negative, is_at_gene_end) + + Emissions at gene ends have 0 emission probability in states that are not at gene ends and vice versa. + Transitions can only happen from states where is_at_gene_end = True, which means probability is set to 0 for all other transitions. + """ + def _convert_startprob(self, startprob): + if startprob is None: + return + # Start probability + start_out = startprob[0] + start_bgc = startprob[1] + return np.array([start_out / 2, start_out / 2, start_bgc / 2, start_bgc / 2]) + #print('Converted to four state start probability:') + #print(self.model.startprob_) + + def _convert_transmat(self, transmat, X_list, verbose=0): + if transmat is None: + return + + num_gene_end = sum([sum(get_sample_gene_ends(X['protein_id'])) for X in X_list]) + num_total = sum([len(X) for X in X_list]) + frac_in_gene_end = num_gene_end / num_total + if verbose: + print('Gene end: {} ({}/{})'.format(frac_in_gene_end, num_gene_end, num_total)) + + # Transition probability + out2bgc = transmat[0][1] * frac_in_gene_end + out2out = 1 - out2bgc + bgc2out = transmat[1][0] * frac_in_gene_end + bgc2bgc = 1 - bgc2out + + converted = np.array([ + [0.5, 0.5, 0, 0], + [out2out / 2, out2out / 2, out2bgc / 2, out2bgc / 2], + [0, 0, 0.5, 0.5], + [bgc2out / 2, bgc2out / 2, bgc2bgc / 2, bgc2bgc / 2] + ]) + if verbose: + print('Converted to four state transitions:') + print(converted) + return converted + + def _convert_emission(self, old_emissionprob, old_vocabulary): + # Emissions + num_words = len(old_vocabulary) + out_emissions = old_emissionprob[0][:-1] + bgc_emissions = old_emissionprob[1][:-1] + zero_emissions = np.zeros(num_words) + default_emission = old_emissionprob[0][-1] + + emissionprob = np.zeros((4, num_words * 2 + 2)) + emissionprob[0] = np.concatenate([out_emissions, zero_emissions, [default_emission, 0]]) + emissionprob[1] = np.concatenate([zero_emissions, out_emissions, [0, default_emission]]) + emissionprob[2] = np.concatenate([bgc_emissions, zero_emissions, [default_emission, 0]]) + emissionprob[3] = np.concatenate([zero_emissions, bgc_emissions, [0, default_emission]]) + + # Vocabulary + vocabulary = {} + for pfam_id, word_index in old_vocabulary.items(): + vocabulary[(pfam_id, False)] = word_index + vocabulary[(pfam_id, True)] = word_index + num_words + + return emissionprob, vocabulary + + def _get_word_index(self, pfam_id, is_gene_end): + default_index = -1 if is_gene_end else -2 + return self.vocabulary_.get((pfam_id, is_gene_end), default_index) + + def get_sample_vector(self, X): + is_gene_end = get_sample_gene_ends(X['protein_id']) + if not any(is_gene_end): + print('Warning: no gene end predicted: '+str(X.head(1))) + return np.array([self._get_word_index(x, is_gene_end[i]) for i, x in enumerate(X['pfam_id'].values)]) + + def predict(self, X): + sample_vector = self.get_sample_vector(X) + prev_level = np.geterr()['divide'] + np.seterr(divide='ignore') + logprob, posteriors = self.model_.score_samples(sample_vector.reshape(-1, 1)) + np.seterr(divide=prev_level) + # final prediction is maximum of the probability of the last two states + prediction = posteriors[:,2:] + return np.max(prediction, axis=1) + + def fit(self, X_list, y_list, startprob=None, transmat=None, verbose=1, debug_progress_path=None, validation_X_list=None, validation_y_list=None): + if validation_X_list: + print('GeneBorderHMM: Validation is present but has no effect yet.') + if verbose: + print('Training two state model...') + + two_state_model = DiscreteHMM() + two_state_model.fit(X_list, y_list, startprob=startprob, transmat=transmat, verbose=verbose) + + emission, self.vocabulary_ = self._convert_emission(two_state_model.model_.emissionprob_, two_state_model.vocabulary_) + + self.model_ = hmm.MultinomialHMM(n_components=4) + self.model_.startprob_ = self._convert_startprob(startprob) + self.model_.transmat_ = self._convert_transmat(transmat, X_list) + self.model_.emissionprob_ = emission + return self + + def get_sample_emissions(self, X): + sample_vector = self.get_sample_vector(X) + return pd.DataFrame({ + 'OUT_IN_GENE': [None if x < 0 else self.model_.emissionprob_[0][x] for x in sample_vector], + 'OUT_GENE_END': [None if x < 0 else self.model_.emissionprob_[1][x] for x in sample_vector], + 'BGC_IN_GENE': [None if x < 0 else self.model_.emissionprob_[2][x] for x in sample_vector], + 'BGC_GENE_END': [None if x < 0 else self.model_.emissionprob_[3][x] for x in sample_vector] + }) + + +class ClusterFinderHMM(DiscreteHMM): + """ + Wrapper that loads the ClusterFinder trained model from the pickled starting, transition and emission matrices. + """ + def fit(self, X_unused, y_unused, param_dir=None, **kwargs): + + with open(os.path.join(param_dir, 'NewTS_all_B_index.pkl'), 'rb') as pfile: + cf_vocabulary = pickle.load(pfile) + + # Start probability + with open(os.path.join(param_dir, 'SP_arr.pkl'), 'rb') as pfile: + cf_start = pickle.load(pfile, encoding='latin1') + + # Transition probability between states + with open(os.path.join(param_dir, 'TP_arr_A.pkl'), 'rb') as pfile: + cf_transition = pickle.load(pfile, encoding='latin1') + + # Emission probability for each state and pfam + with open(os.path.join(param_dir, 'NewTS_all_B_reduced_6filter.pkl'), 'rb') as pfile: + cf_emission = pickle.load(pfile, encoding='latin1') + + # Add default emission to the end of the emission matrix + # Default emission is used when the observed sequence contains words that didn't appear in our vocabulary + # The value actually does not matter as long as it's the same for both states + cf_default_emission = 1.6026668376177961e-07 + cf_default_emissions = np.array([[cf_default_emission], [cf_default_emission]]) + cf_emission = np.append(cf_emission, cf_default_emissions, axis=1) + print('Default emission', cf_default_emission) + + # Create HMM with given parameters + # States are flipped to use more intuitive NONBGC=0, BGC=1 + startprob = np.array([cf_start[1], cf_start[0]]) + transmat = np.array([[cf_transition[1][1], cf_transition[1][0]], + [cf_transition[0][1], cf_transition[0][0]]]) + emissionprob = np.array([cf_emission[1], cf_emission[0]]) + print('Start probability (0=NONBGC, 1=BGC):\n', startprob) + print('Transition probability (0=NONBGC, 1=BGC):\n', transmat) + print('Emission probability (0=NONBGC, 1=BGC):\n', emissionprob) + + self._construct_model(startprob=startprob, transmat=transmat, emissionprob=emissionprob, vocabulary=cf_vocabulary) + return self + + +def get_sample_gene_ends(gene_ids): + """ + For list of Gene IDs, return list of boolean values that mark whether the next gene is different (or we are at end of sequence) + :param gene_ids: List of gene IDs + :return: list of boolean values that mark whether the next gene is different (or we are at end of sequence) + """ + gene_ends = list(gene_ids[:-1].values != gene_ids[1:].values) + [True] + return np.array(gene_ends).astype(np.uint8) + diff --git a/deepbgc/detection/hmm_gaussian.py b/deepbgc/detection/hmm_gaussian.py new file mode 100644 index 0000000..8c0d0a2 --- /dev/null +++ b/deepbgc/detection/hmm_gaussian.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# David Prihoda +# Gaussian HMM model for BGC domain-level prediction +# Experimental, did not get satisfactory results + +import pandas as pd +from sklearn import mixture +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +import pickle +from hmmlearn import hmm + + +class GaussianHMM(BaseEstimator, ClassifierMixin): + + def __init__(self, num_pos_means=5, num_neg_means=5, covariance_type="diag", meta=None): + self.num_pos_means = num_pos_means + self.num_neg_means = num_neg_means + self.covariance_type = covariance_type + self.meta = meta or {} + + def predict(self, X): + # Predict posterior probability using our HMM + logprob, posteriors = self.model_.score_samples(X) + # BGC state probability is in second column + return posteriors[:,1] + + def fit(self, X_list, y_list, startprob=None, transmat=None, verbose=1, debug_progress_path=None, validation_X_list=None, validation_y_list=None): + if validation_X_list: + print('GaussianHMM: Validation is present but has no effect yet.') + if startprob is None: + raise ValueError('Calculating start probability not supported yet, specify startprob explicitly') + if transmat is None: + raise ValueError('Calculating transition matrix not supported yet, specify transmat explicitly') + + X = np.concatenate(X_list) + y = np.concatenate(y_list) + pos_vectors = X[y == 1] + neg_vectors = X[y == 0] + + if verbose: + print('Training positive GMM on {} vectors'.format(len(pos_vectors))) + pos_gmm = mixture.GaussianMixture(n_components=self.num_pos_means, covariance_type=self.covariance_type) + pos_gmm.fit(pos_vectors) + + if verbose: + print('Training negative GMM on {} vectors'.format(len(neg_vectors))) + neg_gmm = mixture.GaussianMixture(n_components=self.num_neg_means, covariance_type=self.covariance_type) + neg_gmm.fit(neg_vectors) + + self.model_ = GMMHMM2(n_components=2, covariance_type=self.covariance_type, verbose=bool(verbose)) + self.model_.startprob_ = startprob + self.model_.transmat_ = transmat + self.model_.gmms_ = np.array([neg_gmm, pos_gmm]) + return self + + def save(self, path): + pickle.dump(self, path) + return self + + @classmethod + def load(cls, path): + return pickle.load(path) + + def get_sample_emissions(self, X): + feature_matrix = self.features.get_feature_matrix(X) + return pd.DataFrame({ + 'OUT': self.model_.gmms_[0].score_samples(feature_matrix), + 'BGC': self.model_.gmms_[1].score_samples(feature_matrix) + }) + + +class GMMHMM2(hmm.GMMHMM): + def __init__(self, n_components=1, + startprob_prior=1.0, transmat_prior=1.0, + covariance_type='diag', covars_prior=1e-2, + algorithm="viterbi", random_state=None, + n_iter=10, tol=1e-2, verbose=False, + params="stmcw", init_params="stmcw"): + hmm._BaseHMM.__init__(self, n_components, + startprob_prior=startprob_prior, + transmat_prior=transmat_prior, + algorithm=algorithm, random_state=random_state, + n_iter=n_iter, tol=tol, verbose=verbose, + params=params, init_params=init_params) + + self.covariance_type = covariance_type + self.covars_prior = covars_prior + self.gmms_ = [] + + def _compute_log_likelihood(self, X): + return np.array([g.score_samples(X) for g in self.gmms_]).T diff --git a/deepbgc/detection/rnn.py b/deepbgc/detection/rnn.py new file mode 100644 index 0000000..1e2a868 --- /dev/null +++ b/deepbgc/detection/rnn.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python +# David Prihoda +# Generic LSTM wrapper used for the DeepBGC model + +import numpy as np +import tensorflow as tf +from sklearn.model_selection import train_test_split +from sklearn.base import BaseEstimator, ClassifierMixin + + +class KerasRNN(BaseEstimator, ClassifierMixin): + """ + Generic LSTM wrapper used for the DeepBGC model + """ + def __init__(self, trained_model=None, batch_size=1, hidden_size=128, loss='binary_crossentropy', stateful=True, + activation='sigmoid', return_sequences=True): + from keras.models import Sequential + if trained_model is not None: + self.model: Sequential = trained_model + # Set the attributes from the model object to be able to clone and cross-validate a loaded model + self.batch_size = trained_model.layers[0].batch_input_shape[0] + self.hidden_size = trained_model.layers[0].layer.units + self.stateful = trained_model.layers[0].layer.stateful + self.loss = trained_model.loss + self.activation = trained_model.layers[-1].layer.activation + self.return_sequences = trained_model.layers[0].layer.return_sequences + else: + self.model: Sequential = None + self.batch_size = batch_size + self.hidden_size = hidden_size + self.loss = loss + self.stateful = stateful + self.activation = activation + self.return_sequences = return_sequences + + def _build_model(self, input_size, stacked_sizes=None, fully_connected_sizes=None, optimizer_name=None, learning_rate=None, decay=None, gpus=0, custom_batch_size=None): + """ + Build Keras Sequential model architecture with given parameters + :param input_size: Dimensionality of input vector (number of features) + :param stacked_sizes: Add given number of additional Bi-LSTM layers after first Bi-LSTM layer, provided as list of sizes + :param fully_connected_sizes: Add a given number of additional fully connected layers after the Bi-LSTM layers, provided as list of sizes + :param optimizer_name: Name of Keras optimizer, default 'adam' + :param learning_rate: Keras learning rate + :param decay: Optimizer decay + :param gpus: Number of gpus to train on (Not implemented) + :param custom_batch_size: Use different batch size than self.batch_size + :return: Keras Sequential model + """ + from keras.layers.core import Dense + from keras.layers.recurrent import LSTM + from keras.layers.wrappers import TimeDistributed, Bidirectional + from keras.models import Sequential + from keras import optimizers + if stacked_sizes is None: + stacked_sizes = [] + if fully_connected_sizes is None: + fully_connected_sizes = [] + + model = Sequential() + + model.add(Bidirectional( + layer=LSTM( + units=self.hidden_size, + return_sequences=True, + dropout=0.2, + recurrent_dropout=0.2, + stateful=self.stateful + ), + batch_input_shape=(custom_batch_size or self.batch_size, None, input_size) + )) + + for size in stacked_sizes: + model.add(Bidirectional(layer=LSTM(units=size, return_sequences=True, stateful=self.stateful))) + + for size in fully_connected_sizes: + model.add(TimeDistributed(Dense(size, activation='sigmoid'))) + + model.add(TimeDistributed(Dense(1, activation='sigmoid'))) + + if gpus > 1: + raise NotImplementedError("Multi GPU model not implemented due to input size mismatch.") + #model = multi_gpu_model(model, gpus=gpus) + + if optimizer_name is None: + optimizer_name = "adam" + + optimizer_args = {} + if learning_rate is not None: + optimizer_args['lr'] = learning_rate + if decay is not None: + optimizer_args['decay'] = decay + + if optimizer_name == 'adam': + optimizer = optimizers.Adam(**optimizer_args) + elif optimizer_args: + raise ValueError('Optimizer {} not implemented for custom params yet'.format(optimizer_name)) + else: + optimizer = optimizer_name + + print('Using optimizer', optimizer_name, optimizer_args) + model.compile(loss=self.loss, optimizer=optimizer, sample_weight_mode='temporal', metrics=["accuracy", precision, recall, auc_roc]) + return model + + def fit(self, X_list, y_list, timesteps=128, validation_size=0.33, num_epochs=10, verbose=1, + debug_progress_path=None, fully_connected_sizes=None, + shuffle=True, gpus=0, stacked_sizes=None, early_stop_mode=None, early_stop_monitor=None, early_stop_min_delta=0.005, early_stop_patience=10, + positive_weight=None, weighted=False, optimizer=None, learning_rate=None, decay=None, + validation_X_list=None, validation_y_list=None): + """ + Train Keras Sequential model using provided list of positive / negative samples. + Training is done in given number of epochs with additional stopping criteria. + In each epoch, we go over all samples in X_list, which are shuffled randomly and merged together into artificial genomes. + + :param X_list: List of DataFrames (samples) where each DataFrame contains protein domains represented by numeric vectors + :param y_list: List of output values, one value for each sample where 0 = negative sample (non-BGC), 1 = positive sample (BGC) + :param timesteps: Number of timesteps (protein domains) in one batch + :param validation_size: Fraction of samples to use for testing + :param num_epochs: Number of epochs. If early stopping is defined, this serves as a limit of maximum number of epochs. + :param verbose: Verbosity (0 = silent, 1 = verbose, 2 = very verbose) + :param debug_progress_path: Log Tensorboard information in given folder + :param fully_connected_sizes: Add a given number of additional fully connected layers after the Bi-LSTM layers, provided as list of sizes + :param shuffle: Whether to shuffle samples within each epoch. If not used, make sure that positive and negative samples are already shuffled in the list. + :param gpus: Number of gpus to use (not implemented!) + :param stacked_sizes: Add given number of additional Bi-LSTM layers after first Bi-LSTM layer, provided as list of sizes + :param early_stop_mode: Keras early stopping mode (use max for increasing metrics like AUC ROC, use min for decreasing metrics like Loss) + :param early_stop_monitor: Metric to observe for early stopping (e.g. val_auc_roc) + :param early_stop_min_delta: Minimum change to observed metric needed to continue training + :param early_stop_patience: Number of epochs to get maximum value of the observed metric from, if that value does not improve over the previous maximum, stop training + :param positive_weight: Weight of positive samples (single number). Can be used to counter imbalance in training data. + :param weighted: Calculate positive weight automatically as num negatives / num positive samples in input training data (y_list). + :param optimizer: Name of Keras optimizer, default 'adam'. + :param learning_rate: Keras learning rate + :param decay: Keras optimizer decay. + :param validation_X_list: List of DataFrames (samples) used to observe validation performance + :param validation_y_list: List of output values for validation samples, one value for each sample where 0 = negative sample (non-BGC), 1 = positive sample (BGC) + :return: self + """ + + import keras + + if not isinstance(X_list, list): + raise AttributeError('Expected X_list to be list, got ' + str(type(X_list))) + + if not isinstance(y_list, list): + raise AttributeError('Expected y_list to be list, got ' + str(type(X_list))) + + if weighted: + if positive_weight: + raise ValueError('Positive weight cannot be specified together with weighted=true.') + num_neg = _count_samples(y_list, 0) + num_pos = _count_samples(y_list, 1) + positive_weight = num_neg / num_pos + print('Negative: {}, Positive: {}'.format(num_neg, num_pos)) + print('Weighing positives based on ratio, weight:', positive_weight) + + input_size = X_list[0].shape[1] + + train_model = self._build_model(input_size, stacked_sizes, fully_connected_sizes=fully_connected_sizes, optimizer_name=optimizer, learning_rate=learning_rate, decay=decay, gpus=gpus) + self.model = self._build_model(input_size, stacked_sizes, fully_connected_sizes=fully_connected_sizes, optimizer_name=optimizer, learning_rate=learning_rate, decay=decay, gpus=gpus, custom_batch_size=1) + + X_train, y_train = X_list, y_list + validation_data, validation_num_batches = None, None + + if validation_X_list: + if positive_weight: + print('Warning: Not using positive_weight "{}" on external validation set!'.format(positive_weight)) + if validation_size: + print('Warning: LSTM validation size {} specified but ignored, ' + 'because external validation set is also present.'.format(validation_size)) + + print('Validating on external validation set of {} samples'.format(len(validation_X_list))) + validation_data = _repeat_to_fill_batch_size(validation_X_list, validation_y_list, self.batch_size, input_size) + validation_num_batches = None + elif validation_size: + print('Validating on {:.1f}% of input set'.format(validation_size*100)) + X_train, X_validation, y_train, y_validation = train_test_split(X_list, y_list, test_size=validation_size) + + get_validation_gen, validation_num_batches = _build_generator( + X_validation, + y_validation, + batch_size=self.batch_size, + timesteps=timesteps, + input_size=input_size, + shuffle=shuffle, + positive_weight=positive_weight + ) + validation_data = get_validation_gen() + + get_train_gen, train_num_batches = _build_generator( + X_train, + y_train, + batch_size=self.batch_size, + timesteps=timesteps, + input_size=input_size, + shuffle=shuffle, + positive_weight=positive_weight, + ) + train_gen = get_train_gen() + + + callbacks = [] + if debug_progress_path: + tb = keras.callbacks.TensorBoard(log_dir=debug_progress_path, histogram_freq=0, batch_size=self.batch_size, + write_graph=True, + write_grads=False, write_images=False, + embeddings_layer_names=None, embeddings_metadata=None) + callbacks.append(tb) + + if early_stop_monitor: + if not early_stop_mode: + raise ValueError('Keras early_stop_mode has to be specified (min, max, auto) to enable early_stop_monitor.') + + callbacks.append(keras.callbacks.EarlyStopping( + min_delta=early_stop_min_delta, + monitor=early_stop_monitor, + patience=early_stop_patience, + mode=early_stop_mode, + verbose=1 + )) + + with _get_device(gpus): + history = train_model.fit_generator( + generator=train_gen, + steps_per_epoch=train_num_batches, + shuffle=False, + epochs=num_epochs, + validation_data=validation_data, + validation_steps=validation_num_batches, + callbacks=callbacks, + verbose=verbose + ) + + trained_weights = train_model.get_weights() + self.model.set_weights(trained_weights) + + return history + + def predict(self, X): + """ + Predict given sample DataFrame/numpy matrix of numeric protein vectors + :param X: DataFrame/numpy matrix of protein vectors + :return: BGC prediction score for each protein vector + """ + if len(X.shape) != 2: + raise AttributeError('Can only be called on a single 2-dimensional feature matrix.') + + if self.model is None: + raise AttributeError('Cannot predict using untrained model.') + + batch_matrix = X.reshape(1, X.shape[0], X.shape[1]) + + # TODO do we need to reset here? + self.model.reset_states() + probs = self.model.predict(batch_matrix, batch_size=1) + return probs[0,:,0] + + def save(self, path): + if self.model is None: + raise AttributeError('Cannot save untrained model.') + self.model.save(path) + return self + + @classmethod + def load(cls, path): + import keras + model = keras.models.load_model(path, custom_objects={'precision': precision, 'recall': recall, 'auc_roc': auc_roc}) + return KerasRNN(trained_model=model) + + def __getstate__(self): + """ + Get representation of object that can be pickled + :return: objects to be pickled + """ + attrs = self.__dict__.copy() + del attrs['model'] + + if self.model is None: + return attrs, None, None + return attrs, self.model.to_json(), self.model.get_weights() + + def __setstate__(self, state): + from keras.models import Sequential, model_from_json + """ + Load object from pickled representation + :param state: attributes of model generated by __getstate__ + """ + attrs, architecture, weights = state + + self.__dict__.update(attrs) + + if architecture is None: + self.model = None + else: + self.model: Sequential = model_from_json(architecture) + self.model.set_weights(weights) + +def rotate(l, n): + m = n % len(l) + return l[m:] + l[:m] + +def _noop(): + return None + +def _yield_single_pair(a, b): + yield a, b + +def _repeat_to_fill_batch_size(X_list, y_list, batch_size, input_size): + """ + Fill matrix of batch_size rows with samples from X_list in a way that all samples are (approximately) evenly present. + Create batch_size rows, each row as long as the longest sample in X_list (max_len). + For row on index i, include concatenated sequence of X_list starting from sample i (sequence is trimmed to max_len). + :param X_list: list of samples + :param y_list: list of sample responses + :param batch_size: how many rows to create + :param input_size: number of columns in sample + :return: Filled matrix of batch_size rows with samples from X_list in a way that all samples are (approximately) evenly present. + """ + if len(X_list) > batch_size: + raise AttributeError('Cannot repeat more samples than batch_size.') + + max_len = max([X.shape[0] for X in X_list]) + + fill_shape = (batch_size, max_len, input_size) + fill_num_values = fill_shape[0] * fill_shape[1] * fill_shape[2] + print('Filling to batch size shape {} ({}M values)...'.format(fill_shape, int(fill_num_values / 1000000))) + + X_filled = np.zeros(shape=fill_shape) + y_filled = np.zeros(shape=(fill_shape[0], fill_shape[1], 1)) + + for i in range(0, batch_size): + X_filled[i] = np.concatenate(rotate(X_list, i))[:max_len] + y_filled[i][:,0] = np.concatenate(rotate(y_list, i))[:max_len] + + print('Filling done.') + return X_filled, y_filled + + +def _build_generator(X_list, y_list, batch_size, timesteps, input_size, shuffle, positive_weight): + """ + Build looping generator of training batches. Will return the generator and the number of batches in each epoch. + In each epoch, all samples are randomly split into batch_size "chunks", each "chunk" in batch can be trained in parallel. + Samples in each chunk are shuffled and merged into one whole sequence. + The whole sequences are separated into batches of given fixed given number of timesteps (protein vectors). + So the number of batches is defined so that we go over the whole sequence (length of the longest "chunk" sequence divided by the number of timesteps). + + :param X_list: List of samples. Each sample is a matrix/DataFrame of protein domain vectors. + :param y_list: List of sample outputs. + :param batch_size: Number of parallel "chunks" in a training batch + :param timesteps: Number of timesteps (protein domain vectors) in a training batch + :param input_size: Size of the protein domain vector + :param shuffle: Whether to shuffle samples within each epoch. If not used, make sure that positive and negative samples are already shuffled in the list. + :param positive_weight: Weight of positive samples (single number). If provided, a triple of (X_batch, y_batch, weights_batch) are provided + :return: Tuple of (batch generator, number of batches in each epoch). + Each batch will contain the X input (batch_size, timesteps, input_size) and y output (batch_size, timesteps, 1) + """ + if not X_list: + return _noop, None + from keras.preprocessing.sequence import pad_sequences + seq_length = sum([len(X) for X in X_list]) + X_arr = np.array(X_list) + y_arr = np.array(y_list) + num_batches = int(np.ceil(np.ceil(seq_length / batch_size) / timesteps)) + maxlen = num_batches * timesteps + print('Initializing generator of {} batches from sequence length {}'.format(num_batches, seq_length)) + + def generator(): + while True: + # shuffle the samples + if shuffle: + shuffled = np.random.permutation(len(X_list)) + # split samples into batch_size chunks + X_batches = np.array_split(X_arr[shuffled] if shuffle else X_arr, batch_size) + y_batches = np.array_split(y_arr[shuffled] if shuffle else y_arr, batch_size) + + # merge the samples in each chunk into one sequence + X_batches = [np.concatenate(b) if b.size else np.empty(0) for b in X_batches] + y_batches = [np.concatenate(b) if b.size else np.empty(0) for b in y_batches] + + # pad the sequences with zeros to the length of the longest chunk sequence + X_batches = pad_sequences(X_batches, maxlen=maxlen, dtype=np.float, + padding='post', truncating='post') + y_batches = pad_sequences(y_batches, maxlen=maxlen, dtype=np.float, + padding='post', truncating='post') + + # Reshape array so that it can be indexed as [batch number][chunk][timestep][input feature] + # This will produce an array of dimension (num_batches, batch_size, timesteps, input_size) + # And output array of dimension (num_batches, batch_size, timesteps, 1) + X_batches = np.swapaxes(X_batches.reshape(batch_size, num_batches, timesteps, input_size), 0, 1) + y_batches = np.swapaxes(y_batches.reshape(batch_size, num_batches, timesteps, 1), 0, 1) + + # print('Generated {}x{} batches: X {}, y {}'.format(num_batches, self.batch_size, X_batches.shape, y_batches.shape)) + + if positive_weight: + # Provide array of weights for each input vector based on the positive weight + weight_batches = np.ones(y_batches.shape) + weight_batches[y_batches == 1] = positive_weight + weight_batches = np.swapaxes(weight_batches.reshape(batch_size, num_batches, timesteps), 0, 1) + for X_batch, y_batch, weight_batch in zip(X_batches, y_batches, weight_batches): + yield X_batch, y_batch, weight_batch + else: + for X_batch, y_batch in zip(X_batches, y_batches): + yield X_batch, y_batch + + return generator, num_batches + +def _count_samples(y_list, klass): + return np.sum([np.mean(y == klass) for y in y_list]) + +def _split_matrix_into_batches(X, batch_size): + if len(X.shape) != 2: + raise AttributeError('Can only be called on a single 2-dimensional feature matrix.') + return X.reshape(batch_size, X.shape[0], X.shape[1]) + +def _pad_matrix_to_be_divisible(X, divisible_by): + from keras.preprocessing.sequence import pad_sequences + remainder = X.shape[0] % divisible_by + if not remainder: + return X + maxlen = X.shape[0] + divisible_by - remainder + return pad_sequences([X], maxlen=maxlen, dtype=np.float, padding='post', truncating='post')[0] + + +def _get_device(gpus): + if gpus == 0: + return tf.device('/cpu:0') + elif gpus >= 1: + return tf.device('/device:GPU:0') # TODO: can we get just the first GPU? + else: + raise AttributeError('GPUs has to be an integer >= 0') + + +def precision(y_true, y_pred): + """Precision metric. + + Only computes a batch-wise average of precision. + + Computes the precision, a metric for multi-label classification of + how many selected items are relevant. + """ + import keras.backend as K + true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) + predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) + precision = true_positives / (predicted_positives + K.epsilon()) + return precision + + +def recall(y_true, y_pred): + """Recall metric. + + Only computes a batch-wise average of recall. + + Computes the recall, a metric for multi-label classification of + how many relevant items are selected. + """ + import keras.backend as K + true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) + possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) + recall = true_positives / (possible_positives + K.epsilon()) + return recall + + +def auc_roc(y_true, y_pred): + """ + Defines AUC ROC metric callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505 + """ + # any tensorflow metric + value, update_op = tf.metrics.auc(y_true, y_pred) + + # find all variables created for this metric + metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]] + + # Add metric variables to GLOBAL_VARIABLES collection. + # They will be initialized for new session. + for v in metric_vars: + tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v) + + # force to update metric values + with tf.control_dependencies([update_op]): + value = tf.identity(value) + return value diff --git a/deepbgc/detector.py b/deepbgc/detector.py new file mode 100644 index 0000000..870eb33 --- /dev/null +++ b/deepbgc/detector.py @@ -0,0 +1,181 @@ +import pandas as pd +import hashlib +import numpy as np +from deepbgc.pipeline import DeepBGCModel + +SCORE_COLUMN = 'deepbgc_score' + + +class DeepBGCDetector: + def __init__(self, model): + if isinstance(model, str): + print('Reading model from path: ', model) + model = DeepBGCModel.load(model) + self.model = model + + def predict_domain_bgc_score(self, domains): + """ + Get BGC score score for given Domain DataFrame, add it as SCORE_COLUMN column. + :param domains: Domain DataFrame, multiple samples marked by different 'sequence_id' will be predicted separately + :return: Original Domain DataFrame with SCORE_COLUMN column added. + """ + + samples = [group for i, group in domains.groupby('sequence_id')] + scores = [] + for sample in samples: + score = sample.copy() + score[SCORE_COLUMN] = self.model.predict(sample) + scores.append(score) + + merged: pd.DataFrame = pd.concat(scores) + return merged + + def detect(self, domains, score_threshold=0.5): + if 'sequence_id' in domains.columns: + sequences = domains.groupby('sequence_id') + print('Detecting in {} sequences: {}'.format(len(sequences), domains['sequence_id'].unique())) + else: + sequences = [(None, domains)] + print('Detecting in single sequence...') + + all_candidates = [] + for sequence_id, sequence_domains in sequences: + scores = self.predict_domain_bgc_score(domains) + candidates = threshold_sequence_candidates(scores, score_threshold, merge_max_protein_gap=0, merge_max_nucl_gap=0) + if sequence_id: + candidates.insert(0, 'sequence_id', sequence_id) + all_candidates.append(candidates) + return pd.concat(all_candidates) + + +def agg_concat(s): + """ + Join given list with semicolons + :param s: list of objects to join + :return: joined string + """ + return ';'.join(s) + + +def average_protein_score(domains, concat_domains=True): + """ + Average scores into a SCORE_COLUMN column by protein using the 'protein_id' and other PROTEIN_GROUP_COLS. + :param domains: DataFrame from the Domain CSV file + :param concat_domains: Whether to include a ';'-concatenated list of pfam_ids for each protein. + :return: DataFrame of proteins with averaged SCORE_COLUMN column + """ + PROTEIN_GROUP_COLS = ['protein_id'] + PROTEIN_EXTRA_COLS = ['in_cluster', 'gene_start', 'gene_end', 'gene_strand'] + + extra_cols = [col for col in PROTEIN_EXTRA_COLS if col in domains.columns] + all_cols = extra_cols + PROTEIN_GROUP_COLS + if concat_domains: + all_cols.append('pfam_id') + copy = domains[all_cols].copy() + copy[SCORE_COLUMN] = domains[SCORE_COLUMN] + per_gene = copy.groupby(all_cols, sort=False) + if concat_domains: + return per_gene.agg({'pfam_id': agg_concat, SCORE_COLUMN: 'mean'}) \ + .rename(columns={'pfam_id': 'pfam_ids'}) \ + .reset_index() + else: + return per_gene.mean().reset_index() + + +def get_candidate(start, end, pfam_ids, protein_ids, protein_scores): + """ + Get single BGC candidate dictionary + :param start: nucleotide coordinate start + :param end: nucleotide coordinate end + :param pfam_ids: list of pfam ids in candidate + :param protein_ids: list of protein ids in candidate + :param protein_scores: list of protein model score outputs + :return: BGC candidate dictionary + """ + return { + 'nucl_start': start, + 'nucl_end': end, + 'num_proteins': len(protein_ids), + 'num_domains': len(pfam_ids), + 'protein_ids': ';'.join(protein_ids), + 'pfam_ids': ';'.join(pfam_ids), + SCORE_COLUMN: np.mean(protein_scores) + } + + +def threshold_sequence_candidates(domain_scores, threshold, merge_max_protein_gap=0, merge_max_nucl_gap=0): + """ + Get a BGC candidate DataFrame for domain scores in a single contig. + Generated by averaging domain scores by protein and then merging consecutive proteins with score satisfying given threshold + :param domain_scores: DataFrame of domains and their SCORE_COLUMN column + :param threshold: Averaged protein score threshold (inclusive) used to include or discard BGC proteins + :param merge_max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them + :param merge_max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them + :return: DataFrame of BGC candidates + """ + + protein_scores = average_protein_score(domain_scores, concat_domains=True) + + candidates = [] + candidate_start = None + candidate_end = None + candidate_domains = [] + candidate_proteins = [] + candidate_scores = [] + gap_domains = [] + gap_proteins = [] + gap_scores = [] + for i, protein in protein_scores.iterrows(): + score = protein[SCORE_COLUMN] + # Inactive protein, add to gap + if score < threshold: + gap_proteins.append(protein['protein_id']) + gap_domains += protein['pfam_ids'].split(';') + gap_scores.append(score) + # We just changed from active to inactive, add previous region as candidate + if candidate_start is not None: + candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_scores)) + candidate_start = None + candidate_end = None + candidate_domains = [] + candidate_proteins = [] + candidate_scores = [] + # Active protein + else: + if not candidate_start: + candidate_start = protein['gene_start'] + if candidates: + # Check if we should merge with the previous candidate + prev_start, prev_end, prev_domains, prev_proteins, prev_scores = candidates[-1] + if len(gap_proteins) <= merge_max_protein_gap or (candidate_start - prev_end) <= merge_max_nucl_gap: + # Remove previous candidate and continue where it started + candidates = candidates[:-1] + candidate_start = prev_start + candidate_domains = prev_domains + gap_domains + candidate_proteins = prev_proteins + gap_proteins + candidate_scores = prev_scores + gap_scores + + candidate_end = protein['gene_end'] + candidate_proteins.append(protein['protein_id']) + candidate_domains += protein['pfam_ids'].split(';') + candidate_scores.append(score) + gap_domains = [] + gap_proteins = [] + gap_scores = [] + + # Last protein was active, add previous region as candidate + if candidate_start is not None: + candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_scores)) + + cands = pd.DataFrame([get_candidate(*args) for args in candidates]) + if cands.empty: + return cands + + cands['nucl_start'] = cands['nucl_start'].astype('int64') + cands['nucl_end'] = cands['nucl_end'].astype('int64') + cands['nucl_length'] = cands['nucl_end'] - cands['nucl_start'] + 1 + cands['candidate_hash'] = cands['pfam_ids'].apply( + lambda pfam_ids: hashlib.md5(pfam_ids.encode('utf-8')).hexdigest()) + cands = cands[['candidate_hash', SCORE_COLUMN, 'nucl_length', 'nucl_start', 'nucl_end', + 'num_domains', 'num_proteins', 'protein_ids', 'pfam_ids']] + return cands diff --git a/deepbgc/features.py b/deepbgc/features.py new file mode 100644 index 0000000..8dd6d45 --- /dev/null +++ b/deepbgc/features.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +# David Prihoda +# Feature transformers that turn Domain DataFrames into protein feature vector matrices + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +import pandas as pd +import sys + + +class ListTransformer(BaseEstimator, TransformerMixin): + """ + Wrapper for other transformers, will transform each DataFrame in a list by each transformer and merge the results. + """ + def __init__(self, transformers): + self.transformers = transformers + + def transform(self, X, y=None): + if X is None: + return None + if not self.transformers: + return X + if isinstance(X, list): + return [self.transform(X[i], y[i] if y else None) for i in range(0, len(X))] + if not isinstance(X, pd.DataFrame): + raise AttributeError('X has to be a pd.DataFrame or list, got '+str(type(X))) + return np.concatenate([t.transform(X, y) for t in self.transformers], axis=1) + + def fit(self, X_list, y_list=None): + if X_list is None: + return self + if not isinstance(X_list, list): + raise AttributeError('X_list has to be a list, got'+str(type(X_list))) + if X_list: + for t in self.transformers: + t.fit(pd.concat(X_list), pd.concat(y_list)) + return self + + @classmethod + def from_config(cls, transformer_configs): + transformers = [] + for params in transformer_configs: + classname = params.get('type') + transformer = getattr(sys.modules[__name__], classname) + trans_args = {k: v for k, v in params.items() if k != 'type'} + transformers.append(transformer(**trans_args)) + return ListTransformer(transformers) + + +class Pfam2VecTransformer(BaseEstimator, TransformerMixin): + """ + Get pfam2vec matrix for a Domain DataFrame + """ + def __init__(self, vector_path): + self.vector_path = vector_path + if vector_path.endswith('.csv'): + self.vectors = pd.read_csv(vector_path).set_index('pfam_id') + elif vector_path.endswith('.pkl') or vector_path.endswith('.pickle'): + self.vectors = pd.read_pickle(vector_path) + elif vector_path.endswith('.bin'): + import word2vec + model = word2vec.load(vector_path, kind='bin') + self.vectors = pd.DataFrame(model.vectors, index=model.vocab) + else: + raise ValueError("File type {} not supported for Pfam2Vec, use .csv, .pkl, .pickle or .bin".format(vector_path)) + + def transform(self, X, y=None): + # Turn each pfam ID into a vector + return self.vectors.reindex(index=X['pfam_id']).fillna(0) + + def fit(self, X, y=None): + return self + + +class RandomVecTransformer(BaseEstimator, TransformerMixin): + """ + Get random vector matrix for a Domain DataFrame. Each unique pfam_id will have the same random vector throughout the sequence. + """ + + def __init__(self, dimensions=100): + self.dimensions = dimensions + self.zero_vector = np.zeros(self.dimensions) + self.vectors = {} + self.random = np.random.RandomState(seed=0) + + def transform(self, X, y=None): + # Turn each pfam ID into a vector + return np.array([self.vectors.get(pfam_id, self.zero_vector) for pfam_id in X['pfam_id']]) + #print(X.iloc[0]['pfam_id'], vectors[0]) + #return vectors + + def fit(self, X, y=None): + for pfam_id in X['pfam_id'].unique(): + if pfam_id not in self.vectors: + self.vectors[pfam_id] = self.random.rand(self.dimensions) + return self + + +class EmissionProbabilityTransformer(BaseEstimator, TransformerMixin): + """ + Get emission probability feature column for given Domain DataFrame. Based on HMM emissions. + """ + def __init__(self): + self.emissions = None + + def fit(self, X, y=None): + unique_y = set(y) + if unique_y != {0, 1}: + raise AttributeError('Invalid target values, expected {0, 1} got ' + str(unique_y)) + counts = pd.DataFrame(data={}, index=X['pfam_id'].unique()) + counts['neg'] = X[y == 0]['pfam_id'].value_counts() + counts['pos'] = X[y == 1]['pfam_id'].value_counts() + counts = counts.fillna(0) + # Divide each state's emission counts by the total number of observations to get emission frequency + self.emissions = counts / counts.sum(axis=0) + return self + + def transform(self, X, y=None): + # Turn each pfam ID into a vector + vectors = self.emissions.reindex(index=X['pfam_id'], fill_value=0) + return vectors + + +class PositiveProbabilityTransformer(BaseEstimator, TransformerMixin): + """ + Get "positive probability" feature columns for given Domain DataFrame. + Each pfam_id will get two columns: Positive probability and Total probability + Positive probability = probability of being in positive state while seeing given pfam, + which is equivalent to number of occurences of given pfam in the positive state divided by number of occurences in all states + Total probability = probability of seeing given pfam in general, + which is equivalent to number of occurences divided by total length of input sequence + """ + def __init__(self): + self.probs = None + + def fit(self, X, y=None): + vals = pd.DataFrame({'pfam_id': X['pfam_id'], 'y': y}) + negweight = sum(y) / sum(y == 0) + total_num_weighted = sum(y) + sum(y == 0) * negweight + probs = {} + for pfam_id, pfam_y in vals.groupby('pfam_id')['y']: + num_pos = sum(pfam_y) + num_neg = sum(pfam_y == 0) + num_weighted = num_pos + num_neg * negweight + prob = num_pos / num_weighted + prob = (prob - 0.5) * 2 + pfam_frac = num_weighted / total_num_weighted + probs[pfam_id] = [prob, pfam_frac] + self.probs = pd.DataFrame(probs).transpose() + return self + + def transform(self, X, y=None): + # Turn each pfam ID into a vector + vectors = self.probs.reindex(index=X['pfam_id'], fill_value=0) + return vectors + + +class OneHotEncodingTransformer(BaseEstimator, TransformerMixin): + """ + Create a binary one-hot-encoding vector from Domain CSV files. + """ + def __init__(self): + self.pfam_ids = [] + + def transform(self, X, y=None): + # Turn each pfam ID into a vector + return pd.get_dummies(X['pfam_id']).reindex(columns=self.pfam_ids, fill_value=0) + + def fit(self, X, y=None): + self.pfam_ids = np.union1d(self.pfam_ids, X['pfam_id']) + return self + + +class ProteinBorderTransformer(BaseEstimator, TransformerMixin): + """ + Get gene beginning / gene end binary flags from Domain CSV files. + """ + + def __init__(self, field='protein_id'): + self.field = field + + def transform(self, X, y=None): + # current != next (exclude last element because we don't have a next value) + borders = list(X[self.field][:-1].values != X[self.field][1:].values) + gene_ends = np.array(borders + [True]).reshape(-1, 1) + gene_beginnings = np.array([True] + borders).reshape(-1, 1) + return np.concatenate([gene_ends, gene_beginnings], axis=1).astype(np.uint8) + + def fit(self, X, y=None): + return self + + +class GeneDistanceTransformer(BaseEstimator, TransformerMixin): + """ + Returns vector specifying nucleotide distance from the end of previous gene to the start of current gene for first domain in each protein. + Distance between domains in same protein is 0. Distance in first domain of given sample is equal to its gene_start. + + TODO: Warning! Do not use the distance transformer for merged samples - the distance would be invalid on sample borders. + """ + + def __init__(self, norm_distance): + print('TODO: Warning! Do not use the distance transformer for merged samples - the distance would be invalid on sample borders.') + self.norm_distance = norm_distance + + def transform(self, X, y=None): + gene_starts = X['gene_start'].values + gene_ends = X['gene_end'].values + previous_gene_ends = np.concatenate([[0], gene_ends[:-1]]) + distances = (gene_starts - previous_gene_ends) / self.norm_distance + # replace negative values with zeros + distances *= distances >= 0 + return distances.astype(np.float32).reshape(-1, 1) + + def fit(self, X, y=None): + return self + +class ColumnSelectTransformer(BaseEstimator, TransformerMixin): + """ + Select given columns of input DataFrame + """ + def __init__(self, columns): + self.columns = columns + + def transform(self, X, y=None): + return X.select(self.columns, axis=1).values + + def fit(self, X, y=None): + return self \ No newline at end of file diff --git a/deepbgc/main.py b/deepbgc/main.py new file mode 100755 index 0000000..ea560c6 --- /dev/null +++ b/deepbgc/main.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +import argparse +from deepbgc.commands.detect import DetectCommand +from deepbgc.commands.pfam import PfamCommand +from deepbgc.commands.classify import ClassifyCommand +import sys + +COMMANDS = [ + PfamCommand, + DetectCommand, + ClassifyCommand +] + +def _fix_subparsers(subparsers): + if sys.version_info[0] == 3: + subparsers.required = True + subparsers.dest = 'cmd' + + +class DeepBGCParser(argparse.ArgumentParser): + def error(self, message): + self.print_help() + self.exit(2, "{}\n".format(message)) + + +def main(argv=None): + parser = DeepBGCParser(prog='deepbgc', + description='DeepBGC - Biosynthetic Gene Cluster detection and classification.', + formatter_class=argparse.RawTextHelpFormatter) + + # Sub commands + subparsers = parser.add_subparsers( + title='Available Commands', + metavar='COMMAND', + dest='cmd', + help='Use: deepbgc COMMAND --help for command-specific help.') + + _fix_subparsers(subparsers) + + for CommandClass in COMMANDS: + CommandClass.add_subparser(subparsers) + + args = parser.parse_args(argv) + + # Initialize command object + cmd = args.func(args) + # Run command + cmd.run() + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/deepbgc/pipeline.py b/deepbgc/pipeline.py new file mode 100644 index 0000000..9b093c1 --- /dev/null +++ b/deepbgc/pipeline.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# David Prihoda +# Wrapper for a BGC detection model than handles feature transformation and loading model definitions from JSON + +from deepbgc import features +import pickle +import json +from sklearn.base import BaseEstimator, ClassifierMixin +from pprint import pprint + + +class DeepBGCModel(BaseEstimator, ClassifierMixin): + """ + Wraper for a BGC detection model than handles feature transformation and loading model definitions from JSON + """ + def __init__(self, transformer: features.ListTransformer, model, fit_params: dict, color=None, label=None): + """ + + :param transformer: ListTransformer used to transform Domain DataFrames into feature matrices + :param model: New instance of a BGC detection model + :param fit_params: Params to pass to the fit function of given model + :param color: Model color stored for plotting purposes + :param label: Model label stored for plotting purposes + """ + self.transformer = transformer + self.model = model + self.fit_params = fit_params + self.color = color + self.label = label + + def fit(self, samples, y, validation_samples=None, validation_y=None, **extra_fit_params): + """ + Train model with given list of samples, observe performance on given validation samples. + Domain DataFrames are converted to feature matrices using the pipeline's feature transformer. + :param samples: List of Domain DataFrames, each DataFrame contains one BGC or non-BGC sample's sequence of protein domains. + :param y: List of output values, one value for each sequence + :param validation_samples: List of validation samples + :param validation_y: List of validation sample outputs + :param extra_fit_params: Extra fitting parameters to pass to the fit function of given model + :return: self + """ + if validation_y is None: + validation_y = [] + if validation_samples is None: + validation_samples = [] + + self.transformer.fit(samples, y) + + train_X_list = self.transformer.transform(samples, y) + validation_X_list = self.transformer.transform(validation_samples, validation_y) + + merged_params = self.fit_params.copy() + merged_params.update(extra_fit_params) + return self.model.fit(train_X_list, y, validation_X_list=validation_X_list, validation_y_list=validation_y, **merged_params) + + def predict(self, sample): + X_list = self.transformer.transform(sample) + return self.model.predict(X_list) + + @classmethod + def from_config(cls, config, meta_only=False) -> 'DeepBGCModel': + """ + Load model configuration from a JSON config + :param config: Path to JSON config or loaded config dict + :param meta_only: Do not create feature transformers + :return: Untrained pipeline based on given config + """ + if isinstance(config, str): + with open(config) as f: + config = json.loads(f.read()) + elif isinstance(config, dict): + pass + else: + raise AttributeError('Invalid config type "{}": {}'.format(type(config), config)) + + print('Loaded model:') + pprint(config) + + color = config.get('color', 'grey') + label = config.get('label') + build_params = config.get('build_params', {}) + fit_params = config.get('fit_params', {}) + input_params = config.get('input_params', {}) + + # Get class from "models" module. Don't forget to import the class in models.__init__ first! + clf_class = getattr(models, config.get('type')) + + # Create a new model instance + model = clf_class(**build_params) + + if meta_only: + transformer = None + else: + feature_params = input_params.get('features', []) + transformer = features.ListTransformer.from_config(feature_params) + + return DeepBGCModel(transformer=transformer, model=model, fit_params=fit_params, color=color, label=label) + + def save(self, path) -> 'DeepBGCModel': + with open(path, 'wb') as f: + pickle.dump(self, f) + return self + + @classmethod + def load(cls, path) -> 'DeepBGCModel': + with open(path, 'rb') as f: + return pickle.load(f) + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a1d65ac --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup, find_packages +from deepbgc import VERSION + +install_requires = [ + 'argparse', + 'biopython', + 'scikit-learn', + 'pandas', + 'keras', + 'tensorflow', + 'hmmlearn', + 'matplotlib' +] + +setup( + name='deepbgc', + version=VERSION, + description='DeepBGC - Biosynthetic Gene Cluster detection and classification', + long_description=open('README.md', 'r').read(), + author='David Příhoda, Geoffrey Hannigan', + packages=find_packages(), + author_email='david.prihoda1@merck.com', + license='MIT', + install_requires=install_requires, + keywords='biosynthetic gene clusters, bgc detection, deep learning, pfam2vec', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Programming Language :: Python :: 3', + ], + include_package_data=True, + url='http://www.merck.com', + entry_points={ + 'console_scripts': ['deepbgc = deepbgc.main:main'] + } +)