diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d942898
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.idea
+*.pyc
+__pycache__
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..cbcf338
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright © 2018 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., Inc., Kenilworth, NJ, USA."
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/LICENSES_THIRD_PARTY b/LICENSES_THIRD_PARTY
new file mode 100644
index 0000000..f7a0b3c
--- /dev/null
+++ b/LICENSES_THIRD_PARTY
@@ -0,0 +1,27 @@
+--------------------------------------------------
+Third party dependencies listed by License type
+[Format: name (Python module) - URL]
+--------------------------------------------------
+
+Biopython License Agreement
+* Biopython (biopython) - https://github.com/biopython/biopython/blob/d5718bb7e3ee505b859b39c03f8ffad8a9a0be2f/LICENSE.rst
+
+OSI Approved (new BSD)
+* scikit-learn (scikit-learn) - https://github.com/scikit-learn/scikit-learn/blob/2e85c8608c93ad0e3290414c4e5e650b87d44b27/COPYING
+* hmmlearn (hmmlearn) - https://github.com/hmmlearn/hmmlearn/blob/1f60373d28c427a2a05c9ea26231c717772066dc/LICENSE.txt
+
+BSD 3-Clause License
+* Pandas (pandas) - https://github.com/pandas-dev/pandas/blob/5aba6659e422e985683cfb46c07c3364a02b6e5b/AUTHORS.md
+* HMMER - https://github.com/EddyRivasLab/hmmer/blob/3e38d667761e0a98a263079cb4a90e49d4b720d5/LICENSE
+
+MIT License (MIT)
+* Keras (keras) - https://github.com/keras-team/keras/blob/dc698c5486117780b643eda0a2f60a8753625b8a/LICENSE
+
+Apache Software License (Apache 2.0)
+* TensorFlow (tensorflow) - https://github.com/tensorflow/tensorflow/blob/6b6d843ccab78f9f91c3b98a43ca09ffecad4747/LICENSE
+
+Python Software Foundation License (BSD)
+* Matplotlib (matplotlib) - https://matplotlib.org/users/license.html
+
+GNU General Public License v3.0
+* Prodigal - https://github.com/hyattpd/Prodigal/blob/b1321f0899c4d7a835583feb344e2c9a5bd908d1/LICENSE
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5f171c0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+# DeepBGC: Biosynthetic Gene Cluster detection and classification.
+
+## Install DeepBGC
+
+- Run `pip install deepbgc` to install the `deepbgc` python module.
+
+## Prerequisities
+
+- Install Python 3.6 (version 3.7 is not supported by TensorFlow yet)
+- Install Prodigal and put the `prodigal` binary it on your PATH: https://github.com/hyattpd/Prodigal/releases
+- Install HMMER and put the `hmmscan` and `hmmpress` binaries on your PATH: http://hmmer.org/download.html
+- Download and **extract** Pfam database from: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam31.0/Pfam-A.hmm.gz
+
+## Use DeepBGC
+
+### Detection
+
+Detect BGCs in a genomic sequence.
+
+```bash
+# Show detection help
+deepbgc detect --help
+
+# Detect BGCs in a nucleotide sequence
+deepbgc detect --model DeepBGCDetector_v0.0.1.pkl --pfam Pfam-A.hmm --output myCandidates/ myInputSequence.fa
+
+# Detect BGCs with >0.9 score in existing Pfam CSV sequence
+deepbgc detect --model myModel.pkl --output myStrictCandidates/ -s 0.9 myCandidates/myCandidates.pfam.csv
+
+```
+
+### Classification
+
+Classify BGCs into one or more classes.
+
+```bash
+# Show classification help
+deepbgc classify --help
+
+# Predict biosynthetic class of detected BGCs
+deepbgc classify --model RandomForestMIBiGClasses_v0.0.1.pkl --output myCandidates/myCandidates.classes.csv myCandidates/myCandidates.candidates.csv
+
+```
diff --git a/deepbgc/__init__.py b/deepbgc/__init__.py
new file mode 100644
index 0000000..b911eeb
--- /dev/null
+++ b/deepbgc/__init__.py
@@ -0,0 +1,3 @@
+VERSION = '0.0.1'
+
+from .pipeline import DeepBGCModel
\ No newline at end of file
diff --git a/deepbgc/commands/__init__.py b/deepbgc/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/deepbgc/commands/base.py b/deepbgc/commands/base.py
new file mode 100644
index 0000000..649f9b3
--- /dev/null
+++ b/deepbgc/commands/base.py
@@ -0,0 +1,24 @@
+from abc import ABC, abstractmethod
+import argparse
+
+
+class BaseCommand(ABC):
+    """
+    Base abstract class for commands
+    """
+    command = ''
+    help = ""
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = subparsers.add_parser(cls.command, description=cls.help, help=cls.help,
+                                       formatter_class=argparse.RawTextHelpFormatter)
+        parser.set_defaults(func=cls)
+        return parser
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/deepbgc/commands/classify.py b/deepbgc/commands/classify.py
new file mode 100644
index 0000000..c2656ad
--- /dev/null
+++ b/deepbgc/commands/classify.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from deepbgc.commands.base import BaseCommand
+import os
+import pickle
+import numpy as np
+
+SCORE_COLUMN = 'deepbgc_score'
+
+class ClassifyCommand(BaseCommand):
+    command = 'classify'
+    help = """Classify BGCs into one or more classes.
+    
+Examples:
+    
+  deepbgc classify --model myClassifier.pkl --output classes.csv inputSequence.fa
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.output_path = args.output
+        self.input_path = args.input
+        self.model_path = args.model
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        parser.add_argument('-o', '--output', required=True, help="Output CSV file path.")
+        parser.add_argument('-m', '--model', required=True, help="Trained classification model file path.")
+        parser.add_argument(dest='input', help="Input candidate CSV file path.")
+
+    def run(self):
+        candidates = pd.read_csv(self.input_path)
+        if 'candidate_hash' not in candidates.columns:
+            raise AttributeError('Input CSV is not a candidate CSV file, "candidate_hash" column should be present.')
+
+        candidates = candidates.set_index('candidate_hash')
+
+        with open(self.model_path, 'rb') as f:
+            model = pickle.load(f)
+
+        vectors = domain_set_vectors(candidates)
+
+        predictions = predict_classes(vectors, model)
+        predictions.to_csv(self.output_path, index=False)
+        print('Saved {} predictions to {}'.format(len(predictions), self.output_path))
+
+
+def domain_set_vectors(candidates):
+    candidate_pfam_ids = [pfam_ids.split(';') for pfam_ids in candidates['pfam_ids']]
+    unique_pfam_ids = sorted(list(set([p for ids in candidate_pfam_ids for p in ids])))
+    print('Getting domain set vectors for {} candidates with {} unique Pfam IDs...'.format(len(candidates), len(unique_pfam_ids)))
+    vectors = pd.DataFrame(np.zeros((len(candidates), len(unique_pfam_ids))), columns=unique_pfam_ids)
+    for i, pfam_ids in enumerate(candidate_pfam_ids):
+        vectors.iloc[i][pfam_ids] = 1
+    return vectors
+
+
+def predict_classes(samples, model, add_classes_list=True):
+    # Set missing columns to 0
+    if not hasattr(model, 'input_columns'):
+        raise AttributeError('Trained model does not contain the "input_columns" attribute.')
+    if not hasattr(model, 'label_columns'):
+        raise AttributeError('Trained model does not contain the "label_columns" attribute.')
+
+    missing_columns = set(model.input_columns).difference(samples.columns)
+    for col in missing_columns:
+        samples[col] = 0
+    #print('Missing columns:\n{}'.format(sorted(list(missing_columns))))
+    print('Warning: Setting {} missing columns to 0'.format(len(missing_columns)))
+    samples = samples[model.input_columns]
+
+    results = np.array([r[:,1] for r in model.predict_proba(samples.values)]).transpose()
+    predictions = pd.DataFrame(results, index=samples.index, columns=model.label_columns)
+    if add_classes_list:
+        predictions['classes'] = [';'.join(model.label_columns[x >= 0.5]) for x in results]
+
+    return predictions
+
+def sequence_id_from_filename(path):
+    """
+    Create a basic sequence_id from a file name without extension
+    :param path: Path of file
+    :return: file name without extension that can be used as sequence_id
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
diff --git a/deepbgc/commands/detect.py b/deepbgc/commands/detect.py
new file mode 100644
index 0000000..b054b94
--- /dev/null
+++ b/deepbgc/commands/detect.py
@@ -0,0 +1,74 @@
+import pandas as pd
+from deepbgc.commands.base import BaseCommand
+from deepbgc.converter import SequenceToPfamCSVConverter
+import os
+from deepbgc.detector import DeepBGCDetector
+
+SCORE_COLUMN = 'deepbgc_score'
+
+class DetectCommand(BaseCommand):
+    command = 'detect'
+    help = """Detect BGCs in a genomic sequence.
+    
+Examples:
+    
+  # Detect BGCs in FASTA sequence with default settings
+  deepbgc detect --model myModel.pkl --output myDetections/ --pfam Pfam-A.hmm inputSequence.fa
+    
+  # Detect BGCs with >0.9 score in existing Pfam CSV sequence
+  deepbgc detect --model myModel.pkl --output myStrictDetections/ -s 0.9 myDetections/myDetections.pfam.csv
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.output_path = args.output
+        self.output_basename = os.path.basename(self.output_path)
+        self.input_path = args.input
+        self.model_path = args.model
+        self.score_threshold = args.score
+        self.converter = SequenceToPfamCSVConverter(db_path=args.pfam)
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        parser.add_argument('-o', '--output', required=True, help="Output folder path.")
+        parser.add_argument('-m', '--model', required=True, help="Trained detection model file path.")
+        parser.add_argument('-p', '--pfam', required=False, help="Pfam DB (Pfam-A.hmm) file path.")
+        parser.add_argument('-s', '--score', default=0.5, type=float, help="Average protein-wise DeepBGC score threshold for extracting BGC regions from domain sequences.")
+        parser.add_argument(dest='input', help="Input pfam CSV file path.")
+
+    def _outpath(self, suffix, extension):
+        return os.path.join(self.output_path, '{}.{}.{}'.format(self.output_basename, suffix, extension))
+
+    def run(self):
+        try:
+            os.makedirs(self.output_path, exist_ok=True)
+        except FileExistsError:
+            raise AttributeError("Output directory already exists: {}".format(self.output_path))
+        except Exception as e:
+            raise AttributeError("Output directory not writable: {}".format(self.output_path), e)
+
+        domain_path = self._outpath('pfam', 'csv')
+        if not self.converter.convert(self.input_path, domain_path):
+            # Input was already a pfam CSV file, use original path
+            domain_path = self.input_path
+
+        domains = pd.read_csv(domain_path)
+        detector = DeepBGCDetector(model=self.model_path)
+
+        candidates = detector.detect(domains, score_threshold=self.score_threshold)
+
+        cand_path = self._outpath('candidates', 'csv')
+        candidates.to_csv(cand_path, index=False)
+        print('Saved {} detected BGCs to {}'.format(len(candidates), cand_path))
+
+
+def sequence_id_from_filename(path):
+    """
+    Create a basic sequence_id from a file name without extension
+    :param path: Path of file
+    :return: file name without extension that can be used as sequence_id
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
diff --git a/deepbgc/commands/pfam.py b/deepbgc/commands/pfam.py
new file mode 100644
index 0000000..9855099
--- /dev/null
+++ b/deepbgc/commands/pfam.py
@@ -0,0 +1,37 @@
+from deepbgc.commands.base import BaseCommand
+from deepbgc.converter import SequenceToPfamCSVConverter
+
+
+class PfamCommand(BaseCommand):
+    command = 'pfam'
+    help = """Convert genomic BGCs sequence into a pfam domain CSV file by detecting proteins and pfam domains.
+    
+Examples:
+    
+  # Detect proteins and pfam domains in a FASTA sequence and save the result as csv file 
+  deepbgc pfam --pfam Pfam-A.hmm inputSequence.fa outputPfamSequence.csv
+  """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.input_path = args.input
+        self.output_path = args.output
+        self.converter = SequenceToPfamCSVConverter(db_path=args.pfam)
+
+    @classmethod
+    def add_subparser(cls, subparsers):
+        parser = super().add_subparser(subparsers)
+
+        # parser.add_argument('--mode', default='auto', choices=['auto', 'nucl', 'prot', 'pfam'],
+        #                     help="Input modes: \n"
+        #                          "--mode auto: Automatic based on file extension.\n"
+        #                          "--mode nucl: Nucleotide sequence without annotated genes. Will detect genes and pfam domains. \n"
+        #                          "--mode prot: Protein sequence. Will detect pfam domains.)")
+        parser.add_argument('-p', '--pfam', required=True, help="Pfam DB (Pfam-A.hmm) file path.")
+        parser.add_argument(dest='input', help="Input sequence file path.")
+        parser.add_argument(dest='output', help="Output pfam CSV file path.")
+
+    def run(self):
+        self.converter.convert(self.input_path, self.output_path)
+        print()
+        print('Saved Pfam CSV to: {}'.format(self.output_path))
diff --git a/deepbgc/converter.py b/deepbgc/converter.py
new file mode 100644
index 0000000..b527c45
--- /dev/null
+++ b/deepbgc/converter.py
@@ -0,0 +1,213 @@
+import shutil
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+import tempfile
+
+MAX_EVALUE = 0.01
+
+class SequenceToPfamCSVConverter:
+    def __init__(self, db_path):
+        self.db_path = db_path
+
+    def convert(self, input_path, output_path):
+
+        format = guess_format(input_path)
+
+        if not format:
+            raise NotImplementedError("Filetype not recognized: {}".format(input_path))
+        elif format == 'csv':
+            # Input is already a CSV file.
+            return False
+
+        if not self.db_path:
+            raise AttributeError('Pfam DB path not specified.')
+
+        from Bio import SeqIO
+        with tempfile.TemporaryDirectory() as tempdir:
+            sequences = SeqIO.parse(input_path, format)
+            if not sequences:
+                raise AttributeError("No sequences in {} file: {}".format(format, input_path))
+
+            with open(output_path, 'w') as outfile:
+                for i, sequence in enumerate(sequences):
+                    print("="*80)
+                    print('Processing sequence: {}'.format(sequence.id))
+                    print("="*80)
+                    print()
+
+                    #from Bio.Alphabet import NucleotideAlphabet
+                    #if not isinstance(sequence.seq.alphabet, NucleotideAlphabet):
+                    #    raise AttributeError("Unsupported alphabet: {}".format(sequence.seq.alphabet))
+
+                    nucl_path = os.path.join(tempdir, 'nucl.fa')
+                    SeqIO.write(sequence, nucl_path, 'fasta')
+
+                    protein_path = os.path.join(tempdir, 'proteins.fa')
+                    save_protein_sequence(nucl_path, protein_path)
+
+                    domtbl_path = os.path.join(tempdir, 'pfam.tbl')
+                    detect_pfam_domains(protein_path, self.db_path, domtbl_path)
+
+                    gene_locations = get_prodigal_gene_locations(protein_path)
+
+                    domains = domtbl_to_df(domtbl_path, gene_locations=gene_locations)
+                    domains.insert(0, 'sequence_id', sequence.id)
+                    domains.to_csv(outfile, index=False, header=(i == 0))
+        return True
+
+def guess_format(file_path):
+    _, ext = os.path.splitext(file_path)
+    if ext in ['.fa', '.fasta']:
+        return 'fasta'
+    elif ext in ['.gbk', '.gb', '.genbank']:
+        return 'genbank'
+    elif ext in ['.csv']:
+        return 'csv'
+    return None
+
+def get_prodigal_gene_locations(protein_path):
+    from Bio import SeqIO
+    proteins = SeqIO.parse(protein_path, 'fasta')
+    locations = []
+    for protein in proteins:
+        splits = protein.description.split('#')
+        try:
+            locations.append({
+                'protein_id': protein.id,
+                'start': int(splits[1]),
+                'end': int(splits[2]),
+                'strand': int(splits[3])
+            })
+        except Exception as e:
+            raise AttributeError('Invalid Prodigal gene description: "{}"'.format(protein.description), e)
+    return pd.DataFrame(locations).set_index('protein_id')
+
+
+def save_protein_sequence(input_path, protein_path):
+    if not shutil.which('prodigal'):
+        raise Exception("Prodigal needs to be installed and available on PATH in order to detect genes.")
+
+    print('Detecting genes using Prodigal...')
+
+    FNULL = open(os.devnull, 'w')
+    subprocess.call(['prodigal', '-i', input_path, '-a', protein_path], stdout=FNULL, stderr=FNULL)
+
+    if not os.path.exists(protein_path):
+        # TODO improve message
+        raise Exception("Unexpected error detecting genes using Prodigal")
+
+
+def detect_pfam_domains(protein_path, db_path, domtbl_path):
+    if not shutil.which('hmmscan') or not shutil.which('hmmpress'):
+        raise Exception(
+            "HMMscan and HMMpress needs to be installed and available on PATH in order to detect pfam domains.")
+
+    pressed_db_path = db_path + '.h3m'
+    if not os.path.exists(pressed_db_path):
+        print('Pressing pfam DB...')
+        subprocess.call(['hmmpress', db_path])
+
+        if not os.path.exists(pressed_db_path):
+            # TODO improve message
+            raise Exception("Unexpected error running HMMpress on Pfam DB")
+
+    print('Detecting pfam domains using HMMscan, this might take a while...')
+    FNULL = open(os.devnull, 'w')
+    subprocess.call(['hmmscan', '--domtblout', domtbl_path, db_path, protein_path], stdout=FNULL)
+
+    if not os.path.exists(domtbl_path):
+        # TODO improve message
+        raise Exception("Unexpected error detecting protein domains using HMMscan")
+
+
+SUPPORTED_FORMATS = [
+    'proteins2fasta'
+]
+
+
+def domtbl_to_df(domtbl_path, format=None, gene_locations=None):
+    """
+    Pfam hmmscan tabular format into internal Domain DataFrame format, one protein domain per line
+    :param domtbl_path: Path to HMMscan tabular format result file
+    :param format: Format of the protein fasta file that was passed into HMMscan (supported: proteins2fasta or None).
+    If it was generated by proteins2fasta, we can extract other values from the sequence ID.
+    :param gene_locations: DataFrame of (start, end, strand) indexed by protein ID
+    :return: Domain DataFrame, one protein domain per line
+    """
+    from Bio import SearchIO
+    # Read domain matches in all proteins
+    queries = SearchIO.parse(domtbl_path, 'hmmscan3-domtab')
+
+    # Extract all matched domain hits
+    domains = []
+    for query in queries:
+        query_domains = []
+        for hit in query.hits:
+            best_index = np.argmin([hsp.evalue for hsp in hit.hsps])
+            best_hsp = hit.hsps[best_index]
+            pfam_id = hit.accession.split('.')[0]
+            evalue = float(best_hsp.evalue)
+            if evalue > MAX_EVALUE:
+                continue
+            query_domains.append({
+                'pfam_id': pfam_id,
+                'query_id': query.id,
+                'domain_start': int(best_hsp.query_start),
+                'domain_end': int(best_hsp.query_end)
+            })
+        domains += sorted(query_domains, key=lambda x: x['domain_start'])
+
+    domains = pd.DataFrame(domains)
+
+    num_domains = len(domains)
+    print('Detected {} Pfam domain hits'.format(num_domains))
+
+    fields = ['pfam_id', 'domain_start', 'domain_end']
+
+    # Use sequence id generated by proteins2fasta.py to get our gene info directly.
+    if format == 'proteins2fasta':
+        domains['sequence_id'] = domains['query_id'].apply(lambda s: s.split('|')[0])
+        domains['locus_tag'] = domains['query_id'].apply(lambda s: s.split('|')[1])
+        domains['protein_id'] = domains['query_id'].apply(lambda s: s.split('|')[2])
+        domains['gene_start'] = domains['query_id'].apply(lambda s: normalize_gene_coord(s.split('|')[3].split('-')[0]))
+        domains['gene_end'] = domains['query_id'].apply(lambda s: normalize_gene_coord(s.split('|')[3].split('-')[1]))
+        domains['gene_strand'] = domains['query_id'].apply(lambda s: s.split('|')[4])
+        fields = ['contig_id', 'locus_tag', 'protein_id', 'gene_start', 'gene_end', 'gene_strand', 'pfam_id',
+                  'domain_start', 'domain_end']
+    elif format is None:
+        missing_ids = set(domains['query_id']).difference(gene_locations.index.values)
+        if len(missing_ids):
+            raise AttributeError("There are {} protein IDs missing: {}".format(len(missing_ids), list(missing_ids)[:10]))
+        locations = gene_locations.loc[domains['query_id']]
+        domains['protein_id'] = locations.index.values
+        domains['gene_start'] = locations['start'].values
+        domains['gene_end'] = locations['end'].values
+        domains['gene_strand'] = locations['strand'].values
+
+        fields = ['protein_id', 'gene_start', 'gene_end', 'gene_strand'] + fields
+
+    else:
+        raise AttributeError(
+            'Format {} not supported, use one of {} or define the protein file.'.format(format, SUPPORTED_FORMATS))
+    domains = domains[fields]
+
+    return domains
+
+
+def normalize_gene_coord(gene_coord):
+    """
+    Normalize imprecise gene coordinates, will pfam <0 into 0 and >1234 into 1234.
+    :param gene_coord: Gene coordinate as int or str
+    :return: normalized numeric gene coordinate
+    """
+    if isinstance(gene_coord, int):
+        return gene_coord
+    if isinstance(gene_coord, str):
+        # TODO: Can we turn <0 into 0 and >1234 into 1234?
+        if gene_coord.startswith('<') or gene_coord.startswith('>'):
+            gene_coord = gene_coord[1:]
+        if gene_coord.isnumeric():
+            return int(gene_coord)
+    raise AttributeError('Invalid gene coord {} ({})'.format(gene_coord, type(gene_coord)))
diff --git a/deepbgc/detection/__init__.py b/deepbgc/detection/__init__.py
new file mode 100644
index 0000000..ffe2ca6
--- /dev/null
+++ b/deepbgc/detection/__init__.py
@@ -0,0 +1,3 @@
+from .hmm_discrete import DiscreteHMM, GeneBorderHMM, ClusterFinderHMM
+from .hmm_gaussian import GaussianHMM
+from .rnn import KerasRNN
\ No newline at end of file
diff --git a/deepbgc/detection/hmm_discrete.py b/deepbgc/detection/hmm_discrete.py
new file mode 100644
index 0000000..8896de5
--- /dev/null
+++ b/deepbgc/detection/hmm_discrete.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python
+# David Prihoda
+# HMM models for BGC domain-level prediction
+# Emission probability can be calculated from positive and negative training samples.
+# Starting and transition probability have to be provided.
+
+import numpy as np
+from hmmlearn import hmm
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
+import pickle
+import os
+
+
+class HMM(BaseEstimator, ClassifierMixin):
+    """
+    HMM model parent class providing Sklearn mixins and saving/loading functionality
+    """
+    def save(self, path):
+        with open(path, 'wb') as f:
+            pickle.dump(self, f)
+        return self
+
+    @classmethod
+    def load(cls, path):
+        with open(path, 'rb') as f:
+            return pickle.load(f)
+
+
+class DiscreteHMM(HMM):
+
+    def get_sample_vector(self, X):
+        """
+        Turn pfam IDs into integers based on our vocabulary
+        :param X: DataFrame of domains with pfam_id column
+        :return: numpy array of numbers representing given words in our vocabulary
+        """
+        return np.array([self.vocabulary_.get(o, -1) for o in X['pfam_id']])
+
+    def predict(self, X: pd.DataFrame):
+        """
+        Get BGC prediction score for a Domain DataFrame
+        :param X: DataFrame with pfam domains
+        :return: numpy array of BGC prediction scores for each domain in X
+        """
+        word_vector = self.get_sample_vector(X)
+        # Predict posterior probability using our HMM
+        logprob, posteriors = self.model_.score_samples(word_vector.reshape(-1, 1))
+        # BGC state probability is in second column
+        return posteriors[:,1]
+
+    def _get_pfam_counts(self, X, y):
+        """
+        Get number of occurences of each pfam ID in negative (non-BGC) and positive (BGC) states
+        :param X: Domain DataFrame with pfam_id column
+        :param y: Series of states for each domain (0 = non-BGC, 1 = BGC)
+        :return: DataFrame with number of positive and negative occurences (pos and neg columns) of each pfam_id (index).
+        """
+        counts = X[['pfam_id']].drop_duplicates().set_index('pfam_id')
+        unique_y = set(y)
+        if unique_y != {0, 1}:
+            raise AttributeError('Invalid target values, expected {0, 1} got '+str(unique_y))
+        counts['pos'] = X[y == 1]['pfam_id'].value_counts()
+        counts['neg'] = X[y == 0]['pfam_id'].value_counts()
+        return counts.fillna(0)
+
+    def _construct_model(self, startprob, transmat, emissionprob, vocabulary):
+        """
+        Create internal HMM model with given matrices and store it to self.model_
+        :param startprob: Starting probability [negative_starting_prob, positive_starting_prob]
+        :param transmat: Transition matrix (An array where the [i][j]-th element corresponds to the posterior probability of transitioning between the i-th to j-th)
+        :param emissionprob: Emission probability [[neg_pfam1, neg_pfam2, ...], [pos_pfam1, pos_pfam2, ...]] with pfam IDs indexed by their vocabulary index numbers
+        :param vocabulary: Vocabulary dictionary with {pfam_id: index_number_in_emission}
+        :return: self
+        """
+        self.model_ = hmm.MultinomialHMM(n_components=2)
+        if isinstance(startprob, list):
+            startprob = np.array(startprob)
+        if isinstance(transmat, list):
+            transmat = np.array(transmat)
+        self.model_.startprob_ = startprob
+        self.model_.transmat_ = transmat
+        self.model_.emissionprob_ = emissionprob
+        self.vocabulary_ = vocabulary
+        return self
+
+    def fit(self, X_list, y_list, sample_weights=None, startprob=None, transmat=None, verbose=0,
+            default_emission_count=0.01, debug_progress_path=None, validation_X_list=None, validation_y_list=None):
+        """
+        Create and train internal HMM model based on list of positive and negative samples.
+        Emission probability will be calculated from samples. Starting and transition probability have to be provided.
+
+        :param X_list: List of samples (Domain DataFrames)
+        :param y_list: List of sample states (0 or 1), one value for each sample (DataFrame)
+        :param sample_weights: List of sample weights, marking their contribution to the emission probability. If not provided, will be set to 1 for all samples.
+        :param startprob: Starting probability [negative_starting_prob, positive_starting_prob]
+        :param transmat: Transition matrix (An array where the [i][j]-th element corresponds to the posterior probability of transitioning between the i-th to j-th)
+        :param verbose: Verbosity (0 = no output, 1 = plot top pfams for positive and negative states)
+        :param default_emission_count: Emission value for the other state for pfams that appear only in the positive / negative state
+        :param debug_progress_path: Not used in HMM models.
+        :param validation_X_list: List of validation samples, not used in HMM models.
+        :param validation_y_list: List of validation states, not used in HMM models.
+        :return: self
+        """
+        if validation_X_list:
+            print('DiscreteHMM: Validation is present but has no effect yet.')
+        if startprob is None:
+            raise ValueError('Calculating start probability not supported yet, specify startprob explicitly')
+        if transmat is None:
+            raise ValueError('Calculating transition matrix not supported yet, specify transmat explicitly')
+
+        if sample_weights is not None:
+            zipped = enumerate(zip(X_list, y_list, sample_weights))
+            weighted_counts = [self._get_pfam_counts(X, y) * weight for i, (X, y, weight) in zipped]
+            all_counts = pd.concat(weighted_counts).reset_index().groupby('pfam_id').sum().sort_index()
+        else:
+            X: pd.DataFrame = pd.concat(X_list)
+            y: pd.DataFrame = pd.concat(y_list)
+            all_counts = self._get_pfam_counts(X, y).sort_index()
+
+        if verbose:
+            print('Top positive:')
+            print(all_counts.sort_values(by='pos', ascending=False).head(3))
+            print('Top negative:')
+            print(all_counts.sort_values(by='neg', ascending=False).head(3))
+
+        # For a pfam_id that appears only in the positive / negative state, set the default emission count instead of 0
+        all_counts.replace(0, default_emission_count, inplace=True)
+
+        # Vocabulary stores map of pfam_id -> index in emission vector
+        vocabulary = {pfam_id: i for i, pfam_id in enumerate(all_counts.index)}
+
+        emissions = all_counts[['neg', 'pos']].values
+        # Divide each state's emission counts by the total number of observations to get emission frequency
+        emissions /= emissions.sum(axis=0)
+        # Add default emissions for unseen pfam_ids to the end (will be indexed by -1)
+        emissions = np.concatenate([emissions, np.array([[0.5, 0.5]])])
+
+        self._construct_model(startprob, transmat, emissions.T, vocabulary)
+        return self
+
+    def get_sample_emissions(self, sample):
+        word_index = self.get_sample_vector(sample)
+        return pd.DataFrame({
+            'OUT': [None if x == -1 else self.model_.emissionprob_[0][x] for x in word_index],
+            'BGC': [None if x == -1 else self.model_.emissionprob_[1][x] for x in word_index]
+        })
+
+
+class GeneBorderHMM(HMM):
+    """
+    HMM that only changes its state at gene borders.
+    Implemented by turning each input symbol (pfam ID) into a tuple of (pfam ID, is_at_gene_end)
+    and each negative and positive state into four states with tuples (positive/negative, is_at_gene_end)
+
+    Emissions at gene ends have 0 emission probability in states that are not at gene ends and vice versa.
+    Transitions can only happen from states where is_at_gene_end = True, which means probability is set to 0 for all other transitions.
+    """
+    def _convert_startprob(self, startprob):
+        if startprob is None:
+            return
+        # Start probability
+        start_out = startprob[0]
+        start_bgc = startprob[1]
+        return np.array([start_out / 2, start_out / 2, start_bgc / 2, start_bgc / 2])
+        #print('Converted to four state start probability:')
+        #print(self.model.startprob_)
+
+    def _convert_transmat(self, transmat, X_list, verbose=0):
+        if transmat is None:
+            return
+
+        num_gene_end = sum([sum(get_sample_gene_ends(X['protein_id'])) for X in X_list])
+        num_total = sum([len(X) for X in X_list])
+        frac_in_gene_end = num_gene_end / num_total
+        if verbose:
+            print('Gene end: {} ({}/{})'.format(frac_in_gene_end, num_gene_end, num_total))
+
+        # Transition probability
+        out2bgc = transmat[0][1] * frac_in_gene_end
+        out2out = 1 - out2bgc
+        bgc2out = transmat[1][0] * frac_in_gene_end
+        bgc2bgc = 1 - bgc2out
+
+        converted = np.array([
+            [0.5, 0.5, 0, 0],
+            [out2out / 2, out2out / 2, out2bgc / 2, out2bgc / 2],
+            [0, 0, 0.5, 0.5],
+            [bgc2out / 2, bgc2out / 2, bgc2bgc / 2, bgc2bgc / 2]
+        ])
+        if verbose:
+            print('Converted to four state transitions:')
+            print(converted)
+        return converted
+
+    def _convert_emission(self, old_emissionprob, old_vocabulary):
+        # Emissions
+        num_words = len(old_vocabulary)
+        out_emissions = old_emissionprob[0][:-1]
+        bgc_emissions = old_emissionprob[1][:-1]
+        zero_emissions = np.zeros(num_words)
+        default_emission = old_emissionprob[0][-1]
+
+        emissionprob = np.zeros((4, num_words * 2 + 2))
+        emissionprob[0] = np.concatenate([out_emissions, zero_emissions, [default_emission, 0]])
+        emissionprob[1] = np.concatenate([zero_emissions, out_emissions, [0, default_emission]])
+        emissionprob[2] = np.concatenate([bgc_emissions, zero_emissions, [default_emission, 0]])
+        emissionprob[3] = np.concatenate([zero_emissions, bgc_emissions, [0, default_emission]])
+
+        # Vocabulary
+        vocabulary = {}
+        for pfam_id, word_index in old_vocabulary.items():
+            vocabulary[(pfam_id, False)] = word_index
+            vocabulary[(pfam_id, True)] = word_index + num_words
+
+        return emissionprob,  vocabulary
+
+    def _get_word_index(self, pfam_id, is_gene_end):
+        default_index = -1 if is_gene_end else -2
+        return self.vocabulary_.get((pfam_id, is_gene_end), default_index)
+
+    def get_sample_vector(self, X):
+        is_gene_end = get_sample_gene_ends(X['protein_id'])
+        if not any(is_gene_end):
+            print('Warning: no gene end predicted: '+str(X.head(1)))
+        return np.array([self._get_word_index(x, is_gene_end[i]) for i, x in enumerate(X['pfam_id'].values)])
+
+    def predict(self, X):
+        sample_vector = self.get_sample_vector(X)
+        prev_level = np.geterr()['divide']
+        np.seterr(divide='ignore')
+        logprob, posteriors = self.model_.score_samples(sample_vector.reshape(-1, 1))
+        np.seterr(divide=prev_level)
+        # final prediction is maximum of the probability of the last two states
+        prediction = posteriors[:,2:]
+        return np.max(prediction, axis=1)
+
+    def fit(self, X_list, y_list, startprob=None, transmat=None, verbose=1, debug_progress_path=None, validation_X_list=None, validation_y_list=None):
+        if validation_X_list:
+            print('GeneBorderHMM: Validation is present but has no effect yet.')
+        if verbose:
+            print('Training two state model...')
+
+        two_state_model = DiscreteHMM()
+        two_state_model.fit(X_list, y_list, startprob=startprob, transmat=transmat, verbose=verbose)
+
+        emission, self.vocabulary_ = self._convert_emission(two_state_model.model_.emissionprob_, two_state_model.vocabulary_)
+
+        self.model_ = hmm.MultinomialHMM(n_components=4)
+        self.model_.startprob_ = self._convert_startprob(startprob)
+        self.model_.transmat_ = self._convert_transmat(transmat, X_list)
+        self.model_.emissionprob_ = emission
+        return self
+
+    def get_sample_emissions(self, X):
+        sample_vector = self.get_sample_vector(X)
+        return pd.DataFrame({
+            'OUT_IN_GENE': [None if x < 0 else self.model_.emissionprob_[0][x] for x in sample_vector],
+            'OUT_GENE_END': [None if x < 0 else self.model_.emissionprob_[1][x] for x in sample_vector],
+            'BGC_IN_GENE': [None if x < 0 else self.model_.emissionprob_[2][x] for x in sample_vector],
+            'BGC_GENE_END': [None if x < 0 else self.model_.emissionprob_[3][x] for x in sample_vector]
+        })
+
+
+class ClusterFinderHMM(DiscreteHMM):
+    """
+    Wrapper that loads the ClusterFinder trained model from the pickled starting, transition and emission matrices.
+    """
+    def fit(self, X_unused, y_unused, param_dir=None, **kwargs):
+
+        with open(os.path.join(param_dir, 'NewTS_all_B_index.pkl'), 'rb') as pfile:
+            cf_vocabulary = pickle.load(pfile)
+
+        # Start probability
+        with open(os.path.join(param_dir, 'SP_arr.pkl'), 'rb') as pfile:
+            cf_start = pickle.load(pfile, encoding='latin1')
+
+        # Transition probability between states
+        with open(os.path.join(param_dir, 'TP_arr_A.pkl'), 'rb') as pfile:
+            cf_transition = pickle.load(pfile, encoding='latin1')
+
+        # Emission probability for each state and pfam
+        with open(os.path.join(param_dir, 'NewTS_all_B_reduced_6filter.pkl'), 'rb') as pfile:
+            cf_emission = pickle.load(pfile, encoding='latin1')
+
+        # Add default emission to the end of the emission matrix
+        # Default emission is used when the observed sequence contains words that didn't appear in our vocabulary
+        # The value actually does not matter as long as it's the same for both states
+        cf_default_emission = 1.6026668376177961e-07
+        cf_default_emissions = np.array([[cf_default_emission], [cf_default_emission]])
+        cf_emission = np.append(cf_emission, cf_default_emissions, axis=1)
+        print('Default emission', cf_default_emission)
+
+        # Create HMM with given parameters
+        # States are flipped to use more intuitive NONBGC=0, BGC=1
+        startprob = np.array([cf_start[1], cf_start[0]])
+        transmat = np.array([[cf_transition[1][1], cf_transition[1][0]],
+                             [cf_transition[0][1], cf_transition[0][0]]])
+        emissionprob = np.array([cf_emission[1], cf_emission[0]])
+        print('Start probability (0=NONBGC, 1=BGC):\n', startprob)
+        print('Transition probability (0=NONBGC, 1=BGC):\n', transmat)
+        print('Emission probability (0=NONBGC, 1=BGC):\n', emissionprob)
+
+        self._construct_model(startprob=startprob, transmat=transmat, emissionprob=emissionprob, vocabulary=cf_vocabulary)
+        return self
+
+
+def get_sample_gene_ends(gene_ids):
+    """
+    For list of Gene IDs, return list of boolean values that mark whether the next gene is different (or we are at end of sequence)
+    :param gene_ids: List of gene IDs
+    :return: list of boolean values that mark whether the next gene is different (or we are at end of sequence)
+    """
+    gene_ends = list(gene_ids[:-1].values != gene_ids[1:].values) + [True]
+    return np.array(gene_ends).astype(np.uint8)
+
diff --git a/deepbgc/detection/hmm_gaussian.py b/deepbgc/detection/hmm_gaussian.py
new file mode 100644
index 0000000..8c0d0a2
--- /dev/null
+++ b/deepbgc/detection/hmm_gaussian.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# David Prihoda
+# Gaussian HMM model for BGC domain-level prediction
+# Experimental, did not get satisfactory results
+
+import pandas as pd
+from sklearn import mixture
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+import pickle
+from hmmlearn import hmm
+
+
+class GaussianHMM(BaseEstimator, ClassifierMixin):
+
+    def __init__(self, num_pos_means=5, num_neg_means=5, covariance_type="diag", meta=None):
+        self.num_pos_means = num_pos_means
+        self.num_neg_means = num_neg_means
+        self.covariance_type = covariance_type
+        self.meta = meta or {}
+
+    def predict(self, X):
+        # Predict posterior probability using our HMM
+        logprob, posteriors = self.model_.score_samples(X)
+        # BGC state probability is in second column
+        return posteriors[:,1]
+
+    def fit(self, X_list, y_list, startprob=None, transmat=None, verbose=1, debug_progress_path=None, validation_X_list=None, validation_y_list=None):
+        if validation_X_list:
+            print('GaussianHMM: Validation is present but has no effect yet.')
+        if startprob is None:
+            raise ValueError('Calculating start probability not supported yet, specify startprob explicitly')
+        if transmat is None:
+            raise ValueError('Calculating transition matrix not supported yet, specify transmat explicitly')
+
+        X = np.concatenate(X_list)
+        y = np.concatenate(y_list)
+        pos_vectors = X[y == 1]
+        neg_vectors = X[y == 0]
+
+        if verbose:
+            print('Training positive GMM on {} vectors'.format(len(pos_vectors)))
+        pos_gmm = mixture.GaussianMixture(n_components=self.num_pos_means, covariance_type=self.covariance_type)
+        pos_gmm.fit(pos_vectors)
+
+        if verbose:
+            print('Training negative GMM on {} vectors'.format(len(neg_vectors)))
+        neg_gmm = mixture.GaussianMixture(n_components=self.num_neg_means, covariance_type=self.covariance_type)
+        neg_gmm.fit(neg_vectors)
+
+        self.model_ = GMMHMM2(n_components=2, covariance_type=self.covariance_type, verbose=bool(verbose))
+        self.model_.startprob_ = startprob
+        self.model_.transmat_ = transmat
+        self.model_.gmms_ = np.array([neg_gmm, pos_gmm])
+        return self
+
+    def save(self, path):
+        pickle.dump(self, path)
+        return self
+
+    @classmethod
+    def load(cls, path):
+        return pickle.load(path)
+
+    def get_sample_emissions(self, X):
+        feature_matrix = self.features.get_feature_matrix(X)
+        return pd.DataFrame({
+            'OUT': self.model_.gmms_[0].score_samples(feature_matrix),
+            'BGC': self.model_.gmms_[1].score_samples(feature_matrix)
+        })
+
+
+class GMMHMM2(hmm.GMMHMM):
+    def __init__(self, n_components=1,
+                 startprob_prior=1.0, transmat_prior=1.0,
+                 covariance_type='diag', covars_prior=1e-2,
+                 algorithm="viterbi", random_state=None,
+                 n_iter=10, tol=1e-2, verbose=False,
+                 params="stmcw", init_params="stmcw"):
+        hmm._BaseHMM.__init__(self, n_components,
+                          startprob_prior=startprob_prior,
+                          transmat_prior=transmat_prior,
+                          algorithm=algorithm, random_state=random_state,
+                          n_iter=n_iter, tol=tol, verbose=verbose,
+                          params=params, init_params=init_params)
+
+        self.covariance_type = covariance_type
+        self.covars_prior = covars_prior
+        self.gmms_ = []
+
+    def _compute_log_likelihood(self, X):
+        return np.array([g.score_samples(X) for g in self.gmms_]).T
diff --git a/deepbgc/detection/rnn.py b/deepbgc/detection/rnn.py
new file mode 100644
index 0000000..1e2a868
--- /dev/null
+++ b/deepbgc/detection/rnn.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python
+# David Prihoda
+# Generic LSTM wrapper used for the DeepBGC model
+
+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+
+class KerasRNN(BaseEstimator, ClassifierMixin):
+    """
+    Generic LSTM wrapper used for the DeepBGC model
+    """
+    def __init__(self, trained_model=None, batch_size=1, hidden_size=128, loss='binary_crossentropy', stateful=True,
+                 activation='sigmoid', return_sequences=True):
+        from keras.models import Sequential
+        if trained_model is not None:
+            self.model: Sequential = trained_model
+            # Set the attributes from the model object to be able to clone and cross-validate a loaded model
+            self.batch_size = trained_model.layers[0].batch_input_shape[0]
+            self.hidden_size = trained_model.layers[0].layer.units
+            self.stateful = trained_model.layers[0].layer.stateful
+            self.loss = trained_model.loss
+            self.activation = trained_model.layers[-1].layer.activation
+            self.return_sequences = trained_model.layers[0].layer.return_sequences
+        else:
+            self.model: Sequential = None
+            self.batch_size = batch_size
+            self.hidden_size = hidden_size
+            self.loss = loss
+            self.stateful = stateful
+            self.activation = activation
+            self.return_sequences = return_sequences
+
+    def _build_model(self, input_size, stacked_sizes=None, fully_connected_sizes=None, optimizer_name=None, learning_rate=None, decay=None, gpus=0, custom_batch_size=None):
+        """
+        Build Keras Sequential model architecture with given parameters
+        :param input_size: Dimensionality of input vector (number of features)
+        :param stacked_sizes: Add given number of additional Bi-LSTM layers after first Bi-LSTM layer, provided as list of sizes
+        :param fully_connected_sizes: Add a given number of additional fully connected layers after the Bi-LSTM layers, provided as list of sizes
+        :param optimizer_name: Name of Keras optimizer, default 'adam'
+        :param learning_rate: Keras learning rate
+        :param decay: Optimizer decay
+        :param gpus: Number of gpus to train on (Not implemented)
+        :param custom_batch_size: Use different batch size than self.batch_size
+        :return: Keras Sequential model
+        """
+        from keras.layers.core import Dense
+        from keras.layers.recurrent import LSTM
+        from keras.layers.wrappers import TimeDistributed, Bidirectional
+        from keras.models import Sequential
+        from keras import optimizers
+        if stacked_sizes is None:
+            stacked_sizes = []
+        if fully_connected_sizes is None:
+            fully_connected_sizes = []
+
+        model = Sequential()
+
+        model.add(Bidirectional(
+            layer=LSTM(
+                units=self.hidden_size,
+                return_sequences=True,
+                dropout=0.2,
+                recurrent_dropout=0.2,
+                stateful=self.stateful
+            ),
+            batch_input_shape=(custom_batch_size or self.batch_size, None, input_size)
+        ))
+
+        for size in stacked_sizes:
+            model.add(Bidirectional(layer=LSTM(units=size, return_sequences=True, stateful=self.stateful)))
+
+        for size in fully_connected_sizes:
+            model.add(TimeDistributed(Dense(size, activation='sigmoid')))
+
+        model.add(TimeDistributed(Dense(1, activation='sigmoid')))
+
+        if gpus > 1:
+            raise NotImplementedError("Multi GPU model not implemented due to input size mismatch.")
+            #model = multi_gpu_model(model, gpus=gpus)
+
+        if optimizer_name is None:
+            optimizer_name = "adam"
+
+        optimizer_args = {}
+        if learning_rate is not None:
+            optimizer_args['lr'] = learning_rate
+        if decay is not None:
+            optimizer_args['decay'] = decay
+
+        if optimizer_name == 'adam':
+            optimizer = optimizers.Adam(**optimizer_args)
+        elif optimizer_args:
+            raise ValueError('Optimizer {} not implemented for custom params yet'.format(optimizer_name))
+        else:
+            optimizer = optimizer_name
+
+        print('Using optimizer', optimizer_name, optimizer_args)
+        model.compile(loss=self.loss, optimizer=optimizer, sample_weight_mode='temporal', metrics=["accuracy", precision, recall, auc_roc])
+        return model
+
+    def fit(self, X_list, y_list, timesteps=128, validation_size=0.33, num_epochs=10, verbose=1,
+            debug_progress_path=None, fully_connected_sizes=None,
+            shuffle=True, gpus=0, stacked_sizes=None, early_stop_mode=None, early_stop_monitor=None, early_stop_min_delta=0.005, early_stop_patience=10,
+            positive_weight=None, weighted=False, optimizer=None, learning_rate=None, decay=None,
+            validation_X_list=None, validation_y_list=None):
+        """
+        Train Keras Sequential model using provided list of positive / negative samples.
+        Training is done in given number of epochs with additional stopping criteria.
+        In each epoch, we go over all samples in X_list, which are shuffled randomly and merged together into artificial genomes.
+
+        :param X_list: List of DataFrames (samples) where each DataFrame contains protein domains represented by numeric vectors
+        :param y_list: List of output values, one value for each sample where 0 = negative sample (non-BGC), 1 = positive sample (BGC)
+        :param timesteps: Number of timesteps (protein domains) in one batch
+        :param validation_size: Fraction of samples to use for testing
+        :param num_epochs: Number of epochs. If early stopping is defined, this serves as a limit of maximum number of epochs.
+        :param verbose: Verbosity (0 = silent, 1 = verbose, 2 = very verbose)
+        :param debug_progress_path: Log Tensorboard information in given folder
+        :param fully_connected_sizes: Add a given number of additional fully connected layers after the Bi-LSTM layers, provided as list of sizes
+        :param shuffle: Whether to shuffle samples within each epoch. If not used, make sure that positive and negative samples are already shuffled in the list.
+        :param gpus: Number of gpus to use (not implemented!)
+        :param stacked_sizes: Add given number of additional Bi-LSTM layers after first Bi-LSTM layer, provided as list of sizes
+        :param early_stop_mode: Keras early stopping mode (use max for increasing metrics like AUC ROC, use min for decreasing metrics like Loss)
+        :param early_stop_monitor: Metric to observe for early stopping (e.g. val_auc_roc)
+        :param early_stop_min_delta: Minimum change to observed metric needed to continue training
+        :param early_stop_patience: Number of epochs to get maximum value of the observed metric from, if that value does not improve over the previous maximum, stop training
+        :param positive_weight: Weight of positive samples (single number). Can be used to counter imbalance in training data.
+        :param weighted: Calculate positive weight automatically as num negatives / num positive samples in input training data (y_list).
+        :param optimizer: Name of Keras optimizer, default 'adam'.
+        :param learning_rate: Keras learning rate
+        :param decay: Keras optimizer decay.
+        :param validation_X_list: List of DataFrames (samples) used to observe validation performance
+        :param validation_y_list: List of output values for validation samples, one value for each sample where 0 = negative sample (non-BGC), 1 = positive sample (BGC)
+        :return: self
+        """
+
+        import keras
+
+        if not isinstance(X_list, list):
+            raise AttributeError('Expected X_list to be list, got ' + str(type(X_list)))
+
+        if not isinstance(y_list, list):
+            raise AttributeError('Expected y_list to be list, got ' + str(type(X_list)))
+
+        if weighted:
+            if positive_weight:
+                raise ValueError('Positive weight cannot be specified together with weighted=true.')
+            num_neg = _count_samples(y_list, 0)
+            num_pos = _count_samples(y_list, 1)
+            positive_weight = num_neg / num_pos
+            print('Negative: {}, Positive: {}'.format(num_neg, num_pos))
+            print('Weighing positives based on ratio, weight:', positive_weight)
+
+        input_size = X_list[0].shape[1]
+
+        train_model = self._build_model(input_size, stacked_sizes, fully_connected_sizes=fully_connected_sizes, optimizer_name=optimizer, learning_rate=learning_rate, decay=decay, gpus=gpus)
+        self.model = self._build_model(input_size, stacked_sizes, fully_connected_sizes=fully_connected_sizes, optimizer_name=optimizer, learning_rate=learning_rate, decay=decay, gpus=gpus, custom_batch_size=1)
+
+        X_train, y_train = X_list, y_list
+        validation_data, validation_num_batches = None, None
+
+        if validation_X_list:
+            if positive_weight:
+                print('Warning: Not using positive_weight "{}" on external validation set!'.format(positive_weight))
+            if validation_size:
+                print('Warning: LSTM validation size {} specified but ignored, '
+                      'because external validation set is also present.'.format(validation_size))
+
+            print('Validating on external validation set of {} samples'.format(len(validation_X_list)))
+            validation_data = _repeat_to_fill_batch_size(validation_X_list, validation_y_list, self.batch_size, input_size)
+            validation_num_batches = None
+        elif validation_size:
+            print('Validating on {:.1f}% of input set'.format(validation_size*100))
+            X_train, X_validation, y_train, y_validation = train_test_split(X_list, y_list, test_size=validation_size)
+
+            get_validation_gen, validation_num_batches = _build_generator(
+                X_validation,
+                y_validation,
+                batch_size=self.batch_size,
+                timesteps=timesteps,
+                input_size=input_size,
+                shuffle=shuffle,
+                positive_weight=positive_weight
+            )
+            validation_data = get_validation_gen()
+
+        get_train_gen, train_num_batches = _build_generator(
+            X_train,
+            y_train,
+            batch_size=self.batch_size,
+            timesteps=timesteps,
+            input_size=input_size,
+            shuffle=shuffle,
+            positive_weight=positive_weight,
+        )
+        train_gen = get_train_gen()
+
+
+        callbacks = []
+        if debug_progress_path:
+            tb = keras.callbacks.TensorBoard(log_dir=debug_progress_path, histogram_freq=0, batch_size=self.batch_size,
+                                             write_graph=True,
+                                             write_grads=False, write_images=False,
+                                             embeddings_layer_names=None, embeddings_metadata=None)
+            callbacks.append(tb)
+
+        if early_stop_monitor:
+            if not early_stop_mode:
+                raise ValueError('Keras early_stop_mode has to be specified (min, max, auto) to enable early_stop_monitor.')
+
+            callbacks.append(keras.callbacks.EarlyStopping(
+                min_delta=early_stop_min_delta,
+                monitor=early_stop_monitor,
+                patience=early_stop_patience,
+                mode=early_stop_mode,
+                verbose=1
+            ))
+
+        with _get_device(gpus):
+            history = train_model.fit_generator(
+                generator=train_gen,
+                steps_per_epoch=train_num_batches,
+                shuffle=False,
+                epochs=num_epochs,
+                validation_data=validation_data,
+                validation_steps=validation_num_batches,
+                callbacks=callbacks,
+                verbose=verbose
+            )
+
+        trained_weights = train_model.get_weights()
+        self.model.set_weights(trained_weights)
+
+        return history
+
+    def predict(self, X):
+        """
+        Predict given sample DataFrame/numpy matrix of numeric protein vectors
+        :param X: DataFrame/numpy matrix of protein vectors
+        :return: BGC prediction score for each protein vector
+        """
+        if len(X.shape) != 2:
+            raise AttributeError('Can only be called on a single 2-dimensional feature matrix.')
+
+        if self.model is None:
+            raise AttributeError('Cannot predict using untrained model.')
+
+        batch_matrix = X.reshape(1, X.shape[0], X.shape[1])
+
+        # TODO do we need to reset here?
+        self.model.reset_states()
+        probs = self.model.predict(batch_matrix, batch_size=1)
+        return probs[0,:,0]
+
+    def save(self, path):
+        if self.model is None:
+            raise AttributeError('Cannot save untrained model.')
+        self.model.save(path)
+        return self
+
+    @classmethod
+    def load(cls, path):
+        import keras
+        model = keras.models.load_model(path, custom_objects={'precision': precision, 'recall': recall, 'auc_roc': auc_roc})
+        return KerasRNN(trained_model=model)
+
+    def __getstate__(self):
+        """
+        Get representation of object that can be pickled
+        :return: objects to be pickled
+        """
+        attrs = self.__dict__.copy()
+        del attrs['model']
+
+        if self.model is None:
+            return attrs, None, None
+        return attrs, self.model.to_json(), self.model.get_weights()
+
+    def __setstate__(self, state):
+        from keras.models import Sequential, model_from_json
+        """
+        Load object from pickled representation
+        :param state: attributes of model generated by __getstate__
+        """
+        attrs, architecture, weights = state
+
+        self.__dict__.update(attrs)
+
+        if architecture is None:
+            self.model = None
+        else:
+            self.model: Sequential = model_from_json(architecture)
+            self.model.set_weights(weights)
+
+def rotate(l, n):
+    m = n % len(l)
+    return l[m:] + l[:m]
+
+def _noop():
+    return None
+
+def _yield_single_pair(a, b):
+    yield a, b
+
+def _repeat_to_fill_batch_size(X_list, y_list, batch_size, input_size):
+    """
+    Fill matrix of batch_size rows with samples from X_list in a way that all samples are (approximately) evenly present.
+    Create batch_size rows, each row as long as the longest sample in X_list (max_len).
+    For row on index i, include concatenated sequence of X_list starting from sample i (sequence is trimmed to max_len).
+    :param X_list: list of samples
+    :param y_list: list of sample responses
+    :param batch_size: how many rows to create
+    :param input_size: number of columns in sample
+    :return: Filled matrix of batch_size rows with samples from X_list in a way that all samples are (approximately) evenly present.
+    """
+    if len(X_list) > batch_size:
+        raise AttributeError('Cannot repeat more samples than batch_size.')
+
+    max_len = max([X.shape[0] for X in X_list])
+
+    fill_shape = (batch_size, max_len, input_size)
+    fill_num_values = fill_shape[0] * fill_shape[1] * fill_shape[2]
+    print('Filling to batch size shape {} ({}M values)...'.format(fill_shape, int(fill_num_values / 1000000)))
+
+    X_filled = np.zeros(shape=fill_shape)
+    y_filled = np.zeros(shape=(fill_shape[0], fill_shape[1], 1))
+
+    for i in range(0, batch_size):
+        X_filled[i] = np.concatenate(rotate(X_list, i))[:max_len]
+        y_filled[i][:,0] = np.concatenate(rotate(y_list, i))[:max_len]
+
+    print('Filling done.')
+    return X_filled, y_filled
+
+
+def _build_generator(X_list, y_list, batch_size, timesteps, input_size, shuffle, positive_weight):
+    """
+    Build looping generator of training batches. Will return the generator and the number of batches in each epoch.
+    In each epoch, all samples are randomly split into batch_size "chunks", each "chunk" in batch can be trained in parallel.
+    Samples in each chunk are shuffled and merged into one whole sequence.
+    The whole sequences are separated into batches of given fixed given number of timesteps (protein vectors).
+    So the number of batches is defined so that we go over the whole sequence (length of the longest "chunk" sequence divided by the number of timesteps).
+
+    :param X_list: List of samples. Each sample is a matrix/DataFrame of protein domain vectors.
+    :param y_list: List of sample outputs.
+    :param batch_size: Number of parallel "chunks" in a training batch
+    :param timesteps: Number of timesteps (protein domain vectors) in a training batch
+    :param input_size: Size of the protein domain vector
+    :param shuffle: Whether to shuffle samples within each epoch. If not used, make sure that positive and negative samples are already shuffled in the list.
+    :param positive_weight: Weight of positive samples (single number). If provided, a triple of (X_batch, y_batch, weights_batch) are provided
+    :return: Tuple of (batch generator, number of batches in each epoch).
+    Each batch will contain the X input (batch_size, timesteps, input_size) and y output (batch_size, timesteps, 1)
+    """
+    if not X_list:
+        return _noop, None
+    from keras.preprocessing.sequence import pad_sequences
+    seq_length = sum([len(X) for X in X_list])
+    X_arr = np.array(X_list)
+    y_arr = np.array(y_list)
+    num_batches = int(np.ceil(np.ceil(seq_length / batch_size) / timesteps))
+    maxlen = num_batches * timesteps
+    print('Initializing generator of {} batches from sequence length {}'.format(num_batches, seq_length))
+
+    def generator():
+        while True:
+            # shuffle the samples
+            if shuffle:
+                shuffled = np.random.permutation(len(X_list))
+            # split samples into batch_size chunks
+            X_batches = np.array_split(X_arr[shuffled] if shuffle else X_arr, batch_size)
+            y_batches = np.array_split(y_arr[shuffled] if shuffle else y_arr, batch_size)
+
+            # merge the samples in each chunk into one sequence
+            X_batches = [np.concatenate(b) if b.size else np.empty(0) for b in X_batches]
+            y_batches = [np.concatenate(b) if b.size else np.empty(0) for b in y_batches]
+
+            # pad the sequences with zeros to the length of the longest chunk sequence
+            X_batches = pad_sequences(X_batches, maxlen=maxlen, dtype=np.float,
+                                                                   padding='post', truncating='post')
+            y_batches = pad_sequences(y_batches, maxlen=maxlen, dtype=np.float,
+                                                                   padding='post', truncating='post')
+
+            # Reshape array so that it can be indexed as [batch number][chunk][timestep][input feature]
+            # This will produce an array of dimension (num_batches, batch_size, timesteps, input_size)
+            # And output array of dimension (num_batches, batch_size, timesteps, 1)
+            X_batches = np.swapaxes(X_batches.reshape(batch_size, num_batches, timesteps, input_size), 0, 1)
+            y_batches = np.swapaxes(y_batches.reshape(batch_size, num_batches, timesteps, 1), 0, 1)
+
+            # print('Generated {}x{} batches: X {}, y {}'.format(num_batches, self.batch_size, X_batches.shape, y_batches.shape))
+
+            if positive_weight:
+                # Provide array of weights for each input vector based on the positive weight
+                weight_batches = np.ones(y_batches.shape)
+                weight_batches[y_batches == 1] = positive_weight
+                weight_batches = np.swapaxes(weight_batches.reshape(batch_size, num_batches, timesteps), 0, 1)
+                for X_batch, y_batch, weight_batch in zip(X_batches, y_batches, weight_batches):
+                    yield X_batch, y_batch, weight_batch
+            else:
+                for X_batch, y_batch in zip(X_batches, y_batches):
+                    yield X_batch, y_batch
+
+    return generator, num_batches
+
+def _count_samples(y_list, klass):
+    return np.sum([np.mean(y == klass) for y in y_list])
+
+def _split_matrix_into_batches(X, batch_size):
+    if len(X.shape) != 2:
+        raise AttributeError('Can only be called on a single 2-dimensional feature matrix.')
+    return X.reshape(batch_size, X.shape[0], X.shape[1])
+
+def _pad_matrix_to_be_divisible(X, divisible_by):
+    from keras.preprocessing.sequence import pad_sequences
+    remainder = X.shape[0] % divisible_by
+    if not remainder:
+        return X
+    maxlen = X.shape[0] + divisible_by - remainder
+    return pad_sequences([X], maxlen=maxlen, dtype=np.float, padding='post', truncating='post')[0]
+
+
+def _get_device(gpus):
+    if gpus == 0:
+        return tf.device('/cpu:0')
+    elif gpus >= 1:
+        return tf.device('/device:GPU:0')  # TODO: can we get just the first GPU?
+    else:
+        raise AttributeError('GPUs has to be an integer >= 0')
+
+
+def precision(y_true, y_pred):
+    """Precision metric.
+    
+    Only computes a batch-wise average of precision.
+    
+    Computes the precision, a metric for multi-label classification of
+    how many selected items are relevant.
+    """
+    import keras.backend as K
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = true_positives / (predicted_positives + K.epsilon())
+    return precision
+
+
+def recall(y_true, y_pred):
+    """Recall metric.
+
+    Only computes a batch-wise average of recall.
+
+    Computes the recall, a metric for multi-label classification of
+    how many relevant items are selected.
+    """
+    import keras.backend as K
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+    recall = true_positives / (possible_positives + K.epsilon())
+    return recall
+
+
+def auc_roc(y_true, y_pred):
+    """
+    Defines AUC ROC metric callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
+    """
+    # any tensorflow metric
+    value, update_op = tf.metrics.auc(y_true, y_pred)
+
+    # find all variables created for this metric
+    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]
+
+    # Add metric variables to GLOBAL_VARIABLES collection.
+    # They will be initialized for new session.
+    for v in metric_vars:
+        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)
+
+    # force to update metric values
+    with tf.control_dependencies([update_op]):
+        value = tf.identity(value)
+        return value
diff --git a/deepbgc/detector.py b/deepbgc/detector.py
new file mode 100644
index 0000000..870eb33
--- /dev/null
+++ b/deepbgc/detector.py
@@ -0,0 +1,181 @@
+import pandas as pd
+import hashlib
+import numpy as np
+from deepbgc.pipeline import DeepBGCModel
+
+SCORE_COLUMN = 'deepbgc_score'
+
+
+class DeepBGCDetector:
+    def __init__(self, model):
+        if isinstance(model, str):
+            print('Reading model from path: ', model)
+            model = DeepBGCModel.load(model)
+        self.model = model
+
+    def predict_domain_bgc_score(self, domains):
+        """
+        Get BGC score score for given Domain DataFrame, add it as SCORE_COLUMN column.
+        :param domains: Domain DataFrame, multiple samples marked by different 'sequence_id' will be predicted separately
+        :return: Original Domain DataFrame with SCORE_COLUMN column added.
+        """
+
+        samples = [group for i, group in domains.groupby('sequence_id')]
+        scores = []
+        for sample in samples:
+            score = sample.copy()
+            score[SCORE_COLUMN] = self.model.predict(sample)
+            scores.append(score)
+
+        merged: pd.DataFrame = pd.concat(scores)
+        return merged
+
+    def detect(self, domains, score_threshold=0.5):
+        if 'sequence_id' in domains.columns:
+            sequences = domains.groupby('sequence_id')
+            print('Detecting in {} sequences: {}'.format(len(sequences), domains['sequence_id'].unique()))
+        else:
+            sequences = [(None, domains)]
+            print('Detecting in single sequence...')
+
+        all_candidates = []
+        for sequence_id, sequence_domains in sequences:
+            scores = self.predict_domain_bgc_score(domains)
+            candidates = threshold_sequence_candidates(scores, score_threshold, merge_max_protein_gap=0, merge_max_nucl_gap=0)
+            if sequence_id:
+                candidates.insert(0, 'sequence_id', sequence_id)
+            all_candidates.append(candidates)
+        return pd.concat(all_candidates)
+
+
+def agg_concat(s):
+    """
+    Join given list with semicolons
+    :param s: list of objects to join
+    :return: joined string
+    """
+    return ';'.join(s)
+
+
+def average_protein_score(domains, concat_domains=True):
+    """
+    Average scores into a SCORE_COLUMN column by protein using the 'protein_id' and other PROTEIN_GROUP_COLS.
+    :param domains: DataFrame from the Domain CSV file
+    :param concat_domains: Whether to include a ';'-concatenated list of pfam_ids for each protein.
+    :return: DataFrame of proteins with averaged SCORE_COLUMN column
+    """
+    PROTEIN_GROUP_COLS = ['protein_id']
+    PROTEIN_EXTRA_COLS = ['in_cluster', 'gene_start', 'gene_end', 'gene_strand']
+
+    extra_cols = [col for col in PROTEIN_EXTRA_COLS if col in domains.columns]
+    all_cols = extra_cols + PROTEIN_GROUP_COLS
+    if concat_domains:
+        all_cols.append('pfam_id')
+    copy = domains[all_cols].copy()
+    copy[SCORE_COLUMN] = domains[SCORE_COLUMN]
+    per_gene = copy.groupby(all_cols, sort=False)
+    if concat_domains:
+        return per_gene.agg({'pfam_id': agg_concat, SCORE_COLUMN: 'mean'}) \
+            .rename(columns={'pfam_id': 'pfam_ids'}) \
+            .reset_index()
+    else:
+        return per_gene.mean().reset_index()
+
+
+def get_candidate(start, end, pfam_ids, protein_ids, protein_scores):
+    """
+    Get single BGC candidate dictionary
+    :param start: nucleotide coordinate start
+    :param end: nucleotide coordinate end
+    :param pfam_ids: list of pfam ids in candidate
+    :param protein_ids: list of protein ids in candidate
+    :param protein_scores: list of protein model score outputs
+    :return: BGC candidate dictionary
+    """
+    return {
+        'nucl_start': start,
+        'nucl_end': end,
+        'num_proteins': len(protein_ids),
+        'num_domains': len(pfam_ids),
+        'protein_ids': ';'.join(protein_ids),
+        'pfam_ids': ';'.join(pfam_ids),
+        SCORE_COLUMN: np.mean(protein_scores)
+    }
+
+
+def threshold_sequence_candidates(domain_scores, threshold, merge_max_protein_gap=0, merge_max_nucl_gap=0):
+    """
+    Get a BGC candidate DataFrame for domain scores in a single contig.
+    Generated by averaging domain scores by protein and then merging consecutive proteins with score satisfying given threshold
+    :param domain_scores: DataFrame of domains and their SCORE_COLUMN column
+    :param threshold: Averaged protein score threshold (inclusive) used to include or discard BGC proteins
+    :param merge_max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them
+    :param merge_max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them
+    :return: DataFrame of BGC candidates
+    """
+
+    protein_scores = average_protein_score(domain_scores, concat_domains=True)
+
+    candidates = []
+    candidate_start = None
+    candidate_end = None
+    candidate_domains = []
+    candidate_proteins = []
+    candidate_scores = []
+    gap_domains = []
+    gap_proteins = []
+    gap_scores = []
+    for i, protein in protein_scores.iterrows():
+        score = protein[SCORE_COLUMN]
+        # Inactive protein, add to gap
+        if score < threshold:
+            gap_proteins.append(protein['protein_id'])
+            gap_domains += protein['pfam_ids'].split(';')
+            gap_scores.append(score)
+            # We just changed from active to inactive, add previous region as candidate
+            if candidate_start is not None:
+                candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_scores))
+                candidate_start = None
+                candidate_end = None
+                candidate_domains = []
+                candidate_proteins = []
+                candidate_scores = []
+        # Active protein
+        else:
+            if not candidate_start:
+                candidate_start = protein['gene_start']
+                if candidates:
+                    # Check if we should merge with the previous candidate
+                    prev_start, prev_end, prev_domains, prev_proteins, prev_scores = candidates[-1]
+                    if len(gap_proteins) <= merge_max_protein_gap or (candidate_start - prev_end) <= merge_max_nucl_gap:
+                        # Remove previous candidate and continue where it started
+                        candidates = candidates[:-1]
+                        candidate_start = prev_start
+                        candidate_domains = prev_domains + gap_domains
+                        candidate_proteins = prev_proteins + gap_proteins
+                        candidate_scores = prev_scores + gap_scores
+
+            candidate_end = protein['gene_end']
+            candidate_proteins.append(protein['protein_id'])
+            candidate_domains += protein['pfam_ids'].split(';')
+            candidate_scores.append(score)
+            gap_domains = []
+            gap_proteins = []
+            gap_scores = []
+
+    # Last protein was active, add previous region as candidate
+    if candidate_start is not None:
+        candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_scores))
+
+    cands = pd.DataFrame([get_candidate(*args) for args in candidates])
+    if cands.empty:
+        return cands
+
+    cands['nucl_start'] = cands['nucl_start'].astype('int64')
+    cands['nucl_end'] = cands['nucl_end'].astype('int64')
+    cands['nucl_length'] = cands['nucl_end'] - cands['nucl_start'] + 1
+    cands['candidate_hash'] = cands['pfam_ids'].apply(
+        lambda pfam_ids: hashlib.md5(pfam_ids.encode('utf-8')).hexdigest())
+    cands = cands[['candidate_hash', SCORE_COLUMN, 'nucl_length', 'nucl_start', 'nucl_end',
+         'num_domains', 'num_proteins', 'protein_ids', 'pfam_ids']]
+    return cands
diff --git a/deepbgc/features.py b/deepbgc/features.py
new file mode 100644
index 0000000..8dd6d45
--- /dev/null
+++ b/deepbgc/features.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+# David Prihoda
+# Feature transformers that turn Domain DataFrames into protein feature vector matrices
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+import pandas as pd
+import sys
+
+
+class ListTransformer(BaseEstimator, TransformerMixin):
+    """
+    Wrapper for other transformers, will transform each DataFrame in a list by each transformer and merge the results.
+    """
+    def __init__(self, transformers):
+        self.transformers = transformers
+
+    def transform(self, X, y=None):
+        if X is None:
+            return None
+        if not self.transformers:
+            return X
+        if isinstance(X, list):
+            return [self.transform(X[i], y[i] if y else None) for i in range(0, len(X))]
+        if not isinstance(X, pd.DataFrame):
+            raise AttributeError('X has to be a pd.DataFrame or list, got '+str(type(X)))
+        return np.concatenate([t.transform(X, y) for t in self.transformers], axis=1)
+
+    def fit(self, X_list, y_list=None):
+        if X_list is None:
+            return self
+        if not isinstance(X_list, list):
+            raise AttributeError('X_list has to be a list, got'+str(type(X_list)))
+        if X_list:
+            for t in self.transformers:
+                t.fit(pd.concat(X_list), pd.concat(y_list))
+        return self
+
+    @classmethod
+    def from_config(cls, transformer_configs):
+        transformers = []
+        for params in transformer_configs:
+            classname = params.get('type')
+            transformer = getattr(sys.modules[__name__], classname)
+            trans_args = {k: v for k, v in params.items() if k != 'type'}
+            transformers.append(transformer(**trans_args))
+        return ListTransformer(transformers)
+
+
+class Pfam2VecTransformer(BaseEstimator, TransformerMixin):
+    """
+    Get pfam2vec matrix for a Domain DataFrame
+    """
+    def __init__(self, vector_path):
+        self.vector_path = vector_path
+        if vector_path.endswith('.csv'):
+            self.vectors = pd.read_csv(vector_path).set_index('pfam_id')
+        elif vector_path.endswith('.pkl') or vector_path.endswith('.pickle'):
+            self.vectors = pd.read_pickle(vector_path)
+        elif vector_path.endswith('.bin'):
+            import word2vec
+            model = word2vec.load(vector_path, kind='bin')
+            self.vectors = pd.DataFrame(model.vectors, index=model.vocab)
+        else:
+            raise ValueError("File type {} not supported for Pfam2Vec, use .csv, .pkl, .pickle or .bin".format(vector_path))
+
+    def transform(self, X, y=None):
+        # Turn each pfam ID into a vector
+        return self.vectors.reindex(index=X['pfam_id']).fillna(0)
+
+    def fit(self, X, y=None):
+        return self
+
+
+class RandomVecTransformer(BaseEstimator, TransformerMixin):
+    """
+    Get random vector matrix for a Domain DataFrame. Each unique pfam_id will have the same random vector throughout the sequence.
+    """
+
+    def __init__(self, dimensions=100):
+        self.dimensions = dimensions
+        self.zero_vector = np.zeros(self.dimensions)
+        self.vectors = {}
+        self.random = np.random.RandomState(seed=0)
+
+    def transform(self, X, y=None):
+        # Turn each pfam ID into a vector
+        return np.array([self.vectors.get(pfam_id, self.zero_vector) for pfam_id in X['pfam_id']])
+        #print(X.iloc[0]['pfam_id'], vectors[0])
+        #return vectors
+
+    def fit(self, X, y=None):
+        for pfam_id in X['pfam_id'].unique():
+            if pfam_id not in self.vectors:
+                self.vectors[pfam_id] = self.random.rand(self.dimensions)
+        return self
+
+
+class EmissionProbabilityTransformer(BaseEstimator, TransformerMixin):
+    """
+    Get emission probability feature column for given Domain DataFrame. Based on HMM emissions.
+    """
+    def __init__(self):
+        self.emissions = None
+
+    def fit(self, X, y=None):
+        unique_y = set(y)
+        if unique_y != {0, 1}:
+            raise AttributeError('Invalid target values, expected {0, 1} got ' + str(unique_y))
+        counts = pd.DataFrame(data={}, index=X['pfam_id'].unique())
+        counts['neg'] = X[y == 0]['pfam_id'].value_counts()
+        counts['pos'] = X[y == 1]['pfam_id'].value_counts()
+        counts = counts.fillna(0)
+        # Divide each state's emission counts by the total number of observations to get emission frequency
+        self.emissions = counts / counts.sum(axis=0)
+        return self
+
+    def transform(self, X, y=None):
+        # Turn each pfam ID into a vector
+        vectors = self.emissions.reindex(index=X['pfam_id'], fill_value=0)
+        return vectors
+
+
+class PositiveProbabilityTransformer(BaseEstimator, TransformerMixin):
+    """
+    Get "positive probability" feature columns for given Domain DataFrame.
+    Each pfam_id will get two columns: Positive probability and Total probability
+    Positive probability = probability of being in positive state while seeing given pfam,
+      which is equivalent to number of occurences of given pfam in the positive state divided by number of occurences in all states
+    Total probability = probability of seeing given pfam in general,
+      which is equivalent to number of occurences divided by total length of input sequence
+    """
+    def __init__(self):
+        self.probs = None
+
+    def fit(self, X, y=None):
+        vals = pd.DataFrame({'pfam_id': X['pfam_id'], 'y': y})
+        negweight = sum(y) / sum(y == 0)
+        total_num_weighted = sum(y) + sum(y == 0) * negweight
+        probs = {}
+        for pfam_id, pfam_y in vals.groupby('pfam_id')['y']:
+            num_pos = sum(pfam_y)
+            num_neg = sum(pfam_y == 0)
+            num_weighted = num_pos + num_neg * negweight
+            prob = num_pos / num_weighted
+            prob = (prob - 0.5) * 2
+            pfam_frac = num_weighted / total_num_weighted
+            probs[pfam_id] = [prob, pfam_frac]
+        self.probs = pd.DataFrame(probs).transpose()
+        return self
+
+    def transform(self, X, y=None):
+        # Turn each pfam ID into a vector
+        vectors = self.probs.reindex(index=X['pfam_id'], fill_value=0)
+        return vectors
+
+
+class OneHotEncodingTransformer(BaseEstimator, TransformerMixin):
+    """
+    Create a binary one-hot-encoding vector from Domain CSV files.
+    """
+    def __init__(self):
+        self.pfam_ids = []
+
+    def transform(self, X, y=None):
+        # Turn each pfam ID into a vector
+        return pd.get_dummies(X['pfam_id']).reindex(columns=self.pfam_ids, fill_value=0)
+
+    def fit(self, X, y=None):
+        self.pfam_ids = np.union1d(self.pfam_ids, X['pfam_id'])
+        return self
+
+
+class ProteinBorderTransformer(BaseEstimator, TransformerMixin):
+    """
+    Get gene beginning / gene end binary flags from Domain CSV files.
+    """
+
+    def __init__(self, field='protein_id'):
+        self.field = field
+
+    def transform(self, X, y=None):
+        # current != next (exclude last element because we don't have a next value)
+        borders = list(X[self.field][:-1].values != X[self.field][1:].values)
+        gene_ends = np.array(borders + [True]).reshape(-1, 1)
+        gene_beginnings = np.array([True] + borders).reshape(-1, 1)
+        return np.concatenate([gene_ends, gene_beginnings], axis=1).astype(np.uint8)
+
+    def fit(self, X, y=None):
+        return self
+
+
+class GeneDistanceTransformer(BaseEstimator, TransformerMixin):
+    """
+    Returns vector specifying nucleotide distance from the end of previous gene to the start of current gene for first domain in each protein.
+    Distance between domains in same protein is 0. Distance in first domain of given sample is equal to its gene_start.
+
+    TODO: Warning! Do not use the distance transformer for merged samples - the distance would be invalid on sample borders.
+    """
+
+    def __init__(self, norm_distance):
+        print('TODO: Warning! Do not use the distance transformer for merged samples - the distance would be invalid on sample borders.')
+        self.norm_distance = norm_distance
+
+    def transform(self, X, y=None):
+        gene_starts = X['gene_start'].values
+        gene_ends = X['gene_end'].values
+        previous_gene_ends = np.concatenate([[0], gene_ends[:-1]])
+        distances = (gene_starts - previous_gene_ends) / self.norm_distance
+        # replace negative values with zeros
+        distances *= distances >= 0
+        return distances.astype(np.float32).reshape(-1, 1)
+
+    def fit(self, X, y=None):
+        return self
+
+class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
+    """
+    Select given columns of input DataFrame
+    """
+    def __init__(self, columns):
+        self.columns = columns
+
+    def transform(self, X, y=None):
+        return X.select(self.columns, axis=1).values
+
+    def fit(self, X, y=None):
+        return self
\ No newline at end of file
diff --git a/deepbgc/main.py b/deepbgc/main.py
new file mode 100755
index 0000000..ea560c6
--- /dev/null
+++ b/deepbgc/main.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+import argparse
+from deepbgc.commands.detect import DetectCommand
+from deepbgc.commands.pfam import PfamCommand
+from deepbgc.commands.classify import ClassifyCommand
+import sys
+
+COMMANDS = [
+    PfamCommand,
+    DetectCommand,
+    ClassifyCommand
+]
+
+def _fix_subparsers(subparsers):
+    if sys.version_info[0] == 3:
+        subparsers.required = True
+        subparsers.dest = 'cmd'
+
+
+class DeepBGCParser(argparse.ArgumentParser):
+    def error(self, message):
+        self.print_help()
+        self.exit(2, "{}\n".format(message))
+
+
+def main(argv=None):
+    parser = DeepBGCParser(prog='deepbgc',
+                           description='DeepBGC - Biosynthetic Gene Cluster detection and classification.',
+                           formatter_class=argparse.RawTextHelpFormatter)
+
+    # Sub commands
+    subparsers = parser.add_subparsers(
+        title='Available Commands',
+        metavar='COMMAND',
+        dest='cmd',
+        help='Use: deepbgc COMMAND --help for command-specific help.')
+
+    _fix_subparsers(subparsers)
+
+    for CommandClass in COMMANDS:
+        CommandClass.add_subparser(subparsers)
+
+    args = parser.parse_args(argv)
+
+    # Initialize command object
+    cmd = args.func(args)
+    # Run command
+    cmd.run()
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/deepbgc/pipeline.py b/deepbgc/pipeline.py
new file mode 100644
index 0000000..9b093c1
--- /dev/null
+++ b/deepbgc/pipeline.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# David Prihoda
+# Wrapper for a BGC detection model than handles feature transformation and loading model definitions from JSON
+
+from deepbgc import features
+import pickle
+import json
+from sklearn.base import BaseEstimator, ClassifierMixin
+from pprint import pprint
+
+
+class DeepBGCModel(BaseEstimator, ClassifierMixin):
+    """
+    Wraper for a BGC detection model than handles feature transformation and loading model definitions from JSON
+    """
+    def __init__(self, transformer: features.ListTransformer, model, fit_params: dict, color=None, label=None):
+        """
+
+        :param transformer: ListTransformer used to transform Domain DataFrames into feature matrices
+        :param model: New instance of a BGC detection model
+        :param fit_params: Params to pass to the fit function of given model
+        :param color: Model color stored for plotting purposes
+        :param label: Model label stored for plotting purposes
+        """
+        self.transformer = transformer
+        self.model = model
+        self.fit_params = fit_params
+        self.color = color
+        self.label = label
+
+    def fit(self, samples, y, validation_samples=None, validation_y=None, **extra_fit_params):
+        """
+        Train model with given list of samples, observe performance on given validation samples.
+        Domain DataFrames are converted to feature matrices using the pipeline's feature transformer.
+        :param samples: List of Domain DataFrames, each DataFrame contains one BGC or non-BGC sample's sequence of protein domains.
+        :param y: List of output values, one value for each sequence
+        :param validation_samples: List of validation samples
+        :param validation_y: List of validation sample outputs
+        :param extra_fit_params: Extra fitting parameters to pass to the fit function of given model
+        :return: self
+        """
+        if validation_y is None:
+            validation_y = []
+        if validation_samples is None:
+            validation_samples = []
+
+        self.transformer.fit(samples, y)
+
+        train_X_list = self.transformer.transform(samples, y)
+        validation_X_list = self.transformer.transform(validation_samples, validation_y)
+
+        merged_params = self.fit_params.copy()
+        merged_params.update(extra_fit_params)
+        return self.model.fit(train_X_list, y, validation_X_list=validation_X_list, validation_y_list=validation_y, **merged_params)
+
+    def predict(self, sample):
+        X_list = self.transformer.transform(sample)
+        return self.model.predict(X_list)
+
+    @classmethod
+    def from_config(cls, config, meta_only=False) -> 'DeepBGCModel':
+        """
+        Load model configuration from a JSON config
+        :param config: Path to JSON config or loaded config dict
+        :param meta_only: Do not create feature transformers
+        :return: Untrained pipeline based on given config
+        """
+        if isinstance(config, str):
+            with open(config) as f:
+                config = json.loads(f.read())
+        elif isinstance(config, dict):
+            pass
+        else:
+            raise AttributeError('Invalid config type "{}": {}'.format(type(config), config))
+
+        print('Loaded model:')
+        pprint(config)
+
+        color = config.get('color', 'grey')
+        label = config.get('label')
+        build_params = config.get('build_params', {})
+        fit_params = config.get('fit_params', {})
+        input_params = config.get('input_params', {})
+
+        # Get class from "models" module. Don't forget to import the class in models.__init__ first!
+        clf_class = getattr(models, config.get('type'))
+
+        # Create a new model instance
+        model = clf_class(**build_params)
+
+        if meta_only:
+            transformer = None
+        else:
+            feature_params = input_params.get('features', [])
+            transformer = features.ListTransformer.from_config(feature_params)
+
+        return DeepBGCModel(transformer=transformer, model=model, fit_params=fit_params, color=color, label=label)
+
+    def save(self, path) -> 'DeepBGCModel':
+        with open(path, 'wb') as f:
+            pickle.dump(self, f)
+        return self
+
+    @classmethod
+    def load(cls, path) -> 'DeepBGCModel':
+        with open(path, 'rb') as f:
+            return pickle.load(f)
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a1d65ac
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,35 @@
+from setuptools import setup, find_packages
+from deepbgc import VERSION
+
+install_requires = [
+    'argparse',
+    'biopython',
+    'scikit-learn',
+    'pandas',
+    'keras',
+    'tensorflow',
+    'hmmlearn',
+    'matplotlib'
+]
+
+setup(
+    name='deepbgc',
+    version=VERSION,
+    description='DeepBGC - Biosynthetic Gene Cluster detection and classification',
+    long_description=open('README.md', 'r').read(),
+    author='David Příhoda, Geoffrey Hannigan',
+    packages=find_packages(),
+    author_email='david.prihoda1@merck.com',
+    license='MIT',
+    install_requires=install_requires,
+    keywords='biosynthetic gene clusters, bgc detection, deep learning, pfam2vec',
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Programming Language :: Python :: 3',
+    ],
+    include_package_data=True,
+    url='http://www.merck.com',
+    entry_points={
+        'console_scripts': ['deepbgc = deepbgc.main:main']
+    }
+)