diff --git a/build.gradle b/build.gradle
index 6b97dd0291..a4a30bf5d0 100644
--- a/build.gradle
+++ b/build.gradle
@@ -53,8 +53,8 @@ configure(allProjs) {
     ext {
         scalaVersion = '2.11'
         scalaVersionRevision = '8'
-        scalaTestVersion = '3.0.0'
-        scalaCheckVersion = '1.13.5'
+        scalaTestVersion = '3.0.5'
+        scalaCheckVersion = '1.14.0'
         junitVersion = '4.11'
         avroVersion = '1.7.7'
         sparkVersion = '2.2.1'
@@ -68,7 +68,7 @@ configure(allProjs) {
         jodaConvertVersion = '1.8.1'
         algebirdVersion = '0.12.3'
         jacksonVersion = '2.7.3'
-        luceneVersion = '7.1.0'
+        luceneVersion = '7.3.0'
         enumeratumVersion = '1.4.12'
         scoptVersion = '3.5.0'
         googleLibPhoneNumberVersion = '8.8.5'
@@ -179,7 +179,8 @@ configure(allProjs) {
         header = rootProject.file('LICENSE.txt')
         ignoreFailures = true
         include '**/*.java', '**/*.scala'
-        exclude '**/com/salesforce/op/utils/io/DirectMapreduceOutputCommitter.scala',
+        exclude '**/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala',
+                '**/com/salesforce/op/utils/io/DirectMapreduceOutputCommitter.scala',
                 '**/com/salesforce/op/test/TestSparkContext.scala',
                 '**/com/salesforce/op/test/TempDirectoryTest.scala',
                 '**/com/salesforce/op/utils/io/DirectOutputCommitter.scala',
diff --git a/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala b/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala
index a1a2306dfb..7ea30826c8 100644
--- a/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala
+++ b/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala
@@ -33,13 +33,14 @@ package com.salesforce.op.cli.gen.templates
 import com.salesforce.op.features.{FeatureBuilder => FB}
 import com.salesforce.op.features.types._
 
+
 /**
  * This is a template for generating binary feature handling in a generated project
  */
 class BinaryFeatureTemplate {
   private[templates] def feature =
   // BEGIN
-    FB.Binary[SampleObject]
-      .extract(o => Option(o.codeGeneration_binaryField_codeGeneration).map(_.booleanValue).toBinary)
+  FB.Binary[SampleObject]
+    .extract(o => Option(o.codeGeneration_binaryField_codeGeneration).map(_.booleanValue).toBinary)
   // END
 }
diff --git a/core/build.gradle b/core/build.gradle
index b931f91b28..5290a7e4d0 100644
--- a/core/build.gradle
+++ b/core/build.gradle
@@ -1,5 +1,6 @@
 dependencies {
     compile project(':readers')
+    testRuntime project(':models')
     testCompile project(':testkit')
 
     // Google libphonenumber
@@ -16,6 +17,7 @@ dependencies {
     // Lucene text analysis
     compile "org.apache.lucene:lucene-analyzers-common:$luceneVersion"
     compile "org.apache.lucene:lucene-analyzers-kuromoji:$luceneVersion"
+    compile "org.apache.lucene:lucene-analyzers-opennlp:$luceneVersion"
     compile "org.apache.lucene:lucene-suggest:$luceneVersion"
 
     // Scopt
diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
index c112981524..c6a01d72c0 100644
--- a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
+++ b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
@@ -31,14 +31,18 @@
 
 package com.salesforce.op
 
-import com.salesforce.op.features.{Feature, OPFeature}
+import com.salesforce.op.features.OPFeature
 import com.salesforce.op.filters.RawFeatureFilter
 import com.salesforce.op.readers.Reader
 import com.salesforce.op.stages.OPStage
+import com.salesforce.op.stages.impl.preparators.CorrelationType
+import com.salesforce.op.stages.impl.selector.ModelSelectorBase
+import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.reflection.ReflectionUtils
 import com.salesforce.op.utils.stages.FitStagesUtil
+import com.salesforce.op.utils.stages.FitStagesUtil.{CutDAG, FittedDAG, Layer, StagesDAG}
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.{Estimator, Transformer}
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import scala.collection.mutable.{MutableList => MList}
@@ -163,7 +167,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         Try {
           stage.set(stage.getParam(k), v)
         } orElse {
-          Try { ReflectionUtils.reflectSetterMethod(stage, k).get.apply(v) }
+          Try { ReflectionUtils.reflectSetterMethod(stage, k, Seq(v)) }
         }
       if (setStage.isFailure) log.error(
         s"Setting parameter $k with value $v for stage $stage with params ${stage.params.toList} failed with an error",
@@ -180,7 +184,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    */
   private def setStagesDAG(features: Array[OPFeature]): OpWorkflow.this.type = {
     // Unique stages layered by distance
-    val uniqueStagesLayered = DAG.compute(features)
+    val uniqueStagesLayered = FitStagesUtil.computeDAG(features)
 
     if (log.isDebugEnabled) {
       val total = uniqueStagesLayered.map(_.length).sum
@@ -311,11 +315,18 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    */
   def train(persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)
     (implicit spark: SparkSession): OpWorkflowModel = {
-    val rawData = generateRawData()
 
-    // Update features with fitted stages
-    val fittedStages = fitStages(data = rawData, stagesToFit = stages, persistEveryKStages)
-    val newResultFeatures = resultFeatures.map(_.copyWithNewStages(fittedStages))
+    val (fittedStages, newResultFeatures) =
+      if (stages.exists(_.isInstanceOf[Estimator[_]])) {
+        val rawData = generateRawData()
+
+        // Update features with fitted stages
+        val fittedStgs = fitStages(data = rawData, stagesToFit = stages, persistEveryKStages)
+        val newResultFtrs = resultFeatures.map(_.copyWithNewStages(fittedStgs))
+        fittedStgs -> newResultFtrs
+      } else {
+        stages -> resultFeatures
+      }
 
     val model =
       new OpWorkflowModel(uid, getParameters())
@@ -327,6 +338,93 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
     reader.map(model.setReader).getOrElse(model)
   }
 
+  /**
+   * Fit the estimators to return a sequence of only transformers
+   * Modified version of Spark 2.x Pipeline
+   *
+   * @param data                dataframe to fit on
+   * @param stagesToFit         stages that need to be converted to transformers
+   * @param persistEveryKStages persist data in transforms every k stages for performance improvement
+   * @return fitted transformers
+   */
+  protected def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int)
+    (implicit spark: SparkSession): Array[OPStage] = {
+
+    // TODO may want to make workflow take an optional reserve fraction
+    val splitters = stagesToFit.collect { case s: ModelSelectorBase[_, _] => s.splitter }.flatten
+    val splitter = splitters.reduceOption { (a, b) =>
+      if (a.getReserveTestFraction > b.getReserveTestFraction) a else b
+    }
+    val (train, test) = splitter.map(_.split(data)).getOrElse((data, spark.emptyDataFrame))
+    val hasTest = !test.isEmpty
+
+    val dag = FitStagesUtil.computeDAG(resultFeatures)
+      .map(_.filter(s => stagesToFit.contains(s._1)))
+      .filter(_.nonEmpty)
+
+    // Search for the last estimator
+    val indexOfLastEstimator: Option[Int] =
+      dag.collect { case seq if seq.exists(_._1.isInstanceOf[Estimator[_]]) => seq.head._2 }.lastOption
+
+    // doing regular workflow fit without workflow level CV
+    if (!isWorkflowCV) {
+      FitStagesUtil.fitAndTransformDAG(
+        dag = dag,
+        train = train,
+        test = test,
+        hasTest = hasTest,
+        indexOfLastEstimator = indexOfLastEstimator,
+        persistEveryKStages = persistEveryKStages
+      ).transformers
+    } else {
+      // doing workflow level CV/TS
+      // Extract Model Selector and Split the DAG into
+      val CutDAG(modelSelectorOpt, before, during, after) = FitStagesUtil.cutDAG(dag)
+
+      log.info("Applying initial DAG before CV/TS. Stages: {}", before.flatMap(_.map(_._1.stageName)).mkString(", "))
+      val FittedDAG(beforeTrain, beforeTest, beforeTransformers) = FitStagesUtil.fitAndTransformDAG(
+        dag = before,
+        train = train,
+        test = test,
+        hasTest = hasTest,
+        indexOfLastEstimator = indexOfLastEstimator,
+        persistEveryKStages = persistEveryKStages
+      )
+
+      // Break up catalyst (cause it chokes) by converting into rdd, persisting it and then back to dataframe
+      val (trainRDD, testRDD) = (beforeTrain.rdd.persist(), beforeTest.rdd.persist())
+      val (trainFixed, testFixed) = (
+        spark.createDataFrame(trainRDD, beforeTrain.schema),
+        spark.createDataFrame(testRDD, beforeTest.schema)
+      )
+
+      modelSelectorOpt match {
+        case None => beforeTransformers
+        case Some((modelSelector, distance)) =>
+          // estimate best model
+          log.info("Estimate best Model with CV/TS. Stages included in CV are: {}, {}",
+            during.flatMap(_.map(_._1.stageName)).mkString(", "), modelSelector.uid: Any
+          )
+          modelSelector.findBestEstimator(trainFixed, during, persistEveryKStages)
+          val remainingDAG: StagesDAG = (during :+ (Array(modelSelector -> distance): Layer)) ++ after
+
+          log.info("Applying DAG after CV/TS. Stages: {}", remainingDAG.flatMap(_.map(_._1.stageName)).mkString(", "))
+          val fitted = FitStagesUtil.fitAndTransformDAG(
+            dag = remainingDAG,
+            train = trainFixed,
+            test = testFixed,
+            hasTest = hasTest,
+            indexOfLastEstimator = indexOfLastEstimator,
+            persistEveryKStages = persistEveryKStages,
+            fittedTransformers = beforeTransformers
+          ).transformers
+          trainRDD.unpersist()
+          testRDD.unpersist()
+          fitted
+      }
+    }
+  }
+
   /**
    * Replaces any estimators in this workflow with their corresponding fit models from the OpWorkflowModel
    * passed in. Note that the Stages UIDs must EXACTLY correspond in order to be replaced so the same features
@@ -352,15 +450,25 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
   def loadModel(path: String): OpWorkflowModel = new OpWorkflowModelReader(this).load(path)
 
   /**
-   * Returns a dataframe containing all the columns generated up to the feature input
+   * Returns a dataframe containing all the columns generated up to and including the feature input
    *
    * @param feature input feature to compute up to
    * @param persistEveryKStages persist data in transforms every k stages for performance improvement
-   * @return Dataframe containing columns corresponding to all of the features generated before the feature given
+   * @return Dataframe containing columns corresponding to all of the features generated up to the feature given
    */
   def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)
     (implicit spark: SparkSession): DataFrame = {
-    computeDataUpTo(stopStage = findOriginStageId(feature), fitted = false, persistEveryKStages)
+    if (findOriginStageId(feature).isEmpty) {
+      log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.")
+      generateRawData()
+    } else {
+      val rawData = generateRawData()
+      val stagesToFit = FitStagesUtil.computeDAG(Array(feature)).flatMap(_.map(_._1))
+      val fittedStages = fitStages(rawData, stagesToFit, persistEveryKStages)
+      val updatedFeature = feature.copyWithNewStages(fittedStages)
+      val dag = FitStagesUtil.computeDAG(Array(updatedFeature))
+      applyTransformationsDAG(rawData, dag, persistEveryKStages)
+    }
   }
 
   /**
@@ -383,16 +491,35 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    * @tparam T Type of the data read in
    */
   @Experimental
-  def withRawFeatureFilter[T](trainingReader: Option[Reader[T]], scoringReader: Option[Reader[T]],
-    bins: Int = 100, minFillRate: Double = 0.001, maxFillDifference: Double = 0.90,
-    maxFillRatioDiff: Double = 20.0, maxJSDivergence: Double = 0.90, protectedFeatures: Array[OPFeature] = Array.empty
+  def withRawFeatureFilter[T](
+    trainingReader: Option[Reader[T]],
+    scoringReader: Option[Reader[T]],
+    bins: Int = 100,
+    minFillRate: Double = 0.001,
+    maxFillDifference: Double = 0.90,
+    maxFillRatioDiff: Double = 20.0,
+    maxJSDivergence: Double = 0.90,
+    maxCorrelation: Double = 0.95,
+    correlationType: CorrelationType = CorrelationType.Pearson,
+    protectedFeatures: Array[OPFeature] = Array.empty
   ): this.type = {
     val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]])
     require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" +
       "as the reader for the workflow")
     val protectedRawFeatures = protectedFeatures.flatMap(_.rawFeatures).map(_.name).toSet
-    rawFeatureFilter = Option( new RawFeatureFilter(training.get, scoringReader, bins, minFillRate,
-      maxFillDifference, maxFillRatioDiff, maxJSDivergence, protectedRawFeatures) )
+    rawFeatureFilter = Option {
+      new RawFeatureFilter(
+        trainingReader = training.get,
+        scoreReader = scoringReader,
+        bins = bins,
+        minFill = minFillRate,
+        maxFillDifference = maxFillDifference,
+        maxFillRatioDiff = maxFillRatioDiff,
+        maxJSDivergence = maxJSDivergence,
+        maxCorrelation = maxCorrelation,
+        correlationType = correlationType,
+        protectedFeatures = protectedRawFeatures)
+    }
     this
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala
index bec870c95b..ee0e268244 100644
--- a/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala
+++ b/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala
@@ -31,84 +31,33 @@
 
 package com.salesforce.op
 
-import com.salesforce.op.DAG.{Layer, StagesDAG}
+import com.salesforce.op.utils.stages.FitStagesUtil._
+import com.salesforce.op.utils.stages.FitStagesUtil
 import com.salesforce.op.features.OPFeature
 import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.readers.{CustomReader, Reader, ReaderKey}
-import com.salesforce.op.stages.impl.selector.ModelSelectorBase
 import com.salesforce.op.stages.{FeatureGeneratorStage, OPStage, OpTransformer}
 import com.salesforce.op.utils.spark.RichDataset._
-import com.salesforce.op.utils.stages.FitStagesUtil
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 import org.slf4j.LoggerFactory
 
-import scala.collection.mutable.ListBuffer
 import scala.reflect.runtime.universe.WeakTypeTag
 
-private[op] case object DAG {
-
-  private[op] type Layer = Array[(OPStage, Int)]
-  private[op] type StagesDAG = Array[Layer]
-
-  /**
-   * Computes stages DAG
-   *
-   * @param features array if features in workflow
-   * @return unique stages layered by distance (desc order)
-   */
-  def compute(features: Array[OPFeature]): StagesDAG = {
-
-    val (failures, parents) = features.map(_.parentStages()).partition(_.isFailure)
-
-    if (failures.nonEmpty) {
-      throw new IllegalArgumentException("Failed to compute stages DAG", failures.head.failed.get)
-    }
-
-    // Stages sorted by distance
-    val sortedByDistance: Array[(OPStage, Int)] = parents.flatMap(_.get)
-
-    // Stages layered by distance
-    val layeredByDistance: StagesDAG = createLayers(sortedByDistance)
-
-
-    // Unique stages layered by distance
-    layeredByDistance
-      .foldLeft(Set.empty[OPStage], Array.empty[Array[(OPStage, Int)]]) {
-        case ((seen, filtered), uncleaned) =>
-          // filter out any seen stages. also add distinct to filter out any duplicate stages in layer
-          val unseen = uncleaned.filterNot(v => seen.contains(v._1)).distinct
-          val nowSeen = seen ++ unseen.map(_._1)
-          (nowSeen, filtered :+ unseen)
-      }._2
-  }
-
-  /**
-   * Layers Stages by distance
-   *
-   * @param stages stages sorted by distance
-   * @return stages layered by distance
-   */
-  def createLayers(stages: Array[(OPStage, Int)]): StagesDAG = {
-    stages.groupBy(_._2).toArray
-      .map(_._2.sortBy(_._1.getOutputFeatureName))
-      .sortBy(s => -s.head._2)
-  }
-}
-
 /**
  * Parameters for pipelines and pipeline models
  */
 private[op] trait OpWorkflowCore {
 
-  @transient implicit protected lazy val log = LoggerFactory.getLogger(this.getClass)
+  @transient protected lazy val log = LoggerFactory.getLogger(this.getClass)
 
   // the uid of the stage
   def uid: String
 
-  // Model Selector
-  private[op] type MS = ModelSelectorBase[_ <: Model[_], _ <: Estimator[_]]
+  // whether the CV/TV is performed on the workflow level
+  private[op] var isWorkflowCV = false
 
   // the data reader for the workflow or model
   private[op] var reader: Option[Reader[_]] = None
@@ -138,6 +87,19 @@ private[op] trait OpWorkflowCore {
     this
   }
 
+  /**
+   * :: Experimental ::
+   * Decides whether the cross-validation/train-validation-split will be done at workflow level
+   * This will remove issues with data leakage, however it will impact the runtime
+   *
+   * @return this workflow that will train part of the DAG in the cross-validation/train validation split
+   */
+  @Experimental
+  final def withWorkflowCV: this.type = {
+    isWorkflowCV = true
+    this
+  }
+
 
   /**
    * Set data reader that will be used to generate data frame for stages
@@ -256,83 +218,6 @@ private[op] trait OpWorkflowCore {
    */
   protected def generateRawData()(implicit spark: SparkSession): DataFrame
 
-  /**
-   * Fit the estimators to return a sequence of only transformers
-   * Modified version of Spark 2.x Pipeline
-   *
-   * @param data                dataframe to fit on
-   * @param stagesToFit         stages that need to be converted to transformers
-   * @param persistEveryKStages persist data in transforms every k stages for performance improvement
-   * @return fitted transformers
-   */
-  protected def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int)
-    (implicit spark: SparkSession): Array[OPStage] = {
-
-    // TODO may want to make workflow take an optional reserve fraction
-    val splitters = stagesToFit.collect{ case s: ModelSelectorBase[_, _] => s.splitter }.flatten
-    val splitter = splitters.reduceOption{ (a, b) => if (a.getReserveTestFraction > b.getReserveTestFraction) a else b }
-    val (train, test) = splitter.map(_.split(data)).getOrElse{ (data, spark.emptyDataFrame) }
-    val hasTest = !test.isEmpty
-
-    val dag = DAG.compute(resultFeatures)
-      .map(_.filter(s => stagesToFit.contains(s._1)))
-      .filter(_.nonEmpty)
-
-    // Search for the last estimator
-    val indexOfLastEstimator = dag
-      .collect { case seq if seq.exists( _._1.isInstanceOf[Estimator[_]] ) => seq.head._2 }
-      .lastOption
-
-    val transformers = ListBuffer.empty[OPStage]
-
-    dag.foldLeft((train.toDF(), test.toDF())) {
-      case ((currTrain, currTest), stagesLayer) =>
-        val index = stagesLayer.head._2
-
-        val (newTrain, newTest, fitTransform) = FitStagesUtil.fitAndTransform(
-          train = currTrain,
-          test = currTest,
-          stages = stagesLayer.map(_._1),
-          transformData = indexOfLastEstimator.exists(_ < index), // only need to update for fit before last estimator
-          persistEveryKStages = persistEveryKStages,
-          doTest = Some(hasTest)
-        )
-
-        transformers.append(fitTransform: _*)
-        newTrain -> newTest
-    }
-    transformers.toArray
-  }
-
-
-  /**
-   * Returns a Dataframe containing all the columns generated up to the stop stage
-   * @param stopStage last stage to apply
-   * @param persistEveryKStages persist data in transforms every k stages for performance improvement
-   * @return Dataframe containing columns corresponding to all of the features generated before the feature given
-   */
-  protected def computeDataUpTo(stopStage: Option[Int], fitted: Boolean, persistEveryKStages: Int)
-    (implicit spark: SparkSession): DataFrame = {
-    if (stopStage.isEmpty) {
-      log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.")
-      generateRawData()
-    } else {
-      val featureStages = stages.slice(0, stopStage.get)
-      log.info("Found parent stage and computing features up to that stage:\n{}",
-        featureStages.map(s => s.uid + " --> " + s.getOutputFeatureName).mkString("\n")
-      )
-      val rawData = generateRawData()
-
-      if (!fitted) {
-        val stages = fitStages(rawData, featureStages, persistEveryKStages)
-          .map(_.asInstanceOf[Transformer])
-        FitStagesUtil.applySparkTransformations(rawData, stages, persistEveryKStages) // TODO use DAG transform
-      } else {
-        featureStages.foldLeft(rawData)((data, stage) => stage.asInstanceOf[Transformer].transform(data))
-      }
-    }
-  }
-
   /**
    * Returns a dataframe containing all the columns generated up to the feature input
    *
@@ -353,64 +238,6 @@ private[op] trait OpWorkflowCore {
     df.saveAvro(path)
   }
 
-  /**
-   * Method that cut DAG in order to perform proper CV/TS
-   *
-   * @param dag DAG in the workflow to be cut
-   * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS)
-   */
-  protected[op] def cutDAG(dag: StagesDAG): (Option[MS], StagesDAG, StagesDAG) = {
-    if (dag.isEmpty) (None, Array.empty, Array.empty) else {
-      // creates Array containing every Model Selector in the DAG
-      val modelSelectorArrays = dag.flatten.collect { case (ms: MS, dist: Int) => (ms, dist) }
-      val modelSelector = modelSelectorArrays.toList match {
-        case Nil => None
-        case List(ms) => Option(ms)
-        case modelSelectors => throw new IllegalArgumentException(
-          s"OpWorkflow can contain at most 1 Model Selector. Found ${modelSelectors.length} Model Selectors :" +
-            s" ${modelSelectors.map(_._1).mkString(",")}")
-      }
-
-      // nonCVTS and CVTS DAGs
-      val (nonCVTSDAG: StagesDAG, cVTSDAG: StagesDAG) = modelSelector.map { case (ms, dist) =>
-        // Optimize the DAG by removing stages unrelated to ModelSelector
-        val modelSelectorDAG = DAG.compute(Array(ms.getOutput())).dropRight(1)
-
-        // Create the DAG without Model Selector. It will be used to compute the final nonCVTS DAG.
-        val nonMSDAG: StagesDAG = {
-          dag.filter(_.exists(_._2 >= dist)).toList match {
-            case stages :: Nil => Array(stages.filterNot(_._1.isInstanceOf[MS]))
-            case xs :+ x => xs.toArray :+ x.filterNot(_._1.isInstanceOf[MS])
-          }
-        }.filter(!_.isEmpty) // Remove empty layers
-
-        // Index of first CVTS stage in ModelSelector DAG
-        val firstCVTSIndex = modelSelectorDAG.toList.indexWhere(_.exists(stage => {
-          val inputs = stage._1.getTransientFeatures()
-          inputs.exists(_.isResponse) && inputs.exists(!_.isResponse)
-        }))
-
-        // If no CVTS stages, the whole DAG is not in the CV/TS
-        if (firstCVTSIndex == -1) (nonMSDAG, Array.empty[Layer]) else {
-
-          val cVTSDAG = modelSelectorDAG.drop(firstCVTSIndex)
-
-          // nonCVTSDAG is the complementary DAG
-          // The rule is "nonCVTSDAG = nonMSDAG - CVTSDAG"
-          val nonCVTSDAG = {
-            val flattenedCVTSDAG = cVTSDAG.flatten.map(_._1)
-            nonMSDAG.map(_.filterNot { case (stage: OPStage, _) => flattenedCVTSDAG.contains(stage) })
-              .filter(!_.isEmpty) // Remove empty layers
-          }
-
-          (nonCVTSDAG, cVTSDAG)
-        }
-      }.getOrElse((Array.empty[Layer], Array.empty[Layer]))
-      (modelSelector.map(_._1), nonCVTSDAG, cVTSDAG)
-    }
-  }
-
-
   /**
    * Efficiently applies all fitted stages grouping by level in the DAG where possible
    *
@@ -425,6 +252,9 @@ private[op] trait OpWorkflowCore {
   )(implicit spark: SparkSession): DataFrame = {
     // A holder for the last persisted rdd
     var lastPersisted: Option[DataFrame] = None
+    if (dag.exists(_.exists(_._1.isInstanceOf[Estimator[_]]))) {
+      throw new IllegalArgumentException("Cannot apply transformations to DAG that contains estimators")
+    }
 
     // Apply stages layer by layer
     dag.foldLeft(rawData) { case (df, stagesLayer) =>
diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala
index a40f2ae8f8..665dde3277 100644
--- a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala
+++ b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala
@@ -38,15 +38,14 @@ import com.salesforce.op.readers.DataFrameFieldNames._
 import com.salesforce.op.stages.{OPStage, OpPipelineStage, OpTransformer}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.RichMetadata._
-import org.apache.spark.ml._
-import org.apache.spark.rdd.RDD
+import com.salesforce.op.utils.stages.FitStagesUtil
+import org.apache.spark.ml.Estimator
 import org.apache.spark.sql.types.Metadata
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.json4s.JValue
 import org.json4s.JsonAST.{JField, JObject}
 import org.json4s.jackson.JsonMethods.{pretty, render}
 
-import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
 
@@ -93,15 +92,22 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
   }
 
   /**
-   * Returns a dataframe containing all the columns generated up to the feature input
+   * Returns a dataframe containing all the columns generated up to and including the feature input
    *
    * @param feature input feature to compute up to
    * @throws IllegalArgumentException if a feature is not part of this workflow
-   * @return Dataframe containing columns corresponding to all of the features generated before the feature given
+   * @return Dataframe containing columns corresponding to all of the features generated up to the feature given
    */
   def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages)
     (implicit spark: SparkSession): DataFrame = {
-    computeDataUpTo(stopStage = findOriginStageId(feature), fitted = true, persistEveryKStages = persistEveryKStages)
+    if (findOriginStageId(feature).isEmpty) {
+      log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.")
+      generateRawData()
+    } else {
+      val fittedFeature = feature.copyWithNewStages(stages)
+      val dag = FitStagesUtil.computeDAG(Array(fittedFeature))
+      applyTransformationsDAG(generateRawData(), dag, persistEveryKStages)
+    }
   }
 
   /**
@@ -123,7 +129,6 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
    * @throws IllegalArgumentException if a feature is not part of this workflow
    * @return Updated instance of feature
    */
-  // TODO change this method to give you raw features for use in stacked workflows
   def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature] = {
     val allFeatures = rawFeatures ++ blacklistedFeatures ++ stages.map(_.getOutput())
     features.map{f => allFeatures.find(_.sameOrigin(f))
@@ -307,9 +312,8 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
     require(persistEveryKStages >= 1, s"persistEveryKStages value of $persistEveryKStages is invalid must be >= 1")
 
     // TODO: replace 'stages' with 'stagesDag'. (is a breaking change for serialization, but would simplify scoreFn)
-
     // Pre-compute transformations dag
-    val dag = DAG.compute(resultFeatures)
+    val dag = FitStagesUtil.computeDAG(resultFeatures)
 
     (path: Option[String]) => {
       // Generate the dataframe with raw features
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala
index 1a63fdc36d..324145b3c0 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala
@@ -95,7 +95,6 @@ trait RichDateFeature {
       trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
       others: Array[FeatureLike[Date]] = Array.empty
     ): FeatureLike[OPVector] = {
-      // vectorize DateList
       f.toDateList().vectorize(dateListPivot = dateListPivot, referenceDate = referenceDate, trackNulls = trackNulls,
         others = others.map(_.toDateList()))
     }
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala b/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala
index 6fbc2c9da5..e621cc76eb 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala
@@ -64,18 +64,20 @@ trait RichFeaturesCollection {
      * Convert features into a single vector feature using the feature engineering steps most likely to provide
      * good results based on the types of the individual features passed in
      *
+     * @param label optional label feature to be passed into stages that require the label column
      * @return vector feature
      */
-    def transmogrify(): FeatureLike[OPVector] =
-      Transmogrifier.transmogrify(features.toSeq)(TransmogrifierDefaults).combine()
+    def transmogrify(label: Option[FeatureLike[RealNN]] = None): FeatureLike[OPVector] =
+      Transmogrifier.transmogrify(features = features.toSeq, label = label)(TransmogrifierDefaults).combine()
 
     /**
      * Convert features into a single vector feature using the feature engineering steps most likely to provide
-     * good results based on the types of the individual features passed in.
+     * good results based on the types of the individual features passed in
      *
+     * @param label optional label feature to be passed into stages that require the label column
      * @return vector feature
      */
-    def autoTransform(): FeatureLike[OPVector] = transmogrify()
+    def autoTransform(label: Option[FeatureLike[RealNN]] = None): FeatureLike[OPVector] = transmogrify(label = label)
 
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala
index 0661ef6077..07e3e2c96d 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala
@@ -252,6 +252,7 @@ trait RichMapFeature {
      * @param autoDetectThreshold       Language detection threshold. If none of the detected languages have
      *                                  confidence greater than the threshold then defaultLanguage is used.
      * @param forceSharedHashSpace      force the hash space to be shared among all included features
+     * @param hashSpaceStrategy         strategy to determine whether to use shared hash space for all included features
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
@@ -276,6 +277,7 @@ trait RichMapFeature {
       prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName,
       autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold,
       forceSharedHashSpace: Boolean = false,
+      hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
       others: Array[FeatureLike[TextMap]] = Array.empty
@@ -298,6 +300,7 @@ trait RichMapFeature {
         .setHashWithIndex(hashWithIndex)
         .setPrependFeatureName(prependFeatureName)
         .setForceSharedHashSpace(forceSharedHashSpace)
+        .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
         .getOutput()
@@ -381,6 +384,7 @@ trait RichMapFeature {
      * @param autoDetectThreshold       Language detection threshold. If none of the detected languages have
      *                                  confidence greater than the threshold then defaultLanguage is used.
      * @param forceSharedHashSpace      force the hash space to be shared among all included features
+     * @param hashSpaceStrategy         strategy to determine whether to use shared hash space for all included features
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
@@ -405,6 +409,7 @@ trait RichMapFeature {
       prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName,
       autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold,
       forceSharedHashSpace: Boolean = false,
+      hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
       others: Array[FeatureLike[TextAreaMap]] = Array.empty
@@ -427,6 +432,7 @@ trait RichMapFeature {
         .setHashWithIndex(hashWithIndex)
         .setPrependFeatureName(prependFeatureName)
         .setForceSharedHashSpace(forceSharedHashSpace)
+        .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
         .getOutput()
@@ -518,7 +524,7 @@ trait RichMapFeature {
     }
 
     /**
-     * Apply RealMapVectorizer on any OPMap that has double values
+     * Apply RealMapVectorizer or auto bucketizer (when label is present) on any OPMap that has double values
      *
      * @param others        other features of the same type
      * @param defaultValue  value to give missing keys on pivot
@@ -526,6 +532,10 @@ trait RichMapFeature {
      * @param whiteListKeys keys to whitelist
      * @param blackListKeys keys to blacklist
      * @param trackNulls    option to keep track of values that were missing
+     * @param label         optional label column to be passed into autoBucketizer if present
+     * @param trackInvalid  option to keep track of invalid values,
+     *                      eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain   minimum info gain, one of the stopping criteria of the Decision Tree
      *
      * @return an OPVector feature
      */
@@ -536,17 +546,29 @@ trait RichMapFeature {
       whiteListKeys: Array[String] = Array.empty,
       blackListKeys: Array[String] = Array.empty,
       others: Array[FeatureLike[T]] = Array.empty,
-      trackNulls: Boolean = TransmogrifierDefaults.TrackNulls
+      trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+      label: Option[FeatureLike[RealNN]] = None
     ): FeatureLike[OPVector] = {
-      new RealMapVectorizer[T]()
-        .setInput(f +: others)
-        .setFillWithMean(fillWithMean)
-        .setDefaultValue(defaultValue)
-        .setCleanKeys(cleanKeys)
-        .setWhiteListKeys(whiteListKeys)
-        .setBlackListKeys(blackListKeys)
-        .setTrackNulls(trackNulls)
-        .getOutput()
+      label match {
+        case None =>
+          new RealMapVectorizer[T]()
+            .setInput(f +: others)
+            .setFillWithMean(fillWithMean)
+            .setDefaultValue(defaultValue)
+            .setCleanKeys(cleanKeys)
+            .setWhiteListKeys(whiteListKeys)
+            .setBlackListKeys(blackListKeys)
+            .setTrackNulls(trackNulls)
+            .getOutput()
+        case Some(lbl) =>
+          autoBucketize(
+            label = lbl, trackNulls = trackNulls, trackInvalid = trackInvalid,
+            minInfoGain = minInfoGain, cleanKeys = cleanKeys,
+            whiteListKeys = whiteListKeys, blackListKeys = blackListKeys
+          )
+      }
     }
   }
 
@@ -590,7 +612,7 @@ trait RichMapFeature {
     }
 
     /**
-     * Apply IntegralMapVectorizer on any OPMap that has long values
+     * Apply IntegralMapVectorizer or auto bucketizer (when label is present) on any OPMap that has long values
      *
      * @param others        other features of the same type
      * @param defaultValue  value to give missing keys on pivot
@@ -598,6 +620,10 @@ trait RichMapFeature {
      * @param whiteListKeys keys to whitelist
      * @param blackListKeys keys to blacklist
      * @param trackNulls    option to keep track of values that were missing
+     * @param label         optional label column to be passed into autoBucketizer if present
+     * @param trackInvalid  option to keep track of invalid values,
+     *                      eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain   minimum info gain, one of the stopping criteria of the Decision Tree
      *
      * @return an OPVector feature
      */
@@ -608,17 +634,29 @@ trait RichMapFeature {
       whiteListKeys: Array[String] = Array.empty,
       blackListKeys: Array[String] = Array.empty,
       others: Array[FeatureLike[T]] = Array.empty,
-      trackNulls: Boolean = TransmogrifierDefaults.TrackNulls
+      trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+      label: Option[FeatureLike[RealNN]] = None
     ): FeatureLike[OPVector] = {
-      new IntegralMapVectorizer[T]()
-        .setInput(f +: others)
-        .setFillWithMode(fillWithMode)
-        .setDefaultValue(defaultValue)
-        .setCleanKeys(cleanKeys)
-        .setWhiteListKeys(whiteListKeys)
-        .setBlackListKeys(blackListKeys)
-        .setTrackNulls(trackNulls)
-        .getOutput()
+      label match {
+        case None =>
+          new IntegralMapVectorizer[T]()
+            .setInput(f +: others)
+            .setFillWithMode(fillWithMode)
+            .setDefaultValue(defaultValue)
+            .setCleanKeys(cleanKeys)
+            .setWhiteListKeys(whiteListKeys)
+            .setBlackListKeys(blackListKeys)
+            .setTrackNulls(trackNulls)
+            .getOutput()
+        case Some(lbl) =>
+          autoBucketize(
+            label = lbl, trackNulls = trackNulls, trackInvalid = trackInvalid,
+            minInfoGain = minInfoGain, cleanKeys = cleanKeys,
+            whiteListKeys = whiteListKeys, blackListKeys = blackListKeys
+          )
+      }
     }
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
index 95378c8ca2..539dfac834 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
@@ -316,22 +316,37 @@ trait RichNumericFeature {
      * @param fillValue    value to pull in place of nulls
      * @param trackNulls   keep tract of when nulls occur by adding a second column to the vector with a null indicator
      * @param fillWithMean replace missing values with mean (as apposed to constant provided in fillValue)
-     * @return
+     * @param trackInvalid option to keep track of invalid values,
+     *                     eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain  minimum info gain, one of the stopping criteria of the Decision Tree for the autoBucketizer
+     * @param label        optional label column to be passed into autoBucketizer if present
+     * @return             a vector feature containing the raw Features with filled missing values and the bucketized
+     *                     features if a label argument is passed
      */
     def vectorize
     (
       fillValue: Double,
       fillWithMean: Boolean,
       trackNulls: Boolean,
-      others: Array[FeatureLike[T]] = Array.empty
+      others: Array[FeatureLike[T]] = Array.empty,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+      label: Option[FeatureLike[RealNN]] = None
     ): FeatureLike[OPVector] = {
-      val stage = new RealVectorizer[T]()
-        .setInput(f +: others)
-        .setTrackNulls(trackNulls)
+      val features = f +: others
+      val stage = new RealVectorizer[T]().setInput(features).setTrackNulls(trackNulls)
       if (fillWithMean) stage.setFillWithMean else stage.setFillWithConstant(fillValue)
-      stage.getOutput()
+      val filledValues = stage.getOutput()
+      label match {
+        case None =>
+          filledValues
+        case Some(lbl) =>
+          val bucketized = features.map(
+            _.autoBucketize(label = lbl, trackNulls = false, trackInvalid = trackInvalid, minInfoGain = minInfoGain)
+          )
+          new VectorsCombiner().setInput(filledValues +: bucketized).getOutput()
+      }
     }
-
   }
 
 
@@ -418,6 +433,10 @@ trait RichNumericFeature {
      * @param minVariance       Minimum amount of variance allowed for each feature and label
      * @param removeBadFeatures If set to true, this will automatically remove all the bad features
      *                          from the feature vector
+     * @param removeFeatureGroup      remove all features descended from a parent feature
+     * @param protectTextSharedHash   protect text shared hash from related null indicators and other hashes
+     * @param categoricalLabel  If true, treat label as categorical. If not set, check number of disticnt labels to
+     *                          decide whether a label should be treated categorical.
      * @return sanity checked feature vector
      */
     // scalastyle:off
@@ -434,6 +453,7 @@ trait RichNumericFeature {
       minVariance: Double = SanityChecker.MinVariance,
       removeBadFeatures: Boolean = SanityChecker.RemoveBadFeatures,
       removeFeatureGroup: Boolean = SanityChecker.RemoveFeatureGroup,
+      protectTextSharedHash: Boolean = SanityChecker.ProtectTextSharedHash,
       categoricalLabel: Option[Boolean] = None
     ): FeatureLike[OPVector] = {
       // scalastyle:on
@@ -449,6 +469,7 @@ trait RichNumericFeature {
         .setMinVariance(minVariance)
         .setRemoveBadFeatures(removeBadFeatures)
         .setRemoveFeatureGroup(removeFeatureGroup)
+        .setProtectTextSharedHash(protectTextSharedHash)
         .setInput(f, featureVector)
 
       categoricalLabel.foreach(checker.setCategoricalLabel)
@@ -578,18 +599,36 @@ trait RichNumericFeature {
      * @param fillValue    value to pull in place of nulls
      * @param trackNulls   keep tract of when nulls occur by adding a second column to the vector with a null indicator
      * @param fillWithMode replace missing values with mode (as apposed to constant provided in fillValue)
-     * @return
+     * @param trackInvalid option to keep track of invalid values,
+     *                     eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain  minimum info gain, one of the stopping criteria of the Decision Tree for the autoBucketizer
+     * @param label        optional label column to be passed into autoBucketizer if present
+     * @return             a vector feature containing the raw Features with filled missing values and the bucketized
+     *                     features if a label argument is passed
      */
     def vectorize
     (
       fillValue: Long,
       fillWithMode: Boolean,
       trackNulls: Boolean,
-      others: Array[FeatureLike[T]] = Array.empty
+      others: Array[FeatureLike[T]] = Array.empty,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+      label: Option[FeatureLike[RealNN]] = None
     ): FeatureLike[OPVector] = {
-      val stage = new IntegralVectorizer().setInput(f +: others).setTrackNulls(trackNulls)
+      val features = f +: others
+      val stage = new IntegralVectorizer[T]().setInput(features).setTrackNulls(trackNulls)
       if (fillWithMode) stage.setFillWithMode else stage.setFillWithConstant(fillValue)
-      stage.getOutput()
+      val filledValues = stage.getOutput()
+      label match {
+        case None =>
+          filledValues
+        case Some(lbl) =>
+          val bucketized = features.map(
+            _.autoBucketize(label = lbl, trackNulls = false, trackInvalid = trackInvalid, minInfoGain = minInfoGain)
+          )
+          new VectorsCombiner().setInput(filledValues +: bucketized).getOutput()
+      }
     }
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala
index baf25c1e07..46f4a91bf3 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala
@@ -191,6 +191,7 @@ trait RichTextFeature {
      * @param autoDetectThreshold       Language detection threshold. If none of the detected languages have
      *                                  confidence greater than the threshold then defaultLanguage is used.
      * @param forceSharedHashSpace      force the hash space to be shared among all included features
+     * @param hashSpaceStrategy         strategy to determine whether to use shared hash space for all included features
      * @param defaultLanguage           default language to assume in case autoDetectLanguage is disabled or
      *                                  failed to make a good enough prediction.
      * @param hashAlgorithm             hash algorithm to use
@@ -215,6 +216,7 @@ trait RichTextFeature {
       prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName,
       autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold,
       forceSharedHashSpace: Boolean = false,
+      hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
       others: Array[FeatureLike[T]] = Array.empty
@@ -237,6 +239,7 @@ trait RichTextFeature {
         .setHashWithIndex(hashWithIndex)
         .setPrependFeatureName(prependFeatureName)
         .setForceSharedHashSpace(forceSharedHashSpace)
+        .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
         .getOutput()
@@ -249,8 +252,8 @@ trait RichTextFeature {
      * The indices are in [0, numLabels), ordered by label frequencies.
      * So the most frequent label gets index 0.
      *
-     * @param unseenName     name to give strings that appear in transform but not in fit
-     * @param handleInvalid   how to transform values not seen in fitting
+     * @param unseenName    name to give strings that appear in transform but not in fit
+     * @param handleInvalid how to transform values not seen in fitting
      * @see [[OpIndexToString]] for the inverse transformation
      *
      * @return indexed real feature
@@ -263,7 +266,7 @@ trait RichTextFeature {
         case StringIndexerHandleInvalid.NoFilter => f.transformWith(
           new OpStringIndexerNoFilter[T]().setUnseenName(unseenName)
         )
-        case _ => f.transformWith( new OpStringIndexer[T]().setHandleInvalid(handleInvalid) )
+        case _ => f.transformWith(new OpStringIndexer[T]().setHandleInvalid(handleInvalid))
       }
     }
 
@@ -273,10 +276,10 @@ trait RichTextFeature {
      * @param languageDetector    a language detector instance
      * @param analyzer            a text analyzer instance
      * @param autoDetectLanguage  indicates whether to attempt language detection
-     * @param defaultLanguage     default language to assume in case autoDetectLanguage is disabled or
-     *                            failed to make a good enough prediction.
      * @param autoDetectThreshold Language detection threshold. If none of the detected languages have
      *                            confidence greater than the threshold then defaultLanguage is used.
+     * @param defaultLanguage     default language to assume in case autoDetectLanguage is disabled or
+     *                            failed to make a good enough prediction.
      * @param minTokenLength      minimum token length, >= 1.
      * @param toLowercase         indicates whether to convert all characters to lowercase before analyzing
      * @return tokenized feature
@@ -303,10 +306,10 @@ trait RichTextFeature {
      * Tokenize text using [[LuceneTextAnalyzer]] with [[OptimaizeLanguageDetector]]
      *
      * @param autoDetectLanguage  indicates whether to attempt language detection
-     * @param defaultLanguage     a language to assume in case no language was detected or
-     *                            when autoDetectLanguage is set to false
      * @param autoDetectThreshold Language detection threshold. If none of the detected languages have
      *                            confidence greater than the threshold then defaultLanguage is used.
+     * @param defaultLanguage     default language to assume in case autoDetectLanguage is disabled or
+     *                            failed to make a good enough prediction.
      * @param minTokenLength      minimum token length, >= 1.
      * @param toLowercase         indicates whether to convert all characters to lowercase before analyzing
      * @param stripHtml           indicates whether to strip HTML tags from the text or not before analyzing
@@ -379,6 +382,37 @@ trait RichTextFeature {
     def detectLanguages(languageDetector: LanguageDetector = LangDetector.DefaultDetector): FeatureLike[RealMap] =
       f.transformWith(new LangDetector[T](languageDetector))
 
+    /**
+     * Find name entities of the text using OpenNLP [[OpenNLPAnalyzer]]
+     *
+     * @param languageDetector    a language detector instance
+     * @param analyzer            a text analyzer instance
+     * @param sentenceSplitter    sentence splitter
+     * @param tagger              name entity recognition tagger
+     * @param autoDetectLanguage  indicates whether to attempt language detection
+     * @param autoDetectThreshold Language detection threshold. If none of the detected languages have
+     *                            confidence greater than the threshold then defaultLanguage is used.
+     * @param defaultLanguage     default language to assume in case autoDetectLanguage is disabled or
+     *                            failed to make a good enough prediction.
+     * @return name entity sets feature
+     */
+    def recognizeEntities
+    (
+      languageDetector: LanguageDetector = NameEntityRecognizer.LanguageDetector,
+      analyzer: TextAnalyzer = NameEntityRecognizer.Analyzer,
+      sentenceSplitter: SentenceSplitter = NameEntityRecognizer.Splitter,
+      tagger: NameEntityTagger[_ <: TaggerResult] = NameEntityRecognizer.Tagger,
+      autoDetectLanguage: Boolean = NameEntityRecognizer.AutoDetectLanguage,
+      autoDetectThreshold: Double = NameEntityRecognizer.AutoDetectThreshold,
+      defaultLanguage: Language = NameEntityRecognizer.DefaultLanguage
+    ): FeatureLike[MultiPickListMap] = {
+      f.transformWith(
+        new NameEntityRecognizer[T](languageDetector, analyzer, sentenceSplitter, tagger)
+          .setAutoDetectLanguage(autoDetectLanguage)
+          .setAutoDetectThreshold(autoDetectThreshold)
+          .setDefaultLanguage(defaultLanguage)
+      )
+    }
   }
 
   implicit class RichPhoneFeature(val f: FeatureLike[Phone]) {
@@ -480,10 +514,10 @@ trait RichTextFeature {
      * 0 if invalid and with an optional second element idicating if the phone number was null
      *
      * @param defaultRegion region against which to check phone validity
-     * @param isStrict strict validation means cannot have extra digits
-     * @param trackNulls produce column indicating if the number was null
-     * @param fillValue value to fill in for nulls in vactor creation
-     * @param others other phone numbers to vectorize
+     * @param isStrict      strict validation means cannot have extra digits
+     * @param trackNulls    produce column indicating if the number was null
+     * @param fillValue     value to fill in for nulls in vactor creation
+     * @param others        other phone numbers to vectorize
      * @return vector feature containing information about phone number
      */
     def vectorize(
@@ -503,12 +537,14 @@ trait RichTextFeature {
 
     /**
      * Extract email prefixes
+     *
      * @return email prefix
      */
     def toEmailPrefix: FeatureLike[Text] = f.map[Text](_.prefix.toText, "prefix")
 
     /**
      * Extract email domains
+     *
      * @return email domain
      */
     def toEmailDomain: FeatureLike[Text] = f.map[Text](_.domain.toText, "domain")
@@ -518,10 +554,10 @@ trait RichTextFeature {
      * and keeping the top K occurrences of each feature, along with an extra column per feature
      * indicating how many values were not in the top K.
      *
-     * @param others Other [[Email]] features
-     * @param topK How many values to keep in the vector
+     * @param others     Other [[Email]] features
+     * @param topK       How many values to keep in the vector
      * @param minSupport Min times a value must occur to be retained in pivot
-     * @param cleanText If true, ignores capitalization and punctuations when grouping categories
+     * @param cleanText  If true, ignores capitalization and punctuations when grouping categories
      * @param trackNulls keep an extra column that indicated if feature was null
      * @return The vectorized features
      */
@@ -563,6 +599,7 @@ trait RichTextFeature {
     /**
      * Verifies if the url is of correct form of "Uniform Resource Identifiers (URI): Generic Syntax"
      * RFC2396 (http://www.ietf.org/rfc/rfc2396.txt)
+     *
      * @param protocols url protocols to consider valid, i.e. http, https, ftp etc.
      */
     def isValidUrl(protocols: Array[String]): FeatureLike[Binary] = f.exists(_.isValid(protocols))
@@ -572,10 +609,10 @@ trait RichTextFeature {
      * and keeping the top K occurrences of each feature, along with an extra column per feature
      * indicating how many values were not in the top K.
      *
-     * @param others Other [[URL]] features
-     * @param topK How many values to keep in the vector
+     * @param others     Other [[URL]] features
+     * @param topK       How many values to keep in the vector
      * @param minSupport Min times a value must occur to be retained in pivot
-     * @param cleanText If true, ignores capitalization and punctuations when grouping categories
+     * @param cleanText  If true, ignores capitalization and punctuations when grouping categories
      * @param trackNulls keep an extra column that indicated if feature was null
      * @return The vectorized features
      */
@@ -613,12 +650,12 @@ trait RichTextFeature {
      * Extracts Base64 features (MIME type etc.),
      * then converts those into PickList features and vectorizes them.
      *
-     * @param topK            number of values to keep for each key
-     * @param minSupport      min times a value must occur to be retained in pivot
-     * @param cleanText       clean text before pivoting
+     * @param topK       number of values to keep for each key
+     * @param minSupport min times a value must occur to be retained in pivot
+     * @param cleanText  clean text before pivoting
      * @param trackNulls keep an extra column that indicated if feature was null
-     * @param typeHint        MIME type hint, i.e. 'application/json', 'text/plain' etc.
-     * @param others          other features of the same type
+     * @param typeHint   MIME type hint, i.e. 'application/json', 'text/plain' etc.
+     * @param others     other features of the same type
      * @return result feature of type vector
      */
     def vectorize(
@@ -646,10 +683,10 @@ trait RichTextFeature {
      * Converts a sequence of [[PickList]] features into a vector keeping the top K occurrences of each feature,
      * along with an extra column per feature indicating how many values were not in the top K.
      *
-     * @param others Other [[PickList]] features to include in pivot
-     * @param topK How many values to keep in the vector
+     * @param others     Other [[PickList]] features to include in pivot
+     * @param topK       How many values to keep in the vector
      * @param minSupport Min times a value must occur to be retained in pivot
-     * @param cleanText If true, ignores capitalization and punctuations when grouping categories
+     * @param cleanText  If true, ignores capitalization and punctuations when grouping categories
      * @param trackNulls keep an extra column that indicated if feature was null
      * @return The vectorized features
      */
@@ -672,10 +709,10 @@ trait RichTextFeature {
      * Converts a sequence of [[ComboBox]] features into a vector keeping the top K occurrences of each feature,
      * along with an extra column per feature indicating how many values were not in the top K.
      *
-     * @param others Other [[ComboBox]] features to include in pivot
-     * @param topK How many values to keep in the vector
+     * @param others     Other [[ComboBox]] features to include in pivot
+     * @param topK       How many values to keep in the vector
      * @param minSupport Min times a value must occur to be retained in pivot
-     * @param cleanText If true, ignores capitalization and punctuations when grouping categories
+     * @param cleanText  If true, ignores capitalization and punctuations when grouping categories
      * @param trackNulls keep an extra column that indicated if feature was null
      * @return The vectorized features
      */
@@ -698,10 +735,10 @@ trait RichTextFeature {
      * Converts a sequence of [[ID]] features into a vector keeping the top K occurrences of each feature,
      * along with an extra column per feature indicating how many values were not in the top K.
      *
-     * @param others Other [[ID]] features to include in pivot
-     * @param topK How many values to keep in the vector
+     * @param others     Other [[ID]] features to include in pivot
+     * @param topK       How many values to keep in the vector
      * @param minSupport Min times a value must occur to be retained in pivot
-     * @param cleanText If true, ignores capitalization and punctuations when grouping categories
+     * @param cleanText  If true, ignores capitalization and punctuations when grouping categories
      * @param trackNulls keep an extra column that indicated if feature was null
      * @return The vectorized features
      */
diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala
index f10bc18bce..84bb9cbe81 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala
@@ -34,7 +34,7 @@ package com.salesforce.op.dsl
 import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureLike
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.classification.{Impurity, OpRandomForest}
+import com.salesforce.op.stages.impl.classification.{Impurity, OpRandomForestClassifier}
 import com.salesforce.op.stages.impl.feature.{DropIndicesByTransformer, OpLDA}
 import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
@@ -86,8 +86,8 @@ trait RichVectorFeature {
       impurity: Impurity = Impurity.Entropy,
       seed: Long = util.Random.nextLong,
       thresholds: Array[Double] = Array.empty
-    ): (FeatureLike[RealNN], FeatureLike[OPVector], FeatureLike[OPVector]) = {
-      val OpRF = new OpRandomForest().setInput(label, f)
+    ): (FeatureLike[Prediction]) = {
+      val OpRF = new OpRandomForestClassifier().setInput(label, f)
       if (thresholds.nonEmpty) OpRF.setThresholds(thresholds)
 
       OpRF.setMaxDepth(maxDepth)
@@ -96,7 +96,7 @@ trait RichVectorFeature {
         .setMinInfoGain(minInfoGain)
         .setSubsamplingRate(subSamplingRate)
         .setNumTrees(numTrees)
-        .setImpurity(impurity)
+        .setImpurity(impurity.sparkName)
         .setSeed(seed)
         .getOutput()
     }
diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala
index fb8c0ba3e2..378cef5634 100644
--- a/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala
+++ b/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala
@@ -33,8 +33,12 @@ package com.salesforce.op.evaluators
 
 import com.salesforce.op.UID
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, MulticlassClassificationEvaluator}
+import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.sql.Dataset
+import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics => SparkMLBinaryClassificationMetrics}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.types.DoubleType
 import org.slf4j.LoggerFactory
 
 /**
@@ -52,7 +56,8 @@ private[op] class OpBinaryClassificationEvaluator
 (
   override val name: String = OpEvaluatorNames.binary,
   override val isLargerBetter: Boolean = true,
-  override val uid: String = UID[OpBinaryClassificationEvaluator]
+  override val uid: String = UID[OpBinaryClassificationEvaluator],
+  val numBins: Int = 100
 ) extends OpBinaryClassificationEvaluatorBase[BinaryClassificationMetrics](uid = uid) {
 
   @transient private lazy val log = LoggerFactory.getLogger(this.getClass)
@@ -60,22 +65,21 @@ private[op] class OpBinaryClassificationEvaluator
   def getDefaultMetric: BinaryClassificationMetrics => Double = _.AuROC
 
   override def evaluateAll(data: Dataset[_]): BinaryClassificationMetrics = {
-    val (labelColName, rawPredictionColName, predictionColName) = (getLabelCol, getRawPredictionCol, getPredictionCol)
+    val (labelColName, rawPredictionColName, predictionColName, probabilityColName) =
+      (getLabelCol, getRawPredictionCol, getPredictionCol, getProbabilityCol)
 
     log.debug(
-      "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n",
-      labelColName, rawPredictionColName, predictionColName
+      "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n probability : {}\n",
+      labelColName, rawPredictionColName, predictionColName, probabilityColName
     )
 
-    val Array(aUROC, aUPR) =
-      Array(BinaryClassEvalMetrics.AuROC, BinaryClassEvalMetrics.AuPR).map(getBinaryEvaluatorMetric(_, data))
-
     import data.sparkSession.implicits._
     val rdd = data.select(predictionColName, labelColName).as[(Double, Double)].rdd
 
     if (rdd.isEmpty()) {
       log.error("The dataset is empty")
-      BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+      BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        Seq(), Seq(), Seq(), Seq())
     } else {
       val multiclassMetrics = new MulticlassMetrics(rdd)
       val labels = multiclassMetrics.labels
@@ -94,11 +98,23 @@ private[op] class OpBinaryClassificationEvaluator
       val f1 = if (precision + recall == 0.0) 0.0 else 2 * precision * recall / (precision + recall)
       val error = if (tp + fp + tn + fn == 0.0) 0.0 else (fp + fn) / (tp + fp + tn + fn)
 
+      val scoreAndLabels =
+        data.select(col(probabilityColName), col(labelColName).cast(DoubleType)).rdd.map {
+          case Row(prob: Vector, label: Double) => (prob(1), label)
+          case Row(prob: Double, label: Double) => (prob, label)
+        }
+      val sparkMLMetrics = new SparkMLBinaryClassificationMetrics(scoreAndLabels = scoreAndLabels, numBins = numBins)
+      val thresholds = sparkMLMetrics.thresholds().collect()
+      val precisionByThreshold = sparkMLMetrics.precisionByThreshold().collect().map(_._2)
+      val recallByThreshold = sparkMLMetrics.recallByThreshold().collect().map(_._2)
+      val falsePositiveRateByThreshold = sparkMLMetrics.roc().collect().map(_._1).slice(1, thresholds.length + 1)
+      val aUROC = sparkMLMetrics.areaUnderROC()
+      val aUPR = sparkMLMetrics.areaUnderPR()
       val metrics = BinaryClassificationMetrics(
         Precision = precision, Recall = recall, F1 = f1, AuROC = aUROC,
-        AuPR = aUPR, Error = error, TP = tp, TN = tn, FP = fp, FN = fn
+        AuPR = aUPR, Error = error, TP = tp, TN = tn, FP = fp, FN = fn,
+        thresholds, precisionByThreshold, recallByThreshold, falsePositiveRateByThreshold
       )
-
       log.info("Evaluated metrics: {}", metrics.toString)
       metrics
     }
@@ -147,5 +163,9 @@ case class BinaryClassificationMetrics
   TP: Double,
   TN: Double,
   FP: Double,
-  FN: Double
+  FN: Double,
+  thresholds: Seq[Double],
+  precisionByThreshold: Seq[Double],
+  recallByThreshold: Seq[Double],
+  falsePositiveRateByThreshold: Seq[Double]
 ) extends EvaluationMetrics
diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala
index 1ce904bccd..9fbe841ecd 100644
--- a/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala
+++ b/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala
@@ -118,7 +118,6 @@ trait EvaluationMetrics extends JsonLike {
    * @return metadata
    */
   def toMetadata: Metadata = this.toMap.toMetadata
-
 }
 
 
@@ -205,6 +204,10 @@ sealed abstract class ClassificationEvalMetric(val sparkEntryName: String) exten
  */
 object BinaryClassEvalMetrics extends Enum[ClassificationEvalMetric] {
   val values = findValues
+  case object Precision extends ClassificationEvalMetric("precision")
+  case object Recall extends ClassificationEvalMetric("recall")
+  case object F1 extends ClassificationEvalMetric("f1")
+  case object Error extends ClassificationEvalMetric("accuracy")
   case object AuROC extends ClassificationEvalMetric("areaUnderROC")
   case object AuPR extends ClassificationEvalMetric("areaUnderPR")
 }
@@ -218,8 +221,10 @@ object MultiClassEvalMetrics extends Enum[ClassificationEvalMetric] {
   case object Recall extends ClassificationEvalMetric("weightedRecall")
   case object F1 extends ClassificationEvalMetric("f1")
   case object Error extends ClassificationEvalMetric("accuracy")
+  case object ThresholdMetrics extends ClassificationEvalMetric("thresholdMetrics")
 }
 
+
 /**
  * Contains the names of metrics used in logging
  */
diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala
index 6f7a60c9dc..07cfb0a92b 100644
--- a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala
+++ b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala
@@ -32,21 +32,24 @@
 package com.salesforce.op.evaluators
 
 import com.salesforce.op.UID
-import com.salesforce.op.features.types._
+import com.twitter.algebird.Monoid._
+import com.twitter.algebird.Operators._
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.param.{DoubleArrayParam, IntArrayParam}
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
 import org.slf4j.LoggerFactory
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 
 /**
- *
  * Instance to evaluate Multi Classification metrics
  * The metrics are  Precision, Recall, F1 and Error Rate
  * Default evaluation returns F1 score
  *
- * @param name name of default metric
+ * @param name           name of default metric
  * @param isLargerBetter is metric better if larger
- * @param uid uid for instance
+ * @param uid            uid for instance
  */
 private[op] class OpMultiClassificationEvaluator
 (
@@ -59,12 +62,33 @@ private[op] class OpMultiClassificationEvaluator
 
   def getDefaultMetric: MultiClassificationMetrics => Double = _.F1
 
+  final val topNs = new IntArrayParam(
+    parent = this,
+    name = "topNs",
+    doc = "sequence of topN values to use for threshold metrics",
+    isValid = _.forall(_ > 0)
+  )
+  setDefault(topNs, Array(1, 3))
+
+  def setTopNs(v: Array[Int]): this.type = set(topNs, v)
+
+  final val thresholds = new DoubleArrayParam(
+    parent = this,
+    name = "thresholds",
+    doc = "sequence of threshold values (must be in [0.0, 1.0]) to use for threshold metrics",
+    isValid = _.forall(x => x >= 0.0 && x <= 1.0)
+  )
+  setDefault(thresholds, (0 to 100).map(_ / 100.0).toArray)
+
+  def setThresholds(v: Array[Double]): this.type = set(thresholds, v)
+
   override def evaluateAll(data: Dataset[_]): MultiClassificationMetrics = {
-    val (labelColName, predictionColName, rawPredictionColName) = (getLabelCol, getPredictionCol, getRawPredictionCol)
+    val (labelColName, predictionColName, rawPredictionColName, probabilityColName) = (getLabelCol, getPredictionCol,
+      getRawPredictionCol, getProbabilityCol)
 
     log.debug(
-      "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n",
-      labelColName, rawPredictionColName, predictionColName
+      "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n probability : {}\n",
+      labelColName, rawPredictionColName, predictionColName, probabilityColName
     )
 
     import data.sparkSession.implicits._
@@ -76,13 +100,132 @@ private[op] class OpMultiClassificationEvaluator
     val recall = multiclassMetrics.weightedRecall
     val f1 = if (precision + recall == 0.0) 0.0 else 2 * precision * recall / (precision + recall)
 
-    val metrics = MultiClassificationMetrics(Precision = precision, Recall = recall, F1 = f1, Error = error)
+    val thresholdMetrics = calculateThresholdMetrics(
+      data = data.select(probabilityColName, labelColName).rdd.map(r => (r.getAs[Vector](0).toArray, r.getDouble(1))),
+      topNs = $(topNs),
+      thresholds = $(thresholds)
+    )
+
+    val metrics = MultiClassificationMetrics(
+      Precision = precision,
+      Recall = recall,
+      F1 = f1,
+      Error = error,
+      ThresholdMetrics = thresholdMetrics
+    )
 
     log.info("Evaluated metrics: {}", metrics.toString)
     metrics
   }
 
 
+  /**
+   * Function that calculates a set of threshold metrics for different topN values given an RDD of scores & labels,
+   * a list of topN values to consider, and a list of thresholds to use.
+   *
+   * Output: ThresholdMetrics object, containing thresholds used, topN values used, and maps from topN value to
+   * arrays of correct, incorrect, and no prediction counts at each threshold. Summing all three of these arrays
+   * together should give an array where each entry the total number of rows in the input RDD.
+   *
+   * @param data       Input RDD consisting of (vector of score probabilities, label), where label corresponds to the
+   *                   index of the true class and the score vector consists of probabilities for each class
+   * @param topNs      Sequence of topN values to calculate threshold metrics for.
+   *                   For example, if topN is Seq(1, 3, 10) then threshold metrics are calculated by considering if
+   *                   the score of the true class is in the top 1, top 3, and top10 scores, respectively. If a topN
+   *                   value is greater than the number of total classes,
+   *                   then it will still be applied, but will have the same results as if that topN value = num classes
+   * @param thresholds Sequence of threshold values applied to predicted probabilities, therefore they must be in the
+   *                   range [0.0, 1.0]
+   */
+  def calculateThresholdMetrics(
+    data: RDD[(Array[Double], Double)],
+    topNs: Seq[Int],
+    thresholds: Seq[Double]
+  ): ThresholdMetrics = {
+    require(thresholds.nonEmpty, "thresholds sequence in cannot be empty")
+    require(thresholds.forall(x => x >= 0 && x <= 1.0), "thresholds sequence elements must be in the range [0, 1]")
+    require(topNs.nonEmpty, "topN sequence in cannot be empty")
+    require(topNs.forall(_ > 0), "topN sequence can only contain positive integers")
+
+    type Label = Int
+    type CorrIncorr = (Array[Long], Array[Long])
+    type MetricsMap = Map[Label, CorrIncorr]
+
+    val nThresholds = thresholds.length
+
+    /**
+     * Allocates an array of longs and fills it with a specified value from start until end
+     */
+    def arrayFill(size: Int)(start: Int, end: Int, value: Long) = {
+      val res = new Array[Long](size)
+      var i = start
+      while (i < end) {
+        res(i) = value
+        i += 1
+      }
+      res
+    }
+
+    /**
+     * First aggregation step turns an array of scores (as probabilities) and a single label (index of correct class)
+     * into two arrays, correct and incorrect counts by threshold. Each array index corresponds to whether
+     * the score counts as correct or incorrect at the threshold corresponding to that index.
+     */
+    def computeMetrics(scoresAndLabels: (Array[Double], Double)): MetricsMap = {
+      val scores: Array[Double] = scoresAndLabels._1
+      val label: Label = scoresAndLabels._2.toInt
+      val trueClassScore: Double = scores(label)
+      val topNsAndScores: Map[Label, Array[(Double, Int)]] = topNs.map(t => t -> scores.zipWithIndex.sortBy(-_._1)
+        .take(t)).toMap
+      val topNScores: Map[Label, Array[Double]] = topNsAndScores.mapValues(_.map(_._1))
+      // Doesn't matter which key you use since the scores are sorted
+      val topScore: Double = topNScores.head._2.head
+      val topNIndices: Map[Label, Array[Int]] = topNsAndScores.mapValues(_.map(_._2))
+
+      // To calculate correct / incorrect counts per threshold, we just need to find the array index where the
+      // true label score and the top score are no longer >= threshold.
+      val trueScoreCutoffIndex: Int = {
+        val idx = thresholds.indexWhere(_ > trueClassScore)
+        if (idx < 0) nThresholds else idx
+      }
+      val maxScoreCutoffIndex: Int = {
+        val idx = thresholds.indexWhere(_ > topScore)
+        if (idx < 0) nThresholds else idx
+      }
+      topNs.view.map { t =>
+        val correctCounts = if (topNIndices(t).contains(label)) {
+          arrayFill(nThresholds)(start = 0, end = trueScoreCutoffIndex, value = 1L)
+        } else new Array[Long](nThresholds)
+
+        val incorrectCounts = if (topNIndices(t).contains(label)) {
+          arrayFill(nThresholds)(start = trueScoreCutoffIndex, end = maxScoreCutoffIndex, value = 1L)
+        } else arrayFill(nThresholds)(start = 0, end = maxScoreCutoffIndex, value = 1L)
+
+        t -> (correctCounts, incorrectCounts)
+      }.toMap[Label, CorrIncorr]
+    }
+
+    val zeroValue: MetricsMap =
+      topNs
+        .map(_ -> (new Array[Long](nThresholds), new Array[Long](nThresholds)))
+        .toMap[Label, CorrIncorr]
+
+    val agg: MetricsMap =
+      data.treeAggregate[MetricsMap](zeroValue)(combOp = _ + _, seqOp = _ + computeMetrics(_))
+
+    val nRows = data.count()
+    ThresholdMetrics(
+      topNs = topNs,
+      thresholds = thresholds,
+      correctCounts = agg.mapValues { case (cor, _) => cor.toSeq },
+      incorrectCounts = agg.mapValues { case (_, incor) => incor.toSeq },
+      noPredictionCounts = agg.mapValues { case (cor, incor) =>
+        (Array.fill(nThresholds)(nRows) + cor.map(-_) + incor.map(-_)).toSeq
+      }
+    )
+  }
+
+
   final private[op] def getMultiEvaluatorMetric(metricName: ClassificationEvalMetric, dataset: Dataset[_]): Double = {
     new MulticlassClassificationEvaluator()
       .setLabelCol(getLabelCol)
@@ -101,6 +244,39 @@ private[op] class OpMultiClassificationEvaluator
  * @param Recall
  * @param F1
  * @param Error
+ * @param ThresholdMetrics
  */
-case class MultiClassificationMetrics(Precision: Double, Recall: Double, F1: Double, Error: Double)
-  extends EvaluationMetrics
+case class MultiClassificationMetrics
+(
+  Precision: Double,
+  Recall: Double,
+  F1: Double,
+  Error: Double,
+  ThresholdMetrics: ThresholdMetrics
+) extends EvaluationMetrics
+
+/**
+ * Threshold-based metrics for multiclass classification
+ *
+ * Classifications being correct, incorrect, or no classification are defined in terms of the topN and score threshold
+ * to be:
+ * Correct - score of the true label is in the top N scores AND the score of the true label is >= threshold
+ * Incorrect - score of top predicted label >= threshold AND
+ * (true label NOT in top N predicted labels OR score of true label < threshold)
+ * No prediction - otherwise (score of top predicted label < threshold)
+ *
+ * @param topNs              list of topN values (used as keys for the count maps)
+ * @param thresholds         list of threshold values (correspond to thresholds at the indices
+ *                           of the arrays in the count maps)
+ * @param correctCounts      map from topN value to an array of counts of correct classifications at each threshold
+ * @param incorrectCounts    map from topN value to an array of counts of incorrect classifications at each threshold
+ * @param noPredictionCounts map from topN value to an array of counts of no prediction at each threshold
+ */
+case class ThresholdMetrics
+(
+  topNs: Seq[Int],
+  thresholds: Seq[Double],
+  correctCounts: Map[Int, Seq[Long]],
+  incorrectCounts: Map[Int, Seq[Long]],
+  noPredictionCounts: Map[Int, Seq[Long]]
+) extends EvaluationMetrics
diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala b/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala
similarity index 61%
rename from core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala
rename to core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala
index e039dbd37d..ddb7846a2e 100644
--- a/core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala
@@ -29,25 +29,23 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.classification
+package com.salesforce.op.filters
 
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-import org.apache.spark.ml.tree.Node
-
-import scala.reflect.runtime.universe.TypeTag
-
-class OpDecisionTreeClassificationModel
-(
-  rootNode: Node,
-  numFeatures: Int,
-  numClasses: Int,
-  uid: String = UID[OpDecisionTreeClassificationModel],
-  val operationName: String = "opDTC"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends DecisionTreeClassificationModel(uid = uid, rootNode = rootNode, numFeatures = numFeatures,
-  numClasses = numClasses) with OpClassifierModelBase
+/**
+ * Contains all feature distribution summaries and null label-leakage correlations used to
+ * determine dropped features in [[RawFeatureFilter]].
+ *
+ * @param responseSummaries response summaries
+ * @param responseDistributions response distributions
+ * @param predictorSummaries predictor summaries
+ * @param predictorDistributions predictor distributions
+ * @param correlationInfo null label-leakage correlation map
+ *                        1st level keys correspond to response keys
+ *                        2nd level keys correspond to predictor keys with values being null-label leakage corr. value
+ */
+private[op] case class AllFeatureInformation(
+    responseSummaries: Map[FeatureKey, Summary],
+    responseDistributions: Array[FeatureDistribution],
+    predictorSummaries: Map[FeatureKey, Summary],
+    predictorDistributions: Array[FeatureDistribution],
+    correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]])
diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
new file mode 100644
index 0000000000..2c0ac16179
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+import com.salesforce.op.features.TransientFeature
+import com.salesforce.op.stages.impl.feature.{Inclusion, NumericBucketizer}
+import com.twitter.algebird.Semigroup
+import com.twitter.algebird.Monoid._
+import com.twitter.algebird.Operators._
+import org.apache.spark.mllib.feature.HashingTF
+
+/**
+ * Class containing summary information for a feature
+ *
+ * @param name name of the feature
+ * @param key map key associated with distribution (when the feature is a map)
+ * @param count total count of feature seen
+ * @param nulls number of empties seen in feature
+ * @param distribution binned counts of feature values (hashed for strings, evently spaced bins for numerics)
+ * @param summaryInfo either min and max number of tokens for text data,
+ *                    or number of splits used for bins for numeric data
+ */
+case class FeatureDistribution
+(
+  name: String,
+  key: Option[String],
+  count: Long,
+  nulls: Long,
+  distribution: Array[Double],
+  summaryInfo: Array[Double]
+) {
+
+  /**
+   * Get feature key associated to this distribution
+   */
+  def featureKey: FeatureKey = (name, key)
+
+  /**
+   * Check that feature distributions belong to the same feature and key.
+   *
+   * @param fd distribution to compare to
+   */
+  def checkMatch(fd: FeatureDistribution): Unit =
+    assert(name == fd.name && key == fd.key, "Name and key must match to compare or combine FeatureDistribution")
+
+  /**
+   * Get fill rate of feature
+   *
+   * @return fraction of data that is non empty
+   */
+  def fillRate(): Double = if (count == 0L) 0.0 else (count - nulls) / count.toDouble
+
+  /**
+   * Combine feature distributions
+   *
+   * @param fd other feature distribution (from the same feature)
+   * @return summed distribution information
+   */
+  def reduce(fd: FeatureDistribution): FeatureDistribution = {
+    checkMatch(fd)
+    val combinedDist = distribution + fd.distribution
+    // summary info can be empty or min max if hist is empty but should otherwise match so take the longest info
+    val combinedSummary = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo
+    FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist, combinedSummary)
+  }
+
+  /**
+   * Ratio of fill rates between the two distributions symetric with larger value on the top
+   *
+   * @param fd feature distribution to compare to
+   * @return ratio of fill rates
+   */
+  def relativeFillRatio(fd: FeatureDistribution): Double = {
+    checkMatch(fd)
+    val (thisFill, thatFill) = (fillRate(), fd.fillRate())
+    val (small, large) = if (thisFill < thatFill) (thisFill, thatFill) else (thatFill, thisFill)
+    if (small == 0.0) Double.PositiveInfinity else large / small
+  }
+
+  /**
+   * Absolute difference in empty rates
+   *
+   * @param fd feature distribution to compare to
+   * @return absolute difference of rates
+   */
+  def relativeFillRate(fd: FeatureDistribution): Double = {
+    checkMatch(fd)
+    math.abs(fillRate() - fd.fillRate())
+  }
+
+  /**
+   * Jensen-Shannon divergence from this distribution to the other distribution fed in
+   *
+   * @param fd other feature distribution
+   * @return the KL divergence
+   */
+  def jsDivergence(fd: FeatureDistribution): Double = {
+    checkMatch(fd)
+    val combinedCounts = distribution.zip(fd.distribution).filterNot{ case (a, b) => a == 0.0 && b == 0.0 }
+    val (thisCount, thatCount) = combinedCounts
+      .fold[(Double, Double)]( (0, 0)){ case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) }
+    val probs = combinedCounts.map{ case (a, b) => a / thisCount -> b / thatCount }
+    val meanProb = probs.map{ case (a, b) => (a + b) / 2}
+    def log2(x: Double) = math.log10(x) / math.log10(2.0)
+    def klDivergence(a: Double, b: Double) = if (a == 0.0) 0.0 else a * log2(a / b)
+    probs.zip(meanProb).map{ case ((a, b), m) => 0.5 * klDivergence(a, m) + 0.5 * klDivergence(b, m) }.sum
+  }
+
+  override def toString(): String = {
+    s"Name=$name, Key=$key, Count=$count, Nulls=$nulls, Histogram=${distribution.toList}, BinInfo=${summaryInfo.toList}"
+  }
+}
+
+private[op] object FeatureDistribution {
+
+  val MaxBins = 100000
+
+  implicit val semigroup: Semigroup[FeatureDistribution] = new Semigroup[FeatureDistribution] {
+    override def plus(l: FeatureDistribution, r: FeatureDistribution) = l.reduce(r)
+  }
+
+  /**
+   * Facilitates feature distribution retrieval from computed feature summaries
+   *
+   * @param featureKey feature key
+   * @param summary feature summary
+   * @param value optional processed sequence
+   * @param bins number of histogram bins
+   * @param hasher hashing method to use for text and categorical features
+   * @return feature distribution given the provided information
+   */
+  def apply(
+    featureKey: FeatureKey,
+    summary: Summary,
+    value: Option[ProcessedSeq],
+    bins: Int,
+    hasher: HashingTF
+  ): FeatureDistribution = {
+    val (nullCount, (summaryInfo, distribution)): (Int, (Array[Double], Array[Double])) =
+      value.map(seq => 0 -> histValues(seq, summary, bins, hasher))
+        .getOrElse(1 -> (Array(summary.min, summary.max) -> Array.fill(bins)(0.0)))
+
+    FeatureDistribution(
+      name = featureKey._1,
+      key = featureKey._2,
+      count = 1,
+      nulls = nullCount,
+      summaryInfo = summaryInfo,
+      distribution = distribution)
+  }
+
+  /**
+   * Function to put data into histogram of counts
+   * @param values values to bin
+   * @param sum summary info for feature (max and min)
+   * @param bins number of bins to produce
+   * @param hasher hasing function to use for text
+   * @return the bin information and the binned counts
+   */
+  // TODO avoid wrapping and unwrapping??
+  private def histValues(
+    values: ProcessedSeq,
+    sum: Summary,
+    bins: Int,
+    hasher: HashingTF
+  ): (Array[Double], Array[Double]) = {
+    values match {
+      case Left(seq) => Array(sum.min, sum.max) -> hasher.transform(seq).toArray // TODO use summary info to pick hashes
+      case Right(seq) => // TODO use kernel fit instead of histogram
+        if (sum == Summary.empty) {
+          Array(sum.min, sum.max) -> seq.toArray // the seq will always be empty in this case
+        } else if (sum.min < sum.max) {
+          val step = (sum.max - sum.min) / (bins - 2.0) // total number of bins includes one for edge and one for other
+          val splits = (0 until bins).map(b => sum.min + step * b).toArray
+          val binned = seq.map { v =>
+            NumericBucketizer.bucketize(
+              splits = splits, trackNulls = false, trackInvalid = true,
+              splitInclusion = Inclusion.Left, input = Option(v)
+            ).toArray
+          }
+          val hist = binned.fold(new Array[Double](bins))(_ + _)
+          splits -> hist
+        } else {
+          val same = seq.map(v => if (v == sum.max) 1.0 else 0.0).sum
+          val other = seq.map(v => if (v != sum.max) 1.0 else 0.0).sum
+          Array(sum.min, sum.max) -> Array(same, other)
+        }
+    }
+  }
+}
diff --git a/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala b/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala
new file mode 100644
index 0000000000..412a947f38
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+import com.salesforce.op.features.TransientFeature
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.feature.TextTokenizer
+import com.salesforce.op.utils.spark.RichRow._
+import com.salesforce.op.utils.text.Language
+import org.apache.spark.mllib.feature.HashingTF
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.sql.Row
+
+/**
+ * Class representing processed reponses and predictors keyed by their respective feature key
+ *
+ * @param responses prepared responses
+ * @param predictors prepared predictors
+ */
+private[filters] case class PreparedFeatures(
+    responses: Map[FeatureKey, ProcessedSeq],
+    predictors: Map[FeatureKey, ProcessedSeq]) {
+
+  /**
+   * Computes summaries keyed by feature keys for this observation.
+   *
+   * @return pair consisting of response and predictor summaries (in this order)
+   */
+  def summaries: (Map[FeatureKey, Summary], Map[FeatureKey, Summary]) =
+    responses.mapValues(Summary(_)) -> predictors.mapValues(Summary(_))
+
+  /**
+   * Computes vector of size responseKeys.length + predictorKeys.length. The first responses.length
+   * values are the actual response values (nulls replaced with 0.0). Its (i + responses.length)th value
+   * is 1 iff. the predictor associated to ith feature key is null, for i >= 0.
+   *
+   * @param responseKeys response feature keys
+   * @param predictorKeys set of all predictor keys needed for constructing binary vector
+   * @return null label-leakage correlation vector
+   */
+  def getNullLabelLeakageVector(responseKeys: Array[FeatureKey], predictorKeys: Array[FeatureKey]): Vector = {
+    val responseValues = responseKeys.map(responses.get(_).collect {
+      case Right(Seq(d)) => d
+    }.getOrElse(0.0))
+    val predictorNullIndicatorValues = predictorKeys.map(predictors.get(_).map(_ => 0.0).getOrElse(1.0))
+
+    Vectors.dense(responseValues ++ predictorNullIndicatorValues)
+  }
+
+  /*
+   * Generates a pair of feature distribution arrays. The first element is associated to responses,
+   * and the second to predictors.
+   *
+   * @param responseSummaries global feature metadata
+   * @param predictorSummaries set of feature summary statistics (derived from metadata)
+   * @param bins number of bins to put numerics into
+   * @param hasher hash function to use on strings
+   * @return a pair consisting of response and predictor feature distributions (in this order)
+   */
+  def getFeatureDistributions(
+    responseSummaries: Array[(FeatureKey, Summary)],
+    predictorSummaries: Array[(FeatureKey, Summary)],
+    bins: Int,
+    hasher: HashingTF
+  ): (Array[FeatureDistribution], Array[FeatureDistribution]) = {
+    val responseFeatureDistributions: Array[FeatureDistribution] =
+      getFeatureDistributions(responses, responseSummaries, bins, hasher)
+    val predictorFeatureDistributions: Array[FeatureDistribution] =
+      getFeatureDistributions(predictors, predictorSummaries, bins, hasher)
+
+    responseFeatureDistributions -> predictorFeatureDistributions
+  }
+
+  private def getFeatureDistributions(
+    features: Map[FeatureKey, ProcessedSeq],
+    summaries: Array[(FeatureKey, Summary)],
+    bins: Int,
+    hasher: HashingTF
+  ): Array[FeatureDistribution] = summaries.map { case (featureKey, summary) =>
+    FeatureDistribution(
+      featureKey = featureKey,
+      summary = summary,
+      value = features.get(featureKey),
+      bins = bins,
+      hasher = hasher)
+  }
+}
+
+private[filters] object PreparedFeatures {
+
+  /**
+   * Retrieve prepared features from a given data frame row and transient features partition
+   * into responses and predictors.
+   *
+   * @param row data frame row
+   * @param responses transient features derived from responses
+   * @param predictors transient features derived from predictors
+   * @return set of prepared features
+   */
+  def apply(row: Row, responses: Array[TransientFeature], predictors: Array[TransientFeature]): PreparedFeatures = {
+    val empty: Map[FeatureKey, ProcessedSeq] = Map()
+    val preparedResponses = responses.foldLeft(empty) { case (map, feature) =>
+      val converter = FeatureTypeSparkConverter.fromFeatureTypeName(feature.typeName)
+      map ++ prepareFeature(feature.name, row.getFeatureType(feature)(converter))
+    }
+    val preparedPredictors = predictors.foldLeft(empty) { case (map, feature) =>
+      val converter = FeatureTypeSparkConverter.fromFeatureTypeName(feature.typeName)
+      map ++ prepareFeature(feature.name, row.getFeatureType(feature)(converter))
+    }
+
+    PreparedFeatures(responses = preparedResponses, predictors = preparedPredictors)
+  }
+
+  /**
+   * Turn features into a sequence that will have stats computed on it based on the type of the feature
+   *
+   * @param name feature name
+   * @param value feature value
+   * @tparam T type of the feature
+   * @return tuple containing whether the feature was empty and a sequence of either doubles or strings
+   */
+  private def prepareFeature[T <: FeatureType](name: String, value: T): Map[FeatureKey, ProcessedSeq] =
+    value match {
+      case v: Text => v.value
+        .map(s => Map[FeatureKey, ProcessedSeq]((name, None) -> Left(tokenize(s))))
+        .getOrElse(Map())
+      case v: OPNumeric[_] => v.toDouble
+        .map(d => Map[FeatureKey, ProcessedSeq]((name, None) -> Right(Seq(d))))
+        .getOrElse(Map())
+      case ft@SomeValue(v: DenseVector) => Map((name, None) -> Right(v.toArray.toSeq))
+      case ft@SomeValue(v: SparseVector) => Map((name, None) -> Right(v.indices.map(_.toDouble).toSeq))
+      case ft@SomeValue(_) => ft match {
+        case v: Geolocation => Map((name, None) -> Right(v.value))
+        case v: TextList => Map((name, None) -> Left(v.value))
+        case v: DateList => Map((name, None) -> Right(v.value.map(_.toDouble)))
+        case v: MultiPickList => Map((name, None) -> Left(v.value.toSeq))
+        case v: MultiPickListMap => v.value.map { case (k, e) => (name, Option(k)) -> Left(e.toSeq) }
+        case v: GeolocationMap => v.value.map{ case (k, e) => (name, Option(k)) -> Right(e) }
+        case v: OPMap[_] => v.value.map { case (k, e) => e match {
+          case d: Double => (name, Option(k)) -> Right(Seq(d))
+          // Do not need to distinguish between string map types, all text is tokenized for distribution calculation
+          case s: String => (name, Option(k)) -> Left(tokenize(s))
+          case l: Long => (name, Option(k)) -> Right(Seq(l.toDouble))
+          case b: Boolean => (name, Option(k)) -> Right(Seq(if (b) 1.0 else 0.0))
+        }}
+        case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter")
+      }
+      case _ => Map()
+    }
+
+  /**
+   * Tokenizes an input string.
+   *
+   * @param s input string
+   * @return array of string tokens
+   */
+  private def tokenize(s: String) = TextTokenizer.Analyzer.analyze(s, Language.Unknown)
+}
diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
index 42ff002dc8..34c9bff908 100644
--- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala
@@ -31,19 +31,23 @@
 
 package com.salesforce.op.filters
 
+import scala.math.{abs, min}
+
 import com.salesforce.op.OpParams
 import com.salesforce.op.features.types._
 import com.salesforce.op.features.{OPFeature, TransientFeature}
-import com.salesforce.op.filters.FeatureDistrib.ProcessedSeq
 import com.salesforce.op.readers.{DataFrameFieldNames, Reader}
 import com.salesforce.op.stages.impl.feature.{HashAlgorithm, Inclusion, NumericBucketizer, TextTokenizer}
+import com.salesforce.op.stages.impl.preparators.CorrelationType
 import com.salesforce.op.utils.spark.RichRow._
-import com.salesforce.op.utils.text.Language
 import com.twitter.algebird.Monoid
 import com.twitter.algebird.Semigroup
 import com.twitter.algebird.Monoid._
 import com.twitter.algebird.Operators._
 import org.apache.spark.mllib.feature.HashingTF
+import org.apache.spark.mllib.linalg.{Matrix, Vector}
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.slf4j.LoggerFactory
@@ -59,6 +63,9 @@ import org.slf4j.LoggerFactory
  * @param maxFillDifference maximum acceptable fill rate difference between training and scoring data to be kept
  * @param maxFillRatioDiff maximum acceptable fill ratio between training and scoring (larger / smaller)
  * @param maxJSDivergence maximum Jensen-Shannon divergence between training and scoring distributions to be kept
+ * @param maxCorrelation maximum absolute correlation allowed between raw predictor null indicator and label
+ * @param correlationType type of correlation metric to use
+ * @param jsDivergenceProtectedFeatures features that are protected from removal by JS divergence check
  * @param protectedFeatures features that are protected from removal
  * @tparam T datatype of the reader
  */
@@ -71,11 +78,14 @@ class RawFeatureFilter[T]
   val maxFillDifference: Double,
   val maxFillRatioDiff: Double,
   val maxJSDivergence: Double,
+  val maxCorrelation: Double,
+  val correlationType: CorrelationType = CorrelationType.Pearson,
+  val jsDivergenceProtectedFeatures: Set[String] = Set.empty,
   val protectedFeatures: Set[String] = Set.empty
 ) extends Serializable {
 
-  assert(bins > 1 && bins <= FeatureDistrib.MaxBins, s"Invalid bin size $bins," +
-    s" bins must be between 1 and ${FeatureDistrib.MaxBins}")
+  assert(bins > 1 && bins <= FeatureDistribution.MaxBins, s"Invalid bin size $bins," +
+    s" bins must be between 1 and ${FeatureDistribution.MaxBins}")
   assert(minFill >= 0.0 && minFill <= 1.0, s"Invalid minFill size $minFill, minFill must be between 0 and 1")
   assert(maxFillDifference >= 0.0 && maxFillDifference <= 1.0, s"Invalid maxFillDifference size $maxFillDifference," +
     s" maxFillDifference must be between 0 and 1")
@@ -90,80 +100,69 @@ class RawFeatureFilter[T]
     .setBinary(false)
     .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
 
-  private def tokenize(s: String) = TextTokenizer.Analyzer.analyze(s, Language.Unknown)
-
-  /**
-   * Turn features into a sequence that will have stats computed on it based on the type of the feature
-   * @param value feature value
-   * @tparam T type of the feature
-   * @return a tuple containing whether the feature was empty and a sequence of either doubles or strings
-   */
-  private def prepareFeatures[T <: FeatureType](value: T): (Boolean, ProcessedSeq) = {
-    value match {
-      case v: Text => v.isEmpty -> Left(v.value.map(tokenize).getOrElse(Seq.empty)) // TODO are empty strings == nulls
-      case v: OPNumeric[_] => v.isEmpty -> Right(v.toDouble.toSeq)
-      case v: OPVector => v.isEmpty -> Right(v.value.toArray.toSeq)
-      case v: Geolocation => v.isEmpty -> Right(v.value)
-      case v: TextList => v.isEmpty -> Left(v.value)
-      case v: DateList => v.isEmpty -> Right(v.value.map(_.toDouble))
-      case v: MultiPickList => v.isEmpty -> Left(v.value.toSeq)
-      case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter")
-    }
-  }
-
-
-  /**
-   * Turn map features into a map of sequences that will have stats computed on it based on the type of the feature
-   * @param value feature value
-   * @tparam T type of the map feature
-   * @return a map from the keys to a sequence of either doubles or strings
-   */
-  private def prepareMapFeatures[T <: FeatureType](value: T): Map[String, ProcessedSeq] = {
-    value match {
-      case v: MultiPickListMap => v.value.map{ case (k, e) => k -> Left(e.toSeq) }
-      case v: GeolocationMap => v.value.map{ case (k, e) => k -> Right(e) }
-      case v: OPMap[_] => v.value.map { case (k, e) => e match {
-        case d: Double => k -> Right(Seq(d))
-        case s: String => k -> Left(tokenize(s))
-        case l: Long => k -> Right(Seq(l.toDouble))
-        case b: Boolean => k -> Right(Seq(if (b) 1.0 else 0.0))
-      }}
-      case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter")
-    }
-  }
 
   /**
    * Get binned counts of the feature distribution and empty count for each raw feature
    * @param data data frame to compute counts on
-   * @param rawFeatures list of raw features contained in the dataframe
+   * @param features list of raw, non-protected, features contained in the dataframe
+   * @param allFeatureInfo existing feature info to use
    * @return a sequence of distribution summaries for each raw feature
    */
-  // TODO do these computations on a per label basis??
-  private[op] def computeFeatureStats(data: DataFrame, rawFeatures: Array[OPFeature],
-    featureSummaries: Option[AllFeatureInformation] = None): AllFeatureInformation = {
-    val (mapTranFeatures, tranFeatures) = rawFeatures
-      .map(f => TransientFeature(f) -> FeatureTypeSparkConverter()(f.wtt))
-      .partition(_._1.getFeature().isSubtypeOf[OPMap[_]])
+  private[op] def computeFeatureStats(
+    data: DataFrame,
+    features: Array[OPFeature],
+    allFeatureInfo: Option[AllFeatureInformation] = None): AllFeatureInformation = {
+    val (responses, predictors): (Array[TransientFeature], Array[TransientFeature]) = {
+      val (allResponses, allPredictors) = features.partition(_.isResponse)
+      val respOut = allResponses.map(TransientFeature(_)).flatMap {
+        case f if f.getFeature().isSubtypeOf[OPNumeric[_]] =>
+          log.info("Using numeric response: {}", f.name)
+          Option(f)
+        case f =>
+          log.info("Not using non-numeric response in raw feature filter: {}", f.name)
+          None
+      }
+      val predOut = allPredictors.map(TransientFeature(_))
 
-    val preparedFeatures = data.rdd.map{ row =>
-      tranFeatures.map(f => prepareFeatures(row.getFeatureType(f._1)(f._2))) ->
-        mapTranFeatures.map(mf => prepareMapFeatures(row.getFeatureType(mf._1)(mf._2)))
+      (respOut, predOut)
     }
-
-    val (summaryFeatures, summaryMapFeatures) = // Have to use the training summaries do process scoring for comparison
-      featureSummaries.map{ fs => fs.featureSummaries -> fs.mapFeatureSummaries }.getOrElse{
-        preparedFeatures.map { case (features, mapFeatures) =>
-          features.map(f => Summary(f._2)) -> mapFeatures.map(mf => mf.map { case (k, v) => k -> Summary(v) })
-        }.reduce(_ + _)
+    val preparedFeatures: RDD[PreparedFeatures] =
+      data.rdd.map(PreparedFeatures(_, responses, predictors))
+    // Have to use the training summaries do process scoring for comparison
+    val (responseSummaries, predictorSummaries): (Map[FeatureKey, Summary], Map[FeatureKey, Summary]) =
+      allFeatureInfo.map(info => info.responseSummaries -> info.predictorSummaries)
+        .getOrElse(preparedFeatures.map(_.summaries).reduce(_ + _))
+    val (responseSummariesArr, predictorSummariesArr): (Array[(FeatureKey, Summary)], Array[(FeatureKey, Summary)]) =
+      (responseSummaries.toArray, predictorSummaries.toArray)
+    val (responseDistributions, predictorDistributions): (Array[FeatureDistribution], Array[FeatureDistribution]) =
+      preparedFeatures
+        .map(_.getFeatureDistributions(
+          responseSummaries = responseSummariesArr,
+          predictorSummaries = predictorSummariesArr,
+          bins = bins,
+          hasher = hasher))
+        .reduce(_ + _) // NOTE: resolved semigroup is IndexedSeqSemigroup
+    val correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]] =
+      allFeatureInfo.map(_.correlationInfo).getOrElse {
+        val emptyCorr: Map[FeatureKey, Map[FeatureKey, Double]] = Map()
+        val responseKeys: Array[FeatureKey] = responseSummariesArr.map(_._1)
+        val predictorKeys: Array[FeatureKey] = predictorSummariesArr.map(_._1)
+        val corrRDD: RDD[Vector] = preparedFeatures.map(_.getNullLabelLeakageVector(responseKeys, predictorKeys))
+        val corrMatrix: Matrix = Statistics.corr(corrRDD, correlationType.sparkName)
+
+        responseKeys.zipWithIndex.map { case (responseKey, i) =>
+          responseKey -> predictorKeys.zipWithIndex.map { case (predictorKey, j) =>
+            predictorKey -> min(abs(corrMatrix(i, j + responseKeys.length)), 1.0)
+          }.toMap
+        }.toMap
       }
 
-    val featureDistrib = preparedFeatures
-      .map{ case (features, mapFeatures) =>
-        FeatureDistrib.getDistributions(tranFeatures.map(_._1), features, summaryFeatures, bins, hasher) ++
-          FeatureDistrib.getMapDistributions(mapTranFeatures.map(_._1), mapFeatures, summaryMapFeatures, bins, hasher) }
-      .reduce(_ + _)
-
-    AllFeatureInformation(summaryFeatures, summaryMapFeatures, featureDistrib)
+    AllFeatureInformation(
+      responseSummaries = responseSummaries,
+      responseDistributions = responseDistributions,
+      predictorSummaries = predictorSummaries,
+      predictorDistributions = predictorDistributions,
+      correlationInfo = correlationInfo)
   }
 
   /**
@@ -171,12 +170,14 @@ class RawFeatureFilter[T]
    * features should be dropped (including maps with all keys dropped) and which map keys need to be dropped
    * @param trainingDistribs summary of distributions for training data features
    * @param scoringDistribs summary of distributions for scoring data features (may be an empty seq)
+   * @param correlationInfo info needed to determine feature to drop based on null label-leakage correlation
    * @return a list of feature names that should be dropped and a map of map keys that should be dropped
    *         Map(featureName -> key)
    */
   private[op] def getFeaturesToExclude(
-    trainingDistribs: Seq[FeatureDistrib],
-    scoringDistribs: Seq[FeatureDistrib]
+    trainingDistribs: Seq[FeatureDistribution],
+    scoringDistribs: Seq[FeatureDistribution],
+    correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]]
   ): (Seq[String], Map[String, Set[String]]) = {
 
     def logExcluded(excluded: Seq[Boolean], message: String): Unit = {
@@ -185,14 +186,29 @@ class RawFeatureFilter[T]
       log.info(s"$message: ${featuresDropped.mkString(", ")}")
     }
 
-    val featureSize = trainingDistribs.size
+    val featureSize = trainingDistribs.length
 
     val trainingUnfilled = trainingDistribs.map(_.fillRate() < minFill)
     logExcluded(trainingUnfilled, s"Features excluded because training fill rate did not meet min required ($minFill)")
 
+    val trainingNullLabelLeakers = {
+      if (correlationInfo.isEmpty) Seq.fill(featureSize)(false)
+      else {
+        val absoluteCorrs = correlationInfo.map(_._2)
+        for {distrib <- trainingDistribs} yield {
+          // Only filter if feature absolute null-label leakage correlation is greater than allowed correlation
+          val nullLabelLeakerIndicators = absoluteCorrs.map(_.get(distrib.featureKey).exists(_ > maxCorrelation))
+          nullLabelLeakerIndicators.exists(identity(_))
+        }
+      }
+    }
+    logExcluded(
+      trainingNullLabelLeakers,
+      s"Features excluded because null indicator correlation (absolute) exceeded max allowed ($maxCorrelation)")
+
     val scoringUnfilled =
       if (scoringDistribs.nonEmpty) {
-        assert(scoringDistribs.length == trainingDistribs.length, "scoring and training features must match")
+        assert(scoringDistribs.length == featureSize, "scoring and training features must match")
         val su = scoringDistribs.map(_.fillRate() < minFill)
         logExcluded(su, s"Features excluded because scoring fill rate did not meet min required ($minFill)")
         su
@@ -206,7 +222,9 @@ class RawFeatureFilter[T]
         log.info(combined.map { case (t, s) => s"\n$t\n$s\nTrain Fill=${t.fillRate()}, Score Fill=${s.fillRate()}, " +
           s"JS Divergence=${t.jsDivergence(s)}, Fill Rate Difference=${t.relativeFillRate(s)}, " +
           s"Fill Ratio Difference=${t.relativeFillRatio(s)}" }.mkString("\n"))
-        val kl = combined.map { case (t, s) => t.jsDivergence(s) > maxJSDivergence }
+        val kl = combined.map { case (t, s) =>
+          !jsDivergenceProtectedFeatures.contains(t.name) && t.jsDivergence(s) > maxJSDivergence
+        }
         logExcluded(kl, s"Features excluded because JS Divergence exceeded max allowed ($maxJSDivergence)")
         val mf = combined.map { case (t, s) => t.relativeFillRate(s) > maxFillDifference }
         logExcluded(mf, s"Features excluded because fill rate difference exceeded max allowed ($maxFillDifference)")
@@ -217,8 +235,8 @@ class RawFeatureFilter[T]
         Seq.fill(featureSize)(false)
       }
 
-    val allExcludeReasons = trainingUnfilled.zip(scoringUnfilled).zip(distribMismatches)
-      .map{ case ((t, s), d) => t || s || d }
+    val allExcludeReasons = trainingUnfilled.zip(scoringUnfilled).zip(distribMismatches).zip(trainingNullLabelLeakers)
+      .map{ case (((t, s), d), n) => t || s || d || n }
 
     val (toDrop, toKeep) = trainingDistribs.zip(allExcludeReasons).partition(_._2)
 
@@ -243,14 +261,12 @@ class RawFeatureFilter[T]
   def generateFilteredRaw(rawFeatures: Array[OPFeature], parameters: OpParams)
     (implicit spark: SparkSession): (DataFrame, Array[OPFeature]) = {
 
-    val (_, predictorFeatures) = rawFeatures.partition(f => f.isResponse || protectedFeatures.contains(f.name) )
-
     val trainData = trainingReader.generateDataFrame(rawFeatures, parameters).persist()
     log.info("Loaded training data")
     assert(trainData.count() > 0, "RawFeatureFilter cannot work with empty training data")
-    val trainingSummary = computeFeatureStats(trainData, predictorFeatures) // TODO also response summaries??
+    val trainingSummary = computeFeatureStats(trainData, rawFeatures) // TODO also response summaries??
     log.info("Computed summary stats for training features")
-    log.debug(trainingSummary.featureDistributions.mkString("\n"))
+    log.debug(trainingSummary.predictorDistributions.mkString("\n"))
 
     val scoreData = scoreReader.flatMap{ s =>
       val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist()
@@ -263,16 +279,16 @@ class RawFeatureFilter[T]
     }
 
     val scoringSummary = scoreData.map{ sd =>
-      val ss = computeFeatureStats(sd, predictorFeatures, Some(trainingSummary)) // TODO also response summaries??
+      val ss = computeFeatureStats(sd, rawFeatures, Some(trainingSummary)) // TODO also response summaries??
       log.info("Computed summary stats for scoring features")
-      log.debug(ss.featureDistributions.mkString("\n"))
+      log.debug(ss.predictorDistributions.mkString("\n"))
       ss
     }
 
     val (featuresToDropNames, mapKeysToDrop) = getFeaturesToExclude(
-      trainingSummary.featureDistributions,
-      scoringSummary.toSeq.flatMap(_.featureDistributions)
-    )
+      trainingSummary.predictorDistributions.filterNot(d => protectedFeatures.contains(d.name)),
+      scoringSummary.toSeq.flatMap(_.predictorDistributions.filterNot(d => protectedFeatures.contains(d.name))),
+      trainingSummary.correlationInfo)
     val (featuresToDrop, featuresToKeep) = rawFeatures.partition(rf => featuresToDropNames.contains(rf.name))
     val featuresToKeepNames = Array(DataFrameFieldNames.KeyFieldName) ++ featuresToKeep.map(_.name)
 
@@ -297,227 +313,3 @@ class RawFeatureFilter[T]
     cleanedData -> featuresToDrop
   }
 }
-
-private[op] case class AllFeatureInformation
-(
-  featureSummaries: Array[Summary],
-  mapFeatureSummaries: Array[Map[String, Summary]],
-  featureDistributions: Array[FeatureDistrib]
-)
-
-/**
- * Class used to get summaries of prepped features so know how to bin it for distributions
- * @param min minimum value seen
- * @param max maximum value seen
- */
-private[op] case class Summary(min: Double, max: Double)
-
-private[op] case object Summary {
-
-  val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity)
-
-  implicit val monoid: Monoid[Summary] = new Monoid[Summary] {
-    override def zero = empty
-    override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max))
-  }
-
-  def apply(preppedFeature: ProcessedSeq): Summary = {
-    preppedFeature match {
-      case Left(v) => Summary(v.size, v.size)
-      case Right(v) => monoid.sum(v.map(d => Summary(d, d)))
-    }
-  }
-}
-
-
-/**
- * Class containing summary information for a feature
- * @param name name of the feature
- * @param key map key associated with distribution (when the feature is a map)
- * @param count total count of feature seen
- * @param nulls number of empties seen in feature
- * @param distribution binned counts of feature values (hashed for strings, evently spaced bins for numerics)
- * @param summaryInfo either min and max of data (for text data) or splits used for bins for numeric data
- */
-case class FeatureDistrib
-(
-  name: String,
-  key: Option[String],
-  count: Long,
-  nulls: Long,
-  distribution: Array[Double],
-  summaryInfo: Array[Double]
-) {
-
-  /**
-   * Check that feature distributions below to the same feature and key
-   * @param fd distribution to compare to
-   */
-  def checkMatch(fd: FeatureDistrib): Unit =
-    assert(name == fd.name && key == fd.key, "Name and key must match to compare or combine FeatureDistrib")
-
-  /**
-   * Get fill rate of feature
-   * @return fraction of data that is non empty
-   */
-  def fillRate(): Double = if (count == 0L) 0.0 else (count - nulls) / count.toDouble
-
-  /**
-   * Combine feature distributions
-   * @param fd other feature distribution (from the same feature)
-   * @return summed distribution information
-   */
-  def reduce(fd: FeatureDistrib): FeatureDistrib = {
-    checkMatch(fd)
-    val combinedDist = distribution + fd.distribution
-    // summary info can be empty or min max if hist is empty but should otherwise match so take the longest info
-    val combinedSummary = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo
-    FeatureDistrib(name, key, count + fd.count, nulls + fd.nulls, combinedDist, combinedSummary)
-  }
-
-  /**
-   * Ratio of fill rates between the two distributions symetric with larger value on the top
-   * @param fd feature distribution to compare to
-   * @return ratio of fill rates
-   */
-  def relativeFillRatio(fd: FeatureDistrib): Double = {
-    checkMatch(fd)
-    val (thisFill, thatFill) = (fillRate(), fd.fillRate())
-    val (small, large) = if (thisFill < thatFill) (thisFill, thatFill) else (thatFill, thisFill)
-    if (small == 0.0) Double.PositiveInfinity else large / small
-  }
-
-  /**
-   * Absolute difference in empty rates
-   * @param fd feature distribution to compare to
-   * @return fill rate ratio with larger fill rate on the bottom
-   */
-  def relativeFillRate(fd: FeatureDistrib): Double = {
-    checkMatch(fd)
-    math.abs(fillRate() - fd.fillRate())
-  }
-
-  /**
-   * Jensen-Shannon divergence from this distribution to the other distribution fed in
-   * @param fd other feature distribution
-   * @return the KL divergence
-   */
-  def jsDivergence(fd: FeatureDistrib): Double = {
-    checkMatch(fd)
-    val combinedCounts = distribution.zip(fd.distribution).filterNot{ case (a, b) => a == 0.0 && b == 0.0 }
-    val (thisCount, thatCount) = combinedCounts
-      .fold[(Double, Double)]( (0, 0)){ case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) }
-    val probs = combinedCounts.map{ case (a, b) => a / thisCount -> b / thatCount }
-    val meanProb = probs.map{ case (a, b) => (a + b) / 2}
-    def log2(x: Double) = math.log10(x) / math.log10(2.0)
-    def klDivergence(a: Double, b: Double) = if (a == 0.0) 0.0 else a * log2(a / b)
-    probs.zip(meanProb).map{ case ((a, b), m) => 0.5 * klDivergence(a, m) + 0.5 * klDivergence(b, m) }.sum
-  }
-
-  override def toString(): String = {
-    s"Name=$name, Key=$key, Count=$count, Nulls=$nulls, Histogram=${distribution.toList}, BinInfo=${summaryInfo.toList}"
-  }
-}
-
-private[op] case object FeatureDistrib {
-
-  type ProcessedSeq = Either[Seq[String], Seq[Double]]
-
-  val MaxBins = 100000
-
-  implicit val semigroup: Semigroup[FeatureDistrib] = new Semigroup[FeatureDistrib] {
-    override def plus(l: FeatureDistrib, r: FeatureDistrib) = l.reduce(r)
-  }
-
-  /**
-   * Function to put data into histogram of counts
-   * @param values values to bin
-   * @param sum summary info for feature (max and min)
-   * @param bins number of bins to produce
-   * @param hasher hasing function to use for text
-   * @return the bin information and the binned counts
-   */
-  // TODO avoid wrapping and unwrapping??
-  private def histValues(
-    values: ProcessedSeq,
-    sum: Summary,
-    bins: Int,
-    hasher: HashingTF
-  ): (Array[Double], Array[Double]) = {
-    values match {
-      case Left(seq) => Array(sum.min, sum.max) -> hasher.transform(seq).toArray // TODO use summary info to pick hashes
-      case Right(seq) => // TODO use kernel fit instead of histogram
-        if (sum == Summary.empty) {
-          Array(sum.min, sum.max) -> seq.toArray // the seq will always be empty in this case
-        } else if (sum.min < sum.max) {
-          val step = (sum.max - sum.min) / (bins - 2.0) // total number of bins includes one for edge and one for other
-          val splits = (0 until bins).map(b => sum.min + step * b).toArray
-          val binned = seq.map { v =>
-            NumericBucketizer.bucketize(
-              splits = splits, trackNulls = false, trackInvalid = true,
-              splitInclusion = Inclusion.Left, input = Option(v)
-            ).toArray
-          }
-          val hist = binned.fold(new Array[Double](bins))(_ + _)
-          splits -> hist
-        } else {
-          val same = seq.map(v => if (v == sum.max) 1.0 else 0.0).sum
-          val other = seq.map(v => if (v != sum.max) 1.0 else 0.0).sum
-          Array(sum.min, sum.max) -> Array(same, other)
-        }
-    }
-  }
-
-  /**
-   * Create the distributions for regular features
-   * @param features list of transient features
-   * @param values values of the features processed into a sequence of either doubles or strings with boolean
-   *               indicating if original feature was empty
-   * @param summary summary statistics about feature
-   * @param bins number of bins to put numerics into
-   * @param hasher hash function to use on strings
-   * @return feature distribution for single feature value to be aggregated
-   */
-  def getDistributions(
-    features: Array[TransientFeature],
-    values: Array[(Boolean, ProcessedSeq)],
-    summary: Array[Summary],
-    bins: Int,
-    hasher: HashingTF
-  ): Array[FeatureDistrib] = {
-    features.zip(values).zip(summary).map{
-      case ((tf, (isNull, seq)), sum) =>
-        val isNullCount = if (isNull) 1 else 0
-        val (info, histogram) = histValues(seq, sum, bins, hasher)
-        FeatureDistrib(tf.name, None, 1, isNullCount, histogram, info)
-    }
-  }
-
-  /**
-   * Create the distributions for map features
-   * @param features list of transient map features
-   * @param values values of the features processed into a map from key to sequence of either doubles or strings
-   * @param summary map from key to summary statistics about feature
-   * @param bins number of bins to put numerics into
-   * @param hasher hash function to use on strings
-   * @return feature distribution for single feature and key value to be aggregated
-   */
-  def getMapDistributions(
-    features: Array[TransientFeature],
-    values: Array[Map[String, ProcessedSeq]],
-    summary: Array[Map[String, Summary]],
-    bins: Int,
-    hasher: HashingTF
-  ): Array[FeatureDistrib] = {
-    features.zip(values).zip(summary).flatMap {
-      case ((tf, map), sum) => sum.map { case (key, seq) =>
-        val isNullCount = if (map.contains(key)) 0 else 1
-        val (info, histogram) = map.get(key)
-          .map(seq => histValues(seq, sum(key), bins, hasher))
-          .getOrElse(Array(sum(key).min, sum(key).max), Array.fill(bins)(0.0))
-        FeatureDistrib(tf.name, Some(key), 1, isNullCount, histogram, info)
-      }
-    }
-  }
-
-}
diff --git a/core/src/main/scala/com/salesforce/op/filters/Summary.scala b/core/src/main/scala/com/salesforce/op/filters/Summary.scala
new file mode 100644
index 0000000000..810f642b3d
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/filters/Summary.scala
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+import com.twitter.algebird.Monoid
+
+/**
+ * Class used to get summaries of prepared features to determine distribution binning strategy
+ *
+ * @param min minimum value seen
+ * @param max maximum value seen
+ */
+private[op] case class Summary(min: Double, max: Double)
+
+private[op] case object Summary {
+
+  val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity)
+
+  implicit val monoid: Monoid[Summary] = new Monoid[Summary] {
+    override def zero = empty
+    override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max))
+  }
+
+  /**
+   * @param preppedFeature processed feature
+   * @return feature summary derived from processed feature
+   */
+  def apply(preppedFeature: ProcessedSeq): Summary = {
+    preppedFeature match {
+      case Left(v) => Summary(v.size, v.size)
+      case Right(v) => monoid.sum(v.map(d => Summary(d, d)))
+    }
+  }
+}
diff --git a/core/src/main/scala/com/salesforce/op/filters/package.scala b/core/src/main/scala/com/salesforce/op/filters/package.scala
new file mode 100644
index 0000000000..7e5971bb29
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/filters/package.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op
+
+// scalastyle:off ensure.single.space.after.token
+package object filters {
+  private[filters] type FeatureKey = (String, Option[String])
+  private[filters] type ProcessedSeq = Either[Seq[String], Seq[Double]]
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala
new file mode 100644
index 0000000000..e4da69bd8a
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, OpDecisionTreeClassifierParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark decision tree classifier [[org.apache.spark.ml.classification.DecisionTreeClassifier]]
+ * @param uid       stage uid
+ */
+class OpDecisionTreeClassifier(uid: String = UID[OpDecisionTreeClassifier])
+  extends OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel](
+    predictor = new DecisionTreeClassifier(),
+    uid = uid
+  ) with OpDecisionTreeClassifierParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /** @group setParam */
+  override def setImpurity(value: String): this.type = set(impurity, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+}
+
+
+/**
+ * Class that takes in a spark DecisionTreeClassificationModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpDecisionTreeClassificationModel
+(
+  sparkModel: DecisionTreeClassificationModel,
+  uid: String = UID[OpDecisionTreeClassificationModel],
+  operationName: String = classOf[DecisionTreeClassifier].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpProbabilisticClassifierModel[DecisionTreeClassificationModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability")
+  @transient lazy val probability2predictionMirror =
+    reflectMethod(getSparkMlStage().get, "probability2prediction")
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala
new file mode 100644
index 0000000000..e8f299010c
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier, OpGBTClassifierParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark GBT classifier [[org.apache.spark.ml.classification.GBTClassifier]]
+ * @param uid       stage uid
+ */
+class OpGBTClassifier(uid: String = UID[OpGBTClassifier])
+  extends OpPredictorWrapper[GBTClassifier, GBTClassificationModel](
+    predictor = new GBTClassifier(),
+    uid = uid
+  ) with OpGBTClassifierParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /**
+   * The impurity setting is ignored for GBT models.
+   * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
+   */
+  override def setImpurity(value: String): this.type = {
+    logWarning("GBTClassifier.setImpurity should NOT be used")
+    this
+  }
+
+  // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  // Parameters from GBTParams:
+
+  /** @group setParam */
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /** @group setParam */
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
+
+  // Parameters from GBTClassifierParams:
+
+  /** @group setParam */
+  def setLossType(value: String): this.type = set(lossType, value)
+}
+
+
+
+/**
+ * Class that takes in a spark GBTClassificationModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpGBTClassificationModel
+(
+  sparkModel: GBTClassificationModel,
+  uid: String = UID[OpGBTClassificationModel],
+  operationName: String = classOf[GBTClassifier].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpProbabilisticClassifierModel[GBTClassificationModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability")
+  @transient lazy val probability2predictionMirror =
+    reflectMethod(getSparkMlStage().get, "probability2prediction")
+}
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala
new file mode 100644
index 0000000000..0fe69ef3e8
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel, OpLinearSVCParams}
+import org.apache.spark.ml.linalg.Vector
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark Linear SVC [[org.apache.spark.ml.classification.LinearSVC]]
+ * @param uid       stage uid
+ */
+class OpLinearSVC(uid: String = UID[OpLinearSVC])
+  extends OpPredictorWrapper[LinearSVC, LinearSVCModel](
+    predictor = new LinearSVC(),
+    uid = uid
+  ) with OpLinearSVCParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /**
+   * Set the regularization parameter.
+   * Default is 0.0.
+   *
+   * @group setParam
+   */
+  def setRegParam(value: Double): this.type = set(regParam, value)
+  setDefault(regParam -> 0.0)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   *
+   * @group setParam
+   */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 100)
+
+  /**
+   * Whether to fit an intercept term.
+   * Default is true.
+   *
+   * @group setParam
+   */
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller values will lead to higher accuracy at the cost of more iterations.
+   * Default is 1E-6.
+   *
+   * @group setParam
+   */
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
+
+  /**
+   * Whether to standardize the training features before fitting the model.
+   * Default is true.
+   *
+   * @group setParam
+   */
+  def setStandardization(value: Boolean): this.type = set(standardization, value)
+  setDefault(standardization -> true)
+
+  /**
+   * Set the value of param [[weightCol]].
+   * If this is not set or empty, we treat all instance weights as 1.0.
+   * Default is not set, so all instances have weight one.
+   *
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+
+  /**
+   * Set threshold in binary classification.
+   *
+   * @group setParam
+   */
+  def setThreshold(value: Double): this.type = set(threshold, value)
+  setDefault(threshold -> 0.0)
+
+  /**
+   * Suggested depth for treeAggregate (greater than or equal to 2).
+   * If the dimensions of features or the number of partitions are large,
+   * this param could be adjusted to a larger size.
+   * Default is 2.
+   *
+   * @group expertSetParam
+   */
+  def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
+  setDefault(aggregationDepth -> 2)
+}
+
+
+/**
+ * Class that takes in a spark LinearSVCModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpLinearSVCModel
+(
+  sparkModel: LinearSVCModel,
+  uid: String = UID[OpLinearSVCModel],
+  operationName: String = classOf[LinearSVC].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictorWrapperModel[LinearSVCModel](uid = uid, operationName = operationName, sparkModel = sparkModel) {
+
+  @transient private lazy val predictRaw = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient private lazy val predict = reflectMethod(getSparkMlStage().get, "predict")
+
+  /**
+   * Function used to convert input to output
+   */
+  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
+    val raw = predictRaw.apply(features.value).asInstanceOf[Vector]
+    val pred = predict.apply(features.value).asInstanceOf[Double]
+
+    Prediction(rawPrediction = raw, prediction = pred)
+  }
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala
index 61766580f1..1b37316735 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala
@@ -32,60 +32,52 @@
 package com.salesforce.op.stages.impl.classification
 
 import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
-import com.salesforce.op.stages.sparkwrappers.specific.OpProbabilisticClassifierWrapper
-import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, OpLogisticRegressionParams}
+import org.apache.spark.ml.linalg.{Matrix, Vector}
+
+import scala.reflect.runtime.universe.TypeTag
 
 /**
- * Wrapper around spark ml logistic regression for use with OP pipelines
+ * Wrapper around spark ml logistic regression [[org.apache.spark.ml.classification.LogisticRegression]]
  */
 class OpLogisticRegression(uid: String = UID[OpLogisticRegression])
-  extends OpProbabilisticClassifierWrapper[LogisticRegression, LogisticRegressionModel](
-    new LogisticRegression(),
+  extends OpPredictorWrapper[LogisticRegression, LogisticRegressionModel](
+    predictor = new LogisticRegression(),
     uid = uid
-  ) {
+  ) with OpLogisticRegressionParams {
 
   override protected def onSetInput(): Unit = {
     super.onSetInput()
     CheckIsResponseValues(in1, in2)
   }
 
-  /**
-   * Set thresholds in multiclass (or binary) classification to adjust the probability of
-   * predicting each class. Array must have length equal to the number of classes, with values >= 0.
-   * The class with largest value p/t is predicted, where p is the original probability of that
-   * class and t is the class' threshold.
-   *
-   * @group setParam
-   */
-  def setThresholds(value: Array[Double]): this.type = {
-    getSparkStage.setThresholds(value)
-    this
-  }
-
   /**
    * Set the regularization parameter.
    * Default is 0.0.
    *
    * @group setParam
    */
-  def setRegParam(value: Double): this.type = {
-    getSparkStage.setRegParam(value)
-    this
-  }
+  def setRegParam(value: Double): this.type = set(regParam, value)
+  setDefault(regParam -> 0.0)
 
   /**
    * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
+   * Note: Fitting under bound constrained optimization only supports L2 regularization,
+   * so throws exception if this param is non-zero value.
+   *
    * @group setParam
    */
-  def setElasticNetParam(value: Double): this.type = {
-    getSparkStage.setElasticNetParam(value)
-    this
-  }
+  def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
+  setDefault(elasticNetParam -> 0.0)
 
   /**
    * Set the maximum number of iterations.
@@ -93,22 +85,18 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression])
    *
    * @group setParam
    */
-  def setMaxIter(value: Int): this.type = {
-    getSparkStage.setMaxIter(value)
-    this
-  }
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 100)
 
   /**
    * Set the convergence tolerance of iterations.
-   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Smaller value will lead to higher accuracy at the cost of more iterations.
    * Default is 1E-6.
    *
    * @group setParam
    */
-  def setTol(value: Double): this.type = {
-    getSparkStage.setTol(value)
-    this
-  }
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
 
   /**
    * Whether to fit an intercept term.
@@ -116,10 +104,17 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression])
    *
    * @group setParam
    */
-  def setFitIntercept(value: Boolean): this.type = {
-    getSparkStage.setFitIntercept(value)
-    this
-  }
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
+  /**
+   * Sets the value of param [[family]].
+   * Default is "auto".
+   *
+   * @group setParam
+   */
+  def setFamily(value: String): this.type = set(family, value)
+  setDefault(family -> "auto")
 
   /**
    * Whether to standardize the training features before fitting the model.
@@ -131,21 +126,89 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression])
    *
    * @group setParam
    */
-  def setStandardization(value: Boolean): this.type = {
-    getSparkStage.setStandardization(value)
-    this
-  }
+  def setStandardization(value: Boolean): this.type = set(standardization, value)
+  setDefault(standardization -> true)
+
+  override def setThreshold(value: Double): this.type = super.setThreshold(value)
+
 
   /**
-   * Whether to over-/under-sample training instances according to the given weights in weightCol.
-   * If not set or empty String, all instances are treated equally (weight 1.0).
+   * Sets the value of param [[weightCol]].
+   * If this is not set or empty, we treat all instance weights as 1.0.
    * Default is not set, so all instances have weight one.
    *
    * @group setParam
    */
-  def setWeightCol(value: String): this.type = {
-    getSparkStage.setWeightCol(value)
-    this
-  }
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+
+  override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
+
+  /**
+   * Suggested depth for treeAggregate (greater than or equal to 2).
+   * If the dimensions of features or the number of partitions are large,
+   * this param could be adjusted to a larger size.
+   * Default is 2.
+   *
+   * @group expertSetParam
+   */
+  def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
+  setDefault(aggregationDepth -> 2)
 
+  /**
+   * Set the lower bounds on coefficients if fitting under bound constrained optimization.
+   *
+   * @group expertSetParam
+   */
+  def setLowerBoundsOnCoefficients(value: Matrix): this.type = set(lowerBoundsOnCoefficients, value)
+
+  /**
+   * Set the upper bounds on coefficients if fitting under bound constrained optimization.
+   *
+   * @group expertSetParam
+   */
+  def setUpperBoundsOnCoefficients(value: Matrix): this.type = set(upperBoundsOnCoefficients, value)
+
+  /**
+   * Set the lower bounds on intercepts if fitting under bound constrained optimization.
+   *
+   * @group expertSetParam
+   */
+  def setLowerBoundsOnIntercepts(value: Vector): this.type = set(lowerBoundsOnIntercepts, value)
+
+  /**
+   * Set the upper bounds on intercepts if fitting under bound constrained optimization.
+   *
+   * @group expertSetParam
+   */
+  def setUpperBoundsOnIntercepts(value: Vector): this.type = set(upperBoundsOnIntercepts, value)
+
+}
+
+
+/**
+ * Class that takes in a spark LogisticRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpLogisticRegressionModel
+(
+  sparkModel: LogisticRegressionModel,
+  operationName: String = classOf[LogisticRegression].getSimpleName,
+  uid: String = UID[OpLogisticRegressionModel]
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpProbabilisticClassifierModel[LogisticRegressionModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability")
+  @transient lazy val probability2predictionMirror =
+    reflectMethod(getSparkMlStage().get, "probability2prediction")
 }
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala
new file mode 100644
index 0000000000..1de8560b3b
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier, OpMultilayerPerceptronClassifierParams}
+import org.apache.spark.ml.linalg.Vector
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark MultiLayerPerceptronClassifier
+ * [[org.apache.spark.ml.classification.MultilayerPerceptronClassifier]]
+ * @param uid       stage uid
+ */
+class OpMultilayerPerceptronClassifier(uid: String = UID[OpMultilayerPerceptronClassifier])
+  extends OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel](
+    predictor = new MultilayerPerceptronClassifier(),
+    uid = uid
+  ) with OpMultilayerPerceptronClassifierParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /**
+   * Sets the value of param [[layers]].
+   *
+   * @group setParam
+   */
+  def setLayers(value: Array[Int]): this.type = set(layers, value)
+
+  /**
+   * Sets the value of param [[blockSize]].
+   * Default is 128.
+   *
+   * @group expertSetParam
+   */
+  def setBlockSize(value: Int): this.type = set(blockSize, value)
+
+  /**
+   * Sets the value of param [[solver]].
+   * Default is "l-bfgs".
+   *
+   * @group expertSetParam
+   */
+  def setSolver(value: String): this.type = set(solver, value)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   *
+   * @group setParam
+   */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-6.
+   *
+   * @group setParam
+   */
+  def setTol(value: Double): this.type = set(tol, value)
+
+  /**
+   * Set the seed for weights initialization if weights are not set
+   *
+   * @group setParam
+   */
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  /**
+   * Sets the value of param [[initialWeights]].
+   *
+   * @group expertSetParam
+   */
+  def setInitialWeights(value: Vector): this.type = set(initialWeights, value)
+
+  /**
+   * Sets the value of param [[stepSize]] (applicable only for solver "gd").
+   * Default is 0.03.
+   *
+   * @group setParam
+   */
+  def setStepSize(value: Double): this.type = set(stepSize, value)
+}
+
+
+/**
+ * Class that takes in a spark MultilayerPerceptronClassificationModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+// TODO in next release of spark this will be probablistic classifier
+class OpMultilayerPerceptronClassificationModel
+(
+  sparkModel: MultilayerPerceptronClassificationModel,
+  uid: String = UID[OpMultilayerPerceptronClassificationModel],
+  operationName: String = classOf[MultilayerPerceptronClassifier].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictionModel[MultilayerPerceptronClassificationModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala
new file mode 100644
index 0000000000..c935c3b6b4
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel, OpNaiveBayesParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark Naive Bayes [[org.apache.spark.ml.classification.NaiveBayesModel]]
+ * @param uid       stage uid
+ */
+class OpNaiveBayes(uid: String = UID[OpNaiveBayes])
+  extends OpPredictorWrapper[NaiveBayes, NaiveBayesModel](
+    predictor = new NaiveBayes(),
+    uid = uid
+  ) with OpNaiveBayesParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /**
+   * Set the smoothing parameter.
+   * Default is 1.0.
+   * @group setParam
+   */
+  def setSmoothing(value: Double): this.type = set(smoothing, value)
+  setDefault(smoothing -> 1.0)
+
+  /**
+   * Set the model type using a string (case-sensitive).
+   * Supported options: "multinomial" and "bernoulli".
+   * Default is "multinomial"
+   * @group setParam
+   */
+  def setModelType(value: String): this.type = set(modelType, value)
+  setDefault(modelType -> "multinomial")
+
+  /**
+   * Sets the value of param [[weightCol]].
+   * If this is not set or empty, we treat all instance weights as 1.0.
+   * Default is not set, so all instances have weight one.
+   *
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+}
+
+
+/**
+ * Class that takes in a spark NaiveBayesModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpNaiveBayesModel
+(
+  sparkModel: NaiveBayesModel,
+  uid: String = UID[OpNaiveBayesModel],
+  operationName: String = classOf[NaiveBayes].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpProbabilisticClassifierModel[NaiveBayesModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability")
+  @transient lazy val probability2predictionMirror =
+    reflectMethod(getSparkMlStage().get, "probability2prediction")
+}
+
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala
deleted file mode 100644
index ec9c61e525..0000000000
--- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.impl.classification
-
-import com.salesforce.op.UID
-import com.salesforce.op.stages.impl.CheckIsResponseValues
-import com.salesforce.op.stages.sparkwrappers.specific.OpProbabilisticClassifierWrapper
-import enumeratum.{Enum, EnumEntry}
-import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
-
-sealed abstract class Impurity(val sparkName: String) extends EnumEntry with Serializable
-
-object Impurity extends Enum[Impurity] {
-  val values: Seq[Impurity] = findValues
-
-  case object Entropy extends Impurity("entropy")
-  case object Gini extends Impurity("gini")
-  case object Variance extends Impurity("variance")
-}
-
-
-class OpRandomForest(uid: String = UID[OpRandomForest])
-  extends OpProbabilisticClassifierWrapper[RandomForestClassifier, RandomForestClassificationModel](
-    probClassifier = new RandomForestClassifier,
-    uid = uid
-  )
-{
-
-  override protected def onSetInput(): Unit = {
-    super.onSetInput()
-    CheckIsResponseValues(in1, in2)
-  }
-
-  /**
-   * Set maximum depth of the tree (>= 0).
-   * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
-   * (default = 5)
-   *
-   * @group setParam
-   */
-  def setMaxDepth(value: Int): this.type = {
-    getSparkStage.setMaxDepth(value)
-    this
-  }
-
-  /**
-   * Set maximum number of bins used for discretizing continuous features and for choosing how to split
-   * on features at each node.  More bins give higher granularity.
-   * Must be >= 2 and >= number of categories in any categorical feature.
-   * (default = 32)
-   *
-   * @group setParam
-   */
-  def setMaxBins(value: Int): this.type = {
-    getSparkStage.setMaxBins(value)
-    this
-  }
-
-  /**
-   * Set minimum number of instances each child must have after split.
-   * If a split causes the left or right child to have fewer than minInstancesPerNode,
-   * the split will be discarded as invalid.
-   * Should be >= 1.
-   * (default = 1)
-   *
-   * @group setParam
-   */
-  def setMinInstancesPerNode(value: Int): this.type = {
-    getSparkStage.setMinInstancesPerNode(value)
-    this
-  }
-
-  /**
-   * Set minimum information gain for a split to be considered at a tree node.
-   * (default = 0.0)
-   *
-   * @group setParam
-   */
-  def setMinInfoGain(value: Double): this.type = {
-    getSparkStage.setMinInfoGain(value)
-    this
-  }
-
-  /**
-   * Set fraction of the training data used for learning each decision tree, in range (0, 1].
-   * (default = 1.0)
-   *
-   * @group setParam
-   */
-  def setSubsamplingRate(value: Double): this.type = {
-    getSparkStage.setSubsamplingRate(value)
-    this
-  }
-
-  /**
-   * Set number of trees to train (>= 1).
-   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
-   * (default = 20)
-   *
-   * @group setParam
-   */
-  def setNumTrees(value: Int): this.type = {
-    getSparkStage.setNumTrees(value)
-    this
-  }
-
-  /**
-   * Set criterion used for information gain calculation (case-insensitive).
-   * Supported: "entropy" and "gini".
-   * (default = gini)
-   *
-   * @group setParam
-   */
-  def setImpurity(value: Impurity): this.type = {
-    getSparkStage.setImpurity(value.sparkName)
-    this
-  }
-
-  /**
-   * Set param for random seed.
-   *
-   * @group setParam
-   */
-  def setSeed(value: Long): this.type = {
-    getSparkStage.setSeed(value)
-    this
-  }
-
-  /**
-   * Set thresholds in multiclass (or binary) classification to adjust the probability of
-   * predicting each class. Array must have length equal to the number of classes, with values >= 0.
-   * The class with largest value p/t is predicted, where p is the original probability of that
-   * class and t is the class' threshold.
-   *
-   * @group setParam
-   */
-  def setThresholds(value: Array[Double]): this.type = {
-    getSparkStage.setThresholds(value)
-    this
-  }
-}
-
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala
new file mode 100644
index 0000000000..74215dba51
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import enumeratum.{Enum, EnumEntry}
+import org.apache.spark.ml.classification.{OpRandomForestClassifierParams, RandomForestClassificationModel, RandomForestClassifier}
+
+import scala.reflect.runtime.universe.TypeTag
+
+sealed abstract class Impurity(val sparkName: String) extends EnumEntry with Serializable
+
+object Impurity extends Enum[Impurity] {
+  val values: Seq[Impurity] = findValues
+
+  case object Entropy extends Impurity("entropy")
+  case object Gini extends Impurity("gini")
+  case object Variance extends Impurity("variance")
+}
+
+
+/**
+ * Wrapper for spark Random Forest Classifier [[org.apache.spark.ml.classification.RandomForestClassifier]]
+ * @param uid       stage uid
+ */
+class OpRandomForestClassifier(uid: String = UID[OpRandomForestClassifier])
+  extends OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel](
+    predictor = new RandomForestClassifier(),
+    uid = uid
+  ) with OpRandomForestClassifierParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  // Parameters from TreeClassifierParams:
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /** @group setParam */
+  override def setImpurity(value: String): this.type = set(impurity, value)
+
+  // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  // Parameters from RandomForestParams:
+
+  /** @group setParam */
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
+
+  /** @group setParam */
+  override def setFeatureSubsetStrategy(value: String): this.type =
+  set(featureSubsetStrategy, value)
+
+  /**
+   * Param for Thresholds in multi-class classification to adjust the probability of predicting each class.
+   * Array must have length equal to the number of classes, with values &gt; 0 excepting that at most one value
+   * may be 0. The class with largest value p/t is predicted, where p is the original probability of that class
+   * and t is the class's threshold.
+   * @group param
+   */
+  def setThresholds(value: Array[Double]): this.type = set(thresholds, value)
+
+}
+
+
+/**
+ * Class that takes in a spark RandomForestClassificationModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpRandomForestClassificationModel
+(
+  sparkModel: RandomForestClassificationModel,
+  uid: String = UID[OpRandomForestClassificationModel],
+  operationName: String = classOf[RandomForestClassifier].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpProbabilisticClassifierModel[RandomForestClassificationModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw")
+  @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability")
+  @transient lazy val probability2predictionMirror =
+    reflectMethod(getSparkMlStage().get, "probability2prediction")
+}
+
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala
index 995b481c2b..168e2bf6a6 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala
@@ -78,23 +78,24 @@ class DecisionTreeNumericMapBucketizer[N, I2 <: OPMap[N]]
       label -> filterKeys[N](map, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues)
     }.persist()
 
-    require(!ds.isEmpty, "Dataset is empty, buckets cannot be computed.")
-
-    // Collect all unique map keys and sort them
-    val uniqueKeys: Seq[String] =
-      ds.map { case (_, map) => map.keys.toSeq }
-        .reduce((l, r) => (l ++ r).distinct)
-        .distinct.sorted
-
-    // Compute splits for each collected key in parallel
-    val computedSplits: Array[(String, Splits)] =
+    val computedSplits: Array[(String, Splits)] = if (ds.isEmpty) {
+      log.info("Skip bucketizing empty numeric map '{}' feature", in2.name)
+      Array.empty[(String, Splits)]
+    } else {
+      // Collect all unique map keys and sort them
+      val uniqueKeys: Seq[String] =
+        ds.map { case (_, map) => map.keys.toSeq }
+          .reduce((l, r) => (l ++ r).distinct)
+          .distinct.sorted
+
+      // Compute splits for each collected key in parallel
       uniqueKeys.par.map { k =>
         val data: Dataset[(Double, Double)] =
           ds.filter(_._2.contains(k))
             .map { case (label, map) => label.get -> nev.toDouble(map(k)) }
         k -> computeSplits(data, featureName = s"${in2.name}[$k]")
       }.toArray
-
+    }
     ds.unpersist()
 
     val meta = makeMetadata(computedSplits)
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala
index d0979ecfd0..67f87c9109 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala
@@ -38,7 +38,7 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.unary.UnaryTransformer
 import org.apache.commons.io.input.BoundedInputStream
-import org.apache.spark.ml.param.{Param, Params}
+import org.apache.spark.ml.param.{LongParam, Param, Params}
 import org.apache.tika.detect.{DefaultDetector, Detector}
 import org.apache.tika.metadata.{HttpHeaders, Metadata}
 import org.apache.tika.mime.MediaType
@@ -84,7 +84,7 @@ private[op] trait MimeTypeDetectorParams extends Params {
   )
   def setTypeHint(value: String): this.type = set(typeHint, value)
 
-  final val maxBytesToParse = new Param[Long](
+  final val maxBytesToParse = new LongParam(
     parent = this, name = "maxBytesToParse", doc = "maximum number of bytes to parse during detection",
     isValid = (v: Long) => v >= 0L
   )
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala
new file mode 100644
index 0000000000..b3e5fd927c
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.unary.UnaryTransformer
+import com.salesforce.op.utils.text._
+import com.twitter.algebird.Operators._
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Name Entity [[NameEntityType]] text recognizer.
+ *
+ * Note: when providing your own the analyzer/splitter/tagger make sure they can work together,
+ * for instance OpenNLP models require their own analyzers to be provided when tokenizing.
+ * The returned feature type is a [[MultiPickListMap]] which contains sets of entities for all the tokens
+ *
+ * @param languageDetector a language detector instance (defaults to [[OptimaizeLanguageDetector]]
+ * @param analyzer         a text analyzer instance (defaults to a [[OpenNLPAnalyzer]])
+ * @param sentenceSplitter a sentence splitter instance (defaults to a [[OpenNLPSentenceSplitter]])
+ * @param tagger           name entity tagger (defaults to [[OpenNLPNameEntityTagger]])
+ * @param uid              uid for instance
+ * @param tti              type tag for input feature type
+ * @tparam T text feature type
+ */
+class NameEntityRecognizer[T <: Text]
+(
+  val languageDetector: LanguageDetector = NameEntityRecognizer.LanguageDetector,
+  val analyzer: TextAnalyzer = NameEntityRecognizer.Analyzer,
+  val sentenceSplitter: SentenceSplitter = NameEntityRecognizer.Splitter,
+  val tagger: NameEntityTagger[_ <: TaggerResult] = NameEntityRecognizer.Tagger,
+  uid: String = UID[NameEntityRecognizer[_]]
+)(implicit tti: TypeTag[T])
+  extends UnaryTransformer[T, MultiPickListMap](uid = uid, operationName = "nameEntityRec")
+    with LanguageDetectionParams {
+
+  setDefault(
+    autoDetectLanguage -> NameEntityRecognizer.AutoDetectLanguage,
+    autoDetectThreshold -> NameEntityRecognizer.AutoDetectThreshold,
+    defaultLanguage -> NameEntityRecognizer.DefaultLanguage.entryName
+  )
+
+  def transformFn: T => MultiPickListMap = text => {
+    val res = TextTokenizer.tokenize(
+      text = text,
+      languageDetector = languageDetector,
+      analyzer = analyzer,
+      sentenceSplitter = Option(sentenceSplitter),
+      autoDetectLanguage = getAutoDetectLanguage,
+      autoDetectThreshold = getAutoDetectThreshold,
+      defaultLanguage = getDefaultLanguage,
+      toLowercase = false
+    )
+    val sentenceTags = res.sentences.view.map { sentence =>
+      val tags = tagger.tag(sentence.value, res.language, NameEntityType.values)
+      tags.tokenTags.mapValues(_.map(_.toString))
+    }
+    sentenceTags.foldLeft(Map.empty[String, Set[String]])(_ + _).toMultiPickListMap
+  }
+
+}
+
+object NameEntityRecognizer {
+  val Analyzer: TextAnalyzer = new OpenNLPAnalyzer()
+  val LanguageDetector: LanguageDetector = new OptimaizeLanguageDetector()
+  val Tagger: NameEntityTagger[_ <: TaggerResult] = new OpenNLPNameEntityTagger()
+  val Splitter: SentenceSplitter = new OpenNLPSentenceSplitter()
+  val AutoDetectLanguage = false
+  val AutoDetectThreshold = 0.99
+  val DefaultLanguage: Language = Language.English
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
index 172857267d..28310f5c3a 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
@@ -36,7 +36,9 @@ import com.salesforce.op.features.TransientFeature
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.OpPipelineStageBase
 import com.salesforce.op.stages.base.sequence.SequenceTransformer
+import com.salesforce.op.stages.impl.feature.HashSpaceStrategy.findValues
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
+import enumeratum.{Enum, EnumEntry}
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.feature.HashingTF
@@ -97,6 +99,16 @@ class OPCollectionHashingVectorizer[T <: OPCollection](uid: String = UID[OPColle
   }
 }
 
+sealed trait HashSpaceStrategy extends EnumEntry with Serializable
+
+object HashSpaceStrategy extends Enum[HashSpaceStrategy] {
+  val values: Seq[HashSpaceStrategy] = findValues
+
+  case object Shared extends HashSpaceStrategy
+  case object Separate extends HashSpaceStrategy
+  case object Auto extends HashSpaceStrategy
+}
+
 private[op] trait HashingVectorizerParams extends Params {
   final val numFeatures = new IntParam(
     parent = this, name = "numFeatures",
@@ -115,11 +127,22 @@ private[op] trait HashingVectorizerParams extends Params {
   )
   def setHashWithIndex(v: Boolean): this.type = set(hashWithIndex, v)
 
+  @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0")
   final val forceSharedHashSpace = new BooleanParam(
     parent = this, name = "forceSharedHashSpace",
     doc = s"if true, then force the hash space to be shared among all included features"
   )
+  @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0")
   def setForceSharedHashSpace(v: Boolean): this.type = set(forceSharedHashSpace, v)
+  @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0")
+  def getForceSharedHashSpace: Boolean = $(forceSharedHashSpace)
+
+  final val hashSpaceStrategy: Param[String] = new Param[String](this, "hashSpaceStrategy",
+    "Strategy to determine whether to use shared or separate hash space for input text features",
+    (value: String) => HashSpaceStrategy.withNameInsensitiveOption(value).isDefined
+  )
+  def setHashSpaceStrategy(v: HashSpaceStrategy): this.type = set(hashSpaceStrategy, v.entryName)
+  def getHashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.withNameInsensitive($(hashSpaceStrategy))
 
   final val prependFeatureName = new BooleanParam(
     parent = this, name = "prependFeatureName",
@@ -132,6 +155,7 @@ private[op] trait HashingVectorizerParams extends Params {
     isValid = (s: String) => HashAlgorithm.withNameInsensitiveOption(s).isDefined
   )
   def setHashAlgorithm(h: HashAlgorithm): this.type = set(hashAlgorithm, h.toString.toLowerCase)
+  def getHashAlgorithm: HashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm))
 
   final val binaryFreq = new BooleanParam(
     parent = this, name = "binaryFreq",
@@ -145,7 +169,8 @@ private[op] trait HashingVectorizerParams extends Params {
     forceSharedHashSpace -> false,
     prependFeatureName -> TransmogrifierDefaults.PrependFeatureName,
     hashAlgorithm -> TransmogrifierDefaults.HashAlgorithm.toString.toLowerCase,
-    binaryFreq -> TransmogrifierDefaults.BinaryFreq
+    binaryFreq -> TransmogrifierDefaults.BinaryFreq,
+    hashSpaceStrategy -> HashSpaceStrategy.Auto.toString
   )
 
 }
@@ -162,6 +187,7 @@ private[op] trait HashingVectorizerParams extends Params {
  * @param binaryFreq           if true, term frequency vector will be binary such that non-zero term counts
  *                             will be set to 1.0
  * @param hashAlgorithm        hash algorithm to use
+ * @param hashSpaceStrategy    strategy to determine whether to use shared hash space for all included features
  */
 case class HashingFunctionParams
 (
@@ -172,7 +198,8 @@ case class HashingFunctionParams
   maxNumOfFeatures: Int,
   forceSharedHashSpace: Boolean,
   binaryFreq: Boolean,
-  hashAlgorithm: HashAlgorithm
+  hashAlgorithm: HashAlgorithm,
+  hashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.Auto
 )
 
 /**
@@ -189,7 +216,12 @@ private[op] trait HashingFun {
   protected def isSharedHashSpace(p: HashingFunctionParams, numFeatures: Option[Int] = None): Boolean = {
     val numHashes = p.numFeatures
     val numOfFeatures = numFeatures.getOrElse(p.numInputs)
-    (numHashes * numOfFeatures) > p.maxNumOfFeatures || p.forceSharedHashSpace
+    import HashSpaceStrategy._
+    p.hashSpaceStrategy match {
+      case s if p.forceSharedHashSpace || s.equals(Shared) => true
+      case Separate => false
+      case Auto => (numHashes * numOfFeatures) > p.maxNumOfFeatures
+    }
   }
 
   /**
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala
index 8c3cce7aed..d83ea2dddc 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala
@@ -451,7 +451,7 @@ final class TextMapHashingVectorizerModel[T <: OPMap[String]] private[op]
         val keys = args.allKeys(i)
         val cleaned = cleanMap(map.v, shouldCleanKey = args.shouldCleanKeys, shouldCleanValue = args.shouldCleanValues)
         val mapValues = cleaned.map { case (k, v) => v.toText }
-        mapValues.map(tokenize(_)._2).toSeq
+        mapValues.map(tokenize(_).tokens).toSeq
     }
     val allTokens = tokenSeq.flatMap(_.value).toTextList
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala
index f8a4492c05..11f16ae80b 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala
@@ -50,7 +50,7 @@ class RealNNVectorizer
   /**
    * Function used to convert input to output
    */
-  override def transformFn: (Seq[RealNN]) => OPVector = in => {
+  override def transformFn: Seq[RealNN] => OPVector = in => {
     val ins = in.map(_.value.get) // assumes a non nullable real (RealNN)
     Vectors.dense(ins.toArray).toOPVector
   }
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 58600cec34..cada837d5d 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -40,8 +40,8 @@ import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import com.twitter.algebird.Monoid._
 import com.twitter.algebird.Operators._
-import com.twitter.algebird.macros.caseclass
 import com.twitter.algebird.Semigroup
+import com.twitter.algebird.macros.caseclass
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.{Dataset, Encoder}
 
@@ -84,9 +84,10 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
     numFeatures = $(numFeatures),
     numInputs = inN.length,
     maxNumOfFeatures = TransmogrifierDefaults.MaxNumOfFeatures,
-    forceSharedHashSpace = $(forceSharedHashSpace),
+    forceSharedHashSpace = getForceSharedHashSpace,
     binaryFreq = $(binaryFreq),
-    hashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm))
+    hashAlgorithm = getHashAlgorithm,
+    hashSpaceStrategy = getHashSpaceStrategy
   )
 
   private def makeVectorMetadata(args: SmartTextMapVectorizerModelArgs): OpVectorMetadata = {
@@ -244,14 +245,14 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
   private def partitionRow(row: Seq[OPMap[String]]):
   (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = {
     val (rowCategorical, keysCategorical) =
-      row.view.zip(args.categoricalKeys).collect{ case (elements, keys) if keys.nonEmpty =>
-        val filtered = elements.value.filter{ case (k, v) => keys.contains(k) }
+      row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty =>
+        val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
         (TextMap(filtered), keys)
       }.unzip
 
     val (rowText, keysText) =
-      row.view.zip(args.textKeys).collect{ case (elements, keys) if keys.nonEmpty =>
-        val filtered = elements.value.filter{ case (k, v) => keys.contains(k) }
+      row.view.zip(args.textKeys).collect { case (elements, keys) if keys.nonEmpty =>
+        val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
         (TextMap(filtered), keys)
       }.unzip
 
@@ -261,17 +262,17 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
   def transformFn: Seq[T] => OPVector = row => {
     val (rowCategorical, keysCategorical, rowText, keysText) = partitionRow(row)
     val categoricalVector = categoricalPivotFn(rowCategorical)
-    val rowTextTokenized = rowText.map( m => m.value.map{ case (k, v) => k -> tokenize(v.toText)._2 } )
+    val rowTextTokenized = rowText.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens })
     val textVector = hash(rowTextTokenized, keysText, args.hashingParams)
     val textNullIndicatorsVector =
-      if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(keysText, rowText)) else Nil
+      if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(keysText, rowTextTokenized)) else Nil
     VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector)
   }
 
-  private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[OPMap[String]]): OPVector = {
+  private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[Map[String, TextList]]): OPVector = {
     val nullIndicators = keysSeq.zip(inputs).flatMap{ case (keys, input) =>
       keys.map{ k =>
-        val nullVal = if (input.value.contains(k)) 0.0 else 1.0
+        val nullVal = if (input.get(k).forall(_.isEmpty)) 1.0 else 0.0
         Seq(0 -> nullVal)
       }
     }
@@ -280,3 +281,4 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
     vector.toOPVector
   }
 }
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
index 647882a937..9c847cdc0a 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
@@ -72,9 +72,10 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
     numFeatures = $(numFeatures),
     numInputs = inN.length,
     maxNumOfFeatures = TransmogrifierDefaults.MaxNumOfFeatures,
-    forceSharedHashSpace = $(forceSharedHashSpace),
+    forceSharedHashSpace = getForceSharedHashSpace,
     binaryFreq = $(binaryFreq),
-    hashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm))
+    hashAlgorithm = getHashAlgorithm,
+    hashSpaceStrategy = getHashSpaceStrategy
   )
 
   def fitFn(dataset: Dataset[Seq[T#Value]]): SequenceModel[T, OPVector] = {
@@ -213,9 +214,9 @@ final class SmartTextVectorizerModel[T <: Text] private[op]
     (row: Seq[Text]) => {
       val (rowCategorical, rowText) = SmartTextVectorizer.partition[Text](row.toArray, args.isCategorical)
       val categoricalVector: OPVector = categoricalPivotFn(rowCategorical)
-      val textTokens: Seq[TextList] = rowText.map(tokenize(_)._2)
+      val textTokens: Seq[TextList] = rowText.map(tokenize(_).tokens)
       val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams)
-      val textNullIndicatorsVector = if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(rowText)) else Seq.empty
+      val textNullIndicatorsVector = if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(textTokens)) else Seq.empty
 
       VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector)
     }
@@ -224,11 +225,9 @@ final class SmartTextVectorizerModel[T <: Text] private[op]
   private def getTextTransientFeatures: Array[TransientFeature] =
     SmartTextVectorizer.partition[TransientFeature](getTransientFeatures(), args.isCategorical)._2
 
-  private def getNullIndicatorsVector(features: Seq[Text]): OPVector = {
-    val nullIndicators = features.map { f =>
-      val theseCat = convertToSet(f)
-        .groupBy(v => cleanTextFn(v.toString, args.shouldCleanText)).map { case (k, v) => k -> v.size }
-      val nullVal = if (theseCat.isEmpty) 1.0 else 0.0
+  private def getNullIndicatorsVector(textTokens: Seq[TextList]): OPVector = {
+    val nullIndicators = textTokens.map { tokens =>
+      val nullVal = if (tokens.isEmpty) 1.0 else 0.0
       Seq(0 -> nullVal)
     }
     val reindexed = reindex(nullIndicators)
@@ -243,7 +242,7 @@ trait MaxCardinalityParams extends Params {
     doc = "max number of distinct values a categorical feature can have",
     isValid = ParamValidators.inRange(lowerBound = 1, upperBound = SmartTextVectorizer.MaxCardinality)
   )
+  final def setMaxCardinality(v: Int): this.type = set(maxCardinality, v)
+  final def getMaxCardinality: Int = $(maxCardinality)
   setDefault(maxCardinality -> SmartTextVectorizer.MaxCardinality)
-  def setMaxCardinality(v: Int): this.type = set(maxCardinality, v)
-  def getMaxCardinality: Int = $(maxCardinality)
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala
index 44a7dbeb8c..a360fb3132 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala
@@ -92,15 +92,17 @@ final class TextMapNullModel[T <: OPMap[String]] private[op]
   uid: String
 )(implicit tti: TypeTag[T])
   extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
-    with VectorizerDefaults with CleanTextMapFun {
+    with VectorizerDefaults with CleanTextMapFun with TextTokenizerParams {
 
   def transformFn: Seq[T] => OPVector = row => {
     row.zipWithIndex.flatMap {
       case (map, i) =>
         val keys = allKeys(i)
         val cleaned = cleanMap(map.v, shouldCleanKey = cleanKeys, shouldCleanValue = cleanValues)
+        val tokenMap = cleaned.mapValues { v => v.toText }.mapValues(tokenize(_).tokens)
 
-        keys.map(k => if (cleaned.contains(k)) 0.0 else 1.0)
+        // Need to check if key is present, and also that our tokenizer will not remove the value
+        keys.map(k => if (cleaned.contains(k) && tokenMap(k).nonEmpty) 0.0 else 1.0)
     }.toOPVector
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala
index fbe114dcba..c7c5147934 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala
@@ -34,12 +34,13 @@ package com.salesforce.op.stages.impl.feature
 import com.salesforce.op.UID
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.unary.UnaryTransformer
-import com.salesforce.op.utils.text._
+import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult
+import com.salesforce.op.utils.text.{Language, _}
 import org.apache.spark.ml.param._
 
 import scala.reflect.runtime.universe.TypeTag
 
-trait TextTokenizerParams extends Params {
+trait LanguageDetectionParams extends Params {
 
   /**
    * Indicates whether to attempt language detection.
@@ -55,6 +56,7 @@ trait TextTokenizerParams extends Params {
   final val autoDetectThreshold =
     new DoubleParam(this, "autoDetectThreshold", "language detection threshold",
       ParamValidators.inRange(0.0, 1.0, true, true))
+
   def setAutoDetectThreshold(value: Double): this.type = set(autoDetectThreshold, value)
   def getAutoDetectThreshold: Double = $(autoDetectThreshold)
 
@@ -65,6 +67,11 @@ trait TextTokenizerParams extends Params {
   def setDefaultLanguage(value: Language): this.type = set(defaultLanguage, value.entryName)
   def getDefaultLanguage: Language = Language.withName($(defaultLanguage))
 
+}
+
+
+trait TextTokenizerParams extends LanguageDetectionParams {
+
   /**
    * Minimum token length, >= 1.
    */
@@ -82,18 +89,18 @@ trait TextTokenizerParams extends Params {
   def getToLowercase: Boolean = $(toLowercase)
 
   setDefault(
+    minTokenLength -> TextTokenizer.MinTokenLength,
+    toLowercase -> TextTokenizer.ToLowercase,
     autoDetectLanguage -> TextTokenizer.AutoDetectLanguage,
     autoDetectThreshold -> TextTokenizer.AutoDetectThreshold,
-    defaultLanguage -> TextTokenizer.DefaultLanguage.entryName,
-    minTokenLength -> TextTokenizer.MinTokenLength,
-    toLowercase -> TextTokenizer.ToLowercase
+    defaultLanguage -> TextTokenizer.DefaultLanguage.entryName
   )
 
   def tokenize(
     text: Text,
     languageDetector: LanguageDetector = TextTokenizer.LanguageDetector,
     analyzer: TextAnalyzer = TextTokenizer.Analyzer
-  ): (Language, TextList) = TextTokenizer.tokenize(
+  ): TextTokenizerResult = TextTokenizer.tokenize(
     text = text,
     languageDetector = languageDetector,
     analyzer = analyzer,
@@ -120,7 +127,7 @@ class TextTokenizer[T <: Text]
   uid: String = UID[TextTokenizer[_]]
 )(implicit tti: TypeTag[T])
   extends UnaryTransformer[T, TextList](operationName = "textToken", uid = uid) with TextTokenizerParams {
-  def transformFn: T => TextList = text => tokenize(text, languageDetector, analyzer)._2
+  def transformFn: T => TextList = text => tokenize(text, languageDetector, analyzer).tokens
 }
 
 object TextTokenizer {
@@ -135,41 +142,64 @@ object TextTokenizer {
   val StripHtml = false
 
   /**
-   * Language wise text tokenization
+   * Language wise sentence tokenization
    *
    * @param text                text to tokenize
    * @param languageDetector    language detector instance
    * @param analyzer            text analyzer instance
+   * @param sentenceSplitter    sentence splitter instance
    * @param autoDetectLanguage  whether to attempt language detection
    * @param defaultLanguage     default language
    * @param autoDetectThreshold language detection threshold
    * @param toLowercase         whether to convert all characters to lowercase before tokenizing
    * @param minTokenLength      minimum token length
-   * @return detected language and tokens
+   * @return detected language and sentence tokens
    */
   def tokenize(
     text: Text,
     languageDetector: LanguageDetector = LanguageDetector,
     analyzer: TextAnalyzer = Analyzer,
+    sentenceSplitter: Option[SentenceSplitter] = None,
     autoDetectLanguage: Boolean = AutoDetectLanguage,
     defaultLanguage: Language = DefaultLanguage,
     autoDetectThreshold: Double = AutoDetectThreshold,
     toLowercase: Boolean = ToLowercase,
     minTokenLength: Int = MinTokenLength
-  ): (Language, TextList) = text match {
-    case SomeValue(Some(txt)) =>
-      val language =
-        if (!autoDetectLanguage) defaultLanguage
-        else {
-          languageDetector
-            .detectLanguages(txt)
-            .collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang }
-            .getOrElse(defaultLanguage)
-        }
-      val lowerTxt = if (toLowercase) txt.toLowerCase else txt
-      val tokens = analyzer.analyze(lowerTxt, language)
-      language -> tokens.filter(_.length >= minTokenLength).toTextList
-    case _ =>
-      defaultLanguage -> TextList.empty
+  ): TextTokenizerResult = {
+    text match {
+      case SomeValue(Some(txt)) =>
+        val language =
+          if (!autoDetectLanguage) defaultLanguage
+          else {
+            languageDetector
+              .detectLanguages(txt)
+              .collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang }
+              .getOrElse(defaultLanguage)
+          }
+        val lowerTxt = if (toLowercase) txt.toLowerCase else txt
+
+        val sentences = sentenceSplitter.map(_.getSentences(lowerTxt, language))
+          .getOrElse(Seq(lowerTxt))
+          .map { sentence =>
+            val tokens = analyzer.analyze(sentence, language)
+            tokens.filter(_.length >= minTokenLength).toTextList
+          }
+        TextTokenizerResult(language, sentences)
+      case _ =>
+        TextTokenizerResult(defaultLanguage, Seq(TextList.empty))
+    }
+  }
+
+  /**
+   * Text tokenization result
+   *
+   * @param language  detected language
+   * @param sentences sentence tokens
+   */
+  case class TextTokenizerResult(language: Language, sentences: Seq[TextList]) {
+    /**
+     * All sentences tokens flattened together
+     */
+    def tokens: TextList = sentences.flatMap(_.value).toTextList
   }
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
index 08628b9d10..cab57f5174 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
@@ -64,6 +64,7 @@ private[op] trait TransmogrifierDefaults {
   val HashWithIndex: Boolean = false
   val PrependFeatureName: Boolean = true
   val ForceSharedHashSpace: Boolean = true
+  val HashSpaceStrategy: HashSpaceStrategy = com.salesforce.op.stages.impl.feature.HashSpaceStrategy.Auto
   val CleanText: Boolean = true
   val CleanKeys: Boolean = false
   val HashAlgorithm: HashAlgorithm = com.salesforce.op.stages.impl.feature.HashAlgorithm.MurMur3
@@ -75,6 +76,7 @@ private[op] trait TransmogrifierDefaults {
   val MinDocFrequency: Int = 0
   // Default is to fill missing Geolocations with the mean, but if fillWithConstant is chosen, use this
   val DefaultGeolocation: Geolocation = Geolocation(0.0, 0.0, GeolocationAccuracy.Unknown)
+  val MinInfoGain: Double = DecisionTreeNumericBucketizer.MinInfoGain
 }
 
 private[op] object TransmogrifierDefaults extends TransmogrifierDefaults
@@ -86,10 +88,12 @@ private[op] case object Transmogrifier {
    *
    * @param features input features
    * @param defaults transmogrifier defaults (allows params injection)
+   * @param label optional label feature to be passed into stages that require the label column
    * @return vectorized features grouped by type
    */
   def transmogrify(
-    features: Seq[FeatureLike[_]]
+    features: Seq[FeatureLike[_]],
+    label: Option[FeatureLike[RealNN]] = None
   )(implicit defaults: TransmogrifierDefaults): Iterable[FeatureLike[OPVector]] = {
     import defaults._
     def castSeqAs[U <: FeatureType](f: Seq[FeatureLike[_]]) = f.map(_.asInstanceOf[FeatureLike[U]])
@@ -141,7 +145,7 @@ private[op] case object Transmogrifier {
         case t if t =:= weakTypeOf[CurrencyMap] =>
           val (f, other) = castAs[CurrencyMap](g)
           f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other,
-            trackNulls = TrackNulls)
+            trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label)
         case t if t =:= weakTypeOf[DateMap] =>
           val (f, other) = castAs[DateMap](g) // TODO make better default
           f.vectorize(defaultValue = FillValue, cleanKeys = CleanKeys, others = other, trackNulls = TrackNulls)
@@ -159,7 +163,7 @@ private[op] case object Transmogrifier {
         case t if t =:= weakTypeOf[IntegralMap] =>
           val (f, other) = castAs[IntegralMap](g)
           f.vectorize(defaultValue = FillValue, fillWithMode = FillWithMode, cleanKeys = CleanKeys, others = other,
-            trackNulls = TrackNulls)
+            trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label)
         case t if t =:= weakTypeOf[MultiPickListMap] =>
           val (f, other) = castAs[MultiPickListMap](g)
           f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, cleanKeys = CleanKeys,
@@ -167,7 +171,7 @@ private[op] case object Transmogrifier {
         case t if t =:= weakTypeOf[PercentMap] =>
           val (f, other) = castAs[PercentMap](g)
           f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other,
-            trackNulls = TrackNulls)
+            trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label)
         case t if t =:= weakTypeOf[PhoneMap] =>
           val (f, other) = castAs[PhoneMap](g) // TODO make better default
           f.vectorize(defaultRegion = PhoneNumberParser.DefaultRegion, others = other, trackNulls = TrackNulls)
@@ -178,7 +182,7 @@ private[op] case object Transmogrifier {
         case t if t =:= weakTypeOf[RealMap] =>
           val (f, other) = castAs[RealMap](g)
           f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other,
-            trackNulls = TrackNulls)
+            trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label)
         case t if t =:= weakTypeOf[TextAreaMap] =>
           val (f, other) = castAs[TextAreaMap](g)
           // Explicitly set cleanText to false here in order to match behavior of Text vectorization
@@ -223,7 +227,8 @@ private[op] case object Transmogrifier {
           f.vectorize(fillValue = BinaryFillValue, trackNulls = TrackNulls, others = other)
         case t if t =:= weakTypeOf[Currency] =>
           val (f, other) = castAs[Currency](g)
-          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other)
+          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls,
+            trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label)
         case t if t =:= weakTypeOf[Date] =>
           val (f, other) = castAs[Date](g)
           f.vectorize(dateListPivot = DateListDefault, referenceDate = ReferenceDate, others = other)
@@ -232,13 +237,16 @@ private[op] case object Transmogrifier {
           f.vectorize(dateListPivot = DateListDefault, referenceDate = ReferenceDate, others = other)
         case t if t =:= weakTypeOf[Integral] =>
           val (f, other) = castAs[Integral](g)
-          f.vectorize(fillValue = FillValue, fillWithMode = FillWithMode, trackNulls = TrackNulls, others = other)
+          f.vectorize(fillValue = FillValue, fillWithMode = FillWithMode, trackNulls = TrackNulls,
+            trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label)
         case t if t =:= weakTypeOf[Percent] =>
           val (f, other) = castAs[Percent](g)
-          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other)
+          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls,
+            trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label)
         case t if t =:= weakTypeOf[Real] =>
           val (f, other) = castAs[Real](g)
-          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other)
+          f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls,
+            trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label)
         case t if t =:= weakTypeOf[RealNN] =>
           val (f, other) = castAs[RealNN](g)
           f.vectorize(other)
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala b/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala
index a0f9642372..24aaf78150 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala
@@ -34,12 +34,13 @@ package com.salesforce.op.stages.impl.insights
 import com.salesforce.op.UID
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.unary.UnaryTransformer
-import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.SparkModelConverter._
-import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
+import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._
 import com.salesforce.op.utils.spark.OpVectorMetadata
-import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.Model
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.IntParam
+
 import scala.collection.mutable.PriorityQueue
 
 /**
@@ -48,7 +49,8 @@ import scala.collection.mutable.PriorityQueue
  * @param model         model instance that you wish to explain
  * @param uid           uid for instance
  */
-class RecordInsightsLOCO[T <: SparkWrapperParams[_]]
+@Experimental
+class RecordInsightsLOCO[T <: Model[T]]
 (
   val model: T,
   uid: String = UID[RecordInsightsLOCO[_]]
@@ -62,7 +64,7 @@ class RecordInsightsLOCO[T <: SparkWrapperParams[_]]
   def getTopK: Int = $(topK)
   setDefault(topK -> 20)
 
-  private val modelApply = toOP(model.getSparkMlStage().map(_.asInstanceOf[Transformer])).transformFn
+  private val modelApply = toOPUnchecked(model).transformFn
   private val labelDummy = RealNN(0.0)
 
   private lazy val featureInfo = OpVectorMetadata(getInputSchema()(in1.name)).getColumnHistory().map(_.toJson(false))
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala
index 9ceafd60fd..5e0a23cff8 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala
@@ -162,6 +162,14 @@ trait SanityCheckerParams extends Params {
   def setRemoveFeatureGroup(value: Boolean): this.type = set(removeFeatureGroup, value)
   def getRemoveFeatureGroup: Boolean = $(removeFeatureGroup)
 
+  final val protectTextSharedHash = new BooleanParam(
+    parent = this, name = "protectTextSharedHash",
+    doc = "If true, an individual hash is dropped/kept independently of related null indicators and" +
+      " other hashes in the same shared hash space."
+  )
+  def setProtectTextSharedHash(value: Boolean): this.type = set(protectTextSharedHash, value)
+  def getProtectTextSharedHash: Boolean = $(protectTextSharedHash)
+
   final val maxRuleConfidence = new DoubleParam(
     parent = this, name = "maxRuleConfidence",
     doc = "Maximum allowed confidence of association rules in categorical variables. A categorical variable will be " +
@@ -193,6 +201,7 @@ trait SanityCheckerParams extends Params {
     maxCramersV -> SanityChecker.MaxCramersV,
     removeBadFeatures -> SanityChecker.RemoveBadFeatures,
     removeFeatureGroup -> SanityChecker.RemoveFeatureGroup,
+    protectTextSharedHash -> SanityChecker.ProtectTextSharedHash,
     correlationType -> SanityChecker.CorrelationType,
     maxRuleConfidence -> SanityChecker.MaxRuleConfidence,
     minRequiredRuleSupport -> SanityChecker.MinRequiredRuleSupport
@@ -240,10 +249,10 @@ class SanityChecker(uid: String = UID[SanityChecker])
         indicatorGroup <- col.indicatorGroup
       } yield (indicatorGroup, (col, col.index))
 
-      nullGroups.groupBy(_._1).foreach {
-        case (group, cols) =>
-          require(cols.length == 1, s"Vector column $group has multiple null indicator fields: $cols")
-      }
+    nullGroups.groupBy(_._1).foreach {
+      case (group, cols) =>
+        require(cols.length == 1, s"Vector column $group has multiple null indicator fields: $cols")
+    }
 
     def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map{ case(k, v) =>
       // Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation
@@ -332,6 +341,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
     val maxRuleConf = $(maxRuleConfidence)
     val minReqRuleSupport = $(minRequiredRuleSupport)
     val removeFromParent = $(removeFeatureGroup)
+    val textSharedHashProtected = $(protectTextSharedHash)
 
     // Calculate groups to remove separately. This is for more complicated checks where you can't determine whether
     // to remove a feature from a single column stats (eg. associate rule confidence/support check)
@@ -357,6 +367,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
         maxRuleConfidence = maxRuleConf,
         minRequiredRuleSupport = minReqRuleSupport,
         removeFeatureGroup = removeFromParent,
+        protectTextSharedHash = textSharedHashProtected,
         removedGroups = ruleConfGroupsToDrop
       )
       if reasons.nonEmpty
@@ -374,9 +385,8 @@ class SanityChecker(uid: String = UID[SanityChecker])
   ): Array[CategoricalGroupStats] = {
     // Figure out which columns correspond to MultiPickList values so that we can make the "OTHER" columns at most 1 so
     // that we can still use contingency matrices to calculate Cramer's V values
-    val multiPickList = FeatureType.shortTypeName[MultiPickList]
     val multiPickListIndices = columnMeta.zipWithIndex.collect {
-      case (col, index) if col.hasParentOfType(multiPickList) => index
+      case (col, index) if col.hasParentOfSubType[MultiPickList] => index
     }.toSet
 
     // Group by label and then add in a 1.0 so we can get the total occurrences for each label in one reduction
@@ -410,7 +420,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
           .groupBy(_._1)
           // Keep track of the group, column name, column index, and whether the parent was a MultiPickList or not
           .map { case (group, cols) => (group, cols.map(_._2.makeColName()), cols.map(_._2.index),
-            cols.exists(_._2.hasParentOfType(multiPickList)))
+            cols.exists(_._2.hasParentOfSubType[MultiPickList]))
           }
 
       colIndicesByIndicatorGroup.map {
@@ -624,6 +634,7 @@ object SanityChecker {
   val MaxCramersV = 0.95
   val RemoveBadFeatures = false
   val RemoveFeatureGroup = true
+  val ProtectTextSharedHash = false
   val CorrelationType = Pearson
   // These settings will make the maxRuleConfidence check off by default
   val MaxRuleConfidence = 1.0
@@ -667,6 +678,8 @@ private[op] case class ColumnStatistics
    * @param maxCramersV         Maximum Cramer's V value
    * @param maxRuleConfidence   Minimum association rule confidence between
    * @param minRequiredRuleSupport  Minimum required support to throw away a group
+   * @param removeFeatureGroup   Whether to remove entire feature group when any group value is flagged for removal
+   * @param protectTextSharedHash   Whether to protect text shared hash from related null indicator and other hashes
    * @param removedGroups       Pre-determined feature groups to remove (eg. via maxRuleConfidence)
    * @return List[String] if reason to remove, nil otherwise
    */
@@ -678,6 +691,7 @@ private[op] case class ColumnStatistics
     maxRuleConfidence: Double,
     minRequiredRuleSupport: Double,
     removeFeatureGroup: Boolean,
+    protectTextSharedHash: Boolean,
     removedGroups: Seq[String]
   ): List[String] = {
     if (isLabel) List() // never remove the label!
@@ -707,18 +721,31 @@ private[op] case class ColumnStatistics
       ).flatten
 
       val parentExclusionReasons =
-        if (removeFeatureGroup) List(
-          parentCramersV.filter(_ > maxCramersV).map(cv =>
-            s"Cramer's V $cv for something in parent feature set higher than max Cramer's V $maxCramersV"),
-          parentCorr.filter(_ > maxCorrelation).map(corr =>
-            s"correlation $corr for something in parent feature set higher than max correlation $maxCorrelation")
-        ).flatten
-        else List.empty[String]
+        if (removeFeatureGroup && (!column.forall(isTextSharedHash) || !protectTextSharedHash)) {
+          List(
+            parentCramersV.filter(_ > maxCramersV).map(cv =>
+              s"Cramer's V $cv for something in parent feature set higher than max Cramer's V $maxCramersV"),
+            parentCorr.filter(_ > maxCorrelation).map(corr =>
+              s"correlation $corr for something in parent feature set higher than max correlation $maxCorrelation")
+          ).flatten
+        } else List.empty[String]
 
       exclusionReasons ++ parentExclusionReasons
     }
   }
 
+  /**
+   * Is column a shared hash feature that is derived from Text, TextArea, TextMap, or TextAreaMap
+   *
+   * @param metadata     metadata of column
+   * @return
+   */
+  def isTextSharedHash(metadata: OpVectorColumnMetadata): Boolean = {
+    val isDerivedFromText = metadata.hasParentOfType[Text] || metadata.hasParentOfType[TextArea] ||
+      metadata.hasParentOfType[TextMap] || metadata.hasParentOfType[TextAreaMap]
+    isDerivedFromText && metadata.indicatorGroup.isEmpty && metadata.indicatorValue.isEmpty
+  }
+
   override def toString: String = {
     val description = if (isLabel) "Label" else s"Feature"
     s"$description $name has: " +
@@ -752,3 +779,4 @@ object CorrelationType extends Enum[CorrelationType] {
    */
   case object Spearman extends CorrelationType("spearman")
 }
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala
new file mode 100644
index 0000000000..4ce76dd324
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark Decision Tree Regressor [[org.apache.spark.ml.regression.DecisionTreeRegressor]]
+ * @param uid       stage uid
+ */
+class OpDecisionTreeRegressor(uid: String = UID[OpDecisionTreeRegressor])
+  extends OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel](
+    predictor = new DecisionTreeRegressor(),
+    uid = uid
+  ) with OpDecisionTreeRegressorParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /** @group setParam */
+  override def setImpurity(value: String): this.type = set(impurity, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  /** @group setParam */
+  def setVarianceCol(value: String): this.type = set(varianceCol, value)
+
+}
+
+/**
+ * Class that takes in a spark DecisionTreeRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ * @param sparkModel model to wrap
+ * @param uid uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpDecisionTreeRegressionModel
+(
+  sparkModel: DecisionTreeRegressionModel,
+  uid: String = UID[OpDecisionTreeRegressionModel],
+  operationName: String = classOf[DecisionTreeRegressor].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictionModel[DecisionTreeRegressionModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala
new file mode 100644
index 0000000000..8083b67250
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, OpGBTRegressorParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark GBT Regressor [[org.apache.spark.ml.regression.GBTRegressor]]
+ * @param uid       stage uid
+ */
+class OpGBTRegressor(uid: String = UID[OpGBTRegressor])
+  extends OpPredictorWrapper[GBTRegressor, GBTRegressionModel](
+    predictor = new GBTRegressor(),
+    uid = uid
+  ) with OpGBTRegressorParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /**
+   * The impurity setting is ignored for GBT models.
+   * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
+   */
+  override def setImpurity(value: String): this.type = {
+    logWarning("GBTRegressor.setImpurity should NOT be used")
+    this
+  }
+
+  // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  // Parameters from GBTParams:
+
+  /** @group setParam */
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /** @group setParam */
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
+
+  // Parameters from GBTRegressorParams:
+
+  /** @group setParam */
+  def setLossType(value: String): this.type = set(lossType, value)
+}
+
+
+/**
+ * Class that takes in a spark GBTRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpGBTRegressionModel
+(
+  sparkModel: GBTRegressionModel,
+  uid: String = UID[OpGBTRegressionModel],
+  operationName: String = classOf[GBTRegressor].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictionModel[GBTRegressionModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala
new file mode 100644
index 0000000000..5f13d5bd45
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.regression.{GeneralizedLinearRegression, GeneralizedLinearRegressionModel, OpGeneralizedLinearRegressionParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper for spark Generalized Regression [[org.apache.spark.ml.regression.GeneralizedLinearRegression]]
+ * @param uid       stage uid
+ */
+class OpGeneralizedLinearRegression(uid: String = UID[OpGeneralizedLinearRegression])
+  extends OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel](
+    predictor = new GeneralizedLinearRegression(),
+    uid = uid
+  ) with OpGeneralizedLinearRegressionParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  /**
+   * Sets the value of param [[family]].
+   * Default is "gaussian".
+   *
+   * @group setParam
+   */
+  def setFamily(value: String): this.type = set(family, value)
+  setDefault(family -> "gaussian")
+
+  /**
+   * Sets the value of param [[variancePower]].
+   * Used only when family is "tweedie".
+   * Default is 0.0, which corresponds to the "gaussian" family.
+   *
+   * @group setParam
+   */
+  def setVariancePower(value: Double): this.type = set(variancePower, value)
+  setDefault(variancePower -> 0.0)
+
+  /**
+   * Sets the value of param [[linkPower]].
+   * Used only when family is "tweedie".
+   *
+   * @group setParam
+   */
+  def setLinkPower(value: Double): this.type = set(linkPower, value)
+
+  /**
+   * Sets the value of param [[link]].
+   * Used only when family is not "tweedie".
+   *
+   * @group setParam
+   */
+  def setLink(value: String): this.type = set(link, value)
+
+  /**
+   * Sets if we should fit the intercept.
+   * Default is true.
+   *
+   * @group setParam
+   */
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+
+  /**
+   * Sets the maximum number of iterations (applicable for solver "irls").
+   * Default is 25.
+   *
+   * @group setParam
+   */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 25)
+
+  /**
+   * Sets the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-6.
+   *
+   * @group setParam
+   */
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
+
+  /**
+   * Sets the regularization parameter for L2 regularization.
+   * The regularization term is
+   * <blockquote>
+   *    $$
+   *    0.5 * regParam * L2norm(coefficients)^2
+   *    $$
+   * </blockquote>
+   * Default is 0.0.
+   *
+   * @group setParam
+   */
+  def setRegParam(value: Double): this.type = set(regParam, value)
+  setDefault(regParam -> 0.0)
+
+  /**
+   * Sets the value of param [[weightCol]].
+   * If this is not set or empty, we treat all instance weights as 1.0.
+   * Default is not set, so all instances have weight one.
+   * In the Binomial family, weights correspond to number of trials and should be integer.
+   * Non-integer weights are rounded to integer in AIC calculation.
+   *
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+
+  /**
+   * Sets the solver algorithm used for optimization.
+   * Currently only supports "irls" which is also the default solver.
+   *
+   * @group setParam
+   */
+  def setSolver(value: String): this.type = set(solver, value)
+  setDefault(solver -> "irls")
+
+  /**
+   * Sets the link prediction (linear predictor) column name.
+   *
+   * @group setParam
+   */
+  def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value)
+
+}
+
+
+
+/**
+ * Class that takes in a spark GeneralizedLinearRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpGeneralizedLinearRegressionModel
+(
+  sparkModel: GeneralizedLinearRegressionModel,
+  uid: String = UID[GeneralizedLinearRegressionModel],
+  operationName: String = classOf[GeneralizedLinearRegression].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictorWrapperModel[GeneralizedLinearRegressionModel](uid = uid, operationName = operationName,
+  sparkModel = sparkModel) {
+
+  @transient lazy private val predictLink = reflectMethod(getSparkMlStage().get, "predictLink")
+  @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict")
+
+  /**
+   * Function used to convert input to output
+   */
+  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
+    val raw = predictLink.apply(features.value).asInstanceOf[Double]
+    val pred = predict.apply(features.value).asInstanceOf[Double]
+    Prediction(prediction = pred, rawPrediction = raw)
+  }
+}
+
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala
index e05bd15075..23fc7d888c 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala
@@ -32,19 +32,23 @@
 package com.salesforce.op.stages.impl.regression
 
 import com.salesforce.op._
-import com.salesforce.op.features.types._
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
 import com.salesforce.op.stages.impl.CheckIsResponseValues
-import com.salesforce.op.stages.sparkwrappers.specific.OpPredictorWrapper
-import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel, OpLinearRegressionParams}
+
+import scala.reflect.runtime.universe.TypeTag
+
 
 /**
- * Wrapper around spark ml linear regression for use with OP pipelines
+ * Wrapper around spark ml linear regression [[org.apache.spark.ml.regression.LinearRegression]]
  */
 class OpLinearRegression(uid: String = UID[OpLinearRegression])
-  extends OpPredictorWrapper[RealNN, RealNN, LinearRegression, LinearRegressionModel](
+  extends OpPredictorWrapper[LinearRegression, LinearRegressionModel](
     predictor = new LinearRegression(),
     uid = uid
-){
+) with OpLinearRegressionParams {
 
   override protected def onSetInput(): Unit = {
     super.onSetInput()
@@ -57,161 +61,127 @@ class OpLinearRegression(uid: String = UID[OpLinearRegression])
    *
    * @group setParam
    */
-  def setRegParam(value: Double): this.type = {
-    getSparkStage.setRegParam(value)
-    this
-  }
+  def setRegParam(value: Double): this.type = set(regParam, value)
+  setDefault(regParam -> 0.0)
 
   /**
-   * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
-   * Default is 0.0 which is an L2 penalty.
+   * Set if we should fit the intercept.
+   * Default is true.
    *
    * @group setParam
    */
-  def setElasticNetParam(value: Double): this.type = {
-    getSparkStage.setElasticNetParam(value)
-    this
-  }
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
 
   /**
-   * Set the maximum number of iterations.
-   * Default is 100.
+   * Whether to standardize the training features before fitting the model.
+   * The coefficients of models will be always returned on the original scale,
+   * so it will be transparent for users.
+   * Default is true.
+   *
+   * @note With/without standardization, the models should be always converged
+   * to the same solution when no regularization is applied. In R's GLMNET package,
+   * the default behavior is true as well.
    *
    * @group setParam
    */
-  def setMaxIter(value: Int): this.type = {
-    getSparkStage.setMaxIter(value)
-    this
-  }
+  def setStandardization(value: Boolean): this.type = set(standardization, value)
+  setDefault(standardization -> true)
 
   /**
-   * Set the convergence tolerance of iterations.
-   * Smaller value will lead to higher accuracy with the cost of more iterations.
-   * Default is 1E-6.
+   * Set the ElasticNet mixing parameter.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
+   * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
    */
-  def setTol(value: Double): this.type = {
-    getSparkStage.setTol(value)
-    this
-  }
+  def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
+  setDefault(elasticNetParam -> 0.0)
 
   /**
-   * Whether to fit an intercept term.
-   * Default is true.
+   * Set the maximum number of iterations.
+   * Default is 100.
    *
    * @group setParam
    */
-  def setFitIntercept(value: Boolean): this.type = {
-    getSparkStage.setFitIntercept(value)
-    this
-  }
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 100)
 
   /**
-   * Whether to standardize the training features before fitting the model.
-   * The coefficients of models will be always returned on the original scale,
-   * so it will be transparent for users. Note that with/without standardization,
-   * the models should be always converged to the same solution when no regularization
-   * is applied. In R's GLMNET package, the default behavior is true as well.
-   * Default is true.
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-6.
    *
    * @group setParam
    */
-  def setStandardization(value: Boolean): this.type = {
-    getSparkStage.setStandardization(value)
-    this
-  }
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
 
   /**
    * Whether to over-/under-sample training instances according to the given weights in weightCol.
-   * If not set or empty String, all instances are treated equally (weight 1.0).
+   * If not set or empty, all instances are treated equally (weight 1.0).
    * Default is not set, so all instances have weight one.
    *
    * @group setParam
    */
-  def setWeightCol(value: String): this.type = {
-    getSparkStage.setWeightCol(value)
-    this
-  }
+  def setWeightCol(value: String): this.type = set(weightCol, value)
 
   /**
-   * Set the solver algorithm used for optimization. In case of linear regression, this can be "l-bfgs", "normal" and
-   * "auto".
-   * "l-bfgs": Limited-memory BFGS which is a limited-memory quasi-Newton optimization method.
-   * "normal": Normal Equation as an analytical solution to the linear regression problem.
-   * "auto" (default): solver algorithm is selected automatically. The Normal Equations solver will be used when
-   * possible, but this will automatically fall back to iterative optimization methods when needed.
+   * Set the solver algorithm used for optimization.
+   * In case of linear regression, this can be "l-bfgs", "normal" and "auto".
+   *  - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
+   *    optimization method.
+   *  - "normal" denotes using Normal Equation as an analytical solution to the linear regression
+   *    problem.  This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`.
+   *  - "auto" (default) means that the solver algorithm is selected automatically.
+   *    The Normal Equations solver will be used when possible, but this will automatically fall
+   *    back to iterative optimization methods when needed.
    *
    * @group setParam
    */
   def setSolver(value: String): this.type = {
-    getSparkStage.setSolver(value)
-    this
+    require(Set("auto", "l-bfgs", "normal").contains(value),
+      s"Solver $value was not supported. Supported options: auto, l-bfgs, normal")
+    set(solver, value)
   }
+  setDefault(solver -> "auto")
 
   /**
-   * Get the regularization parameter.
+   * Suggested depth for treeAggregate (greater than or equal to 2).
+   * If the dimensions of features or the number of partitions are large,
+   * this param could be adjusted to a larger size.
+   * Default is 2.
    *
+   * @group expertSetParam
    */
-  def getRegParam: Double = {
-    getSparkStage.getRegParam
-  }
+  def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
+  setDefault(aggregationDepth -> 2)
 
-  /**
-   * Get the ElasticNet mixing parameter.
-   *
-   */
-  def getElasticNetParam: Double = {
-    getSparkStage.getElasticNetParam
-  }
-
-  /**
-   * Get the maximum number of iterations.
-   *
-   */
-  def getMaxIter: Int = {
-    getSparkStage.getMaxIter
-  }
-
-  /**
-   * Get the convergence tolerance of iterations.
-   *
-   */
-  def getTol: Double = {
-    getSparkStage.getTol
-  }
-
-  /**
-   * Get the fit intercept boolean parameter
-   *
-   */
-  def getFitIntercept: Boolean = {
-    getSparkStage.getFitIntercept
-  }
+}
 
-  /**
-   * Get the standardization boolean parameter
-   *
-   */
-  def getStandardization: Boolean = {
-    getSparkStage.getStandardization
-  }
 
-  /**
-   * Get the weights in weightCol defining whether to over-/under-sample training instances
-   *
-   */
-  def getWeightCol: String = {
-    getSparkStage.getWeightCol
-  }
-
-  /**
-   * Get the solver algorithm used for optimization
-   *
-   */
-  def getSolver: String = {
-    getSparkStage.getSolver
-  }
+/**
+ * Class that takes in a spark LinearRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ * @param sparkModel model to wrap
+ * @param uid uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpLinearRegressionModel
+(
+  sparkModel: LinearRegressionModel,
+  uid: String = UID[OpLinearRegressionModel],
+  operationName: String = classOf[LinearRegression].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictionModel[LinearRegressionModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
 }
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala
new file mode 100644
index 0000000000..7f92aaa478
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.impl.CheckIsResponseValues
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
+import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod
+import org.apache.spark.ml.regression.{OpRandomForestRegressorParams, RandomForestRegressionModel, RandomForestRegressor}
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Wrapper around sparj Random Forest Regressor [[org.apache.spark.ml.regression.RandomForestRegressor]]
+ * @param uid       stage uid
+ */
+class OpRandomForestRegressor(uid: String = UID[OpRandomForestRegressor])
+  extends OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel](
+    predictor = new RandomForestRegressor(),
+    uid = uid
+  ) with OpRandomForestRegressorParams {
+
+  override protected def onSetInput(): Unit = {
+    super.onSetInput()
+    CheckIsResponseValues(in1, in2)
+  }
+
+  // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /** @group setParam */
+  override def setImpurity(value: String): this.type = set(impurity, value)
+
+  // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
+
+  /** @group setParam */
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  // Parameters from RandomForestParams:
+
+  /** @group setParam */
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
+
+  /** @group setParam */
+  override def setFeatureSubsetStrategy(value: String): this.type =
+  set(featureSubsetStrategy, value)
+
+}
+
+/**
+ * Class that takes in a spark RandomForestRegressionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ * @param sparkModel model to wrap
+ * @param uid uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ */
+class OpRandomForestRegressionModel
+(
+  sparkModel: RandomForestRegressionModel,
+  uid: String = UID[OpRandomForestRegressionModel],
+  operationName: String = classOf[RandomForestRegressor].getSimpleName
+)(
+  implicit tti1: TypeTag[RealNN],
+  tti2: TypeTag[OPVector],
+  tto: TypeTag[Prediction],
+  ttov: TypeTag[Prediction#Value]
+) extends OpPredictionModel[RandomForestRegressionModel](
+  sparkModel = sparkModel, uid = uid, operationName = operationName
+) {
+  @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict")
+}
+
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala b/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala
index 5344f769ad..94a6899954 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala
@@ -31,6 +31,7 @@
 
 package com.salesforce.op.stages.impl.selector
 
+import com.salesforce.op.utils.stages.FitStagesUtil._
 import com.salesforce.op.UID
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.evaluators.{EvaluationMetrics, _}
@@ -39,15 +40,14 @@ import com.salesforce.op.features.types._
 import com.salesforce.op.readers.DataFrameFieldNames
 import com.salesforce.op.stages._
 import com.salesforce.op.stages.impl.CheckIsResponseValues
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
 import com.salesforce.op.stages.impl.tuning._
 import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
 import com.salesforce.op.utils.spark.RichMetadata._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.{Estimator, Model, Transformer}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.MetadataBuilder
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.types.{MetadataBuilder, StructType}
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 
 import scala.reflect.runtime.universe._
 import scala.util.Try
@@ -191,6 +191,34 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]]
    */
   protected def getModelInfo: Seq[ModelInfo[E]]
 
+  /**
+   * Get the list of all the models and their parameters for comparison
+   * @return value
+   */
+  protected[op] def getUsedModels: Seq[ModelInfo[E]] = getModelInfo.filter(m => $(m.useModel))
+
+  /**
+   * Find best estimator with validation on a workflow level. Executed when workflow level Cross Validation is on
+   * (see [[com.salesforce.op.OpWorkflow.withWorkflowCV]])
+   *
+   * @param data                data to validate
+   * @param dag                 dag done inside the Cross-validation/Train-validation split
+   * @param persistEveryKStages frequency of persisting the DAG's stages
+   * @param spark               Spark Session
+   * @return Updated Model Selector with best model along with best paramMap
+   */
+  protected[op] def findBestEstimator(data: Dataset[_], dag: StagesDAG, persistEveryKStages: Int = 0)
+    (implicit spark: SparkSession): Unit = {
+
+    val theBestEstimator = validator.validate(modelInfo = getUsedModels, dataset = data,
+      label = in1.name, features = in2.name, dag = Option(dag), splitter = splitter,
+      stratifyCondition = validator.isClassification
+    )
+
+    bestEstimator = Option(theBestEstimator)
+  }
+
+
   // Map (name of param, value of param) of output column names
   def outputsColNamesMap: Map[String, String] = {
     val defaultNames = getOutputsColNamesMap(in1, in2)
@@ -216,16 +244,15 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]]
    */
   final override def fit(dataset: Dataset[_]): SelectedModel = {
 
-    import dataset.sparkSession.implicits._
+    implicit val spark = dataset.sparkSession
+    import spark.implicits._
 
     val datasetWithID =
       if (dataset.columns.contains(DataFrameFieldNames.KeyFieldName)) {
         dataset.select(in1.name, in2.name, DataFrameFieldNames.KeyFieldName)
-          .as[LabelFeaturesKey].persist()
       } else {
         dataset.select(in1.name, in2.name)
           .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id())
-          .as[LabelFeaturesKey].persist()
       }
     require(!datasetWithID.isEmpty, "Dataset cannot be empty")
 
@@ -234,13 +261,19 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]]
       case None => new ModelData(datasetWithID, new MetadataBuilder())
     }
 
-
-    val bestModel = bestEstimator.map { case BestEstimator(name, estimator, meta) =>
-      new BestModel(name = name, model = estimator.fit(trainData).asInstanceOf[M], metadata = Option(meta))
-    }.getOrElse {
+    val BestEstimator(name, estimator, meta) = bestEstimator.getOrElse{
       setInputSchema(dataset.schema).transformSchema(dataset.schema)
-      validator.validate(getModelInfo.filter(m => $(m.useModel)), trainData, in1.name, in2.name)
+      val best = validator
+        .validate(modelInfo = getUsedModels, dataset = trainData, label = in1.name, features = in2.name)
+      bestEstimator = Some(best)
+      best
     }
+
+    val bestModel = new BestModel(
+      name = name,
+      model = estimator.fit(trainData).asInstanceOf[M],
+      metadata = Option(meta)
+    )
     bestModel.metadata.foreach(meta => setMetadata(meta.build))
     val bestClassifier = bestModel.model.parent
     log.info(s"Selected model : ${bestClassifier.getClass.getSimpleName}")
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala
index db6cf874c8..8a1b68f84a 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala
@@ -33,9 +33,8 @@ package com.salesforce.op.stages.impl.tuning
 
 import com.salesforce.op.UID
 import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
 import org.apache.spark.ml.param._
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.types.MetadataBuilder
 import org.slf4j.LoggerFactory
 
@@ -74,6 +73,7 @@ case object DataBalancer {
 class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) with DataBalancerParams {
 
   @transient private lazy val log = LoggerFactory.getLogger(this.getClass)
+  @transient private[op] val metadataBuilder = new MetadataBuilder()
 
   /**
    * Computes the upSample and downSample proportions.
@@ -120,44 +120,23 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid)
   /**
    * Split into a training set and a test set and balance the training set
    *
-   * @param data to prepare for model training
+   * @param data to prepare for model training. first column must be the label as a double
    * @return balanced training set and a test set
    */
-  def prepare(data: Dataset[LabelFeaturesKey]): ModelData = {
+  def prepare(data: Dataset[Row]): ModelData = {
 
     val ds = data.persist()
 
-    val Array(negativeData, positiveData) = Array(0.0, 1.0).map(label => ds.filter(_._1 == label).persist())
-    val metadataBuilder = new MetadataBuilder()
+    val Array(negativeData, positiveData) = Array(0.0, 1.0).map(label => ds.filter(_.getDouble(0) == label).persist())
     val balancerSeed = getSeed
 
-    // If these conditions are met, that means that we have enough information to balance the data : upSample,
-    // downSample and which class is in minority
-    if (isSet(isPositiveSmall) && isSet(downSampleFraction) && isSet(upSampleFraction)) {
-      val (down, up) = ($(downSampleFraction), $(upSampleFraction))
-      log.info(s"Fractions are already known : downSample of ${down}, upSample of ${up}")
-      val (smallData, bigData) = if ($(isPositiveSmall)) (positiveData, negativeData) else (negativeData, positiveData)
-      new ModelData(rebalance(smallData, up, bigData, down, balancerSeed), metadataBuilder)
-      // If this condition is met, that means that the data is already balanced, but need to be sampled
-    } else if (isSet(alreadyBalancedFraction)) {
-      val f = $(alreadyBalancedFraction)
-      log.info(s"Data is already balanced, yet it will be sampled by a fraction of $f")
-      new ModelData(sampleBalancedData(
-        fraction = f,
-        seed = balancerSeed,
-        data = data,
-        positiveData = positiveData,
-        negativeData = negativeData),
-        metadataBuilder
-      )
-      // Usual estimation by computing the sizes of the data
-    } else estimateAndBalance(
-      data = data,
+    prepareData(
+      data = ds,
       positiveData = positiveData,
       negativeData = negativeData,
-      metadataBuilder = metadataBuilder,
       seed = balancerSeed
     )
+
   }
 
   override def copy(extra: ParamMap): DataBalancer = {
@@ -165,67 +144,55 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid)
     copyValues(copy, extra)
   }
 
+
+
   /**
-   * Estimate if data needs to be balanced or not. If so, computes sample fractions and balance data
+   * Estimate if data needs to be balanced or not. If so, computes sample fractions and sets the appropriate params
    *
    * @param data            input data
    * @param positiveData    data with positives only
    * @param negativeData    data with negatives only
-   * @param metadataBuilder metadata
    * @param seed            seed
    * @return balanced data
    */
-  private[op] def estimateAndBalance(
-    data: Dataset[LabelFeaturesKey],
-    positiveData: Dataset[LabelFeaturesKey],
-    negativeData: Dataset[LabelFeaturesKey],
-    metadataBuilder: MetadataBuilder,
+  private[op] def estimate[T](
+    data: Dataset[T],
+    positiveData: Dataset[T],
+    negativeData: Dataset[T],
     seed: Long
-  ): ModelData = {
+  ): Unit = {
     val positiveCount = positiveData.count()
     val negativeCount = negativeData.count()
     val totalCount = positiveCount + negativeCount
+    val sampleF = getSampleFraction
 
     // feed metadata with counts and sample fraction
     metadataBuilder.putLong(ModelSelectorBaseNames.Positive, positiveCount)
     metadataBuilder.putLong(ModelSelectorBaseNames.Negative, negativeCount)
-    metadataBuilder.putDouble(ModelSelectorBaseNames.Desired, $(sampleFraction))
+    metadataBuilder.putDouble(ModelSelectorBaseNames.Desired, sampleF)
     log.info(s"Data has $positiveCount positive and $negativeCount negative.")
 
-    val (smallCount, smallData, bigCount, bigData) = {
+    val (smallCount, bigCount) = {
       val isPosSmall = positiveCount < negativeCount
       setIsPositiveSmall(isPosSmall)
-      if (isPosSmall) (positiveCount, positiveData, negativeCount, negativeData)
-      else (negativeCount, negativeData, positiveCount, positiveData)
+      if (isPosSmall) (positiveCount, negativeCount)
+      else (negativeCount, positiveCount)
     }
     val maxTrainSample = getMaxTrainingSample
 
     if (smallCount < 100 || (smallCount + bigCount) < 500) {
       log.warn("!!!Attention!!! - there is not enough data to build a good model!")
     }
-    val sampleF = getSampleFraction
 
-    // if the current fraction is superior than the one expected
     if (smallCount.toDouble / totalCount.toDouble >= sampleF) {
       log.info(
         s"Not resampling data: $smallCount small count and $bigCount big count is greater than" +
           s" requested ${sampleF}"
       )
-
       // if data is too big downsample
       val fraction = if (maxTrainSample < totalCount) maxTrainSample / totalCount.toDouble else 1.0
-
       setAlreadyBalancedFraction(fraction)
 
-      // sample
-      new ModelData(sampleBalancedData(
-        fraction = fraction,
-        seed = seed,
-        data = data,
-        positiveData = positiveData,
-        negativeData = negativeData
-      ),
-        metadataBuilder)
     } else {
       log.info(s"Sampling data to get $sampleF split versus $smallCount small and $bigCount big")
       val (downSample, upSample) = getProportions(smallCount, bigCount, sampleF, maxTrainSample)
@@ -257,7 +224,42 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid)
           s"To make upsampling happen, please increase the max training sample size '${maxTrainingSample.name}'")
       }
 
-      new ModelData(rebalance(smallData, upSample, bigData, downSample, seed), metadataBuilder)
+    }
+  }
+  /**
+   * Preparing data
+   *
+   * @param data            input data
+   * @param positiveData    data with positives only
+   * @param negativeData    data with negatives only
+   * @param seed            seed
+   * @return balanced data
+   */
+  private[op] def prepareData[T](
+    data: Dataset[T],
+    positiveData: Dataset[T],
+    negativeData: Dataset[T],
+    seed: Long
+  ): ModelData = {
+
+    if (!(isSet(isPositiveSmall) || isSet(downSampleFraction) ||
+      isSet(upSampleFraction) || isSet(alreadyBalancedFraction))) {
+      estimate(data = data, positiveData = positiveData, negativeData = negativeData, seed = seed)
+    }
+
+    // If these conditions are met, that means that we have enough information to balance the data : upSample,
+    // downSample and which class is in minority
+    if (isSet(isPositiveSmall) && isSet(downSampleFraction) && isSet(upSampleFraction)) {
+      val (down, up) = ($(downSampleFraction), $(upSampleFraction))
+      log.info(s"Sample fractions: downSample of ${down}, upSample of ${up}")
+      val (smallData, bigData) = if ($(isPositiveSmall)) (positiveData, negativeData) else (negativeData, positiveData)
+      new ModelData(rebalance(smallData, up, bigData, down, seed).toDF(), metadataBuilder)
+    } else { // Data is already balanced, but need to be sampled
+      val fraction = $(alreadyBalancedFraction)
+      log.info(s"Data is already balanced, yet it will be sampled by a fraction of $fraction")
+      val balanced = sampleBalancedData(fraction = fraction, seed = seed,
+        data = data, positiveData = positiveData, negativeData = negativeData).toDF()
+      new ModelData(balanced, metadataBuilder)
     }
   }
 
@@ -272,13 +274,13 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid)
    * @return balanced small and big data split into training and test sets
    *         with downSample and upSample proportions
    */
-  private[op] def rebalance(
-    smallData: Dataset[_],
+  private[op] def rebalance[T](
+    smallData: Dataset[T],
     upSampleFraction: Double,
-    bigData: Dataset[_],
+    bigData: Dataset[T],
     downSampleFraction: Double,
     seed: Long
-  ): Dataset[LabelFeaturesKey] = {
+  ): Dataset[T] = {
 
     import smallData.sparkSession.implicits._
     val bigDataTrain = bigData.sample(withReplacement = false, downSampleFraction, seed = seed)
@@ -288,27 +290,26 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid)
       case u => smallData.sample(withReplacement = false, u, seed = seed) // downsample instead
     }
 
-    smallDataTrain.as[LabelFeaturesKey].union(bigDataTrain.as[LabelFeaturesKey])
-
+    smallDataTrain.union(bigDataTrain)
   }
 
   /**
    * Sample already balanced data
    *
-   * @param fraction
-   * @param seed
-   * @param data
-   * @param positiveData
-   * @param negativeData
+   * @param fraction subsample to take
+   * @param seed seed to use in sampling
+   * @param data full dataset in case no sampling is needed
+   * @param positiveData positive data for stratified sampling
+   * @param negativeData negative data for stratified sampling
    * @return
    */
-  private[op] def sampleBalancedData(
+  private[op] def sampleBalancedData[T](
     fraction: Double,
     seed: Long,
-    data: Dataset[LabelFeaturesKey],
-    positiveData: Dataset[LabelFeaturesKey],
-    negativeData: Dataset[LabelFeaturesKey]
-  ): Dataset[LabelFeaturesKey] = {
+    data: Dataset[T],
+    positiveData: Dataset[T],
+    negativeData: Dataset[T]
+  ): Dataset[T] = {
     fraction match {
       case 1.0 => data // we don't sample
       // stratified sampling
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala
index 5762fa0277..6c9fb97b06 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala
@@ -33,11 +33,10 @@ package com.salesforce.op.stages.impl.tuning
 
 import com.salesforce.op.UID
 import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
 import org.apache.spark.ml.param._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{Metadata, MetadataBuilder}
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.slf4j.LoggerFactory
 
 case object DataCutter {
@@ -81,15 +80,15 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with
    * function to use to prepare the dataset for modeling
    * eg - do data balancing or dropping based on the labels
    *
-   * @param data
+   * @param data first column must be the label as a double
    * @return Training set test set
    */
-  def prepare(data: Dataset[LabelFeaturesKey]): ModelData = {
+  def prepare(data: Dataset[Row]): ModelData = {
     import data.sparkSession.implicits._
 
     val keep =
       if (!isSet(labelsToKeep) || !isSet(labelsToDrop)) {
-        val labels = data.map(r => r._1 -> 1L)
+        val labels = data.map(r => r.getDouble(0) -> 1L)
         val labelCounts = labels.groupBy(labels.columns(0)).sum(labels.columns(1)).persist()
         val (resKeep, resDrop) = estimate(labelCounts)
         labelCounts.unpersist()
@@ -97,7 +96,7 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with
         resKeep
       } else getLabelsToKeep.toSet
 
-    val dataUse = data.filter(r => keep.contains(r._1))
+    val dataUse = data.filter(r => keep.contains(r.getDouble(0)))
 
     val labelsMeta = new MetadataBuilder()
       .putDoubleArray(ModelSelectorBaseNames.LabelsKept, getLabelsToKeep)
@@ -127,7 +126,7 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with
     val labelSet = labelsKeep.toSet
     val labelsDropped = labelCounts.filter(r => !labelSet.contains(r.getDouble(0))).collect().map(_.getDouble(0)).toSet
 
-    if (labelSet.size > 1) {
+    if (labelSet.nonEmpty) {
       log.info(s"DataCutter is keeping labels: $labelSet and dropping labels: $labelsDropped")
     } else {
       throw new RuntimeException(s"DataCutter dropped all labels with param settings:" +
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala
index 6d64b7e858..5666451f7e 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala
@@ -32,9 +32,8 @@
 package com.salesforce.op.stages.impl.tuning
 
 import com.salesforce.op.UID
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
 import org.apache.spark.ml.param._
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.types.MetadataBuilder
 
 case object DataSplitter {
@@ -70,7 +69,7 @@ class DataSplitter(uid: String = UID[DataSplitter]) extends Splitter(uid = uid)
    * @param data
    * @return Training set test set
    */
-  def prepare(data: Dataset[LabelFeaturesKey]): ModelData =
+  def prepare(data: Dataset[Row]): ModelData =
     new ModelData(data, new MetadataBuilder())
 
   override def copy(extra: ParamMap): DataSplitter = {
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala
index 72fae3647f..f062ff0b1d 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala
@@ -22,22 +22,17 @@ package com.salesforce.op.stages.impl.tuning
 
 import com.github.fommil.netlib.BLAS
 import com.salesforce.op.evaluators.OpEvaluatorBase
-import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.types.StructType
-import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames}
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.sql.{Dataset, Row}
+import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames}
+import com.salesforce.op.utils.stages.FitStagesUtil._
 import com.twitter.algebird.Monoid._
 import com.twitter.algebird.Operators._
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
 
-import scala.collection.parallel.mutable.ParArray
 
-
-private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]]
+private[op] class OpCrossValidation[M <: Model[_], E <: Estimator[_]]
 (
   val numFolds: Int = ValidatorParamDefaults.NumFolds,
   val seed: Long = ValidatorParamDefaults.Seed,
@@ -49,11 +44,11 @@ private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]]
   private val blas = BLAS.getInstance()
 
   private def findBestModel(
-    folds: ParArray[(E, Array[Double], Array[ParamMap])]
+    folds: Seq[ValidatedModel[E]]
   ): ValidatedModel[E] = {
-    val metrics = folds.map(_._2).reduce(_ + _)
+    val metrics = folds.map(_.metrics).reduce(_ + _)
     blas.dscal(metrics.length, 1.0 / numFolds, metrics, 1)
-    val (est, _, grid) = folds.head
+    val ValidatedModel(est, _, _, grid) = folds.head
     log.info(s"Average cross-validation for $est metrics: {}", metrics.toSeq.mkString(","))
     val (bestMetric, bestIndex) =
       if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
@@ -64,111 +59,106 @@ private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]]
   }
 
   // TODO use futures to parallelize https://github.com/apache/spark/commit/16c4c03c71394ab30c8edaf4418973e1a2c5ebfe
-  private[op] def validate(
+  private[op] override def validate[T](
     modelInfo: Seq[ModelInfo[E]],
-    dataset: Dataset[_],
+    dataset: Dataset[T],
     label: String,
-    features: String
-  ): BestModel[M] = {
-
-    // get param that stores the label column
-    val labelCol = evaluator.getParam(ValidatorParamDefaults.labelCol)
-    evaluator.set(labelCol, label)
-
-    val sparkSession = dataset.sparkSession
-    import sparkSession.implicits._
-    val rdd = dataset.as[LabelFeaturesKey].rdd.persist()
+    features: String,
+    dag: Option[StagesDAG] = None,
+    splitter: Option[Splitter] = None,
+    stratifyCondition: Boolean = isClassification && stratify
+  )(implicit spark: SparkSession): BestEstimator[E] = {
 
+    dataset.persist()
+    val schema = dataset.schema
 
     // creating k train/validation data
-    val splits: Array[(RDD[Row], RDD[Row])] = createTrainValidationSplits(rdd)
-
-
-    val schema = dataset.schema
-    val newSchema = StructType(schema.dropRight(1)) // dropping key
-
-    val modelWithGrid = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName))
-
-    val fitSummary = splits.zipWithIndex.par.flatMap {
-      case ((training, validation), splitIndex) =>
-
-        log.info(s"Cross Validation $splitIndex with multiple sets of parameters.")
-        val trainingDataset = sparkSession.createDataFrame(training, newSchema).persist()
-        val validationDataset = sparkSession.createDataFrame(validation, newSchema).persist()
-
-        val summary = modelWithGrid.map {
-          case (estimator, paramGrids, name) =>
-            val pi1 = estimator.getParam(StageParamNames.inputParam1Name)
-            val pi2 = estimator.getParam(StageParamNames.inputParam2Name)
-            estimator.set(pi1, label).set(pi2, features)
-
-            val numModels = paramGrids.length
-            val metrics = new Array[Double](paramGrids.length)
-
-            // multi-model training
-            val models = estimator.fit(trainingDataset, paramGrids).asInstanceOf[Seq[M]]
-            var i = 0
-            while (i < numModels) {
-              val metric = evaluator.evaluate(models(i).transform(validationDataset, paramGrids(i)))
-              log.debug(s"Got metric $metric for $name trained with ${paramGrids(i)}.")
-              metrics(i) = metric
-              i += 1
-            }
-            (estimator, metrics, paramGrids)
+    val splits: Array[(RDD[Row], RDD[Row])] = createTrainValidationSplits(
+      stratifyCondition = stratifyCondition,
+      dataset = dataset,
+      label = label,
+      splitter = splitter
+    )
+
+    val modelsWithGrids = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName))
+
+    // TODO use futures to parallelize https://github.com/apache/spark/commit/16c4c03c71394ab30c8edaf4418973e1a2c5ebfe
+    val groupedSummary = suppressLoggingForFun() {
+      splits.zipWithIndex.flatMap {
+        case ((training, validation), splitIndex) => {
+          log.info(s"Cross Validation $splitIndex with multiple sets of parameters.")
+          val trainingDataset = spark.createDataFrame(training, schema)
+          val validationDataset = spark.createDataFrame(validation, schema)
+          val (newTrain, newTest) = dag.map(theDAG =>
+            // If there is a CV DAG, then run it
+            applyDAG(
+              dag = theDAG,
+              training = trainingDataset,
+              validation = validationDataset,
+              label = label,
+              features = features,
+              splitter = splitter
+            )
+          ).getOrElse(trainingDataset, validationDataset)
+          getSummary(modelsWithGrids = modelsWithGrids, label = label, features = features,
+            train = newTrain, test = newTest)
         }
-        trainingDataset.unpersist()
-        validationDataset.unpersist()
-        summary
+      }.groupBy(_.model).map{ case (_, folds) => findBestModel(folds) }.toArray
     }
-    rdd.unpersist()
-
-    val groupedSummary = fitSummary.groupBy(_._1).map { case (_, folds) => findBestModel(folds) }.toArray
+    dataset.unpersist()
 
-    val model =
-      if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric)
-      else groupedSummary.minBy(_.bestMetric)
-
-    val bestModel = model.model.fit(dataset, model.bestGrid).asInstanceOf[M]
-    wrapBestModel(groupedSummary, bestModel, s"$numFolds folds")
+    val model = getValidatedModel(groupedSummary)
+    wrapBestEstimator(groupedSummary, model.model.copy(model.bestGrid).asInstanceOf[E], s"$numFolds folds")
   }
 
   // TODO : Implement our own kFold method for better performance in a separate PR
   /**
    * Creates Train Validation Splits For CV
-   * @param rdd
+   *
+   * @param stratifyCondition condition to do stratify cv
+   * @param dataset dataset to split
+   * @param label name of label in data
+   * @param splitter  used to estimate splitter params prior to cv
    * @return Array((TrainRDD, ValidationRDD), Index)
    */
-  private[op] override def createTrainValidationSplits(
-    rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])] = {
-
-    if (stratify && isClassification) {
-      log.info(s"Creating $numFolds stratified folds")
-      val classes = rdd.map(_._1).distinct().collect()
-      // Creates RDD grouped by classes (0, 1, 2, 3, ..., K)
-      val rddByClass = classes.map(label => rdd.filter(_._1 == label)
-        .map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _))
-
-      // Cross Validation's Train/Validation data for each class
-      val foldsByClass = rddByClass.map { case rdd: RDD[(String, Seq[Row])] => {
-        MLUtils.kFold(rdd, numFolds, seed)
-          .map { case (rdd1, rdd2) => (rdd1.values.flatMap(identity), rdd2.values.flatMap(identity)) }
-      }
-      }.toSeq
-
-      if (foldsByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty")
-      // Merging Train/Validation data one by one
-      foldsByClass.reduce[Array[(RDD[Row], RDD[Row])]] {
-        // cv1 and cv2 are arrays of train/validation data
-        case (cv1: Array[(RDD[Row], RDD[Row])], cv2: Array[(RDD[Row], RDD[Row])]) =>
-          (cv1 zip cv2).map { // zip the two arrays and merge the tuples one by one
-            case ((train1: RDD[Row], test1: RDD[Row]), (train2: RDD[Row], test2: RDD[Row])) =>
-              (train1.union(train2), test1.union(test2))
-          }
-      }
+  private[op] override def createTrainValidationSplits[T](stratifyCondition: Boolean,
+    dataset: Dataset[T], label: String, splitter: Option[Splitter] = None): Array[(RDD[Row], RDD[Row])] = {
+
+    // get param that stores the label column
+    val labelCol = evaluator.getParam(ValidatorParamDefaults.LabelCol)
+    evaluator.set(labelCol, label)
+
+    // creating k train/validation data
+    if (stratifyCondition) {
+      val rddsByClass = prepareStratification(
+        dataset = dataset,
+        message = s"Creating $numFolds stratified folds",
+        label = label,
+        splitter = splitter
+      )
+      stratifyKFolds(rddsByClass)
     } else {
-      val rddRow = rdd.map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _)
+      val rddRow = dataset.toDF().rdd
       MLUtils.kFold(rddRow, numFolds, seed)
-        .map { case (rdd1, rdd2) => (rdd1.values.flatMap(identity), rdd2.values.flatMap(identity)) }
+    }
+  }
+
+
+  private def stratifyKFolds(rddsByClass: Array[RDD[Row]]): Array[(RDD[Row], RDD[Row])] = {
+    // Cross Validation's Train/Validation data for each class
+    val foldsByClass = rddsByClass.map(rdd => MLUtils.kFold(rdd, numFolds, seed)).toSeq
+
+    if (foldsByClass.isEmpty) {
+      throw new RuntimeException("Dataset is too small for CV forlds selected some empty datasets are created")
+    }
+    // Merging Train/Validation data one by one
+    foldsByClass.reduce[Array[(RDD[Row], RDD[Row])]] {
+      // cv1 and cv2 are arrays of train/validation data
+      case (cv1: Array[(RDD[Row], RDD[Row])], cv2: Array[(RDD[Row], RDD[Row])]) =>
+        (cv1 zip cv2).map { // zip the two arrays and merge the tuples one by one
+          case ((train1: RDD[Row], test1: RDD[Row]), (train2: RDD[Row], test2: RDD[Row])) =>
+            (train1.union(train2), test1.union(test2))
+        }
     }
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala
index 6481276fdc..7d79fefddc 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala
@@ -20,17 +20,16 @@
 
 package com.salesforce.op.stages.impl.tuning
 
-import com.salesforce.op.evaluators.{OpBinaryClassificationEvaluatorBase, OpEvaluatorBase, OpMultiClassificationEvaluatorBase}
+import com.salesforce.op.evaluators.OpEvaluatorBase
 import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames}
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
-import org.apache.spark.ml.linalg.Vector
+import com.salesforce.op.utils.stages.FitStagesUtil._
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
 
 
-private[impl] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]]
+private[op] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]]
 (
   val trainRatio: Double = ValidatorParamDefaults.TrainRatio,
   val seed: Long = ValidatorParamDefaults.Seed,
@@ -40,104 +39,100 @@ private[impl] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]]
 
   val validationName: String = ModelSelectorBaseNames.TrainValSplitResults
 
-  private[op] def validate(
+  private[op] override def validate[T](
     modelInfo: Seq[ModelInfo[E]],
-    dataset: Dataset[_],
+    dataset: Dataset[T],
     label: String,
-    features: String
-  ): BestModel[M] = {
-    // get param that stores the label column
-    val labelCol = evaluator.getParam(ValidatorParamDefaults.labelCol)
-    evaluator.set(labelCol, label)
+    features: String,
+    dag: Option[StagesDAG] = None,
+    splitter: Option[Splitter] = None,
+    stratifyCondition: Boolean = isClassification && stratify
+  )(implicit spark: SparkSession): BestEstimator[E] = {
 
+    dataset.persist()
     val schema = dataset.schema
-    import dataset.sparkSession.implicits._
-    val rdd = dataset.as[LabelFeaturesKey].rdd.persist()
-
-    val (trainingRDD, validationRDD) = createTrainValidationSplits(rdd).head
 
-    val sparkSession = dataset.sparkSession
-    val newSchema = StructType(schema.dropRight(1)) // dropping key
-    val trainingDataset = sparkSession.createDataFrame(trainingRDD, newSchema).persist()
-    val validationDataset = sparkSession.createDataFrame(validationRDD, newSchema).persist()
-
-    // multi-model training
-    val modelWithGrid = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName))
-    val groupedSummary = modelWithGrid.par.map {
-      case (estimator, paramGrids, name) =>
-        val pi1 = estimator.getParam(StageParamNames.inputParam1Name)
-        val pi2 = estimator.getParam(StageParamNames.inputParam2Name)
-        estimator.set(pi1, label).set(pi2, features)
-
-        val numModels = paramGrids.length
-        val metrics = new Array[Double](paramGrids.length)
-
-        log.info(s"Train split with multiple sets of parameters.")
-        val models = estimator.fit(trainingDataset, paramGrids).asInstanceOf[Seq[M]]
-        var i = 0
-        while (i < numModels) {
-          val metric = evaluator.evaluate(models(i).transform(validationDataset, paramGrids(i)))
-          log.info(s"Got metric $metric for model $name trained with ${paramGrids(i)}.")
-          metrics(i) = metric
-          i += 1
-        }
-        log.info(s"Train validation split for $name metrics: {}", metrics.toSeq.mkString(","))
-        val (bestMetric, bestIndex) =
-          if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
-          else metrics.zipWithIndex.minBy(_._1)
-        log.info(s"Best set of parameters:\n${paramGrids(bestIndex)} for $name")
-        log.info(s"Best train validation split metric: $bestMetric.")
-
-        ValidatedModel(estimator, bestIndex, metrics, paramGrids)
+    val (training, validation) = createTrainValidationSplits(
+      stratifyCondition = stratifyCondition,
+      dataset = dataset,
+      label = label,
+      splitter = splitter
+    ).head
+
+    val trainingDataset = dataset.sparkSession.createDataFrame(training, schema)
+    val validationDataset = dataset.sparkSession.createDataFrame(validation, schema)
+
+    // If there is a TS DAG, then run it
+    val (newTrain, newTest) = suppressLoggingForFun() {
+      dag.map(theDAG => applyDAG(
+        dag = theDAG,
+        training = trainingDataset,
+        validation = validationDataset,
+        label = label,
+        features = features,
+        splitter = splitter
+      )).getOrElse(trainingDataset, validationDataset)
     }
-    trainingDataset.unpersist()
-    validationDataset.unpersist()
-    rdd.unpersist()
+    // multi-model training
+    val modelsWithGrids = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName))
+
+    val groupedSummary = getSummary(
+      modelsWithGrids = modelsWithGrids, label = label, features = features,
+      train = newTrain, test = newTest
+    )
 
-    val model =
-      if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric)
-      else groupedSummary.minBy(_.bestMetric)
+    dataset.unpersist()
 
-    val bestModel = model.model.fit(dataset, model.bestGrid).asInstanceOf[M]
-    wrapBestModel(groupedSummary.toArray, bestModel, s"$trainRatio training split")
+    val model = getValidatedModel(groupedSummary)
+    wrapBestEstimator(groupedSummary, model.model.copy(model.bestGrid).asInstanceOf[E], s"$trainRatio training split")
   }
 
-  // TODO : Implement our own startified split method for better performance in a separate PR
   /**
    * Creates Train Validation Splits For TS
-   * @param rdd
-   * @return
+   *
+   * @param stratifyCondition condition to do stratify ts
+   * @param dataset dataset to split
+   * @param label name of label in dataset
+   * @param splitter  used to estimate splitter params prior to ts
+   * @return Array[(Train, Test)]
    */
-  private[op] override def createTrainValidationSplits(
-    rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])] = {
-
-    val Array(trainData, validateData) = {
-      if (stratify && isClassification) {
-        log.info(s"Creating stratified train/validation with training ratio of $trainRatio")
-
-        val classes = rdd.map(_._1).distinct().collect()
-        // Creates RDD grouped by classes (0, 1, 2, 3, ..., K)
-        val rddByClass = classes.map(label => rdd.filter(_._1 == label)
-          .map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _))
-
-        // Train/Validation data for each class
-        val splitByClass = rddByClass.map(_.randomSplit(Array(trainRatio, 1 - trainRatio), seed)
-          .map(_.values.flatMap(identity)))
-
-        if (splitByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty")
-        // Merging Train/Validation data one by one
-        splitByClass.reduce[Array[RDD[Row]]] {
-          case (Array(train1: RDD[Row], validate1: RDD[Row]), Array(train2: RDD[Row], validate2: RDD[Row])) =>
-            Array(train1.union(train2), validate1.union(validate2))
-        }
-
-      } else {
-        rdd.map { case (label, features, key) => key -> Seq(Row(label, features)) }
-          .reduceByKey(_ ++ _)
-          .randomSplit(Array(trainRatio, 1 - trainRatio), seed)
-          .map(_.values.flatMap(identity))
-      }
+  private[op] override def createTrainValidationSplits[T](
+    stratifyCondition: Boolean,
+    dataset: Dataset[T],
+    label: String,
+    splitter: Option[Splitter] = None
+  ): Array[(RDD[Row], RDD[Row])] = {
+
+    // get param that stores the label column
+    val labelCol = evaluator.getParam(ValidatorParamDefaults.LabelCol)
+    evaluator.set(labelCol, label)
+
+    val Array(train, test) = if (stratifyCondition) {
+      val rddsByClass = prepareStratification(
+        dataset = dataset,
+        message = s"Creating stratified train/validation with training ratio of $trainRatio",
+        label = label,
+        splitter = splitter
+      )
+      stratifyTrainValidationSplit(rddsByClass)
+    } else {
+      val rddRow = dataset.toDF().rdd
+      rddRow.randomSplit(Array(trainRatio, 1 - trainRatio), seed)
     }
-    Array((trainData, validateData))
+    Array((train, test))
   }
+
+  private def stratifyTrainValidationSplit(rddsByClass: Array[RDD[Row]]): Array[RDD[Row]] = {
+    // Train/Validation data for each class
+    val splitByClass = rddsByClass.map(_.randomSplit(Array(trainRatio, 1 - trainRatio), seed))
+
+    if (splitByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty")
+    // Merging Train/Validation data one by one
+    splitByClass.reduce[Array[RDD[Row]]] {
+      case (Array(train1: RDD[Row], validate1: RDD[Row]), Array(train2: RDD[Row], validate2: RDD[Row])) =>
+        Array(train1.union(train2), validate1.union(validate2))
+    }
+  }
+
 }
+
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala
index 0bde386e32..5a5f284f8e 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala
@@ -31,16 +31,20 @@
 
 package com.salesforce.op.stages.impl.tuning
 
+import com.salesforce.op.utils.stages.FitStagesUtil._
+import com.salesforce.op.utils.stages.FitStagesUtil
 import com.salesforce.op.evaluators.{OpBinaryClassificationEvaluatorBase, OpEvaluatorBase, OpMultiClassificationEvaluatorBase}
-import com.salesforce.op.stages.impl.selector.ModelInfo
-import org.apache.spark.ml.linalg.Vector
+import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames}
+import org.apache.log4j.{Level, LogManager}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.sql.functions.monotonically_increasing_id
+import org.apache.spark.sql.types.{MetadataBuilder, StructType}
+import org.apache.spark.sql.{Dataset, Row, SparkSession, functions}
 import org.slf4j.{Logger, LoggerFactory}
 
+
 /**
  * Best Model container
  *
@@ -54,9 +58,9 @@ case class BestModel[M <: Model[_]](name: String, model: M, metadata: Option[Met
 /**
  * Best Estimator container
  *
- * @param name     the name of the best model
- * @param estimator    best estimator
- * @param metadata optional metadata
+ * @param name      the name of the best model
+ * @param estimator best estimator
+ * @param metadata  optional metadata
  * @tparam E model type
  */
 case class BestEstimator[E <: Estimator[_]](name: String, estimator: E, metadata: MetadataBuilder = new MetadataBuilder)
@@ -80,6 +84,7 @@ private[tuning] case class ValidatedModel[E <: Estimator[_]]
    * Best metric (metric at bestIndex)
    */
   def bestMetric: Double = metrics(bestIndex)
+
   /**
    * Best grid (param grid at bestIndex)
    */
@@ -94,9 +99,14 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial
 
   @transient protected lazy val log: Logger = LoggerFactory.getLogger(this.getClass)
 
+  type ModelWithGrids = Seq[(E, Array[ParamMap], String)]
+
   def seed: Long
+
   def evaluator: OpEvaluatorBase[_]
+
   def validationName: String
+
   def stratify: Boolean
 
   private[op] final def isClassification = evaluator match {
@@ -105,35 +115,44 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial
     case _ => false
   }
 
+
   /**
    * Function that performs the model selection
    *
-   * @param modelInfo estimators and grids to validate
+   * @param modelInfo
+   * @param dataset
    * @param label
    * @param features
-   * @param dataset
-   * @return estimator
+   * @param dag
+   * @param splitter
+   * @param stratifyCondition Condition to stratify CV/TS
+   * @param spark
+   * @return
    */
-  private[op] def validate(
+  private[op] def validate[T](
     modelInfo: Seq[ModelInfo[E]],
-    dataset: Dataset[_],
+    dataset: Dataset[T],
     label: String,
-    features: String
-  ): BestModel[M]
+    features: String,
+    dag: Option[StagesDAG] = None,
+    splitter: Option[Splitter] = None,
+    stratifyCondition: Boolean = isClassification && stratify
+  )(implicit spark: SparkSession): BestEstimator[E]
+
 
   /**
    * Get the best model and the metadata with with the validator params
    *
-   * @param modelsFit info from validation
-   * @param bestModel best fit model
-   * @param splitInfo split info for logging
+   * @param modelsFit     info from validation
+   * @param bestEstimator best fit model
+   * @param splitInfo     split info for logging
    * @return best model
    */
-  private[op] def wrapBestModel(
+  private[op] def wrapBestEstimator(
     modelsFit: Array[ValidatedModel[E]],
-    bestModel: M,
+    bestEstimator: E,
     splitInfo: String
-  ): BestModel[M] = {
+  ): BestEstimator[E] = {
     log.info(
       "Model Selection over {} with {} with {} and the {} metric",
       modelsFit.map(_.model.getClass.getSimpleName).mkString(","), validationName, splitInfo, evaluator.name
@@ -143,16 +162,18 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial
     val newMeta = new MetadataBuilder().putMetadata(validationName, meta.build())
     val (bestModelName, _) = if (evaluator.isLargerBetter) cvFittedModels.maxBy(_._2) else cvFittedModels.minBy(_._2)
 
-    BestModel(name = bestModelName, model = bestModel, metadata = Option(newMeta))
+    BestEstimator(name = bestModelName, estimator = bestEstimator, metadata = newMeta)
   }
 
   /**
    * Update metadata during model selection and return best model name
+   *
    * @return best model name
    */
   private[op] def updateBestModelMetadata(metadataBuilder: MetadataBuilder, v: ValidatedModel[E]): String = {
     val ValidatedModel(model, bestIndex, metrics, grids) = v
     val modelParams = model.extractParamMap()
+
     def makeModelName(index: Int) = s"${model.uid}_$index"
 
     for {((paramGrid, met), ind) <- grids.zip(metrics).zipWithIndex} {
@@ -167,17 +188,140 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial
     makeModelName(bestIndex)
   }
 
+
   /**
    * Creates Train Validation Splits
-   * @param rdd
-   * @return Train Validation Splits
+   *
+   * @param stratifyCondition condition to stratify splits
+   * @param dataset
+   * @param label
+   * @param splitter  used to estimate splitter params prior to splits
+   * @return
+   */
+  private[op] def createTrainValidationSplits[T](stratifyCondition: Boolean,
+    dataset: Dataset[T], label: String, splitter: Option[Splitter] = None): Array[(RDD[Row], RDD[Row])]
+
+
+  protected def prepareStratification[T](
+    dataset: Dataset[T],
+    message: String,
+    label: String,
+    splitter: Option[Splitter] = None
+  ): Array[RDD[Row]] = {
+    log.info(message)
+    import dataset.sqlContext.implicits._
+    val classes = dataset.select(label).as[Double].distinct().collect().sorted
+    val datasetsByClass = classes.map(theClass => dataset.filter(functions.col(label) === theClass))
+
+    splitter.map {
+      case d: DataBalancer => {
+        val Array(negative, positive) = datasetsByClass
+        d.estimate(
+          data = dataset,
+          positiveData = positive,
+          negativeData = negative,
+          seed = d.getSeed
+        )
+      }
+      case c: DataCutter => {
+        val labelCounts = dataset.sparkSession.createDataFrame(classes zip datasetsByClass.map(_.count())).persist
+        c.estimate(labelCounts)
+        labelCounts.unpersist
+      }
+      case _ =>
+    }
+    // Creates RDD grouped by classes (0, 1, 2, 3, ..., K)
+    datasetsByClass.map(_.toDF().rdd)
+  }
+
+  protected def applyDAG(
+    dag: StagesDAG,
+    training: Dataset[Row],
+    validation: Dataset[Row],
+    label: String,
+    features: String,
+    splitter: Option[Splitter]
+  )(implicit sparkSession: SparkSession): (Dataset[Row], Dataset[Row]) = {
+    import sparkSession.implicits._
+
+    val FittedDAG(newTrain, newTest, _) = FitStagesUtil.fitAndTransformDAG(
+      dag = dag,
+      train = training,
+      test = validation,
+      hasTest = true,
+      indexOfLastEstimator = Some(-1)
+    )
+    val selectTrain = newTrain.select(label, features)
+      .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id())
+
+    val selectTest = newTest.select(label, features)
+      .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id())
+
+    val (balancedTrain, balancedTest) = splitter.map(s => (
+      s.prepare(selectTrain).train,
+      s.prepare(selectTest).train)
+    ).getOrElse((selectTrain, selectTest))
+
+    (balancedTrain, balancedTest)
+  }
+
+  /**
+   * Suppress logging to a specified level when executing method `f`.
    */
-  private[op] def createTrainValidationSplits(rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])]
+  protected def suppressLoggingForFun[Result](level: Level = Level.ERROR)(f: => Result): Result = {
+    val opLog = LogManager.getLogger("com.salesforce.op")
+    val originalLevel = opLog.getLevel
+    opLog.setLevel(level)
+    val result = f
+    opLog.setLevel(originalLevel) // Reset log level back to normal
+    result
+  }
+
+  protected def getValidatedModel(groupedSummary: Array[ValidatedModel[E]]): ValidatedModel[E] = {
+    if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric) else groupedSummary.minBy(_.bestMetric)
+  }
+
+  protected def getSummary[T](
+    modelsWithGrids: ModelWithGrids, label: String, features: String, train: Dataset[T], test: Dataset[T]
+  ): Array[ValidatedModel[E]] = {
+    train.persist()
+    test.persist()
+    val summary = modelsWithGrids.par.map {
+      case (estimator, paramGrids, name) =>
+        val pi1 = estimator.getParam(StageParamNames.inputParam1Name)
+        val pi2 = estimator.getParam(StageParamNames.inputParam2Name)
+        estimator.set(pi1, label).set(pi2, features)
+
+        val numModels = paramGrids.length
+        val metrics = new Array[Double](paramGrids.length)
+
+        log.info(s"Train split with multiple sets of parameters.")
+        val models = estimator.fit(train, paramGrids).asInstanceOf[Seq[M]]
+        var i = 0
+        while (i < numModels) {
+          val metric = evaluator.evaluate(models(i).transform(test, paramGrids(i)))
+          log.info(s"Got metric $metric for model $name trained with ${paramGrids(i)}.")
+          metrics(i) = metric
+          i += 1
+        }
+        val (bestMetric, bestIndex) =
+          if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
+          else metrics.zipWithIndex.minBy(_._1)
+        log.info(s"Best set of parameters:\n${paramGrids(bestIndex)} for $name")
+        log.info(s"Best train validation split metric: $bestMetric.")
+
+        ValidatedModel(estimator, bestIndex, metrics, paramGrids)
+    }.toArray
+    train.unpersist()
+    test.unpersist()
+    summary
+  }
+
 }
 
 object ValidatorParamDefaults {
   def Seed: Long = util.Random.nextLong // scalastyle:off method.name
-  val labelCol = "labelCol"
+  val LabelCol = "labelCol"
   val NumFolds = 3
   val TrainRatio = 0.75
   val Stratify = false
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala
index ef35eabe9c..1bdface8b9 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala
@@ -31,17 +31,10 @@
 
 package com.salesforce.op.stages.impl.tuning
 
-import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param._
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.types.{Metadata, MetadataBuilder}
 
-/**
- * Case class of data used in model selectors for data prep and cross validation
- */
-case object SelectorData {
-  type LabelFeaturesKey = (Double, Vector, String)
-}
 
 /**
  * Case class for Training & test sets
@@ -49,8 +42,8 @@ case object SelectorData {
  * @param train      training set is persisted at construction
  * @param metadata   metadata built at construction
  */
-case class ModelData private(train: Dataset[_], metadata: Metadata) {
-  def this(train: Dataset[_], metadata: MetadataBuilder) =
+case class ModelData private(train: Dataset[Row], metadata: Metadata) {
+  def this(train: Dataset[Row], metadata: MetadataBuilder) =
     this(train.persist(), metadata.build())
 }
 
@@ -65,7 +58,7 @@ abstract class Splitter(val uid: String) extends SplitterParams {
    * @param data
    * @return (dataTrain, dataTest)
    */
-  def split(data: Dataset[_]): (Dataset[_], Dataset[_]) = {
+  def split[T](data: Dataset[T]): (Dataset[T], Dataset[T]) = {
     val fraction = 1.0 - getReserveTestFraction
     val Array(dataTrain, dataTest) = data.randomSplit(Array(fraction, 1.0 - fraction), seed = $(seed))
     dataTrain -> dataTest
@@ -78,7 +71,7 @@ abstract class Splitter(val uid: String) extends SplitterParams {
    * @param data
    * @return Training set test set
    */
-  def prepare(data: Dataset[SelectorData.LabelFeaturesKey]): ModelData
+  def prepare(data: Dataset[Row]): ModelData
 
 }
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala
deleted file mode 100644
index bf76a6d848..0000000000
--- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.sparkwrappers.generic
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.FeatureLike
-import com.salesforce.op.features.types.FeatureType
-import com.salesforce.op.stages.{OpPipelineStage2to3, _}
-import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.sql._
-
-import scala.reflect.runtime.universe.TypeTag
-
-/**
- * Generic wrapper for any spark estimator which has two inputs and three outputs
- *
- * @param inputParam1Name     name of spark parameter that sets the first input column
- * @param inputParam2Name     name of spark parameter that sets the second input column
- * @param outputParam1Name    name of spark parameter that sets the first output column
- * @param outputParam2Name    name of spark parameter that sets the second output column
- * @param outputParam3Name    name of spark parameter that sets the third output column
- * @param stage1OperationName unique name of the operation first stage performs
- * @param stage2OperationName unique name of the operation second stage performs
- * @param stage3OperationName unique name of the operation third stage performs
- * @param sparkMlStageIn      instance of spark estimator to wrap
- * @param uid                 stage uid
- * @param i1ttag              type tag for first input
- * @param i2ttag              type tag for second input
- * @param o1ttag              type tag for first output
- * @param o2ttag              type tag for second output
- * @param o3ttag              type tag for third output
- * @param i1ttiv              type tag for first input value
- * @param i2ttiv              type tag for second input value
- * @param o1ttov              type tag for first output value
- * @param o2ttov              type tag for second output value
- * @param o3ttov              type tag for third output value
- * @tparam I1 input feature type 1
- * @tparam I2 input feature type 2
- * @tparam O1 first output feature type
- * @tparam O2 second output feature type
- * @tparam O3 third output feature type
- * @tparam M  spark model type returned by spark estimator wrapped
- * @tparam E  spark estimator to wrap
- */
-class SwThreeStageBinaryEstimator[I1 <: FeatureType, I2 <: FeatureType, O1 <: FeatureType, O2 <: FeatureType,
-O3 <: FeatureType, M <: Model[M], E <: Estimator[M]]
-(
-  val inputParam1Name: String,
-  val inputParam2Name: String,
-  val outputParam1Name: String,
-  val outputParam2Name: String,
-  val outputParam3Name: String,
-  val stage1OperationName: String,
-  val stage2OperationName: String,
-  val stage3OperationName: String,
-  private val sparkMlStageIn: Option[E],
-  val uid: String = UID[SwThreeStageBinaryEstimator[I1, I2, O1, O2, O3, M, E]]
-)(
-  implicit val i1ttag: TypeTag[I1],
-  val i2ttag: TypeTag[I2],
-  val o1ttag: TypeTag[O1],
-  val o2ttag: TypeTag[O2],
-  val o3ttag: TypeTag[O3],
-  val i1ttiv: TypeTag[I1#Value],
-  val i2ttiv: TypeTag[I2#Value],
-  val o1ttov: TypeTag[O1#Value],
-  val o2ttov: TypeTag[O2#Value],
-  val o3ttov: TypeTag[O3#Value]
-) extends Estimator[SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M]]
-  with OpPipelineStage2to3[I1, I2, O1, O2, O3] with SparkWrapperParams[E] {
-
-  setSparkMlStage(sparkMlStageIn)
-  set(sparkInputColParamNames, Array(inputParam1Name, inputParam2Name))
-  set(sparkOutputColParamNames, Array(outputParam1Name, outputParam2Name, outputParam3Name))
-
-  private lazy val stage1uid = UID[SwBinaryEstimator[I1, I2, O1, M, E]]
-  private lazy val stage2uid = UID[SwTernaryTransformer[I1, I2, O1, O2, M]]
-  private lazy val stage3uid = UID[SwQuaternaryTransformer[I1, I2, O1, O2, O3, M]]
-
-  private lazy val outputName1 = makeOutputNameFromStageId[O1](stage1uid, Seq(in1, in2))
-  private lazy val outputName2 = makeOutputNameFromStageId[O2](stage2uid, Seq(in1, in2), 2)
-  private lazy val outputName3 = makeOutputNameFromStageId[O3](stage3uid, Seq(in1, in2), 3)
-
-  // put together parameter names and values
-  private lazy val outputs = $(sparkOutputColParamNames).zip(
-    Array(outputName1, outputName2, outputName3))
-
-  private[op] lazy val stage1 = new SwBinaryEstimatorSpecial[I1, I2, O1, M, E](
-    inputParam1Name = $(sparkInputColParamNames)(0),
-    inputParam2Name = $(sparkInputColParamNames)(1),
-    outputParamName = $(sparkOutputColParamNames)(0),
-    operationName = stage1OperationName,
-    sparkMlStageIn = getSparkMlStage().map { spk => // set all the outputs for this stage
-      outputs.foldLeft(spk) { case (s, (pname, pvalue)) => s.set(s.getParam(pname), pvalue) }
-    },
-    uid = stage1uid,
-    outputs
-  ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2])
-
-  private[op] lazy val stage2 = new SwTernaryTransformer[I1, I2, O1, O2, M](
-    inputParam1Name = $(sparkInputColParamNames)(0),
-    inputParam2Name = $(sparkInputColParamNames)(1),
-    inputParam3Name = stage1OperationName,
-    outputParamName = $(sparkOutputColParamNames)(1),
-    operationName = stage2OperationName,
-    sparkMlStageIn = None,
-    uid = stage2uid
-  ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2], stage1.getOutput())
-
-  private[op] lazy val stage3 = new SwQuaternaryTransformer[I1, I2, O1, O2, O3, M](
-    inputParam1Name = $(sparkInputColParamNames)(0),
-    inputParam2Name = $(sparkInputColParamNames)(1),
-    inputParam3Name = stage1OperationName,
-    inputParam4Name = stage2OperationName,
-    outputParamName = $(sparkOutputColParamNames)(2),
-    operationName = stage3OperationName,
-    sparkMlStageIn = None,
-    uid = stage3uid
-  ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2], stage1.getOutput(), stage2.getOutput())
-
-  /**
-   * Output features that will be created by the transformation
-   *
-   * @return features of type O1, O2 and O3
-   */
-  final override def getOutput(): (FeatureLike[O1], FeatureLike[O2], FeatureLike[O3]) = {
-    (stage1.getOutput(), stage2.getOutput(), stage3.getOutput())
-  }
-
-  override def fit(dataset: Dataset[_]): SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M] = {
-    val model = stage1.fit(dataset)
-
-    new SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M](
-      inputParam1Name,
-      inputParam2Name,
-      outputParam1Name,
-      outputParam2Name,
-      outputParam3Name,
-      stage1OperationName,
-      stage2OperationName,
-      stage3OperationName,
-      model,
-      stage2,
-      stage3,
-      uid
-    ).setParent(this).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2])
-
-  }
-}
-
-/**
- * Generic wrapper for any model returned by an estimator which has two inputs and three outputs
- *
- * @param inputParam1Name     name of spark parameter that sets the first input column
- * @param inputParam2Name     name of spark parameter that sets the second input column
- * @param outputParam1Name    name of spark parameter that sets the first output column
- * @param outputParam2Name    name of spark parameter that sets the second output column
- * @param outputParam3Name    name of spark parameter that sets the third output column
- * @param stage1OperationName unique name of the operation first stage performs
- * @param stage2OperationName unique name of the operation second stage performs
- * @param stage3OperationName unique name of the operation third stage performs
- * @param stage1              first wrapping stage for output one (this is the only stage that actually does anything)
- * @param stage2              second stage - dummy for generating second output
- * @param stage3              third stage - dummy for generating third output
- * @param uid                 stage uid
- * @tparam I1 input feature type 1
- * @tparam I2 input feature type 2
- * @tparam O1 first output feature type
- * @tparam O2 second output feature type
- * @tparam O3 third output feature type
- * @tparam M
- */
-private[stages] final class SwThreeStageBinaryModel[I1 <: FeatureType, I2 <: FeatureType, O1 <: FeatureType,
-O2 <: FeatureType, O3 <: FeatureType, M <: Model[M]]
-(
-  val inputParam1Name: String,
-  val inputParam2Name: String,
-  val outputParam1Name: String,
-  val outputParam2Name: String,
-  val outputParam3Name: String,
-  val stage1OperationName: String,
-  val stage2OperationName: String,
-  val stage3OperationName: String,
-  val stage1: SwBinaryModel[I1, I2, O1, M],
-  val stage2: SwTernaryTransformer[I1, I2, O1, O2, M],
-  val stage3: SwQuaternaryTransformer[I1, I2, O1, O2, O3, M],
-  val uid: String
-) extends Model[SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M]]
-  with OpPipelineStage2to3[I1, I2, O1, O2, O3] with SparkWrapperParams[M] {
-
-  setSparkMlStage(stage1.getSparkMlStage())
-  set(sparkInputColParamNames, Array(inputParam1Name, inputParam2Name))
-  set(sparkOutputColParamNames, Array(outputParam1Name, outputParam2Name, outputParam3Name))
-
-  override def transform(dataset: Dataset[_]): DataFrame = stage1.transform(dataset)
-
-  override def getOutput(): (FeatureLike[O1], FeatureLike[O2], FeatureLike[O3]) =
-    (stage1.getOutput(), stage2.getOutput(), stage3.getOutput())
-}
-
-/**
- * Wrapper for any spark estimator that has two inputs and three outputs (for use in three stage wrapper)
- */
-private[op] class SwBinaryEstimatorSpecial[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType,
-M <: Model[M], E <: Estimator[M]]
-(
-  inputParam1Name: String,
-  inputParam2Name: String,
-  outputParamName: String,
-  operationName: String,
-  private val sparkMlStageIn: Option[E],
-  uid: String = UID[SwBinaryEstimator[I1, I2, O, M, E]],
-  val outputNames: Array[(String, String)]
-)(
-  implicit tti1: TypeTag[I1],
-  tti2: TypeTag[I2],
-  tto: TypeTag[O],
-  ttov: TypeTag[O#Value]
-) extends SwBinaryEstimator[I1, I2, O, M, E] (inputParam1Name = inputParam1Name, inputParam2Name = inputParam2Name,
-  outputParamName = outputParamName, operationName = operationName, sparkMlStageIn = sparkMlStageIn,
-  uid = uid)(tti1 = tti1, tti2 = tti2, tto = tto, ttov = ttov){
-
-  override def setOutputFeatureName(m: String): this.type = {
-    getSparkMlStage().map { spk => // set all the outputs for this stage
-      outputNames.zipWithIndex.foldLeft(spk) { case (s, ((pname, pvalue), i)) =>
-        val newName = updateOutputName(m, pvalue, i)
-        s.set(s.getParam(pname), newName)
-      }}
-    set(outputFeatureName, m)
-  }
-}
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala
index 67b4bc069f..59b69551a9 100644
--- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala
@@ -41,7 +41,7 @@ import scala.reflect.runtime.universe.TypeTag
 
 /**
  * Wraps a spark ML estimator.  This wrapper is meant for Estimators not already covered by more specific
- * wrappers such as: [[OpProbabilisticClassifierWrapper]] and [[OpPredictorWrapper]].
+ * wrappers such as: [[OpPredictorWrapper]].
  * Examples of estimators meant to be wrapped with OpEstimatorWrapper include MinMaxScaler, IDF, VectorIndexer,
  * CountVectorizer, QuantileDiscretizer, StandardScaler, PCA, MaxAbsScaler, Word2Vec, etc.
  * Their defining characteristic is that they output a Model which takes in one column as input and output
diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala
similarity index 60%
rename from core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala
rename to core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala
index 7408c99194..d24f8b852d 100644
--- a/core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala
@@ -29,28 +29,38 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.classification
+package com.salesforce.op.stages.sparkwrappers.specific
 
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-import org.apache.spark.ml.linalg.{Matrix, Vector}
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import org.apache.spark.ml.PredictionModel
+import org.apache.spark.ml.linalg.Vector
 
-import scala.reflect.runtime.universe.TypeTag
+import scala.reflect.runtime.universe._
 
-class OpNaiveBayesModel
+/**
+ * Class that takes in a spark PredictionModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ * @tparam T type of the model to wrap
+ */
+abstract class OpPredictionModel[T <: PredictionModel[Vector, T]]
 (
-  pi: Vector,
-  theta: Matrix,
-  val oldLabelsIn: Array[Double],
-  val modelTypeIn: String,
-  uid: String = UID[OpNaiveBayesModel],
-  val operationName: String = "opNB"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends NaiveBayesModel(uid = uid, pi = pi, theta = theta) with OpClassifierModelBase {
-  this.oldLabels = oldLabelsIn
-  set(modelType, modelTypeIn)
+  sparkModel: T,
+  uid: String,
+  operationName: String
+) extends OpPredictorWrapperModel[T](uid = uid, operationName = operationName, sparkModel = sparkModel) {
+
+  protected def predictMirror: MethodMirror
+
+  protected def predict(features: Vector): Double = predictMirror.apply(features).asInstanceOf[Double]
+
+  /**
+   * Function used to convert input to output
+   */
+  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) =>
+    Prediction(prediction = predict(features.value))
+
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala
index e2cef41c4a..06b4c4a8de 100644
--- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala
@@ -29,56 +29,95 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-// scalastyle:off
 package com.salesforce.op.stages.sparkwrappers.specific
 
 import com.salesforce.op.UID
-import com.salesforce.op.features.types.{FeatureType, OPVector}
-import com.salesforce.op.stages.sparkwrappers.generic.SwBinaryEstimator
+import com.salesforce.op.features.types.{FeatureType, OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.{OpPipelineStage2, SparkStageParam}
+import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel, OpTransformer2}
+import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
+import org.apache.spark.ml._
 import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.{PredictionModel, Predictor, SparkMLSharedParamConstants}
+import org.apache.spark.sql.Dataset
 
 import scala.reflect.runtime.universe.TypeTag
 
 /**
  * Wraps a spark ML predictor.  Predictors represent supervised learning algorithms (regression and classification) in
- * spark ML that inherit from [[Predictor]], examples of which include:
+ * spark ML that inherit from [[Predictor]], supported models are:
+ * [[org.apache.spark.ml.classification.LogisticRegression]]
+ * [[org.apache.spark.ml.regression.LinearRegression]],
+ * [[org.apache.spark.ml.classification.RandomForestClassifier]],
  * [[org.apache.spark.ml.regression.RandomForestRegressor]],
- * [[org.apache.spark.ml.regression.GBTRegressor]], [[org.apache.spark.ml.classification.GBTClassifier]],
+ * [[org.apache.spark.ml.classification.NaiveBayesModel]],
+ * [[org.apache.spark.ml.classification.GBTClassifier]],
+ * [[org.apache.spark.ml.regression.GBTRegressor]],
+ * [[org.apache.spark.ml.classification.DecisionTreeClassifier]]
  * [[org.apache.spark.ml.regression.DecisionTreeRegressor]],
+ * [[org.apache.spark.ml.classification.LinearSVC]]
  * [[org.apache.spark.ml.classification.MultilayerPerceptronClassifier]],
- * [[org.apache.spark.ml.regression.LinearRegression]],
- * and [[org.apache.spark.ml.regression.GeneralizedLinearRegression]].
+ * [[org.apache.spark.ml.regression.GeneralizedLinearRegression]].
  * Their defining characteristic is that they output a model which takes in 2 columns as input (labels and features)
- * and output one column as result.
- * NOTE: Probabilistic classifiers contain additional output information, and so there is a specific wrapper
- * for that kind of classifier see: [[OpProbabilisticClassifierWrapper]]
+ * and output one to three column as result.
  *
  * @param predictor the predictor to wrap
  * @param uid       stage uid
- * @tparam I the type of the transformation input feature
- * @tparam O the type of the transformation output feature
- * @tparam E spark estimator to wrap
- * @tparam M spark model type returned by spark estimator wrapped
+ * @tparam E        spark estimator to wrap
+ * @tparam M        spark model returned
  */
-class OpPredictorWrapper[I <: FeatureType, O <: FeatureType, E <: Predictor[Vector, E, M],
-M <: PredictionModel[Vector, M]]
+class OpPredictorWrapper[E <: Predictor[Vector, E, M], M <: PredictionModel[Vector, M]]
 (
   val predictor: E,
-  uid: String = UID[OpPredictorWrapper[I, O, E, M]]
+  val uid: String = UID[OpPredictorWrapper[_, _]]
+)(
+  implicit val tti1: TypeTag[RealNN],
+  val tti2: TypeTag[OPVector],
+  val tto: TypeTag[Prediction],
+  val ttov: TypeTag[Prediction#Value]
+) extends Estimator[OpPredictorWrapperModel[M]] with OpPipelineStage2[RealNN, OPVector, Prediction]
+  with SparkWrapperParams[E] {
+
+  val operationName = predictor.getClass.getSimpleName
+  val inputParam1Name = SparkMLSharedParamConstants.LabelColName
+  val inputParam2Name = SparkMLSharedParamConstants.FeaturesColName
+  val outputParamName = SparkMLSharedParamConstants.PredictionColName
+  setDefault(sparkMlStage, Option(predictor))
+
+    /**
+   * Function that fits the binary model
+   */
+  override def fit(dataset: Dataset[_]): OpPredictorWrapperModel[M] = {
+    setInputSchema(dataset.schema).transformSchema(dataset.schema)
+    copyValues(predictor) // when params are shared with wrapping class this will pass them into the model
+
+    val p1 = predictor.getParam(inputParam1Name)
+    val p2 = predictor.getParam(inputParam2Name)
+    val po = predictor.getParam(outputParamName)
+    val model: M = predictor
+      .set(p1, in1.name)
+      .set(p2, in2.name)
+      .set(po, getOutputFeatureName)
+      .fit(dataset)
+
+    SparkModelConverter.toOP(model, uid)
+      .setParent(this)
+      .setInput(in1.asFeatureLike[RealNN], in2.asFeatureLike[OPVector])
+      .setMetadata(getMetadata())
+      .setOutputFeatureName(getOutputFeatureName)
+  }
+}
+
+abstract class OpPredictorWrapperModel[M <: PredictionModel[Vector, M]]
+(
+  val operationName: String,
+  val uid: String,
+  val sparkModel: M
 )(
-  implicit tti1: TypeTag[I],
-  tto: TypeTag[O],
-  ttov: TypeTag[O#Value]
-) extends SwBinaryEstimator[I, OPVector, O, M, E](
-  inputParam1Name = SparkMLSharedParamConstants.LabelColName,
-  inputParam2Name = SparkMLSharedParamConstants.FeaturesColName,
-  outputParamName = SparkMLSharedParamConstants.PredictionColName,
-  operationName = predictor.getClass.getSimpleName,
-  // cloning below to prevent parameter changes to the underlying classifier outside the wrapper
-  sparkMlStageIn = Option(predictor).map(_.copy(ParamMap.empty)),
-  uid = uid
-) {
-  final protected def getSparkStage: E = getSparkMlStage().get
+  implicit val tti1: TypeTag[RealNN],
+  val tti2: TypeTag[OPVector],
+  val tto: TypeTag[Prediction],
+  val ttov: TypeTag[Prediction#Value]
+) extends Model[OpPredictorWrapperModel[M]] with OpTransformer2[RealNN, OPVector, Prediction]
+  with SparkWrapperParams[M] {
+  setDefault(sparkMlStage, Option(sparkModel))
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala
new file mode 100644
index 0000000000..291dc0bc50
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.sparkwrappers.specific
+
+import com.salesforce.op.features.types._
+import org.apache.spark.ml.classification.ProbabilisticClassificationModel
+import org.apache.spark.ml.linalg.Vector
+
+import scala.reflect.runtime.universe._
+
+/**
+ * Class that takes in a spark ProbabilisticClassifierModel and wraps it into an OP model which returns a
+ * Prediction feature
+ *
+ * @param sparkModel    model to wrap
+ * @param uid           uid to give stage
+ * @param operationName unique name of the operation this stage performs
+ * @tparam T type of the model to wrap
+ */
+abstract class OpProbabilisticClassifierModel[T <: ProbabilisticClassificationModel[Vector, T]]
+(
+  sparkModel: T,
+  uid: String,
+  operationName: String
+) extends OpPredictorWrapperModel[T](uid = uid, operationName = operationName, sparkModel = sparkModel) {
+
+  protected def predictRawMirror: MethodMirror
+  protected def raw2probabilityMirror: MethodMirror
+  protected def probability2predictionMirror: MethodMirror
+
+  protected def predictRaw(features: Vector): Vector = predictRawMirror.apply(features).asInstanceOf[Vector]
+  protected def raw2probability(raw: Vector): Vector = raw2probabilityMirror.apply(raw).asInstanceOf[Vector]
+  protected def probability2prediction(prob: Vector): Double =
+    probability2predictionMirror.apply(prob).asInstanceOf[Double]
+
+  /**
+   * Function used to convert input to output
+   */
+  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
+    val raw = predictRaw(features.value)
+    val prob = raw2probability(raw)
+    val pred = probability2prediction(prob)
+
+    Prediction(rawPrediction = raw, probability = prob, prediction = pred)
+  }
+
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala
deleted file mode 100644
index cec3d62d15..0000000000
--- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.sparkwrappers.specific
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.types._
-import com.salesforce.op.stages.sparkwrappers.generic.SwThreeStageBinaryEstimator
-import org.apache.spark.ml.SparkMLSharedParamConstants
-import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier}
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.ml.param.ParamMap
-
-/**
- * Wraps a spark ML probabilistic classifier.  In SparkML, a probabilistic classifier is anything that inherits
- * from [[ProbabilisticClassifier]].  Examples of these probabilistic classifiers
- * include: RandomForestClassifier, NaiveBayes, LogisticRegression, and DecisionTreeClassifier.
- * These classifiers in spark ML output not a single column, but 3: (1) the raw unnormalized scores for each class,
- * (2) the probabilistic classification (normalized raw scores), and
- * (3) the labels of the output (e.g. max unnormalized score).
- * The defining characteristic of classifiers intended to be wrapped by this class is that they output a model which
- * takes in 2 columns as input (label and features) and output 3 columns as result.
- *
- * @param probClassifier the probabilistic classifier to wrap
- * @param uid            stage uid
- * @tparam E spark estimator to wrap
- * @tparam M spark model type returned by spark estimator wrapped
- */
-class OpProbabilisticClassifierWrapper[E <: ProbabilisticClassifier[Vector, E, M],
-M <: ProbabilisticClassificationModel[Vector, M]]
-(
-  val probClassifier: E,
-  uid: String = UID[OpProbabilisticClassifierWrapper[E, M]]
-) extends SwThreeStageBinaryEstimator[RealNN, OPVector, RealNN, OPVector, OPVector, M, E](
-  inputParam1Name = SparkMLSharedParamConstants.LabelColName,
-  inputParam2Name = SparkMLSharedParamConstants.FeaturesColName,
-  outputParam1Name = SparkMLSharedParamConstants.PredictionColName,
-  outputParam2Name = SparkMLSharedParamConstants.RawPredictionColName,
-  outputParam3Name = SparkMLSharedParamConstants.ProbabilityColName,
-  stage1OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.PredictionColName ,
-  stage2OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.RawPredictionColName,
-  stage3OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.ProbabilityColName,
-  // cloning below to prevent parameter changes to the underlying classifier outside the wrapper
-  sparkMlStageIn = Option(probClassifier).map(_.copy(ParamMap.empty)),
-  uid = uid
-) {
-  final protected def getSparkStage: E = getSparkMlStage().get
-}
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala
new file mode 100644
index 0000000000..f77d0d1dfa
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.sparkwrappers.specific
+
+import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.stages.base.binary.OpTransformer2
+import com.salesforce.op.stages.impl.classification._
+import com.salesforce.op.stages.impl.regression._
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.regression._
+import org.apache.spark.ml.{Model, PredictionModel}
+
+/**
+ * Allows conversion from spark models to models that follow the OP convention of having a
+ * transformFn that can be called on a single row rather than the whole dataframe
+ */
+object SparkModelConverter {
+
+  /**
+   * Converts supported spark model of type PredictionModel[Vector, T] to an OP model
+   * @param model model to convert
+   * @param uid uid to give converted model
+   * @tparam T type of model to convert
+   * @return Op Binary Model which will produce the same values put into a Prediction return feature
+   */
+  def toOP[T <: PredictionModel[Vector, T]](
+    model: T,
+    uid: String
+  ): OpPredictorWrapperModel[T] = {
+    toOPUnchecked(model, uid).asInstanceOf[OpPredictorWrapperModel[T]]
+  }
+
+  /**
+   * Converts supported spark model of type PredictionModel[Vector, T] to an OP model
+   * @param model model to convert
+   * @tparam T type of model to convert
+   * @return Op Binary Model which will produce the same values put into a Prediction return feature
+   */
+  // TODO remove when loco and model selector are updated
+  def toOPUnchecked[T <: Model[_]](model: T): OpTransformer2[RealNN, OPVector, Prediction] =
+    toOPUnchecked(model, model.uid)
+
+  /**
+   * Converts supported spark model of type PredictionModel[Vector, T] to an OP model
+   * @param model model to convert
+   * @param uid uid to give converted model
+   * @tparam T type of model to convert
+   * @return Op Binary Model which will produce the same values put into a Prediction return feature
+   */
+  // TODO remove when loco and model selector are updated
+  def toOPUnchecked[T <: Model[_]](
+    model: T,
+    uid: String
+  ): OpTransformer2[RealNN, OPVector, Prediction] = {
+    model match {
+      case m: LogisticRegressionModel => new OpLogisticRegressionModel(m, uid = uid)
+      case m: RandomForestClassificationModel => new OpRandomForestClassificationModel(m, uid = uid)
+      case m: NaiveBayesModel => new OpNaiveBayesModel(m, uid)
+      case m: DecisionTreeClassificationModel => new OpDecisionTreeClassificationModel(m, uid = uid)
+      case m: GBTClassificationModel => new OpGBTClassificationModel(m, uid = uid)
+      case m: LinearSVCModel => new OpLinearSVCModel(m, uid = uid)
+      case m: MultilayerPerceptronClassificationModel => new OpMultilayerPerceptronClassificationModel(m, uid = uid)
+      case m: LinearRegressionModel => new OpLinearRegressionModel(m, uid = uid)
+      case m: RandomForestRegressionModel => new OpRandomForestRegressionModel(m, uid = uid)
+      case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid)
+      case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid)
+      case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid)
+      case m => throw new RuntimeException(s"model conversion not implemented for model $m")
+    }
+  }
+
+}
diff --git a/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala b/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala
index 6f1fda93a6..b43c4f644c 100644
--- a/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala
+++ b/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala
@@ -31,25 +31,70 @@
 
 package com.salesforce.op.utils.stages
 
-import com.salesforce.op.OpWorkflowModel
+import com.salesforce.op.features.OPFeature
+import com.salesforce.op.stages.impl.selector.{HasTestEval, ModelSelectorBase}
 import com.salesforce.op.stages.{OPStage, OpTransformer}
-import com.salesforce.op.stages.impl.selector.HasTestEval
-import org.apache.spark.ml.{Estimator, Transformer}
+import com.salesforce.op.{OpWorkflow, OpWorkflowModel}
+import org.apache.spark.ml.{Estimator, Model, Transformer}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.slf4j.Logger
-import com.salesforce.op.utils.spark.RichDataset._
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+import org.slf4j.LoggerFactory
 
+import scala.collection.mutable.ListBuffer
+
+/**
+ * Functionality for manipulating stages DAG and fitting stages
+ *
+ * NOTE: this should be kept private to OP, cause we do not want users to mess up with
+ * the internal mechanisms of our workflows.
+ */
 private[op] case object FitStagesUtil {
 
+  /**
+   * DAG layer - stages with their distance pairs
+   */
+  type Layer = Array[(OPStage, Int)]
+
+  /**
+   * Stages DAG - unique stages layered by distance (desc order)
+   */
+  type StagesDAG = Array[Layer]
+
+  /**
+   * Model Selector type
+   */
+  type MS = ModelSelectorBase[_ <: Model[_], _ <: Estimator[_]]
+
+  /**
+   * Fitted DAG together with it's trainding & test data
+   *
+   * @param trainData    train data
+   * @param testData     test data
+   * @param transformers fitted transformers
+   */
+  case class FittedDAG(trainData: Dataset[Row], testData: Dataset[Row], transformers: Array[OPStage])
+
+  /**
+   * Extracted Model Selector and Split the DAG into
+   *
+   * @param modelSelector maybe model selector (if any)
+   * @param before        DAG before CV/TS
+   * @param during        DAG during CV/TS
+   * @param after         DAG after CV/TS
+   */
+  case class CutDAG(modelSelector: Option[(MS, Int)], before: StagesDAG, during: StagesDAG, after: StagesDAG)
+
+  private val log = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$"))
+
   /**
    * Efficiently apply all op stages
-   * @param opStages  list of op stages to apply
-   * @param df dataframe to apply them too
+   *
+   * @param opStages list of op stages to apply
+   * @param df       dataframe to apply them too
    * @return new data frame containing columns with output for all stages fed in
    */
-  def applyOpTransformations(opStages: Array[_ <:OPStage with OpTransformer], df: DataFrame)
-    (implicit spark: SparkSession, log: Logger): DataFrame = {
+  def applyOpTransformations(opStages: Array[_ <: OPStage with OpTransformer], df: Dataset[Row])
+    (implicit spark: SparkSession): Dataset[Row] = {
     if (opStages.isEmpty) df
     else {
       log.info("Applying {} OP stage(s): {}", opStages.length, opStages.map(_.uid).mkString(","))
@@ -87,8 +132,8 @@ private[op] case object FitStagesUtil {
    * @return Dataframe transformed data
    */
   def applySparkTransformations(
-    data: DataFrame, transformers: Array[Transformer], persistEveryKStages: Int
-  )(implicit spark: SparkSession, log: Logger): DataFrame = {
+    data: Dataset[Row], transformers: Array[Transformer], persistEveryKStages: Int
+  )(implicit spark: SparkSession): Dataset[Row] = {
 
     // you have more than 5 stages and are not persisting at least once
     if (transformers.length > 5 && persistEveryKStages > transformers.length) {
@@ -119,34 +164,110 @@ private[op] case object FitStagesUtil {
     transformedData
   }
 
+  /**
+   * Computes stages DAG
+   *
+   * @param features array if features in workflow
+   * @return unique stages layered by distance (desc order)
+   */
+  def computeDAG(features: Array[OPFeature]): StagesDAG = {
+    val (failures, parents) = features.map(_.parentStages()).partition(_.isFailure)
+
+    if (failures.nonEmpty) {
+      throw new IllegalArgumentException("Failed to compute stages DAG", failures.head.failed.get)
+    }
+
+    // Stages sorted by distance
+    val sortedByDistance: Array[(OPStage, Int)] = parents.flatMap(_.get)
+
+    // Stages layered by distance
+    val layeredByDistance: StagesDAG =
+      sortedByDistance.groupBy(_._2).toArray
+        .map(_._2.sortBy(_._1.getOutputFeatureName))
+        .sortBy(s => -s.head._2)
+
+    // Unique stages layered by distance
+    layeredByDistance
+      .foldLeft(Set.empty[OPStage], Array.empty[Array[(OPStage, Int)]]) {
+        case ((seen, filtered), uncleaned) =>
+          // filter out any seen stages. also add distinct to filter out any duplicate stages in layer
+          val unseen = uncleaned.filterNot(v => seen.contains(v._1)).distinct
+          val nowSeen = seen ++ unseen.map(_._1)
+          (nowSeen, filtered :+ unseen)
+      }._2
+  }
+
+  /**
+   * Fit DAG and apply transformations on data up to the last estimator stage
+   *
+   * @param dag                  DAG to fit
+   * @param train                training dataset
+   * @param test                 test dataset
+   * @param hasTest              whether the test dataset is empty or not
+   * @param indexOfLastEstimator Optional index of the last estimator
+   * @param persistEveryKStages  frequency of persisting stages
+   * @param fittedTransformers   list of already fitted transformers
+   * @param spark                Spark session
+   * @return Fitted and Transformed train/test before the last estimator with fitted transformers
+   */
+  def fitAndTransformDAG(
+    dag: StagesDAG,
+    train: Dataset[Row],
+    test: Dataset[Row],
+    hasTest: Boolean,
+    indexOfLastEstimator: Option[Int],
+    persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages,
+    fittedTransformers: Seq[OPStage] = Seq.empty
+  )(implicit spark: SparkSession): FittedDAG = {
+    val alreadyFitted: ListBuffer[OPStage] = ListBuffer(fittedTransformers: _*)
+
+    val (newTrain, newTest) =
+      dag.foldLeft(train -> test) { case ((currTrain, currTest), stagesLayer) =>
+        val index = stagesLayer.head._2
+        val FittedDAG(newTrain, newTest, justFitted) = fitAndTransformLayer(
+          stagesLayer = stagesLayer,
+          train = currTrain,
+          test = currTest,
+          hasTest = hasTest,
+          transformData = indexOfLastEstimator.exists(_ < index), // only need to update for fit before last estimator
+          persistEveryKStages = persistEveryKStages
+        )
+        alreadyFitted ++= justFitted
+        newTrain -> newTest
+      }
+
+    FittedDAG(newTrain, newTest, alreadyFitted.toArray)
+  }
 
   /**
    * Fit a sequence of stages and transform a training and test dataset for use this function assumes all
    * stages fed in are on the same level of the dag
-   * @param train training dataset for estimators
-   * @param test test dataset for evaluation
-   * @param stages stages to fix
-   * @param transformData should the imput data be transformed or only used for fitting
+   *
+   * @param train               training dataset for estimators
+   * @param test                test dataset for evaluation
+   * @param hasTest             whether the test dataset is empty or not
+   * @param stagesLayer         stages to fit
+   * @param transformData       should the input data be transformed or only used for fitting
    * @param persistEveryKStages persist data at this frequency during transformations
-   * @param doTest test data is nonempty
    * @return dataframes for train and test as well as the fitted stages
    */
-  def fitAndTransform(
-    train: DataFrame,
-    test: DataFrame,
-    stages: Array[(OPStage)],
+  private def fitAndTransformLayer(
+    stagesLayer: Layer,
+    train: Dataset[Row],
+    test: Dataset[Row],
+    hasTest: Boolean,
     transformData: Boolean,
-    persistEveryKStages: Int,
-    doTest: Option[Boolean] = None
-  )(implicit spark: SparkSession, log: Logger): (DataFrame, DataFrame, Array[OPStage]) = {
-
-    val testExists = doTest.getOrElse(!test.isEmpty)
-    val (estimators, noFit) = stages.partition( _.isInstanceOf[Estimator[_]] )
+    persistEveryKStages: Int
+  )(implicit spark: SparkSession): FittedDAG = {
+    val stages = stagesLayer.map(_._1)
+    val (estimators, noFit) = stages.partition(_.isInstanceOf[Estimator[_]])
     val fitEstimators = estimators.map { case e: Estimator[_] =>
       e.fit(train) match {
-        case m: HasTestEval if testExists => m.evaluateModel(test)
+        case m: HasTestEval if hasTest =>
+          m.evaluateModel(test)
+          m.asInstanceOf[OPStage]
+        case m =>
           m.asInstanceOf[OPStage]
-        case m => m.asInstanceOf[OPStage]
       }
     }
     val transformers = noFit ++ fitEstimators
@@ -160,16 +281,92 @@ private[op] case object FitStagesUtil {
       val withOPTrain = applyOpTransformations(opTransformers, train)
       val withAllTrain = applySparkTransformations(withOPTrain, sparkTransformers, persistEveryKStages)
 
-      val withAllTest = if (testExists) {
+      val withAllTest = if (hasTest) {
         val withOPTest = applyOpTransformations(opTransformers, test)
         applySparkTransformations(withOPTest, sparkTransformers, persistEveryKStages)
       } else test
 
-      (withAllTrain, withAllTest, transformers)
+      FittedDAG(trainData = withAllTrain, testData = withAllTest, transformers = transformers)
     } else {
-      (train, test, transformers)
+      FittedDAG(trainData = train, testData = test, transformers = transformers)
+    }
+  }
+
+  /**
+   * Method that cut DAG in order to perform proper CV/TS.
+   * Extracts Model Selector and Split the DAG into
+   * 1. DAG before CV/TS
+   * 2. DAG during CV/TS
+   * 3. DAG after CV/TS
+   *
+   * @param dag DAG in the workflow to be cut
+   * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS)
+   */
+  def cutDAG(dag: StagesDAG): CutDAG = {
+    if (dag.isEmpty) CutDAG(None, Array(), Array(), Array())
+    else {
+      // creates Array containing every Model Selector in the DAG
+      val modelSelectorArrays = dag.flatten.collect { case (ms: MS, dist: Int) => (ms, dist) }
+      val modelSelector = modelSelectorArrays.toList match {
+        case Nil => None
+        case List(ms) => Option(ms)
+        case modelSelectors => throw new IllegalArgumentException(
+          s"OpWorkflow can contain at most 1 Model Selector. Found ${modelSelectors.length} Model Selectors :" +
+            s" ${modelSelectors.map(_._1).mkString(",")}")
+      }
+
+      // nonCVTS and CVTS DAGs
+      val (nonCVTSDAG: StagesDAG, inCVTSDAG: StagesDAG, afterCVTSDAG: StagesDAG) =
+        modelSelector.map { case (ms, dist) =>
+          // Optimize the DAG by removing stages unrelated to ModelSelector
+
+          // Create the DAG after Model Selector.
+          val (afterCVTSDAG, beforeCVDAG) = dag.partition(_.exists(_._2 < dist))
+
+          val modelSelectorDAG = computeDAG(Array(ms.getOutput()))
+            .dropRight(1)
+            .map(_.map{ case (stage, dist) => (stage, dist + afterCVTSDAG.length) })
+
+          // Create the DAG without Model Selector. It will be used to compute the final nonCVTS DAG.
+          val nonMSDAG: StagesDAG = beforeCVDAG.map(_.filterNot(_._1.isInstanceOf[MS])).filter(_.nonEmpty)
+
+          // Index of first CVTS stage in ModelSelector DAG
+          val firstCVTSIndex = modelSelectorDAG.indexWhere(_.exists(stage => {
+            val inputs = stage._1.getTransientFeatures()
+            inputs.exists(_.isResponse) && inputs.exists(!_.isResponse)
+          }))
+
+          // If no CVTS stages, the whole DAG is not in the CV/TS
+          if (firstCVTSIndex == -1) (nonMSDAG, Array.empty[Layer], afterCVTSDAG) else {
+
+            val cVTSDAG = modelSelectorDAG.drop(firstCVTSIndex)
+
+            // nonCVTSDAG is the complementary DAG
+            // The rule is "nonCVTSDAG = nonMSDAG - CVTSDAG"
+            val nonCVTSDAG = {
+              val flattenedCVTSDAG = cVTSDAG.flatten.map(_._1)
+              nonMSDAG.map(_.filterNot { case (stage: OPStage, _) => flattenedCVTSDAG.contains(stage) })
+                .filter(_.nonEmpty) // Remove empty layers
+            }
+
+            (nonCVTSDAG, cVTSDAG, afterCVTSDAG)
+          }
+        }.getOrElse((Array.empty[Layer], Array.empty[Layer], Array.empty[Layer]))
+
+      CutDAG(modelSelector, before = nonCVTSDAG, during = inCVTSDAG, after = afterCVTSDAG)
     }
   }
 
+  /**
+   * Method that cut DAG in order to perform proper CV/TS.
+   * Extracts Model Selector and Split the DAG into
+   * 1. DAG before CV/TS
+   * 2. DAG during CV/TS
+   * 3. DAG after CV/TS
+   *
+   * @param wf to be cut
+   * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS)
+   */
+  def cutDAG(wf: OpWorkflow): CutDAG = cutDAG(computeDAG(wf.getResultFeatures()))
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala
index 4d4a76800c..57207290b7 100644
--- a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala
+++ b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala
@@ -32,13 +32,18 @@
 package com.salesforce.op.utils.text
 
 import java.io.Reader
+import java.nio.charset.StandardCharsets
 
 import com.salesforce.op.utils.text.Language._
+import org.apache.lucene.analysis._
 import org.apache.lucene.analysis.ar.ArabicAnalyzer
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer
+import org.apache.lucene.analysis.bn.BengaliAnalyzer
+import org.apache.lucene.analysis.br.BrazilianAnalyzer
 import org.apache.lucene.analysis.ca.CatalanAnalyzer
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter
 import org.apache.lucene.analysis.cjk.CJKAnalyzer
+import org.apache.lucene.analysis.ckb.SoraniAnalyzer
 import org.apache.lucene.analysis.cz.CzechAnalyzer
 import org.apache.lucene.analysis.da.DanishAnalyzer
 import org.apache.lucene.analysis.de.GermanAnalyzer
@@ -63,12 +68,13 @@ import org.apache.lucene.analysis.no.NorwegianAnalyzer
 import org.apache.lucene.analysis.pt.PortugueseAnalyzer
 import org.apache.lucene.analysis.ro.RomanianAnalyzer
 import org.apache.lucene.analysis.ru.RussianAnalyzer
+import org.apache.lucene.analysis.snowball.SnowballFilter
 import org.apache.lucene.analysis.standard.StandardAnalyzer
 import org.apache.lucene.analysis.sv.SwedishAnalyzer
 import org.apache.lucene.analysis.th.ThaiAnalyzer
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 import org.apache.lucene.analysis.tr.TurkishAnalyzer
-import org.apache.lucene.analysis.{Analyzer, AnalyzerWrapper, TokenStream}
+import org.apache.lucene.util.IOUtils
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -118,21 +124,28 @@ class LuceneTextAnalyzer
  */
 object LuceneTextAnalyzer {
 
+  private val englishStopwords = WordlistLoader.getSnowballWordSet(
+    IOUtils.getDecodingReader(classOf[SnowballFilter], "english_stop.txt", StandardCharsets.UTF_8)
+  )
+
   /**
    * Default analyzer to use if a language specific one is not present
    */
-  val DefaultAnalyzer: Analyzer = new StandardAnalyzer()
+  val DefaultAnalyzer: Analyzer = new StandardAnalyzer(englishStopwords)
 
   // TODO we should add specific analyzers per each language if possible
   private val analyzers: Map[Language, Analyzer] = Map(
     Arabic -> new ArabicAnalyzer(),
-    Catalan -> new CatalanAnalyzer(),
     Bulgarian -> new BulgarianAnalyzer(),
+    Bengali -> new BengaliAnalyzer(),
+    Brazilian -> new BrazilianAnalyzer(),
+    Catalan -> new CatalanAnalyzer(),
+    Sorani -> new SoraniAnalyzer(),
     Czech -> new CzechAnalyzer(),
     Danish -> new DanishAnalyzer(),
     German -> new GermanAnalyzer(),
     Greek -> new GreekAnalyzer(),
-    English -> new EnglishAnalyzer(),
+    English -> new EnglishAnalyzer(englishStopwords),
     Spanish -> new SpanishAnalyzer(),
     Basque -> new BasqueAnalyzer(),
     Persian -> new PersianAnalyzer(),
@@ -145,7 +158,7 @@ object LuceneTextAnalyzer {
     Indonesian -> new IndonesianAnalyzer(),
     Italian -> new ItalianAnalyzer(),
     Japanese -> new JapaneseAnalyzer(),
-    Korean -> new CJKAnalyzer(),
+    Korean -> new CJKAnalyzer(englishStopwords),
     Lithuanian -> new LithuanianAnalyzer(),
     Latvian -> new LatvianAnalyzer(),
     Dutch -> new DutchAnalyzer(),
@@ -156,8 +169,8 @@ object LuceneTextAnalyzer {
     Swedish -> new SwedishAnalyzer(),
     Thai -> new ThaiAnalyzer(),
     Turkish -> new TurkishAnalyzer(),
-    SimplifiedChinese -> new CJKAnalyzer(),
-    TraditionalChinese -> new CJKAnalyzer()
+    SimplifiedChinese -> new CJKAnalyzer(englishStopwords),
+    TraditionalChinese -> new CJKAnalyzer(englishStopwords)
   )
 
   private val defaultAnalyzerHtmlStrip = stripHtml(DefaultAnalyzer)
diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala
similarity index 70%
rename from core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala
rename to core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala
index 298002ed45..a1a8d31418 100644
--- a/core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala
+++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala
@@ -29,24 +29,22 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.regression
+package com.salesforce.op.utils.text
 
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
+import opennlp.tools.namefind.TokenNameFinderModel
+import opennlp.tools.tokenize.TokenizerME
 
-import scala.reflect.runtime.universe.TypeTag
+/**
+ * OpenNLP text analyzer to apply when applying Open NLP
+ * [[TokenNameFinderModel]]
+ */
+class OpenNLPAnalyzer extends TextAnalyzer {
 
+  def analyze(s: String, language: Language): Seq[String] = {
+    OpenNLPModels.getTokenizerModel(language) match {
+      case Some(tokenizerModel) => new TokenizerME(tokenizerModel).tokenize(s)
+      case _ => Seq(s)
+    }
+  }
 
-class OpRandomForestRegressionModel
-(
-  val treesIn: Array[DecisionTreeRegressionModel],
-  numFeatures: Int,
-  uid: String = UID[OpRandomForestRegressionModel],
-  val operationName: String = "opRFR"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends RandomForestRegressionModel(uid = uid, _trees = treesIn, numFeatures = numFeatures)
-  with OpPredictionModelBase
+}
diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala
new file mode 100644
index 0000000000..50ffccc7f8
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+import java.io.InputStream
+
+import com.salesforce.op.utils.text.Language._
+import com.salesforce.op.utils.text.NameEntityType._
+import opennlp.tools.namefind.TokenNameFinderModel
+import opennlp.tools.sentdetect.SentenceModel
+import opennlp.tools.tokenize.TokenizerModel
+
+/**
+ * A factory to get/create OpenNLP models
+ */
+object OpenNLPModels {
+  // Assumes that models are stored as a resource
+  private val modelsPath = "/OpenNLP"
+
+  private lazy val tokenNameModels: Map[(Language, NameEntityType), TokenNameFinderModel] = Map(
+    (English, Date) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-date.bin"),
+    (English, Location) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-location.bin"),
+    (English, Money) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-money.bin"),
+    (English, Organization) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-organization.bin"),
+    (English, Percentage) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-percentage.bin"),
+    (English, Person) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-person.bin"),
+    (English, Time) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-time.bin"),
+
+    (Spanish, Location) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-location.bin"),
+    (Spanish, Organization) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-organization.bin"),
+    (Spanish, Person) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-person.bin"),
+    (Spanish, Misc) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-misc.bin"),
+
+    (Dutch, Location) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-location.bin"),
+    (Dutch, Organization) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-organization.bin"),
+    (Dutch, Person) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-person.bin"),
+    (Dutch, Misc) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-misc.bin")
+  )
+
+  private lazy val sentenceModels: Map[Language, SentenceModel] = Map(
+    Danish -> loadSentenceModel(s"$modelsPath/da-sent.bin"),
+    English -> loadSentenceModel(s"$modelsPath/en-sent.bin"),
+    German -> loadSentenceModel(s"$modelsPath/de-sent.bin"),
+    Dutch -> loadSentenceModel(s"$modelsPath/nl-sent.bin"),
+    Portuguese -> loadSentenceModel(s"$modelsPath/pt-sent.bin"),
+    Sami -> loadSentenceModel(s"$modelsPath/se-sent.bin")
+  )
+
+  private lazy val tokenizerModels: Map[Language, TokenizerModel] = Map(
+    Danish -> loadTokenizerModel(s"$modelsPath/da-token.bin"),
+    German -> loadTokenizerModel(s"$modelsPath/de-token.bin"),
+    English -> loadTokenizerModel(s"$modelsPath/en-token.bin"),
+    Dutch -> loadTokenizerModel(s"$modelsPath/nl-token.bin"),
+    Portuguese -> loadTokenizerModel(s"$modelsPath/pt-token.bin"),
+    Sami -> loadTokenizerModel(s"$modelsPath/se-token.bin")
+  )
+
+  /**
+   * Factory to get [[TokenNameFinderModel]] for a given language & entity type if it exists
+   *
+   * @return some [[TokenNameFinderModel]] instance or None
+   */
+  def getTokenNameFinderModel(language: Language, entity: NameEntityType): Option[TokenNameFinderModel] =
+    tokenNameModels.get(language -> entity)
+
+  /**
+   * Factory to get [[SentenceModel]] for a given language
+   *
+   * @return some [[SentenceModel]] instance or None
+   */
+  def getSentenceModel(language: Language): Option[SentenceModel] =
+    sentenceModels.get(language)
+
+  /**
+   * Factory to get [[TokenizerModel]] for a given language
+   *
+   * @return some [[TokenizerModel]] instance or None
+   */
+  def getTokenizerModel(language: Language): Option[TokenizerModel] =
+    tokenizerModels.get(language)
+
+  private def loadTokenNameFinderModel(resourcePath: String): TokenNameFinderModel = {
+    val modelStream = loadFromResource(resourcePath)
+    new TokenNameFinderModel(modelStream)
+  }
+
+  private def loadSentenceModel(resourcePath: String): SentenceModel = {
+    val modelStream = loadFromResource(resourcePath)
+    new SentenceModel(modelStream)
+  }
+
+  private def loadTokenizerModel(resourcePath: String): TokenizerModel = {
+    val modelStream = loadFromResource(resourcePath)
+    new TokenizerModel(modelStream)
+  }
+
+  private def loadFromResource(resourcePath: String): InputStream =
+    try {
+      getClass.getResourceAsStream(resourcePath)
+    } catch {
+      case e: Exception => throw new RuntimeException(
+        s"Failed to load OpenNLP model from resource '$resourcePath'. " +
+          "Make sure to include OP 'models' dependency jar in your application classpath.", e
+      )
+    }
+
+}
diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala
new file mode 100644
index 0000000000..11c2a51023
--- /dev/null
+++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+import com.salesforce.op.utils.text.NameEntityType._
+import com.twitter.algebird.Monoid._
+import com.twitter.algebird.Operators._
+import opennlp.tools.namefind.NameFinderME
+import opennlp.tools.util.Span
+
+/**
+ * OpenNLP implementation of [[NameEntityTagger]]
+ */
+class OpenNLPNameEntityTagger extends NameEntityTagger[OpenNLPTagResult] {
+
+  /**
+   * Apply the name entity recognition model on the sentence tokens to retrieve information
+   *
+   * @param tokens        sentence tokens
+   * @param language      language
+   * @param entitiesToTag entities to tag if found
+   * @return map of entity and corresponding tokens
+   */
+  def tag(
+    tokens: Seq[String],
+    language: Language,
+    entitiesToTag: Seq[NameEntityType]
+  ): OpenNLPTagResult = {
+    val tokensArr = tokens.toArray
+    val empty = Map.empty[String, Set[NameEntityType]]
+    val tags = entitiesToTag.foldLeft(empty) { (acc, entityToTag) =>
+      OpenNLPModels.getTokenNameFinderModel(language, entityToTag) match {
+        case None => acc
+        case Some(model) =>
+          val finder = new NameFinderME(model)
+          val spans = finder.find(tokensArr)
+          val res = convertSpansToMap(spans, tokensArr)
+          acc + res
+      }
+    }
+    OpenNLPTagResult(tags)
+  }
+
+  /**
+   * Retrieve information from the model output
+   *
+   * @param spans  open nlp name entity finder model output
+   * @param tokens sentence tokens
+   * @return map of token and its tag set
+   */
+  private[op] def convertSpansToMap(spans: Seq[Span], tokens: Array[String]): Map[String, Set[NameEntityType]] = {
+    // span objects provide exclusive end index
+    val pairSeq = for {
+      span <- spans
+      entity = Seq(nameEntityType(span.getType.toLowerCase))
+      token <- tokens.slice(span.getStart, span.getEnd)
+    } yield token -> entity
+
+    // aggregate results by token convert the output to map
+    pairSeq
+      .groupBy { case (token, _) => token }
+      .map { case (token, entities) =>
+        token -> entities.flatMap(_._2).toSet
+      }
+  }
+
+  private def nameEntityType: String => NameEntityType = {
+    case "date" => Date
+    case "location" => Location
+    case "money" => Money
+    case "organization" => Organization
+    case "percentage" => Percentage
+    case "person" => Person
+    case "time" => Time
+    case "misc" => Misc
+    case _ => Other
+  }
+}
+
+
+/**
+ * OpenNLP implementation of [[TaggerResult]]
+ *
+ * @param tokenTags token tags map, where keys are token and values are entities matching each token
+ */
+case class OpenNLPTagResult(tokenTags: Map[String, Set[NameEntityType]]) extends TaggerResult
diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala
similarity index 70%
rename from core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala
rename to core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala
index 1c79a2d0bf..d9d9bf68c7 100644
--- a/core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala
+++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala
@@ -29,24 +29,20 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.regression
+package com.salesforce.op.utils.text
 
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-import org.apache.spark.ml.linalg.Vector
+import opennlp.tools.sentdetect.SentenceDetectorME
 
-import scala.reflect.runtime.universe.TypeTag
+/**
+ * Implementation of [[SentenceSplitter]] using Open NLP sentence splitter
+ */
+class OpenNLPSentenceSplitter extends SentenceSplitter {
+
+  def getSentences(input: String, language: Language): Seq[String] = {
+    OpenNLPModels.getSentenceModel(language) match {
+      case Some(sentenceModel) => new SentenceDetectorME(sentenceModel).sentDetect(input)
+      case None => Seq(input) // sequence of original input
+    }
+  }
 
-class OpLinearPredictionModel
-(
-  coefficients: Vector,
-  intercept: Double,
-  uid: String = UID[OpLinearPredictionModel],
-  val operationName: String = "opLP"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends LinearRegressionModel(uid = uid, coefficients = coefficients, intercept = intercept)
-  with OpPredictionModelBase
+}
diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala b/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala
index 40ade0815a..f680dba202 100644
--- a/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala
+++ b/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala
@@ -31,10 +31,10 @@
 
 package com.salesforce.op.utils.text
 
+import com.optimaize.langdetect.LanguageDetectorBuilder
 import com.optimaize.langdetect.i18n.LdLocale
 import com.optimaize.langdetect.ngram.NgramExtractors
 import com.optimaize.langdetect.profiles.LanguageProfileReader
-import com.optimaize.langdetect.{LanguageDetectorBuilder, LanguageDetector => OLanguageDetector}
 import org.slf4j.LoggerFactory
 
 import scala.collection.JavaConverters._
diff --git a/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala b/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala
deleted file mode 100644
index fa76f238d3..0000000000
--- a/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.spark.ml
-
-import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
-import com.salesforce.op.stages.base.binary.OpTransformer2
-import org.apache.spark.ml.classification._
-import org.apache.spark.ml.regression._
-
-/**
- * Allows conversion from spark models to models that follow the OP convention of having a
- * transformFn that can be called on a single row rather than the whole dataframe
- */
-object SparkModelConverter {
-
-  def toOP[T <: Transformer](
-    model: Option[T],
-    isMultinomial: Boolean = false
-  ): OpTransformer2[RealNN, OPVector, Prediction] = {
-    model match {
-      case None => throw new RuntimeException("no model found")
-      case Some(m: LogisticRegressionModel) =>
-        new OpLogisticRegressionModel(m.coefficientMatrix, m.interceptVector, m.numClasses, isMultinomial)
-      case Some(m: RandomForestClassificationModel) =>
-        new OpRandomForestClassificationModel(m.trees, m.numFeatures, m.numClasses)
-      case Some(m: NaiveBayesModel) =>
-        new OpNaiveBayesModel(m.pi, m.theta, m.oldLabels, if (isMultinomial) "multinomial" else "bernoulli")
-      case Some(m: DecisionTreeClassificationModel) =>
-        new OpDecisionTreeClassificationModel(m.rootNode, m.numFeatures, m.numClasses)
-      case Some(m: LinearRegressionModel) =>
-        new OpLinearPredictionModel(m.coefficients, m.intercept)
-      case Some(m: RandomForestRegressionModel) =>
-        new OpRandomForestRegressionModel(m.trees, m.numFeatures)
-      case Some(m: GBTRegressionModel) =>
-        new OpGBTRegressionModel(m.trees, m.treeWeights, m.numFeatures)
-      case Some(m: DecisionTreeRegressionModel) =>
-        new OpDecisionTreeRegressionModel(m.rootNode, m.numFeatures)
-      case m => throw new RuntimeException(s"model conversion not implemented for model $m")
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala b/core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala
similarity index 71%
rename from core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala
rename to core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala
index e363b07868..2434571eea 100644
--- a/core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala
+++ b/core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala
@@ -31,25 +31,19 @@
 
 package org.apache.spark.ml.classification
 
-import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
-import com.salesforce.op.stages.base.binary.OpTransformer2
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.tree.{DecisionTreeClassifierParams, GBTClassifierParams, RandomForestClassifierParams}
 
 
-trait OpClassifierModelBase extends OpTransformer2[RealNN, OPVector, Prediction] {
+trait OpDecisionTreeClassifierParams extends DecisionTreeClassifierParams
 
-  self: ProbabilisticClassificationModel[Vector, _] =>
+trait OpGBTClassifierParams extends GBTClassifierParams
 
+trait OpLinearSVCParams extends LinearSVCParams
 
-  /**
-   * Function used to convert input to output
-   */
-  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => {
-    val raw = predictRaw(features.value)
-    val prob = raw2probability(raw)
-    val pred = probability2prediction(prob)
+trait OpLogisticRegressionParams extends LogisticRegressionParams
 
-    Prediction(rawPrediction = raw, probability = prob, prediction = pred)
-  }
+trait OpMultilayerPerceptronClassifierParams extends MultilayerPerceptronParams
 
-}
+trait OpNaiveBayesParams extends NaiveBayesParams
+
+trait OpRandomForestClassifierParams extends RandomForestClassifierParams with ProbabilisticClassifierParams
diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala b/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala
deleted file mode 100644
index e2f22dd2a4..0000000000
--- a/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.spark.ml.classification
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-
-import scala.reflect.runtime.universe.TypeTag
-
-class OpRandomForestClassificationModel
-(
-  val treesIn: Array[DecisionTreeClassificationModel],
-  numFeatures: Int,
-  numClasses: Int,
-  uid: String = UID[OpRandomForestClassificationModel],
-  val operationName: String = "opRF"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends RandomForestClassificationModel(uid = uid, _trees = treesIn, numFeatures = numFeatures,
-  numClasses = numClasses) with OpClassifierModelBase
diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala b/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala
deleted file mode 100644
index 61f60ebcf0..0000000000
--- a/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.spark.ml.regression
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-import org.apache.spark.ml.tree.Node
-
-import scala.reflect.runtime.universe.TypeTag
-
-class OpDecisionTreeRegressionModel
-(
-  rootNode: Node,
-  numFeatures: Int,
-  uid: String = UID[OpDecisionTreeRegressionModel],
-  val operationName: String = "opDTR"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends DecisionTreeRegressionModel(uid = uid, rootNode = rootNode, numFeatures = numFeatures)
-  with OpPredictionModelBase
diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala b/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala
deleted file mode 100644
index 2a9ae09d51..0000000000
--- a/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.spark.ml.regression
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN}
-
-import scala.reflect.runtime.universe.TypeTag
-
-class OpGBTRegressionModel
-(
-  val treesIn: Array[DecisionTreeRegressionModel],
-  val treeWeightsIn: Array[Double],
-  numFeatures: Int,
-  uid: String = UID[OpGBTRegressionModel],
-  val operationName: String = "opGBTR"
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends GBTRegressionModel(uid = uid, _trees = treesIn, _treeWeights = treeWeightsIn, numFeatures = numFeatures)
-  with OpPredictionModelBase
diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala b/core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala
similarity index 75%
rename from core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala
rename to core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala
index 5ec106d757..6f08a7fdf9 100644
--- a/core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala
+++ b/core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala
@@ -31,18 +31,15 @@
 
 package org.apache.spark.ml.regression
 
-import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
-import com.salesforce.op.stages.base.binary.OpTransformer2
-import org.apache.spark.ml.PredictionModel
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.tree.{DecisionTreeRegressorParams, GBTRegressorParams, RandomForestRegressorParams}
 
-trait OpPredictionModelBase extends OpTransformer2[RealNN, OPVector, Prediction] {
-  self: PredictionModel[Vector, _] =>
+trait OpDecisionTreeRegressorParams extends DecisionTreeRegressorParams
 
-  /**
-   * Function used to convert input to output
-   */
-  override def transformFn: (RealNN, OPVector) => Prediction = (label, features) =>
-    Prediction(prediction = predict(features.value))
+trait OpLinearRegressionParams extends LinearRegressionParams
+
+trait OpGBTRegressorParams extends GBTRegressorParams
+
+trait OpGeneralizedLinearRegressionParams extends GeneralizedLinearRegressionBase
+
+trait OpRandomForestRegressorParams extends RandomForestRegressorParams
 
-}
diff --git a/core/src/test/avro/PassengerDataAll.avsc b/core/src/test/avro/PassengerDataAll.avsc
new file mode 100644
index 0000000000..031dd116e6
--- /dev/null
+++ b/core/src/test/avro/PassengerDataAll.avsc
@@ -0,0 +1,43 @@
+{
+  "type" : "record",
+  "name" : "PassengerDataAll",
+  "namespace" : "com.salesforce.app.schema",
+  "fields" : [ {
+    "name" : "PassengerId",
+    "type" : [ "int", "null" ]
+  }, {
+    "name" : "Survived",
+    "type" : "int",
+    "default": 0
+  }, {
+    "name" : "Pclass",
+    "type" : [ "int", "null" ]
+  }, {
+    "name" : "Name",
+    "type" : [ "string", "null" ]
+  }, {
+    "name" : "Sex",
+    "type" : [ "string", "null" ]
+  }, {
+    "name" : "Age",
+    "type" : [ "double", "null" ]
+  }, {
+    "name" : "SibSp",
+    "type" : [ "int", "null" ]
+  }, {
+    "name" : "Parch",
+    "type" : [ "int", "null" ]
+  }, {
+    "name" : "Ticket",
+    "type" : [ "string", "null" ]
+  }, {
+    "name" : "Fare",
+    "type" : [ "double", "null" ]
+  }, {
+    "name" : "Cabin",
+    "type" : [ "string", "null" ]
+  }, {
+    "name" : "Embarked",
+    "type" : [ "string", "null" ]
+  } ]
+}
diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties
index f178bf8434..a2162e650f 100644
--- a/core/src/test/resources/log4j.properties
+++ b/core/src/test/resources/log4j.properties
@@ -19,3 +19,9 @@ log4j.logger.com.databricks.spark.avro=WARN
 # Optimus Prime logging
 log4j.logger.com.salesforce.op=ERROR
 log4j.logger.com.salesforce.op.utils.spark.OpSparkListener=OFF
+
+# Breeze
+log4j.logger.breeze.optimize=ERROR
+
+# BLAS & LAPACK
+log4j.logger.com.github.fommil.netlib=ERROR
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala
new file mode 100644
index 0000000000..6f7ee25684
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op
+
+import com.salesforce.app.schema.PassengerDataAll
+import com.salesforce.op.evaluators._
+import com.salesforce.op.features._
+import com.salesforce.op.features.types._
+import com.salesforce.op.readers._
+import com.salesforce.op.stages.base.binary.BinaryTransformer
+import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry._
+import com.salesforce.op.stages.impl.classification._
+import com.salesforce.op.stages.impl.preparators.SanityChecker
+import com.salesforce.op.stages.impl.regression.{LossType, RegressionModelSelector, RegressionModelsToTry}
+import com.salesforce.op.stages.impl.selector.{ModelSelectorBase, ModelSelectorBaseNames}
+import com.salesforce.op.stages.impl.tuning._
+import com.salesforce.op.test.PassengerSparkFixtureTest
+import org.apache.spark.ml.PipelineStage
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.DataFrame
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+
+@RunWith(classOf[JUnitRunner])
+class OpWorkflowCVTest extends FlatSpec with PassengerSparkFixtureTest {
+
+  val log = LoggerFactory.getLogger(this.getClass)
+
+  trait PassenserCSVforCV {
+    val simplePassengerForCV = DataReaders.Simple.csv[PassengerDataAll](
+      path = Some(s"$testDataPath/PassengerDataAll.csv"),
+      schema = PassengerDataAll.getClassSchema.toString,
+      key = _.getPassengerId.toString
+    )
+    val age = FeatureBuilder.Real[PassengerDataAll].extract(_.getAge.toReal).asPredictor
+    val sex = FeatureBuilder.PickList[PassengerDataAll].extract(_.getSex.toPickList).asPredictor
+    val fair = FeatureBuilder.Real[PassengerDataAll].extract(p => Option(p.getFare).map(_.toDouble).toReal).asPredictor
+    val pClass = FeatureBuilder.PickList[PassengerDataAll].extract(_.getPclass.toString.toPickList).asPredictor
+    val cabin = FeatureBuilder.PickList[PassengerDataAll].extract(_.getCabin.toPickList).asPredictor
+    val survived = FeatureBuilder.Binary[PassengerDataAll].extract(p => p.getSurvived.intValue.toBinary).asResponse
+    val survivedPred = FeatureBuilder.Binary[PassengerDataAll].extract(p => p.getSurvived.intValue.toBinary).asPredictor
+    val survivedNum = survived.occurs()
+  }
+
+
+  Spec[OpWorkflow] should
+    "return a binary classification model that runs cv at the workflow level" in new PassenserCSVforCV {
+    val fv = Seq(age, sex, fair, pClass, cabin).transmogrify()
+    val checked = survivedNum.sanityCheck(fv)
+
+    val (pred1, _, prob1) = new BinaryClassificationModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L),
+      splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpBinaryClassificationEvaluator)
+    ).setModelsToTry(LogisticRegression, RandomForest)
+      .setLogisticRegressionRegParam(10000)
+      .setLogisticRegressionElasticNetParam(0.01, 0.5)
+      .setRandomForestMaxBins(10)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1, prob1)
+    wf1.isWorkflowCV shouldBe true
+    val model1 = wf1.setReader(simplePassengerForCV).train()
+    val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    val (pred2, _, prob2) = new BinaryClassificationModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L),
+      splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpBinaryClassificationEvaluator)
+    ).setModelsToTry(LogisticRegression, RandomForest)
+      .setLogisticRegressionRegParam(10000)
+      .setLogisticRegressionElasticNetParam(0.01, 0.5)
+      .setRandomForestMaxBins(10)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf2 = new OpWorkflow().setResultFeatures(pred2, prob2)
+    wf2.isWorkflowCV shouldBe false
+    val model2 = wf2.setReader(simplePassengerForCV).train()
+    val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    compare(data1, data2, pred1, pred2)
+
+    val summary = model1.summary()
+    summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true
+    summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true
+    summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true
+  }
+
+  it should "return a multi classification model that runs ts at the workflow level" in new PassenserCSVforCV {
+    val fv = Seq(age, sex, fair, pClass, cabin).transmogrify()
+    val checked = survivedNum.sanityCheck(fv)
+
+    val (pred1, _, prob1) = new MultiClassificationModelSelector(
+      validator = new OpTrainValidationSplit(evaluator = Evaluators.MultiClassification.error()),
+      splitter = Option(DataCutter(reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpMultiClassificationEvaluator())
+    ).setModelsToTry(LogisticRegression, DecisionTree)
+      .setLogisticRegressionMaxIter(10)
+      .setLogisticRegressionRegParam(0.1)
+      .setDecisionTreeMaxDepth(5, 10)
+      .setDecisionTreeMinInfoGain(100000)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1, prob1)
+    wf1.isWorkflowCV shouldBe true
+    val model1 = wf1.setReader(simplePassengerForCV).train()
+    val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+
+    val (pred2, _, prob2) = new MultiClassificationModelSelector(
+      validator = new OpTrainValidationSplit(evaluator = Evaluators.MultiClassification.error()),
+      splitter = Option(DataCutter(reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpMultiClassificationEvaluator())
+    ).setModelsToTry(LogisticRegression, DecisionTree)
+      .setLogisticRegressionMaxIter(10)
+      .setLogisticRegressionRegParam(0.1)
+      .setDecisionTreeMaxDepth(5, 10)
+      .setDecisionTreeMinInfoGain(100000)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf2 = new OpWorkflow().setResultFeatures(pred2, prob2)
+    wf2.isWorkflowCV shouldBe false
+    val model2 = wf2.setReader(simplePassengerForCV).train()
+    val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    compare(data1, data2, pred1, pred2)
+
+    val summary = model1.summary()
+    log.info(summary)
+    summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true
+    summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true
+    summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true
+
+  }
+
+  it should "return a regression model that runs cv at the workflow level" in new PassenserCSVforCV {
+    val fv = Seq(sex, fair, pClass, cabin, age).transmogrify()
+    val checked = survivedNum.sanityCheck(fv)
+
+    val pred1 = new RegressionModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.Regression.r2()),
+      dataSplitter = None,
+      evaluators = Seq(new OpRegressionEvaluator())
+    ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.RandomForestRegression)
+      .setLinearRegressionElasticNetParam(0.01)
+      .setRandomForestMinInfoGain(10000)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1)
+    wf1.isWorkflowCV shouldBe true
+    val model1 = wf1.setReader(simplePassengerForCV).train()
+    val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    val pred2 = new RegressionModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.Regression.r2()),
+      dataSplitter = None,
+      evaluators = Seq(new OpRegressionEvaluator())
+    ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.RandomForestRegression)
+      .setLinearRegressionElasticNetParam(0.01)
+      .setRandomForestMinInfoGain(10000)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf2 = new OpWorkflow().setResultFeatures(pred2)
+    wf2.isWorkflowCV shouldBe false
+    val model2 = wf2.setReader(simplePassengerForCV).train()
+    val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    compare(data1, data2, pred1, pred2)
+
+    val summary = model1.summary()
+    log.info(summary)
+    summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true
+    summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true
+  }
+
+  it should "return a regression model that runs ts at the workflow level" in new PassenserCSVforCV {
+    val fv = Seq(sex, fair, pClass, cabin, age).transmogrify()
+    val checked = survivedNum.sanityCheck(fv)
+
+    val pred1 = new RegressionModelSelector(
+      validator = new OpTrainValidationSplit(evaluator = Evaluators.Regression.r2()),
+      dataSplitter = Option(DataSplitter(seed = 0L)),
+      evaluators = Seq(new OpRegressionEvaluator())
+    ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.GBTRegression)
+      .setLinearRegressionRegParam(100000)
+      .setGradientBoostedTreeLossType(LossType.Absolute)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1)
+    wf1.isWorkflowCV shouldBe true
+    val model1 = wf1.setReader(simplePassengerForCV).train()
+    val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    val pred2 = new RegressionModelSelector(
+      validator = new OpTrainValidationSplit(evaluator = Evaluators.Regression.r2()),
+      dataSplitter = Option(DataSplitter(seed = 0L)),
+      evaluators = Seq(new OpRegressionEvaluator())
+    ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.GBTRegression)
+      .setLinearRegressionRegParam(100000)
+      .setGradientBoostedTreeLossType(LossType.Absolute)
+      .setInput(survivedNum, checked)
+      .getOutput()
+
+    val wf2 = new OpWorkflow().setResultFeatures(pred2)
+    wf2.isWorkflowCV shouldBe false
+    val model2 = wf2.setReader(simplePassengerForCV).train()
+    val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false)
+
+    compare(data1, data2, pred1, pred2)
+
+    val summary = model1.summary()
+    log.info(summary)
+    summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true
+    summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true
+    summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true
+  }
+
+  it should "avoid adding label leakage when feature engineering would introduce it" in new PassenserCSVforCV {
+
+    val fairLeaker = fair.autoBucketize(survivedNum, trackNulls = false)
+    val ageLeaker = age.autoBucketize(survivedNum, trackNulls = false)
+    val fv = Seq(age, sex, ageLeaker, fairLeaker, pClass, cabin)
+      .transmogrify()
+
+    val (pred1, _, _) = new BinaryClassificationModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L),
+      splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpBinaryClassificationEvaluator)
+    ).setModelsToTry(LogisticRegression)
+      .setLogisticRegressionRegParam(0.0, 0.001, 0.1)
+      .setInput(survivedNum, fv)
+      .getOutput()
+
+    val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1)
+    wf1.isWorkflowCV shouldBe true
+    val model1 = wf1.setReader(simplePassengerForCV).train()
+    val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = true)
+
+    val (pred2, _, _) = new BinaryClassificationModelSelector(
+      validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L),
+      splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)),
+      evaluators = Seq(new OpBinaryClassificationEvaluator)
+    ).setModelsToTry(LogisticRegression)
+      .setLogisticRegressionRegParam(0.0, 0.001, 0.1)
+      .setInput(survivedNum, fv)
+      .getOutput()
+
+    val wf2 = new OpWorkflow().setResultFeatures(pred2)
+    wf2.isWorkflowCV shouldBe false
+    val model2 = wf2.setReader(simplePassengerForCV).train()
+    val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = true)
+
+    // CV
+    model1.summary().contains(""""area under PR" : "0.802""") shouldBe true
+    model1.summary().contains(""""area under PR" : "0.81""") shouldBe false
+    model2.summary().contains(""""area under PR" : "0.81""") shouldBe true
+  }
+
+  def compare(data1: DataFrame, data2: DataFrame, f1: FeatureLike[_], f2: FeatureLike[_]): Unit = {
+
+    val winner1 = f1.originStage.asInstanceOf[ModelSelectorBase[_, _]].bestEstimator.get
+    val winner2 = f2.originStage.asInstanceOf[ModelSelectorBase[_, _]].bestEstimator.get
+    winner1.estimator.getClass shouldEqual winner2.estimator.getClass
+    winner1.estimator.asInstanceOf[PipelineStage].extractParamMap.toSeq.sortBy(_.param.name).map(_.value) should
+      contain theSameElementsAs
+      winner2.estimator.asInstanceOf[PipelineStage].extractParamMap.toSeq.sortBy(_.param.name).map(_.value)
+
+    val d1s = data1.collect().sortBy(_.getAs[String]("key"))
+    val d2s = data2.collect().sortBy(_.getAs[String]("key"))
+    d1s.zip(d2s).foreach{
+      case (r1, r2) =>
+        math.abs(r1.getDouble(0) - r2.getDouble(0)) < 0.5 shouldBe true
+        if (r1.size > 2) math.abs(r1.getAs[Vector](1)(0) - r2.getAs[Vector](1)(0) ) < 0.5 shouldBe true
+    }
+  }
+
+}
+
+class Leaker(uid: String = UID[BinaryTransformer[_, _, _]]) extends
+  BinaryTransformer[Real, RealNN, RealNN](operationName = "makeLeaker", uid = uid) {
+  override def transformFn: (Real, RealNN) => RealNN =
+  (f: Real, l: RealNN) => if (l.v.exists(_ > 0)) 1.0.toRealNN else 0.0.toRealNN
+  override def outputIsResponse: Boolean = false
+}
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala
index 48b5eb843f..7e3107bf9a 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala
@@ -32,7 +32,7 @@
 package com.salesforce.op
 
 
-import com.salesforce.op.DAG._
+import com.salesforce.op.utils.stages.FitStagesUtil._
 import com.salesforce.op.features.FeatureLike
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression}
@@ -43,8 +43,8 @@ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.testkit.{RandomBinary, RandomReal, RandomVector}
 import org.apache.spark.ml.{Estimator, Model}
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
 import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
@@ -74,13 +74,15 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
   // Workflow
   val wf = new OpWorkflow()
 
-
   Spec[OpWorkflowCore] should "handle empty DAG" in {
     assert(
       res = cutDAG(wf),
-      modelSelector = None,
-      nonCVTSDAG = Array.empty[Layer],
-      cVTSDAG = Array.empty[Layer]
+      expected = CutDAG(
+        modelSelector = None,
+        before = Array.empty[Layer],
+        during = Array.empty[Layer],
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -90,9 +92,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array.empty[Layer],
-      cVTSDAG = Array.empty[Layer]
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array.empty[Layer],
+        during = Array.empty[Layer],
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -104,9 +109,31 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array(Array((lda, 2))),
-      cVTSDAG = Array(Array((sanityChecker, 1)))
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array(Array((lda, 2))),
+        during = Array(Array((sanityChecker, 1))),
+        after = Array.empty[Layer]
+      )
+    )
+  }
+
+  it should "cut simple DAG with nonCVTS and cVTS stage and stages after CV" in {
+    val ldaFeatures = lda.setInput(features).getOutput()
+    val checkedFeatures = sanityChecker.setInput(label, ldaFeatures).getOutput()
+    val ms = BinaryClassificationModelSelector()
+    val (pred, _, _) = ms.setInput(label, checkedFeatures).getOutput()
+    val zNormalize = new OpScalarStandardScaler()
+    val realPred = zNormalize.setInput(pred).getOutput()
+
+    assert(
+      res = cutDAG(wf.setResultFeatures(realPred)),
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 1)),
+        before = Array(Array((lda, 3))),
+        during = Array(Array((sanityChecker, 2))),
+        after = Array(Array((zNormalize, 0)))
+      )
     )
   }
 
@@ -117,9 +144,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array.empty[Layer],
-      cVTSDAG = Array(Array((sanityChecker, 1)))
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array.empty[Layer],
+        during = Array(Array((sanityChecker, 1))),
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -130,9 +160,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array(Array((lda, 1))),
-      cVTSDAG = Array.empty[Layer]
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array(Array((lda, 1))),
+        during = Array.empty[Layer],
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -142,9 +175,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(checkedFeatures)),
-      modelSelector = None,
-      nonCVTSDAG = Array.empty[Layer],
-      cVTSDAG = Array.empty[Layer]
+      expected = CutDAG(
+        modelSelector = None,
+        before = Array.empty[Layer],
+        during = Array.empty[Layer],
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -176,13 +212,16 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
     val ldaFeatures = lda.setInput(features).getOutput()
     val checkedFeatures = sanityChecker.setInput(label2, ldaFeatures).getOutput()
     val (pred, _, _) = ms.setInput(label, features).getOutput()
-    val (predLogReg, _, _) = logReg.setInput(label2, checkedFeatures).getOutput()
+    val predLogReg = logReg.setInput(label2, checkedFeatures).getOutput()
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred, predLogReg)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array(Array((lda, 2)), Array((sanityChecker, 1)), Array((logReg.stage1, 0))),
-      cVTSDAG = Array.empty[Layer]
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array(Array((lda, 2)), Array((sanityChecker, 1)), Array((logReg, 0))),
+        during = Array.empty[Layer],
+        after = Array.empty[Layer]
+      )
     )
   }
 
@@ -196,34 +235,27 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext {
 
     assert(
       res = cutDAG(wf.setResultFeatures(pred)),
-      modelSelector = Option(ms.stage1),
-      nonCVTSDAG = Array(Array((lda, 2), (zNormalize, 2))),
-      cVTSDAG = Array(Array((sanityChecker, 1)))
+      expected = CutDAG(
+        modelSelector = Option((ms.stage1, 0)),
+        before = Array(Array((lda, 2), (zNormalize, 2))),
+        during = Array(Array((sanityChecker, 1))),
+        after = Array.empty[Layer]
+      )
     )
   }
 
-  /**
-   * Shortcut function to cut DAG
-   *
-   * @param wf Workflow
-   * @return Cut DAG
-   */
-  private def cutDAG(wf: OpWorkflow): (Option[MS], StagesDAG, StagesDAG) = {
-    wf.cutDAG(DAG.compute(wf.getResultFeatures()))
-  }
-
   /**
    * Compare Actual and expected cut DAGs
    *
-   * @param res             Actual results
-   * @param modelSelector   Expected Model Selector
-   * @param nonCVTSDAG Expected nonCVTS DAG
-   * @param cVTSDAG   Expected cVTS DAG
+   * @param res      actual cut
+   * @param expected expected cut
    */
-  private def assert(res: (Option[MS], StagesDAG, StagesDAG),
-    modelSelector: Option[MS], nonCVTSDAG: StagesDAG, cVTSDAG: StagesDAG): Unit = {
-    res._1 shouldBe modelSelector
-    res._2 shouldBe nonCVTSDAG
-    res._3 shouldBe cVTSDAG
+  private def assert(res: CutDAG, expected: CutDAG): Unit = {
+    res.modelSelector shouldBe expected.modelSelector
+    res.before should contain theSameElementsInOrderAs expected.before
+    res.during should contain theSameElementsInOrderAs expected.during
+    res.after should contain theSameElementsInOrderAs expected.after
   }
 }
+
+
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala
index c3f60759b8..132be1ee0a 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala
@@ -51,7 +51,8 @@ import org.slf4j.LoggerFactory
 
 
 @RunWith(classOf[JUnitRunner])
-class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest with BeforeAndAfterEach {
+class OpWorkflowModelReaderWriterTest
+  extends FlatSpec with UIDReset with PassengerSparkFixtureTest with BeforeAndAfterEach {
 
   implicit val jsonFormats: Formats = DefaultFormats
   val log = LoggerFactory.getLogger(this.getClass)
@@ -71,7 +72,6 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur
     saveModelPath = tempDir + "/op-rw-wf-model-test-" + DateTime.now().getMillis
   }
 
-
   override def afterAll: Unit = {
     super.afterAll
     deleteRecursively(new File(saveFlowPathStable))
@@ -237,8 +237,7 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur
     compareWorkflowModels(model, wfMR)
   }
 
-  trait VectorizedFlow {
-    UID.reset()
+  trait VectorizedFlow extends UIDReset {
     val cat = Seq(gender, boarded, height, age, description).transmogrify()
     val catHead = cat.map[Real](v => Real(v.value.toArray.headOption))
     val wf = new OpWorkflow()
@@ -324,3 +323,7 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur
     p1.customParams shouldBe p2.customParams
   }
 }
+
+trait UIDReset {
+  UID.reset()
+}
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala
index f735b2ec80..2397316236 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala
@@ -37,7 +37,8 @@ import com.salesforce.op.OpWorkflowRunType._
 import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators}
 import com.salesforce.op.features.types._
 import com.salesforce.op.readers.DataFrameFieldNames._
-import com.salesforce.op.stages.impl.classification.OpLogisticRegression
+import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression
+import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression}
 import com.salesforce.op.test.{PassengerSparkFixtureTest, TestSparkStreamingContext}
 import com.salesforce.op.utils.spark.AppMetrics
 import com.salesforce.op.utils.spark.RichDataset._
@@ -68,10 +69,15 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec
   private val features = Seq(height, weight, gender, description, age).transmogrify()
   private val survivedNum = survived.occurs()
 
-  val (pred, raw, prob) = new OpLogisticRegression().setInput(survivedNum, features).getOutput()
+  // TODO put back LR when evaluators work with prediction features
+  val (pred, raw, prob) = BinaryClassificationModelSelector.withTrainValidationSplit(None)
+    .setModelsToTry(LogisticRegression)
+    .setLogisticRegressionRegParam(0)
+    .setInput(survivedNum, features).getOutput()
   private val workflow = new OpWorkflow().setResultFeatures(pred, raw, survivedNum).setReader(dataReader)
   private val evaluator =
     Evaluators.BinaryClassification().setLabelCol(survivedNum).setPredictionCol(pred).setRawPredictionCol(raw)
+      .setProbabilityCol(prob)
 
   val metricsPromise = Promise[AppMetrics]()
 
@@ -138,7 +144,7 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec
       metricsLocation = Some(modelMetricsLocation.toString)
     )
     val res = doRun[TrainResult](runConfig, modelLocation, modelMetricsLocation)
-    res.modelSummary shouldBe "{ }"
+    res.modelSummary.nonEmpty shouldBe true
   }
 
   it should "score a dataset with a trained model" in {
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
index f889f4c35c..9c5708ea15 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
@@ -31,26 +31,27 @@
 
 package com.salesforce.op
 
-import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators}
-import com.salesforce.op.features.OPFeature
+import com.salesforce.op.evaluators._
+import com.salesforce.op.features._
 import com.salesforce.op.features.types._
 import com.salesforce.op.filters.RawFeatureFilter
 import com.salesforce.op.readers.DataFrameFieldNames._
 import com.salesforce.op.readers._
 import com.salesforce.op.stages.base.unary._
 import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry._
-import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, Stage1BinaryClassificationModelSelector}
+import com.salesforce.op.stages.impl.classification._
 import com.salesforce.op.stages.impl.preparators.SanityChecker
-import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames
-import com.salesforce.op.stages.impl.tuning.DataBalancer
-import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder}
+import com.salesforce.op.stages.impl.regression.{LossType, RegressionModelSelector, RegressionModelsToTry}
+import com.salesforce.op.stages.impl.selector.{ModelSelectorBaseNames, SelectedModel}
+import com.salesforce.op.stages.impl.tuning._
+import com.salesforce.op.test.{Passenger, PassengerCSV, PassengerSparkFixtureTest, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import org.apache.spark.ml.param.BooleanParam
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.{DoubleType, StringType}
 import org.apache.spark.sql.{Dataset, SparkSession}
-import org.joda.time.DateTime
+import org.joda.time.{DateTime, Duration}
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
@@ -158,7 +159,11 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
       .setInput(survivedNum, checked).getOutput()
     val wf = new OpWorkflow()
       .setResultFeatures(whyNotNormed, prob)
-      .withRawFeatureFilter(Option(dataReader), None, minFillRate = 0.7)
+      .withRawFeatureFilter(
+        trainingReader = Option(dataReader),
+        scoringReader = None,
+        minFillRate = 0.7,
+        protectedFeatures = Array(height, weight))
 
     val wfM = wf.train()
     val data = wfM.score()
@@ -183,7 +188,8 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
 
   it should "be able to compute a partial dataset in both workflow and workflow model" in {
     val fields =
-      List(KeyFieldName, height.name, weight.name, heightNormed.name, density.name, densityByHeightNormed.name)
+      List(KeyFieldName, height.name, weight.name, heightNormed.name, density.name,
+        densityByHeightNormed.name, whyNotNormed.name)
 
     val data = workflow.setReader(dataReader).computeDataUpTo(whyNotNormed)
     data.schema.fieldNames should contain theSameElementsAs fields
@@ -439,6 +445,7 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
       .setRawPredictionCol(rawPred)
       .setLabelCol(survivedNum)
       .setPredictionCol(pred)
+      .setProbabilityCol(prob)
 
     val scores1 = fittedWorkflow.score(keepIntermediateFeatures = true)
     val (scores2, metrics) = fittedWorkflow.scoreAndEvaluate(evaluator = evaluator, keepIntermediateFeatures = true)
@@ -452,7 +459,12 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
     scores1.schema.fields.map(_.metadata.toString()) should contain theSameElementsAs
       scores2.schema.fields.map(_.metadata.toString())
 
-    metrics shouldBe BinaryClassificationMetrics(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 5.0, 0.0, 0.0)
+    val probs = scores2.collect(prob)
+    val thresholds = probs.map(_.value(1)).distinct.sorted.reverse
+
+    metrics shouldBe BinaryClassificationMetrics(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 5.0, 0.0, 0.0,
+      thresholds.toSeq, Seq(1.0, 0.5, 0.25, 0.2, 1.0/6), Seq(1.0, 1.0, 1.0, 1.0, 1.0),
+      Seq(0.0, 0.2, 0.6, 0.8, 1.0))
   }
 
   it should "return an empty data set if passed empty data for scoring" in {
diff --git a/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala
index 9689ff3c9a..909f254fa5 100644
--- a/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala
@@ -32,7 +32,8 @@
 package com.salesforce.op.evaluators
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.classification.OpLogisticRegression
+import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression
+import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression}
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator}
 import org.apache.spark.ml.linalg.Vectors
@@ -80,7 +81,11 @@ class EvaluatorsTest extends FlatSpec with TestSparkContext {
   )
   val test_label = test_rawLabel.copy(isResponse = true)
 
-  val testEstimator = new OpLogisticRegression().setInput(label, features)
+  // TODO put back LR when evaluators work with prediction features
+  val testEstimator = BinaryClassificationModelSelector()
+    .setModelsToTry(LogisticRegression)
+    .setLogisticRegressionRegParam(0)
+    .setInput(label, features)
   val (pred, rawPred, prob) = testEstimator.getOutput()
   val model = testEstimator.fit(ds)
   val transformedData = model.setInput(test_label, test_features).transform(test_ds)
diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala
index 889041569a..a89009b8dd 100644
--- a/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala
@@ -32,9 +32,9 @@
 package com.salesforce.op.evaluators
 
 import com.salesforce.op.evaluators.BinaryClassEvalMetrics._
-import com.salesforce.op.evaluators.MultiClassEvalMetrics._
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.classification.OpLogisticRegression
+import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression
+import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression}
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import org.apache.spark.ml.evaluation._
 import org.apache.spark.ml.linalg.Vectors
@@ -102,11 +102,16 @@ class OpBinaryClassificationEvaluatorTest extends FlatSpec with TestSparkContext
   )
   val one_label = one_rawLabel.copy(isResponse = true)
 
-  val testEstimator = new OpLogisticRegression().setInput(label, features)
-  val (pred, rawPred, _) = testEstimator.getOutput()
+  // TODO put back LR when evaluators work with prediction features
+  val testEstimator = BinaryClassificationModelSelector()
+    .setModelsToTry(LogisticRegression)
+    .setLogisticRegressionRegParam(0)
+    .setInput(label, features)
+  val (pred, rawPred, prob) = testEstimator.getOutput()
   val testEvaluator = new OpBinaryClassificationEvaluator().setLabelCol(label)
     .setPredictionCol(pred)
     .setRawPredictionCol(rawPred)
+    .setProbabilityCol(prob)
   val model = testEstimator.fit(ds)
   val sparkBinaryEvaluator = new BinaryClassificationEvaluator()
   val sparkMulticlassEvaluator = new MulticlassClassificationEvaluator()
@@ -122,8 +127,9 @@ class OpBinaryClassificationEvaluatorTest extends FlatSpec with TestSparkContext
     sparkBinaryEvaluator.setLabelCol(label.name).setRawPredictionCol(rawPred.name)
     sparkMulticlassEvaluator.setLabelCol(label.name).setPredictionCol(pred.name)
 
-    metrics.AuROC shouldBe sparkBinaryEvaluator.setMetricName(AuROC.sparkEntryName).evaluate(transformedData)
-    metrics.AuPR shouldBe sparkBinaryEvaluator.setMetricName(AuPR.sparkEntryName).evaluate(transformedData)
+    // TODO: These are no longer the same since we now use probabilities as thresholds, and Spark uses rawPredictions
+    // metrics.AuROC shouldBe sparkBinaryEvaluator.setMetricName(AuROC.sparkEntryName).evaluate(transformedData)
+    // metrics.AuPR shouldBe sparkBinaryEvaluator.setMetricName(AuPR.sparkEntryName).evaluate(transformedData)
 
     val (tp, tn, fp, fn, precision, recall, f1) = getPosNegValues(
       transformedData.select(pred.name, test_label.name).rdd
diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala
new file mode 100644
index 0000000000..beea884623
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.evaluators
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpMultiClassificationEvaluatorTest extends FlatSpec with TestSparkContext {
+
+  // loggingLevel(Level.INFO)
+
+  val numRows = 1000L
+  val (dsMulti, labelRawMulti, predMulti, rawPredMulti, probMulti) =
+    TestFeatureBuilder[RealNN, RealNN, OPVector, OPVector](Seq.fill(numRows.toInt)(
+      (1.0, 0.0, Vectors.dense(10.0, 5.0, 1.0, 0.0, 0.0), Vectors.dense(0.70, 0.25, 0.05, 0.0, 0.0))
+    ).map(v => (v._1.toRealNN, v._2.toRealNN, v._3.toOPVector, v._4.toOPVector))
+    )
+  val labelMulti = labelRawMulti.copy(isResponse = true)
+  val defaultThresholds = (0 to 100).map(_ / 100.0).toArray
+  val defaultTopNs = Array(1, 3)
+
+  Spec[OpMultiClassificationEvaluator] should "determine incorrect/correct counts from the thresholds" in {
+    val evaluatorMulti = new OpMultiClassificationEvaluator()
+      .setLabelCol(labelMulti)
+      .setPredictionCol(predMulti)
+      .setRawPredictionCol(rawPredMulti)
+      .setProbabilityCol(probMulti)
+
+    val metricsMulti = evaluatorMulti.evaluateAll(dsMulti)
+
+    // Predictions should never be correct for top1 (since correct class has 2nd highest probability).
+    // For top3, it should be correct up to a threshold of 0.25
+    val expectedCorrects = Map(
+      1 -> Seq.fill(defaultThresholds.length)(0L),
+      3 -> (Seq.fill(26)(numRows) ++ Seq.fill(defaultThresholds.length - 26)(0L))
+    )
+    // For top1, prediction is incorrect up to a threshold of 0.7, and then no prediction
+    // For top3, prediction is incorrect in the threshold range (0.25, 0.7], then no prediction
+    val expectedIncorrects = Map(
+      1 -> (Seq.fill(71)(numRows) ++ Seq.fill(defaultThresholds.length - 71)(0L)),
+      3 -> (Seq.fill(26)(0L) ++ Seq.fill(71 - 26)(numRows) ++ Seq.fill(defaultThresholds.length - 71)(0L))
+    )
+    val expectedNoPredictons = Map(
+      1 -> (Seq.fill(71)(0L) ++ Seq.fill(defaultThresholds.length - 71)(numRows)),
+      3 -> (Seq.fill(26)(0L) ++ Seq.fill(71 - 26)(0L) ++ Seq.fill(defaultThresholds.length - 71)(numRows))
+    )
+
+    metricsMulti.ThresholdMetrics shouldEqual ThresholdMetrics(
+      topNs = defaultTopNs,
+      thresholds = defaultThresholds,
+      correctCounts = expectedCorrects,
+      incorrectCounts = expectedIncorrects,
+      noPredictionCounts = expectedNoPredictons
+    )
+  }
+
+  it should "have settable thresholds and topNs" in {
+    val thresholds = Array(0.1, 0.2, 0.5, 0.8, 0.9, 1.0)
+    val topNs = Array(1, 4, 12)
+
+    val evaluatorMulti = new OpMultiClassificationEvaluator()
+      .setLabelCol(labelMulti)
+      .setPredictionCol(predMulti)
+      .setRawPredictionCol(rawPredMulti)
+      .setProbabilityCol(probMulti)
+      .setThresholds(thresholds)
+      .setTopNs(topNs)
+
+    val metricsMulti = evaluatorMulti.evaluateAll(dsMulti)
+
+    // Predictions should never be correct for top1 (since correct class has 2nd highest probability).
+    // For top4 & top12, it should be correct up to a threshold of 0.25
+    val expectedCorrects = Map(
+      1 -> Seq(0L, 0L, 0L, 0L, 0L, 0L),
+      4 -> Seq(numRows, numRows, 0L, 0L, 0L, 0L),
+      12 -> Seq(numRows, numRows, 0L, 0L, 0L, 0L)
+    )
+    // For top1, prediction is incorrect up to a threshold of 0.7, and then no prediction
+    // For top4 & top 12, prediction is incorrect in the threshold range (0.25, 0.7], then no prediction
+    val expectedIncorrects = Map(
+      1 -> Seq(numRows, numRows, numRows, 0L, 0L, 0L),
+      4 -> Seq(0L, 0L, numRows, 0L, 0L, 0L),
+      12 -> Seq(0L, 0L, numRows, 0L, 0L, 0L)
+    )
+    val expectedNoPredictons = Map(
+      1 -> Seq(0L, 0L, 0L, numRows, numRows, numRows),
+      4 -> Seq(0L, 0L, 0L, numRows, numRows, numRows),
+      12 -> Seq(0L, 0L, 0L, numRows, numRows, numRows)
+    )
+
+    metricsMulti.ThresholdMetrics shouldEqual ThresholdMetrics(
+      topNs = topNs,
+      thresholds = thresholds,
+      correctCounts = expectedCorrects,
+      incorrectCounts = expectedIncorrects,
+      noPredictionCounts = expectedNoPredictons
+    )
+  }
+
+  it should "not allow topNs to be negative or 0" in {
+    intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setTopNs(Array(0, 1, 3)))
+    intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setTopNs(Array(1, -4, 3)))
+  }
+
+  it should "not allow thresholds to be out of the range [0.0, 1.0]" in {
+    intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setThresholds(Array(-0.1, 0.4)))
+    intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setThresholds(Array(1.1, 0.4)))
+  }
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala
index 16cc8978b2..c75498f56f 100644
--- a/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala
@@ -32,7 +32,8 @@
 package com.salesforce.op.evaluators
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.regression.OpLinearRegression
+import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.LinearRegression
+import com.salesforce.op.stages.impl.regression.{OpLinearRegression, RegressionModelSelector}
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamMap
@@ -54,7 +55,11 @@ class OpRegressionEvaluatorTest extends FlatSpec with TestSparkContext {
   )
 
   val label = rawLabel.copy(isResponse = true)
-  val testEstimator = new OpLinearRegression().setInput(label, features)
+  // TODO put back LR when evaluators work with prediction features
+  val testEstimator = RegressionModelSelector.withTrainValidationSplit(dataSplitter = None, trainRatio = 0.5)
+    .setModelsToTry(LinearRegression)
+    .setLinearRegressionRegParam(0)
+    .setInput(label, features)
 
   val prediction = testEstimator.getOutput()
   val testEvaluator = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction)
diff --git a/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala b/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala
index de7fc287cb..f58735d317 100644
--- a/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala
+++ b/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala
@@ -76,6 +76,20 @@ class TransientFeatureTest extends FlatSpec with PassengerFeaturesTest with Test
     assertThrows[RuntimeException] { t.getFeature() }
   }
 
+  it should "be equal to self" in {
+    tf shouldBe tf
+    tf.equals(tf) shouldBe true
+  }
+
+  it should "not be equal to a different instance" in {
+    val other = TransientFeature(weight)
+    tf should not be other
+    tf.equals(other) shouldBe false
+  }
+
+  it should "have hash code of it's uid" in {
+    tf.hashCode() shouldBe tf.uid.hashCode
+  }
 
   it should "cast back to FeatureLike" in {
     tf.asFeatureLike[Real] shouldBe height
diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
new file mode 100644
index 0000000000..9a4f77c1b6
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+import com.salesforce.op.OpParams
+import com.salesforce.op.features.{OPFeature, TransientFeature}
+import com.salesforce.op.stages.impl.feature.HashAlgorithm
+import com.salesforce.op.test.PassengerSparkFixtureTest
+import com.salesforce.op.utils.spark.RichDataset._
+import org.apache.spark.mllib.feature.HashingTF
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData {
+
+  Spec[FeatureDistribution] should "be correctly created for features" in {
+    val features = Array(survived, age, gender, height, weight).map(TransientFeature.apply)
+    val values: Array[(Boolean, ProcessedSeq)] = Array(
+      (false, Right(Seq(1.0))), (true, Right(Seq.empty[Double])), (false, Left(Seq("male", "female"))),
+      (true, Left(Seq.empty[String])), (false, Right(Seq(1.0, 3.0, 5.0)))
+    )
+    val summary =
+      Array(Summary(0.0, 1.0), Summary(-1.6, 10.6), Summary(0.0, 3.0), Summary(0.0, 0.0), Summary(1.0, 5.0))
+    val bins = 10
+    val hasher: HashingTF = new HashingTF(numFeatures = bins)
+      .setBinary(false)
+      .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
+
+    val featureKeys: Array[FeatureKey] = features.map(f => (f.name, None))
+    val processedSeqs: Array[Option[ProcessedSeq]] = values.map { case (isEmpty, processed) =>
+      if (isEmpty) None else Option(processed)
+    }
+    val distribs = featureKeys.zip(summary).zip(processedSeqs).map { case ((key, summ), seq) =>
+      FeatureDistribution(key, summ, seq, bins, hasher)
+    }
+    distribs.foreach{ d =>
+      d.key shouldBe None
+      d.count shouldBe 1
+      d.distribution.length shouldBe bins
+    }
+    distribs(0).nulls shouldBe 0
+    distribs(1).nulls shouldBe 1
+    distribs(1).distribution.sum shouldBe 0
+    distribs(2).distribution.sum shouldBe 2
+    distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0)
+    distribs(3).distribution.sum shouldBe 0
+    distribs(4).distribution.sum shouldBe 3
+    distribs(4).summaryInfo.length shouldBe bins
+  }
+
+  it should "be correctly created for map features" in {
+    val features = Array(stringMap, numericMap, booleanMap).map(TransientFeature.apply)
+    val values: Array[Map[String, ProcessedSeq]] = Array(
+      Map("A" -> Left(Seq("male", "female"))),
+      Map("A" -> Right(Seq(1.0)), "B" -> Right(Seq(1.0))),
+      Map("B" -> Right(Seq(0.0))))
+    val summary = Array(
+      Map("A" -> Summary(0.0, 1.0), "B" -> Summary(0.0, 5.0)),
+      Map("A" -> Summary(-1.6, 10.6), "B" -> Summary(0.0, 3.0)),
+      Map("B" -> Summary(0.0, 0.0)))
+    val bins = 10
+    val hasher: HashingTF = new HashingTF(numFeatures = bins)
+      .setBinary(false)
+      .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
+    val distribs = features.map(_.name).zip(summary).zip(values).flatMap { case ((name, summaryMaps), valueMaps) =>
+      summaryMaps.map { case (key, summary) =>
+        val featureKey = (name, Option(key))
+        FeatureDistribution(featureKey, summary, valueMaps.get(key), bins, hasher)
+      }
+    }
+
+    distribs.length shouldBe 5
+    distribs.foreach{ d =>
+      d.key.contains("A") || d.key.contains("B") shouldBe true
+      d.count shouldBe 1
+      if (d.name != "booleanMap") d.distribution.length shouldBe bins
+      else d.distribution.length shouldBe 2
+    }
+    distribs(0).nulls shouldBe 0
+    distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 1.0)
+    distribs(1).nulls shouldBe 1
+    distribs(0).distribution.sum shouldBe 2
+    distribs(1).distribution.sum shouldBe 0
+    distribs(2).summaryInfo.length shouldBe bins
+    distribs(2).distribution.sum shouldBe 1
+    distribs(4).distribution(0) shouldBe 1
+    distribs(4).distribution(1) shouldBe 0
+    distribs(4).summaryInfo.length shouldBe 2
+  }
+
+  it should "correctly compare fill rates" in {
+    val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, Array.empty, Array.empty)
+    fd1.relativeFillRate(fd2) shouldBe 0.9
+  }
+
+  it should "correctly compare relative fill rates" in {
+    val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 19, Array.empty, Array.empty)
+    trainSummaries(0).relativeFillRatio(scoreSummaries(0)) shouldBe 4.5
+    trainSummaries(2).relativeFillRatio(scoreSummaries(2)) shouldBe 1.0
+    fd1.relativeFillRatio(fd2) shouldBe 18.0
+  }
+
+  it should "correctly compute the DS divergence" in {
+    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    fd1.jsDivergence(fd2) should be < eps
+
+    val fd3 = FeatureDistribution("A", None, 10, 1, Array(0, 0, 1000, 1000, 0), Array.empty)
+    fd3.jsDivergence(fd3) should be < eps
+    val fd4 = FeatureDistribution("A", None, 20, 20, Array(200, 800, 0, 0, 1200), Array.empty)
+    (fd3.jsDivergence(fd4) - 1.0) should be < eps
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
new file mode 100644
index 0000000000..ee4c835ae0
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+trait FiltersTestData {
+
+  protected val eps = 1E-2
+
+  protected val trainSummaries = Seq(
+    FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("C", Some("1"), 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("D", Some("1"), 10, 9, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("D", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty)
+  )
+
+  protected val scoreSummaries = Seq(
+    FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), Array.empty),
+    FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty),
+    FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty)
+  )
+}
diff --git a/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala
new file mode 100644
index 0000000000..0f80a75302
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.filters
+
+import scala.math.round
+
+import com.salesforce.op.stages.impl.preparators.CorrelationType
+import com.salesforce.op.test.TestSparkContext
+import com.twitter.algebird.Monoid._
+import com.twitter.algebird.Operators._
+import org.apache.spark.mllib.linalg.{Matrix, Vector}
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class PreparedFeaturesTest extends FlatSpec with TestSparkContext {
+
+  val responseKey1: FeatureKey = "Response1" -> None
+  val responseKey2: FeatureKey = "Response2" -> None
+  val predictorKey1: FeatureKey = "Predictor1" -> None
+  val predictorKey2A: FeatureKey = "Predictor2" -> Option("A")
+  val predictorKey2B: FeatureKey = "Predictor2" -> Option("B")
+
+  val preparedFeatures1 = PreparedFeatures(
+    responses = Map(responseKey1 -> Right(Seq(1.0)), responseKey2 -> Right(Seq(0.5))),
+    predictors = Map(
+      predictorKey1 -> Right(Seq(0.0, 0.0)),
+      predictorKey2A -> Left(Seq("i", "ii")),
+      predictorKey2B -> Left(Seq("iii"))))
+  val preparedFeatures2 = PreparedFeatures(
+    responses = Map(responseKey1 -> Right(Seq(0.0))),
+    predictors = Map(predictorKey1 -> Right(Seq(0.4, 0.5))))
+  val preparedFeatures3 = PreparedFeatures(
+    responses = Map(responseKey2 -> Right(Seq(-0.5))),
+    predictors = Map(predictorKey2A -> Left(Seq("iv"))))
+  val allPreparedFeatures = Seq(preparedFeatures1, preparedFeatures2, preparedFeatures3)
+  val (allResponseSummaries, allPredictorSummaries) = allPreparedFeatures.map(_.summaries).reduce(_ + _)
+
+  val allResponseKeys1 = Array(responseKey1, responseKey2)
+  val allResponseKeys2 = Array(responseKey1)
+  val allPredictorKeys1 = Array(predictorKey1, predictorKey2A, predictorKey2B)
+  val allPredictorKeys2 = Array(predictorKey1)
+
+  Spec[PreparedFeatures] should "produce correct summaries" in {
+    val (responseSummaries1, predictorSummaries1) = preparedFeatures1.summaries
+    val (responseSummaries2, predictorSummaries2) = preparedFeatures2.summaries
+    val (responseSummaries3, predictorSummaries3) = preparedFeatures3.summaries
+
+    responseSummaries1 should contain theSameElementsAs
+      Seq(responseKey1 -> Summary(1.0, 1.0), responseKey2 -> Summary(0.5, 0.5))
+    predictorSummaries1 should contain theSameElementsAs
+      Seq(predictorKey1 -> Summary(0.0, 0.0), predictorKey2A -> Summary(2.0, 2.0), predictorKey2B -> Summary(1.0, 1.0))
+    responseSummaries2 should contain theSameElementsAs
+      Seq(responseKey1 -> Summary(0.0, 0.0))
+    predictorSummaries2 should contain theSameElementsAs
+      Seq(predictorKey1 -> Summary(0.4, 0.5))
+    responseSummaries3 should contain theSameElementsAs
+      Seq(responseKey2 -> Summary(-0.5, -0.5))
+    predictorSummaries3 should contain theSameElementsAs
+      Seq(predictorKey2A -> Summary(1.0, 1.0))
+    allResponseSummaries should contain theSameElementsAs
+      Seq(responseKey1 -> Summary(0.0, 1.0), responseKey2 -> Summary(-0.5, 0.5))
+    allPredictorSummaries should contain theSameElementsAs
+      Seq(predictorKey1 -> Summary(0.0, 0.5), predictorKey2A -> Summary(1.0, 2.0), predictorKey2B -> Summary(1.0, 1.0))
+  }
+
+  it should "produce correct null-label leakage vector with single response" in {
+    preparedFeatures1.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual
+      Array(1.0, 0.0, 0.0, 0.0)
+
+    preparedFeatures2.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual
+      Array(0.0, 0.0, 1.0, 1.0)
+
+    preparedFeatures3.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual
+      Array(0.0, 1.0, 0.0, 1.0)
+  }
+
+  it should "produce correct null-label leakage vector with multiple responses" in {
+    preparedFeatures1.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual
+      Array(1.0, 0.5, 0.0, 0.0, 0.0)
+
+    preparedFeatures2.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual
+      Array(0.0, 0.0, 0.0, 1.0, 1.0)
+
+    preparedFeatures3.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual
+      Array(0.0, -0.5, 1.0, 0.0, 1.0)
+  }
+
+  it should "produce correct null-label leakage Pearson correlation matrix with multiple responses" in {
+    val expected = Seq(
+      Array(1.0, 0.87, -0.5, -0.5, -1.0),
+      Array(1.0, -0.87, 0.0, -0.87),
+      Array(1.0, -0.5, 0.5),
+      Array(1.0, 0.5),
+      Array(1.0))
+    testCorrMatrix(allResponseKeys1, CorrelationType.Pearson, expected)
+  }
+
+  it should "produce correct null-label leakage Spearman correlation matrix with multiple responses" in {
+    val expected = Seq(
+      Array(1.0, 0.87, -0.5, -0.5, -1.0),
+      Array(1.0, -0.87, 0.0, -0.87),
+      Array(1.0, -0.5, 0.5),
+      Array(1.0, 0.5),
+      Array(1.0))
+    testCorrMatrix(allResponseKeys1, CorrelationType.Spearman, expected)
+  }
+
+  it should "produce correct null-label leakage Pearson correlation matrix with single response" in {
+    val expected = Seq(
+      Array(1.0, -0.5, -0.5, -1.0),
+      Array(1.0, -0.5, 0.5),
+      Array(1.0, 0.5),
+      Array(1.0))
+    testCorrMatrix(allResponseKeys2, CorrelationType.Pearson, expected)
+  }
+
+  it should "produce correct null-label leakage Spearman correlation matrix with single response" in {
+    val expected = Seq(
+      Array(1.0, -0.5, -0.5, -1.0),
+      Array(1.0, -0.5, 0.5),
+      Array(1.0, 0.5),
+      Array(1.0))
+    testCorrMatrix(allResponseKeys2, CorrelationType.Spearman, expected)
+  }
+
+  def testCorrMatrix(
+    responseKeys: Array[FeatureKey],
+    correlationType: CorrelationType,
+    expectedResult: Seq[Array[Double]]
+  ): Unit = {
+    val corrRDD =
+      sc.parallelize(allPreparedFeatures.map(_.getNullLabelLeakageVector(responseKeys, allPredictorKeys1)))
+    val corrMatrix = Statistics.corr(corrRDD, correlationType.sparkName)
+
+    corrMatrix.colIter.zipWithIndex.map { case(vec, idx) =>
+      // It's symmetric, so can drop based on index
+      vec.toArray.drop(idx).map(BigDecimal(_).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble)
+    }.toSeq should contain theSameElementsInOrderAs expectedResult
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
index add784ec66..6930995887 100644
--- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala
@@ -33,164 +33,63 @@ package com.salesforce.op.filters
 
 import com.salesforce.op.OpParams
 import com.salesforce.op.features.{OPFeature, TransientFeature}
+import com.salesforce.op.readers.DataFrameFieldNames
 import com.salesforce.op.stages.impl.feature.HashAlgorithm
-import com.salesforce.op.test.PassengerSparkFixtureTest
+import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest}
 import com.salesforce.op.utils.spark.RichDataset._
+import com.twitter.algebird.Operators._
 import org.apache.spark.mllib.feature.HashingTF
+import org.apache.spark.sql.DataFrame
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 @RunWith(classOf[JUnitRunner])
-class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
-
-  private val eps = 1E-2
-
-  private val trainSummaries = Seq(
-    FeatureDistrib("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistrib("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistrib("C", Some("1"), 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistrib("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistrib("D", Some("1"), 10, 9, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistrib("D", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty)
-  )
-
-  private val scoreSummaries = Seq(
-    FeatureDistrib("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistrib("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistrib("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), Array.empty),
-    FeatureDistrib("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistrib("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty),
-    FeatureDistrib("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty)
-  )
-
-  Spec[Summary] should "be correctly created from a sequence of features" in {
-    val f1 = Left(Seq("a", "b", "c"))
-    val f2 = Right(Seq(0.5, 1.0))
-    val f1s = Summary(f1)
-    val f2s = Summary(f2)
-    f1s.min shouldBe 3
-    f1s.max shouldBe 3
-    f2s.min shouldBe 0.5
-    f2s.max shouldBe 1.0
-  }
-
-  Spec[FeatureDistrib] should "be correctly created for features" in {
-    val features = Array(survived, age, gender, height, weight).map(TransientFeature.apply)
-    val values: Array[(Boolean, FeatureDistrib.ProcessedSeq)] = Array(
-      (false, Right(Seq(1.0))), (true, Right(Seq.empty[Double])), (false, Left(Seq("male", "female"))),
-      (true, Left(Seq.empty[String])), (false, Right(Seq(1.0, 3.0, 5.0)))
-    )
-    val summary = Array(Summary(0.0, 1.0), Summary(-1.6, 10.6), Summary(0.0, 3.0), Summary(0.0, 0.0), Summary(1.0, 5.0))
-    val bins = 10
-    val hasher: HashingTF = new HashingTF(numFeatures = bins)
-      .setBinary(false)
-      .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
-
-    val distribs = FeatureDistrib.getDistributions(features, values, summary, bins, hasher)
-    distribs.foreach{ d =>
-      d.key shouldBe None
-      d.count shouldBe 1
-      d.distribution.length shouldBe bins
-    }
-    distribs(0).nulls shouldBe 0
-    distribs(1).nulls shouldBe 1
-    distribs(1).distribution.sum shouldBe 0
-    distribs(2).distribution.sum shouldBe 2
-    distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0)
-    distribs(3).distribution.sum shouldBe 0
-    distribs(4).distribution.sum shouldBe 3
-    distribs(4).summaryInfo.length shouldBe bins
-  }
-
-  it should "be correctly created for map features" in {
-    val features = Array(stringMap, numericMap, booleanMap).map(TransientFeature.apply)
-    val values: Array[Map[String, FeatureDistrib.ProcessedSeq]] = Array(
-      Map("A" -> Left(Seq("male", "female"))),
-      Map("A" -> Right(Seq(1.0)), "B" -> Right(Seq(1.0))),
-      Map("B" -> Right(Seq(0.0))))
-    val summary = Array(
-      Map("A" -> Summary(0.0, 1.0), "B" -> Summary(0.0, 5.0)),
-      Map("A" -> Summary(-1.6, 10.6), "B" -> Summary(0.0, 3.0)),
-      Map("B" -> Summary(0.0, 0.0)))
-    val bins = 10
-    val hasher: HashingTF = new HashingTF(numFeatures = bins)
-      .setBinary(false)
-      .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
-
-    val distribs = FeatureDistrib.getMapDistributions(features, values, summary, bins, hasher)
-    distribs.length shouldBe 5
-    distribs.foreach{ d =>
-      d.key.contains("A") || d.key.contains("B") shouldBe true
-      d.count shouldBe 1
-      if (d.name != "booleanMap") d.distribution.length shouldBe bins
-      else d.distribution.length shouldBe 2
-    }
-    distribs(0).nulls shouldBe 0
-    distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 1.0)
-    distribs(1).nulls shouldBe 1
-    distribs(0).distribution.sum shouldBe 2
-    distribs(1).distribution.sum shouldBe 0
-    distribs(2).summaryInfo.length shouldBe bins
-    distribs(2).distribution.sum shouldBe 1
-    distribs(4).distribution(0) shouldBe 1
-    distribs(4).distribution(1) shouldBe 0
-    distribs(4).summaryInfo.length shouldBe 2
-  }
-
-  it should "correctly compare fill rates" in {
-    val fd1 = FeatureDistrib("A", None, 10, 1, Array.empty, Array.empty)
-    val fd2 = FeatureDistrib("A", None, 20, 20, Array.empty, Array.empty)
-    fd1.relativeFillRate(fd2) shouldBe 0.9
-  }
-
-  it should "correctly compare relative fill rates" in {
-    val fd1 = FeatureDistrib("A", None, 10, 1, Array.empty, Array.empty)
-    val fd2 = FeatureDistrib("A", None, 20, 19, Array.empty, Array.empty)
-    trainSummaries(0).relativeFillRatio(scoreSummaries(0)) shouldBe 4.5
-    trainSummaries(2).relativeFillRatio(scoreSummaries(2)) shouldBe 1.0
-    fd1.relativeFillRatio(fd2) shouldBe 18.0
-  }
-
-  it should "correctly compute the DS divergence" in {
-    val fd1 = FeatureDistrib("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistrib("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
-    fd1.jsDivergence(fd2) should be < eps
-
-    val fd3 = FeatureDistrib("A", None, 10, 1, Array(0, 0, 1000, 1000, 0), Array.empty)
-    fd3.jsDivergence(fd3) should be < eps
-    val fd4 = FeatureDistrib("A", None, 20, 20, Array(200, 800, 0, 0, 1200), Array.empty)
-    (fd3.jsDivergence(fd4) - 1.0) should be < eps
-  }
-
+class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData {
   Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in {
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.8, Double.PositiveInfinity, 0.7)
-    val summaries = filter.computeFeatureStats(passengersDataSet, features)
+    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.8, Double.PositiveInfinity, 0.7, 1.0)
+    val allFeatureInfo = filter.computeFeatureStats(passengersDataSet, features)
 
-    summaries.featureSummaries.size shouldBe 7
-    summaries.mapFeatureSummaries.size shouldBe 3
-    summaries.featureDistributions.size shouldBe 13
+    allFeatureInfo.responseSummaries.size shouldBe 1
+    allFeatureInfo.responseSummaries.headOption.map(_._2) shouldEqual Option(Summary(0, 1))
+    allFeatureInfo.responseDistributions.size shouldBe 1
+    allFeatureInfo.predictorSummaries.size shouldBe 12
+    allFeatureInfo.predictorDistributions.size shouldBe 12
 
-    val surv = summaries.featureDistributions(0)
+    val surv = allFeatureInfo.responseDistributions(0)
     surv.name shouldBe survived.name
     surv.key shouldBe None
     surv.count shouldBe 6
     surv.nulls shouldBe 4
     surv.distribution.sum shouldBe 2
-    val strMapF = summaries.featureDistributions(7)
+
+    val ageF = allFeatureInfo.predictorDistributions.filter(_.name == age.name)(0)
+    ageF.name shouldBe age.name
+    ageF.key shouldBe None
+    ageF.count shouldBe 6
+    ageF.nulls shouldBe 2
+    ageF.distribution.sum shouldBe 4
+
+    val strMapF =
+      allFeatureInfo.predictorDistributions.filter(d => d.name == stringMap.name && d.key == Option("Female"))(0)
+
     strMapF.name shouldBe stringMap.name
     if (strMapF.key.contains("Female")) strMapF.nulls shouldBe 3 else strMapF.nulls shouldBe 4
-    val strMapM = summaries.featureDistributions(8)
+
+    val strMapM =
+      allFeatureInfo.predictorDistributions.filter(d => d.name == stringMap.name && d.key == Option("Male"))(0)
+
     strMapM.name shouldBe stringMap.name
     if (strMapM.key.contains("Male")) strMapM.nulls shouldBe 4 else strMapM.nulls shouldBe 3
   }
 
   it should "correctly determine which features to exclude based on the stats of training fill rate" in {
     // only fill rate matters
-    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0)
-    val (excludedTrainF, excludedTrainMK) = filter.getFeaturesToExclude(trainSummaries, Seq.empty)
+    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val (excludedTrainF, excludedTrainMK) =
+      filter.getFeaturesToExclude(trainSummaries, Seq.empty, Map.empty)
     excludedTrainF.toSet shouldEqual Set("B", "D")
     excludedTrainMK.keySet shouldEqual Set("C")
     excludedTrainMK.head._2 shouldEqual Set("2")
@@ -199,8 +98,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
   it should "correctly determine which features to exclude based on the stats of training and scoring fill rate" in {
     // only fill rate matters
 
-    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0)
-    val (excludedBothF, excludedBothMK) = filter.getFeaturesToExclude(trainSummaries, scoreSummaries)
+    val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0, 1.0)
+    val (excludedBothF, excludedBothMK) =
+      filter.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty)
     excludedBothF.toSet shouldEqual Set("B", "D")
     excludedBothMK.keySet shouldEqual Set("C")
     excludedBothMK.head._2 shouldEqual Set("2")
@@ -208,24 +108,27 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
 
   it should "correctly determine which features to exclude based on the stats of relative fill rate" in {
     // relative fill rate matters
-    val filter2 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 0.5, Double.PositiveInfinity, 1.0)
-    val (excludedBothRelF, excludedBothRelMK) = filter2.getFeaturesToExclude(trainSummaries, scoreSummaries)
+    val filter2 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 0.5, Double.PositiveInfinity, 1.0, 1.0)
+    val (excludedBothRelF, excludedBothRelMK) =
+      filter2.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty)
     excludedBothRelF.toSet shouldEqual Set("A")
     excludedBothRelMK.isEmpty shouldBe true
   }
 
   it should "correctly determine which features to exclude based on the stats of fill rate ratio" in {
     // relative fill ratio matters
-    val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, 2.0, 1.0)
-    val (excludedBothRelFR, excludedBothRelMKR) = filter4.getFeaturesToExclude(trainSummaries, scoreSummaries)
+    val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, 2.0, 1.0, 1.0)
+    val (excludedBothRelFR, excludedBothRelMKR) =
+      filter4.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty)
     excludedBothRelFR.toSet shouldEqual Set("D", "A", "B")
     excludedBothRelMKR.isEmpty shouldBe true
   }
 
   it should "correctly determine which features to exclude based on the stats of js distance" in {
     // js distance
-    val filter3 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, Double.PositiveInfinity, 0.5)
-    val (excludedBothDistF, excludedBothDistMK) = filter3.getFeaturesToExclude(trainSummaries, scoreSummaries)
+    val filter3 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, Double.PositiveInfinity, 0.5, 1.0)
+    val (excludedBothDistF, excludedBothDistMK) =
+      filter3.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty)
     excludedBothDistF.isEmpty shouldEqual true
     excludedBothDistMK.keySet shouldEqual Set("C")
     excludedBothDistMK.head._2 shouldEqual Set("1")
@@ -233,8 +136,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
 
   it should "correctly determine which features to exclude based on all the stats" in {
     // all
-    val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.5, Double.PositiveInfinity, 0.5)
-    val (excludedBothAllF, excludedBothAllMK) = filter4.getFeaturesToExclude(trainSummaries, scoreSummaries)
+    val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.5, Double.PositiveInfinity, 0.5, 1.0)
+    val (excludedBothAllF, excludedBothAllMK) =
+      filter4.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty)
     excludedBothAllF.toSet shouldEqual Set("A", "B", "C", "D")
     excludedBothAllMK.isEmpty shouldBe true
   }
@@ -244,12 +148,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
     val survPred = survived.copy(isResponse = false)
     val features: Array[OPFeature] =
       Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0)
     val (df, toDrop) = filter.generateFilteredRaw(features, params)
     toDrop.isEmpty shouldBe true
     df.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
 
-    val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0)
+    val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0)
     val (df1, toDrop1) = filter1.generateFilteredRaw(features, params)
     toDrop1 should contain theSameElementsAs Array(survPred)
     df1.schema.fields.exists(_.name == survPred.name) shouldBe false
@@ -260,7 +164,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
     val params = new OpParams()
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0)
     val (df, toDrop) = filter.generateFilteredRaw(features, params)
     toDrop.isEmpty shouldBe true
     df.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields
@@ -272,14 +176,95 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest {
     val params = new OpParams()
     val features: Array[OPFeature] =
       Array(survived, age, gender, height, weight, description, boarded)
-    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2)
+    val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9)
     val (df, toDrop) = filter.generateFilteredRaw(features, params)
     toDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
-    df.schema.fields.map(_.name) should contain theSameElementsAs Array("key", "survived")
+    df.schema.fields.map(_.name) should contain theSameElementsAs
+      Array(DataFrameFieldNames.KeyFieldName, survived.name)
 
-    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, Set("age", "gender"))
+    val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9,
+      protectedFeatures = Set(age.name, gender.name))
     val (df2, toDrop2) = filter2.generateFilteredRaw(features, params)
     toDrop2.toSet shouldEqual Set(height, weight, description, boarded)
-    df2.schema.fields.map(_.name) should contain theSameElementsAs Array("key", "survived", "age", "gender")
+    df2.schema.fields.map(_.name) should contain theSameElementsAs
+      Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name)
+  }
+
+  it should "not drop JS divergence-protected features based on JS divergence check" in {
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime)
+    val filter = new RawFeatureFilter(
+      trainingReader = dataReader,
+      scoreReader = Some(simpleReader),
+      bins = 10,
+      minFill = 0.0,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 0.0,
+      maxCorrelation = 1.0,
+      jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name))
+
+    val (df, toDrop) = filter.generateFilteredRaw(features, params)
+    toDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded)
+    df.schema.fields.map(_.name) should contain theSameElementsAs
+      Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.9" in {
+    val expectedDropped = Seq(boarded, weight, gender)
+    val expectedMapKeys = Seq("Female", "Male")
+    nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.6" in {
+    val expectedDropped = Seq(boarded, weight, gender, age)
+    val expectedMapKeys = Seq("Female", "Male")
+    nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.4" in {
+    val expectedDropped = Seq(boarded, weight, gender, age, description)
+    val expectedMapKeys = Seq("Male")
+    nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys)
+  }
+
+  it should "correctly drop features based on null-label leakage correlation greater than 0.3" in {
+    val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap)
+    nullLabelCorrelationTest(0.3, expectedDropped, Seq())
+  }
+
+  private def nullLabelCorrelationTest(
+    maxCorrelation: Double,
+    expectedDropped: Seq[OPFeature],
+    expectedMapKeys: Seq[String]
+  ): Unit = {
+    def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter(
+      trainingReader = dataReader,
+      scoreReader = Some(simpleReader),
+      bins = 10,
+      minFill = 0.0,
+      maxFillDifference = 1.0,
+      maxFillRatioDiff = Double.PositiveInfinity,
+      maxJSDivergence = 1.0,
+      maxCorrelation = maxCorrelation)
+
+    val params = new OpParams()
+    val features: Array[OPFeature] =
+      Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap)
+    val (df, dropped) = getFilter(maxCorrelation).generateFilteredRaw(features, params)
+
+    dropped should contain theSameElementsAs expectedDropped.toSeq
+    df.schema.fields.map(_.name) should contain theSameElementsAs
+      DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name)
+    if (expectedMapKeys.nonEmpty) {
+      df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+      df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+      df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys
+    } else {
+      intercept[IllegalArgumentException] { df.collect(booleanMap) }
+      intercept[IllegalArgumentException] { df.collect(numericMap) }
+      intercept[IllegalArgumentException] { df.collect(stringMap) }
+    }
   }
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala b/core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala
similarity index 76%
rename from core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala
rename to core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala
index ba364d439d..4bbb47b5ee 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala
@@ -29,21 +29,23 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package com.salesforce.op.stages.base.ternary
+package com.salesforce.op.filters
 
-import com.salesforce.op.features.types.Text
-import com.salesforce.op.test._
-import org.apache.spark.ml.param.ParamMap
+import com.salesforce.op.test.TestCommon
 import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
 
 @RunWith(classOf[JUnitRunner])
-class TernaryTransformerTest extends FlatSpec with TestCommon {
-
-  Spec[TernaryLambdaTransformer[_, _, _, _]] should "copy successfully" in {
-    val tr = new TernaryLambdaTransformer[Text, Text, Text, Text](operationName = "foo", transformFn = (x, y, z) => x)
-    tr.copy(new ParamMap()).uid shouldBe tr.uid
+class SummaryTest extends FlatSpec with TestCommon {
+  Spec[Summary] should "be correctly created from a sequence of features" in {
+    val f1 = Left(Seq("a", "b", "c"))
+    val f2 = Right(Seq(0.5, 1.0))
+    val f1s = Summary(f1)
+    val f2s = Summary(f2)
+    f1s.min shouldBe 3
+    f1s.max shouldBe 3
+    f2s.min shouldBe 0.5
+    f2s.max shouldBe 1.0
   }
-
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala
deleted file mode 100644
index 51faecf92c..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.base.ternary
-
-import breeze.numerics.abs
-import com.salesforce.op.UID
-import com.salesforce.op.features.Feature
-import com.salesforce.op.features.types._
-import com.salesforce.op.test.PassengerSparkFixtureTest
-import com.salesforce.op.utils.spark.RichDataset._
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.types._
-import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
-import org.scalatest.junit.JUnitRunner
-
-
-@RunWith(classOf[JUnitRunner])
-class TernaryEstimatorTest extends FlatSpec with PassengerSparkFixtureTest {
-
-  val testEstimator: TernaryEstimator[MultiPickList, Binary, RealMap, Real] = new TripleInteractionsEstimator()
-
-  Spec[TernaryEstimator[_, _, _, _]] should "error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testEstimator.getOutput())
-  }
-
-  it should "return a single output feature of the correct type" in {
-    val outputFeatures = testEstimator.setInput(gender, survived, numericMap).getOutput()
-    outputFeatures shouldBe new Feature[Real](
-      name = testEstimator.getOutputFeatureName,
-      originStage = testEstimator,
-      isResponse = true,
-      parents = Array(gender, survived, numericMap)
-    )
-  }
-
-  it should "return a TernaryModel with the estimator as the parent and the correct function" in {
-    val testModel = testEstimator.setInput(gender, survived, numericMap).fit(passengersDataSet)
-
-    testModel.parent shouldBe testEstimator
-    abs(
-      testModel.transformFn(Seq("male").toMultiPickList, false.toBinary, Map("male" -> 1.2).toRealMap).value.get - 0.0
-    ) should be < 0.000000002
-
-    testModel.transformFn(Seq("male").toMultiPickList, true.toBinary, Map("male" -> 1.2).toRealMap).value shouldBe None
-    abs(
-      testModel.transformFn(Seq("male").toMultiPickList, false.toBinary, Map("male" -> 2.2).toRealMap).value.get - 1.0
-    ) should be < 0.000000002
-  }
-
-  it should "create a TernaryModel that uses the specified transform function when fit" in {
-    val testModel = testEstimator.setInput(gender, survived, numericMap).fit(passengersDataSet)
-    val testDataTransformed = testModel.setInput(gender, survived, numericMap)
-      .transform(passengersDataSet.select(gender.name, survived.name, numericMap.name))
-
-    testDataTransformed.schema shouldEqual StructType(
-      Seq(StructField(gender.name, ArrayType(StringType, true), true),
-        StructField(survived.name, BooleanType, true),
-        StructField(numericMap.name, MapType(StringType, DoubleType, true), true),
-        StructField(testEstimator.getOutputFeatureName, DoubleType, true)))
-
-    testDataTransformed.collect(gender, survived, numericMap, testModel.getOutput()) shouldEqual Array(
-      (Set("Male").toMultiPickList, false.toBinary, new RealMap(Map("Male" -> 2.0)), 0.8.toReal),
-      (Seq().toMultiPickList, true.toBinary, new RealMap(Map()), new Real(None)),
-      (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996)),
-      (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996)),
-      (Set("Male").toMultiPickList, new Binary(None), Map("Male" -> 1.0).toRealMap, new Real(-0.19999999999999996)),
-      (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996))
-    )
-  }
-
-  it should "copy itself and the model successfully" in {
-    val est = new TripleInteractionsEstimator()
-    val mod = new TripleInteractionsModel(0.0, est.operationName, est.uid)
-
-    est.copy(new ParamMap()).uid shouldBe est.uid
-    mod.copy(new ParamMap()).uid shouldBe mod.uid
-  }
-
-}
-
-
-class TripleInteractionsEstimator(uid: String = UID[TripleInteractionsEstimator])
-  extends TernaryEstimator[MultiPickList, Binary, RealMap, Real](operationName = "tripleInteractions", uid = uid)
-    with TripleInteractions {
-
-  // scalastyle:off line.size.limit
-  def fitFn(dataset: Dataset[(MultiPickList#Value, Binary#Value, RealMap#Value)]): TernaryModel[MultiPickList, Binary, RealMap, Real] = {
-    import dataset.sparkSession.implicits._
-    val mean = {
-      dataset.map { case (gndr, srvvd, nmrcMp) =>
-        if (survivedAndMatches(gndr, srvvd, nmrcMp)) nmrcMp(gndr.head) else 0.0
-      }.filter(_ != 0.0).groupBy().mean().first().getDouble(0)
-    }
-    new TripleInteractionsModel(mean = mean, operationName = operationName, uid = uid)
-  }
-  // scalastyle:on
-
-}
-
-final class TripleInteractionsModel private[op](val mean: Double, operationName: String, uid: String)
-  extends TernaryModel[MultiPickList, Binary, RealMap, Real](operationName = operationName, uid = uid)
-    with TripleInteractions {
-
-  def transformFn: (MultiPickList, Binary, RealMap) => Real = (g: MultiPickList, s: Binary, nm: RealMap) => new Real(
-    if (!survivedAndMatches(g.value, s.value, nm.value)) None
-    else Some(nm.value(g.value.head) - mean)
-  )
-
-}
-
-sealed trait TripleInteractions {
-  def survivedAndMatches(g: MultiPickList#Value, s: Binary#Value, nm: RealMap#Value): Boolean =
-    !s.getOrElse(false) && g.nonEmpty && nm.contains(g.head)
-}
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala
deleted file mode 100644
index fd14ddac66..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.base.unary
-
-import com.salesforce.op.UID
-import com.salesforce.op.features.Feature
-import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.RichDataset._
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType}
-import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
-import org.scalatest.junit.JUnitRunner
-
-
-@RunWith(classOf[JUnitRunner])
-class UnaryEstimatorTest extends FlatSpec with TestSparkContext {
-
-  val (ds, f1) = TestFeatureBuilder(Seq(1.0, 5.0, 3.0, 2.0, 6.0).toReal)
-
-  val testEstimator: UnaryEstimator[Real, Real] = new MinMaxNormEstimator()
-
-  Spec[UnaryEstimator[_, _]] should "throw an error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testEstimator.getOutput())
-  }
-
-  it should "return a copy with the same uid" in {
-    val newData = new MetadataBuilder().putLong("myKey", 100).build()
-    val copyWithValues = testEstimator.copy(
-      new ParamMap().put(testEstimator.outputMetadata, newData)
-    )
-
-    copyWithValues.isInstanceOf[UnaryEstimator[_, _]]
-    copyWithValues.uid shouldBe testEstimator.uid
-    copyWithValues.getMetadata() shouldBe newData
-  }
-
-  it should "return a single output feature of the correct type" in {
-    val outputFeatures = testEstimator.setInput(f1).getOutput()
-
-    outputFeatures shouldBe new Feature[Real](
-      name = testEstimator.getOutputFeatureName,
-      originStage = testEstimator,
-      isResponse = false,
-      parents = Array(f1)
-    )
-  }
-
-  it should "return a UnaryModel with the estimator as the parent, a working copy method and the same uid" +
-    " and the correct function" in {
-    val testModel = testEstimator.setInput(f1).fit(ds)
-
-    testModel.parent shouldBe testEstimator
-    testModel.transformFn(1.0.toReal) shouldBe 0.0.toReal
-    testModel.copy(new ParamMap()).uid shouldBe testEstimator.uid
-  }
-
-  it should "create a UnaryModel transformer when it is fit" in {
-    val testModel = testEstimator.setInput(f1).fit(ds)
-    val testDataTransformed = testModel.setInput(f1).transform(ds)
-    val outputFeatures = testModel.getOutput()
-    val transformedValues = testDataTransformed.collect(f1, outputFeatures)
-
-    val expectedTypes =
-      StructType(Seq(StructField(f1.name, DoubleType, true),
-        StructField(outputFeatures.name, DoubleType, true)))
-
-    testDataTransformed.schema shouldEqual expectedTypes
-    transformedValues shouldEqual
-      Array((1.0, 0.0), (5.0, 0.8), (3.0, 0.4), (2.0, 0.2), (6.0, 1.0)).map(v => v._1.toReal -> v._2.toReal)
-  }
-
-    it should "copy itself and the model successfully" in {
-      val est = new MinMaxNormEstimator()
-      val mod = new MinMaxNormEstimatorModel(0.0, 0.0, est.operationName, est.uid)
-
-      est.copy(new ParamMap()).uid shouldBe est.uid
-      mod.copy(new ParamMap()).uid shouldBe mod.uid
-    }
-
-}
-
-class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
-  extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {
-
-  def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
-    val grouped = dataset.groupBy()
-    val maxVal = grouped.max().first().getDouble(0)
-    val minVal = grouped.min().first().getDouble(0)
-    new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid)
-  }
-}
-
-final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String)
-  extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
-  def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal
-}
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala
deleted file mode 100644
index 8548be79d1..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.base.unary
-
-import com.salesforce.op.features.Feature
-import com.salesforce.op.features.types._
-import com.salesforce.op.test.PassengerSparkFixtureTest
-import com.salesforce.op.utils.spark.RichDataset._
-import com.salesforce.op.utils.spark.RichRow._
-import org.apache.spark.ml.param.ParamMap
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
-
-@RunWith(classOf[JUnitRunner])
-class UnaryTransformerTest extends FlatSpec with PassengerSparkFixtureTest {
-
-  val scaleBy2 = new UnaryLambdaTransformer[Real, Real](operationName = "unary",
-    transformFn = r => r.v.map(_ * 2.0).toReal
-  )
-
-  val toCat = new UnaryLambdaTransformer[Real, MultiPickList](operationName = "cat",
-    transformFn = value => Set(value.v.getOrElse(0.0).toString).toMultiPickList
-  )
-
-  Spec[UnaryLambdaTransformer[_, _]] should "return single properly formed Feature" in {
-    scaleBy2.setInput(weight)
-    val feats = scaleBy2.getOutput()
-
-    feats shouldBe new Feature[Real](
-      name = scaleBy2.getOutputFeatureName,
-      originStage = scaleBy2,
-      isResponse = false,
-      parents = Array(weight)
-    )
-  }
-
-  it should "add column to DataFrame when transformed" in {
-    scaleBy2.setInput(weight)
-    val transformedData = scaleBy2.transform(passengersDataSet)
-    val output = scaleBy2.getOutput()
-    val answer = passengersArray.map(r => scaleBy2.transformFn(r.getFeatureType[Real](weight)))
-    transformedData.columns.contains(scaleBy2.getOutputFeatureName) shouldBe true
-    transformedData.collect(output) shouldBe answer
-  }
-
-  it should "work when returning a MultiPickList feature" in {
-    toCat.setInput(weight)
-    val transformedData = toCat.transform(passengersDataSet)
-    val output = toCat.getOutput()
-    val answer = passengersArray.map(r => toCat.transformFn(r.getFeatureType[Real](weight)))
-    transformedData.columns.contains(toCat.getOutputFeatureName)  shouldBe true
-    transformedData.collect(output) shouldBe answer
-  }
-
-  it should "copy successfully" in {
-    val copy = scaleBy2.copy(new ParamMap())
-    copy shouldBe a[UnaryTransformer[_, _]]
-    copy.uid shouldBe scaleBy2.uid
-  }
-
-}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala b/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala
new file mode 100644
index 0000000000..5bfe525a6e
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl
+
+import com.salesforce.op.features.types.Prediction
+import com.salesforce.op.test.OpEstimatorSpec
+import org.scalactic.Equality
+
+trait PredictionEquality {
+
+  self: OpEstimatorSpec[Prediction, _, _] =>
+
+  abstract override implicit val featureTypeEquality = new Equality[Prediction] {
+    def areEqual(a: Prediction, b: Any): Boolean = b match {
+      case s: Prediction =>
+        val keyset = a.v.keySet.union(s.v.keySet)
+        keyset.forall(k => math.abs(a.v(k) - s.v(k)) < 0.01)
+      case _ => false
+    }
+  }
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala
index 655dc29471..c375a58d5b 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala
@@ -460,7 +460,7 @@ class BinaryClassificationModelSelectorTest extends FlatSpec with TestSparkConte
 
     testEstimator.evaluators.foreach {
       case evaluator: OpBinaryClassificationEvaluator => {
-        MultiClassEvalMetrics.values.foreach(metric =>
+        BinaryClassEvalMetrics.values.foreach(metric =>
           Seq(trainMetaData, holdOutMetaData).foreach(
             metadata => assert(metadata.contains(s"(${OpEvaluatorNames.binary})_${metric.entryName}"),
               s"Metric ${metric.entryName} is not present in metadata: " + metadata.json)
diff --git a/core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala
similarity index 65%
rename from core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala
rename to core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala
index 0c72bf52af..0cdb9c5873 100644
--- a/core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala
@@ -29,18 +29,18 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.classification
+package com.salesforce.op.stages.impl.classification
 
-import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
+import com.salesforce.op.features.types.{Prediction, RealNN}
+import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._
 import com.salesforce.op.test._
 import com.salesforce.op.testkit._
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.DataFrame
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
-import org.apache.spark.ml.SparkModelConverter._
-import org.apache.spark.ml.linalg.Vector
-
 
 
 @RunWith(classOf[JUnitRunner])
@@ -62,7 +62,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext {
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
@@ -70,22 +70,24 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext {
 
   Spec[OpLogisticRegressionModel] should "produce the same values as the spark version" in {
     val spk = new LogisticRegression()
+      .setFamily("multinomial")
       .setFeaturesCol(featureV.name)
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
 
   Spec[OpNaiveBayesModel] should "produce the same values as the spark version" in {
     val spk = new NaiveBayes()
+      .setModelType("multinomial")
       .setFeaturesCol(featureV.name)
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk), isMultinomial = true).setInput(labelF, featureV)
+    val op = toOP(spk, uid = spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
@@ -96,11 +98,42 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext {
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
+
+    compareOutputs(spk.transform(rawDF), op.transform(rawDF))
+  }
+
+  Spec[OpGBTClassificationModel] should "produce the same values as the spark version" in {
+    val spk = new GBTClassifier()
+      .setFeaturesCol(featureV.name)
+      .setLabelCol(labelF.name)
+      .fit(rawDF)
 
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
 
+  Spec[OpLinearSVCModel] should "produce the same values as the spark version" in {
+    val spk = new LinearSVC()
+      .setFeaturesCol(featureV.name)
+      .setLabelCol(labelF.name)
+      .fit(rawDF)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
+
+    compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 3)
+  }
+
+  Spec[OpMultilayerPerceptronClassificationModel] should "produce the same values as the spark version" in {
+    val spk = new MultilayerPerceptronClassifier()
+      .setLayers(Array(10, 5, 4, 2)) // this is hard to generalize input layer must = number of features
+      // output layer must equal number of labels
+      .setFeaturesCol(featureV.name)
+      .setLabelCol(labelF.name)
+      .fit(rawDF)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
+    compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2)
+  }
+
   def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = {
 
     def keysStartsWith(name: String, value: Map[String, Double]): Array[Double] = {
@@ -116,4 +149,13 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext {
       r1.getAs[Vector](2).toArray shouldEqual keysStartsWith(Prediction.Keys.RawPredictionName, map)
     }
   }
+
+  def compareOutputsPred(df1: DataFrame, df2: DataFrame, predIndex: Int): Unit = {
+    val sorted1 = df1.collect().sortBy(_.getAs[Double](predIndex))
+    val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName))
+    sorted1.zip(sorted2).foreach{ case (r1, r2) =>
+      val map = r2.getAs[Map[String, Double]](2)
+      r1.getAs[Double](predIndex) shouldEqual map(Prediction.Keys.PredictionName)
+    }
+  }
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala
new file mode 100644
index 0000000000..7856f57585
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction,
+  OpPredictorWrapperModel[DecisionTreeClassificationModel],
+  OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality {
+
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+    Seq[(RealNN, OPVector)](
+      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
+      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
+      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
+      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
+      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
+    )
+  )
+  val feature1 = rawFeature1.copy(isResponse = true)
+  val estimator = new OpDecisionTreeClassifier().setInput(feature1, feature2)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)),
+    Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)),
+    Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)),
+    Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)),
+    Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)),
+    Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)),
+    Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)),
+    Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0))
+  )
+
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxDepth(6)
+      .setMaxBins(2)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.1)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxDepth shouldBe 6
+    estimator.predictor.getMaxBins shouldBe 2
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.1
+  }
+}
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala
new file mode 100644
index 0000000000..49bba49cf0
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpGBTClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTClassificationModel],
+  OpPredictorWrapper[GBTClassifier, GBTClassificationModel]] with PredictionEquality {
+
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+    Seq[(RealNN, OPVector)](
+      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
+      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
+      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
+      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
+      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
+    )
+  )
+  val feature1 = rawFeature1.copy(isResponse = true)
+  val estimator = new OpGBTClassifier().setInput(feature1, feature2)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)),
+    Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)),
+    Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)),
+    Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)),
+    Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)),
+    Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)),
+    Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)),
+    Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04))
+  )
+
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxIter(10)
+      .setMaxDepth(6)
+      .setMaxBins(2)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.1)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxIter shouldBe 10
+    estimator.predictor.getMaxDepth shouldBe 6
+    estimator.predictor.getMaxBins shouldBe 2
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.1
+
+  }
+}
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala
new file mode 100644
index 0000000000..83e13e3784
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpLinearSVCTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearSVCModel],
+  OpPredictorWrapper[LinearSVC, LinearSVCModel]] with PredictionEquality {
+
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+    Seq[(RealNN, OPVector)](
+      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
+      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
+      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
+      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
+      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
+    )
+  )
+  val feature1 = rawFeature1.copy(isResponse = true)
+  val estimator = new OpLinearSVC().setInput(feature1, feature2)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Vectors.dense(Array(-1.33, 1.33))),
+    Prediction(0.0, Vectors.dense(Array(1.04, -1.04))),
+    Prediction(0.0, Vectors.dense(Array(2.69, -2.69))),
+    Prediction(1.0, Vectors.dense(Array(-1.32, 1.32))),
+    Prediction(1.0, Vectors.dense(Array(-2.11, 2.11))),
+    Prediction(0.0, Vectors.dense(Array(4.41, -4.41))),
+    Prediction(1.0, Vectors.dense(Array(-1.46, 1.46))),
+    Prediction(0.0, Vectors.dense(Array(1.42, -1.42)))
+  )
+
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setRegParam(0.1)
+      .setMaxIter(20)
+      .setTol(1E-4)
+    estimator.fit(inputData)
+
+    estimator.predictor.getRegParam shouldBe 0.1
+    estimator.predictor.getMaxIter shouldBe 20
+    estimator.predictor.getTol shouldBe 1E-4
+  }
+}
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala
index 1eac82c03b..d647a6ccb5 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala
@@ -32,19 +32,20 @@
 package com.salesforce.op.stages.impl.classification
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.sparkwrappers.generic._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import org.apache.spark.ml.classification.LogisticRegressionModel
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.linalg.Vectors
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
 
 
 @RunWith(classOf[JUnitRunner])
-class OpLogisticRegressionTest extends FlatSpec with TestSparkContext {
+class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel],
+  OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality {
 
-  val (testData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
     Seq[(RealNN, OPVector)](
       1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
       0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
@@ -57,102 +58,31 @@ class OpLogisticRegressionTest extends FlatSpec with TestSparkContext {
     )
   )
   val feature1 = rawFeature1.copy(isResponse = true)
-  val logReg = new OpLogisticRegression().setInput(feature1, feature2)
-
-  Spec[OpLogisticRegression] should "have properly formed stage1" in {
-    assert(logReg.stage1.isInstanceOf[SwBinaryEstimator[_, _, _, _, _]])
-    val inputNames = logReg.stage1.getInputFeatures().map(_.name)
-    inputNames should have length 2
-    inputNames shouldBe Array(feature1.name, feature2.name)
-    logReg.stage1.getOutput().name shouldBe logReg.stage1.getOutputFeatureName
-    the[IllegalArgumentException] thrownBy {
-      logReg.setInput(feature1.copy(isResponse = true), feature2.copy(isResponse = true))
-    } should have message "The feature vector should not contain any response features."
-  }
-
-  it should "have properly formed stage2" in {
-    assert(logReg.stage2.isInstanceOf[SwTernaryTransformer[_, _, _, _, _]])
-    val inputNames = logReg.stage2.getInputFeatures().map(_.name)
-    inputNames should have length 3
-    inputNames shouldBe Array(feature1.name, feature2.name, logReg.stage1.getOutputFeatureName)
-    logReg.stage2.getOutput().name shouldBe logReg.stage2.getOutputFeatureName
-
-  }
-
-  it should "have properly formed stage3" in {
-    assert(logReg.stage3.isInstanceOf[SwQuaternaryTransformer[_, _, _, _, _, _]])
-    val inputNames = logReg.stage3.getInputFeatures().map(_.name)
-    inputNames should have length 4
-    inputNames shouldBe Array(feature1.name, feature2.name, logReg.stage1.getOutputFeatureName,
-      logReg.stage2.getOutputFeatureName)
-
-    logReg.stage3.getOutput().name shouldBe logReg.stage3.getOutputFeatureName
-  }
-
-  it should "have proper outputs corresponding to the stages" in {
-    val outputs = logReg.getOutput()
-    outputs._1.name shouldBe logReg.stage1.getOutput().name
-    outputs._2.name shouldBe logReg.stage2.getOutput().name
-    outputs._3.name shouldBe logReg.stage3.getOutput().name
-
-    // as long as the parent stages are correct, we can also assume
-    // that the parent features are correct, since that should
-    // be verified in the unit tests for the transformers.
-    outputs._1.originStage shouldBe logReg.stage1
-    outputs._2.originStage shouldBe logReg.stage2
-    outputs._3.originStage shouldBe logReg.stage3
-  }
-
-
-  it should "return a properly formed LogisticRegressionModel when fitted" in {
-    val model = logReg.setSparkParams("maxIter", 10).fit(testData)
-
-    model shouldBe a[SwThreeStageBinaryModel[_, _, _, _, _, _]]
-    model.stage1 shouldBe a[SwBinaryModel[_, _, _, _]]
-
-    val sparkStage = model.stage1.getSparkMlStage()
-    sparkStage.get.isInstanceOf[LogisticRegressionModel]
-    assert(model.stage2.getSparkMlStage().isEmpty)
-    assert(model.stage3.getSparkMlStage().isEmpty)
-
-    model.stage1OperationName shouldBe "LogisticRegression_predictionCol"
-    model.stage2OperationName shouldBe "LogisticRegression_rawPredictionCol"
-    model.stage3OperationName shouldBe "LogisticRegression_probabilityCol"
-
-    val inputNames = model.getInputFeatures().map(_.name)
-    inputNames should have length 2
-    inputNames shouldBe Array(feature1.name, feature2.name)
-  }
+  val estimator = new OpLogisticRegression().setInput(feature1, feature2)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Array(-20.88, 20.88), Array(0.0, 1.0)),
+    Prediction(0.0, Array(16.70, -16.7), Array(1.0, 0.0)),
+    Prediction(0.0, Array(22.2, -22.2), Array(1.0, 0.0)),
+    Prediction(1.0, Array(-18.35, 18.35), Array(0.0, 1.0)),
+    Prediction(1.0, Array(-31.46, 31.46), Array(0.0, 1.0)),
+    Prediction(0.0, Array(24.67, -24.67), Array(1.0, 0.0)),
+    Prediction(1.0, Array(-22.07, 22.07), Array(0.0, 1.0)),
+    Prediction(0.0, Array(20.9, -20.9), Array(1.0, 0.0))
+  )
 
 
   it should "allow the user to set the desired spark parameters" in {
-    logReg.setSparkParams("maxIter", 10).setSparkParams("regParam", 0.1)
-    logReg.getSparkParams("maxIter") shouldBe Some(10)
-    logReg.getSparkParams("regParam") shouldBe Some(0.1)
-
-    logReg.setThresholds(Array(0.03, 0.06)).setElasticNetParam(0.1)
-    logReg.getSparkParams("thresholds").get.asInstanceOf[Array[Double]] should contain theSameElementsAs
-      Array(0.03, 0.06)
-    logReg.getSparkParams("elasticNetParam") shouldBe Some(0.1)
+    estimator
+      .setRegParam(0.1)
+      .setElasticNetParam(0.1)
+      .setMaxIter(20)
+    estimator.fit(inputData)
+
+    estimator.predictor.getRegParam shouldBe 0.1
+    estimator.predictor.getElasticNetParam shouldBe 0.1
+    estimator.predictor.getMaxIter shouldBe 20
   }
-
-  // TODO: move this to OpWorkFlowTest
-  //  it should "work in a workflow" in {
-  //    val (prob, rawpred, pred) = logReg.getOutput()
-  //    val workflow = new OpWorkflow().setResultFeatures(pred)
-  //
-  //    val reader = DataReaders.Simple.custom[LRDataTest](
-  //      readFn = (s: Option[String], spk: SparkSession) => spk.sparkContext.parallelize(DataTest.input)
-  //    )
-  //
-  //    val workflowModel = workflow.setReader(reader).train()
-  //    val scores = workflowModel.score()
-  //    val justScores = scores.select(s"(label)_(features)_((label)_(features)_${stageNames(0)})_" +
-  //      s"((label)_(features)_((label)_(features)_${stageNames(0)})_${stageNames(1)})_${stageNames(2)}")
-  //      .collect().map(_.getAs[Double](0)).toList
-  //    justScores shouldEqual List(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0)
-  //  }
-
 }
 
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala
new file mode 100644
index 0000000000..19efea8605
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction,
+  OpPredictorWrapperModel[MultilayerPerceptronClassificationModel],
+  OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]] with PredictionEquality {
+
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+    Seq[(RealNN, OPVector)](
+      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
+      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
+      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
+      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
+      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
+    )
+  )
+  val feature1 = rawFeature1.copy(isResponse = true)
+  val estimator = new OpMultilayerPerceptronClassifier()
+    .setInput(feature1, feature2)
+    .setLayers(Array(3, 5, 4, 2))
+
+
+  val expectedResult = Seq(
+    Prediction(1.0),
+    Prediction(0.0),
+    Prediction(0.0),
+    Prediction(1.0),
+    Prediction(1.0),
+    Prediction(0.0),
+    Prediction(1.0),
+    Prediction(0.0)
+  )
+
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxIter(50)
+      .setBlockSize(2)
+      .setSeed(42)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxIter shouldBe 50
+    estimator.predictor.getBlockSize shouldBe 2
+    estimator.predictor.getSeed shouldBe 42
+  }
+}
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala
new file mode 100644
index 0000000000..a215cef91c
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpNaiveBayesTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[NaiveBayesModel],
+  OpPredictorWrapper[NaiveBayes, NaiveBayesModel]] with PredictionEquality {
+
+  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
+    Seq[(RealNN, OPVector)](
+      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
+      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
+      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
+      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
+      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
+      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
+    )
+  )
+  val feature1 = rawFeature1.copy(isResponse = true)
+  val estimator = new OpNaiveBayes().setInput(feature1, feature2)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Array(-34.41, -14.85), Array(0.0, 1.0)),
+    Prediction(0.0, Array(-1.07, -1.42), Array(0.58, 0.41)),
+    Prediction(0.0, Array(-9.70, -17.99), Array(1.0, 0.0)),
+    Prediction(1.0, Array(-26.22, -8.33), Array(0.0, 1.0)),
+    Prediction(1.0, Array(-41.93, -16.49), Array(0.0, 1.0)),
+    Prediction(0.0, Array(-8.60, -27.31), Array(1.0, 0.0)),
+    Prediction(1.0, Array(-31.07, -11.44), Array(0.0, 1.0)),
+    Prediction(0.0, Array(-4.54, -6.32), Array(0.85, 0.14))
+  )
+
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setSmoothing(2)
+    estimator.fit(inputData)
+
+    estimator.predictor.getSmoothing shouldBe 2
+  }
+}
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala
new file mode 100644
index 0000000000..ed39afa3c5
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.classification
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpRandomForestClassifierTest extends
+  OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestClassificationModel],
+    OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel]] with PredictionEquality {
+
+  lazy val (inputData, rawLabelMulti, featuresMulti) =
+    TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti",
+      Seq(
+        (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector),
+        (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector),
+        (2.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector),
+        (2.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector),
+        (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector),
+        (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector),
+        (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector),
+        (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector),
+        (2.0.toRealNN, Vectors.dense(1.0, 4.0, 4.5).toOPVector),
+        (2.0.toRealNN, Vectors.dense(10.0, 1.5, 1.0).toOPVector)
+      )
+    )
+
+  val labelMulti = rawLabelMulti.copy(isResponse = true)
+
+  val estimator = new OpRandomForestClassifier().setInput(labelMulti, featuresMulti)
+
+  val expectedResult = Seq(
+    Prediction(1.0, Array(0.0, 17.0, 3.0), Array(0.0, 0.85, 0.15)),
+    Prediction(0.0, Array(19.0, 0.0, 1.0), Array(0.95, 0.0, 0.05)),
+    Prediction(2.0, Array(0.0, 1.0, 19.0), Array(0.0, 0.05, 0.95)),
+    Prediction(2.0, Array(1.0, 2.0, 17.0), Array(0.05, 0.1, 0.85)),
+    Prediction(1.0, Array(0.0, 17.0, 3.0), Array(0.0, 0.85, 0.15)),
+    Prediction(0.0, Array(16.0, 0.0, 4.0), Array(0.8, 0.0, 0.2)),
+    Prediction(1.0, Array(1.0, 17.0, 2.0), Array(0.05, 0.85, 0.1)),
+    Prediction(0.0, Array(17.0, 0.0, 3.0), Array(0.85, 0.0, 0.15)),
+    Prediction(2.0, Array(2.0, 1.0, 17.0), Array(0.1, 0.05, 0.85)),
+    Prediction(2.0, Array(1.0, 2.0, 17.0), Array(0.05, 0.1, 0.85))
+  )
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxDepth(10)
+      .setImpurity(Impurity.Gini.sparkName)
+      .setMaxBins(33)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.2)
+      .setSubsamplingRate(0.9)
+      .setNumTrees(21)
+      .setSeed(2L)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxDepth shouldBe 10
+    estimator.predictor.getMaxBins shouldBe 33
+    estimator.predictor.getImpurity shouldBe Impurity.Gini.sparkName
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.2
+    estimator.predictor.getSubsamplingRate shouldBe 0.9
+    estimator.predictor.getNumTrees shouldBe 21
+    estimator.predictor.getSeed shouldBe 2L
+  }
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala
deleted file mode 100644
index 7c38c7474f..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.impl.classification
-
-import com.salesforce.op._
-import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.classification.Impurity.Gini
-import com.salesforce.op.stages.sparkwrappers.generic._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import org.apache.spark.ml.classification.RandomForestClassificationModel
-import org.apache.spark.ml.linalg.Vectors
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
-
-@RunWith(classOf[JUnitRunner])
-class OpRandomForestTest extends FlatSpec with TestSparkContext {
-
-  val stageNames = Array[String]("RandomForestClassifier_predictionCol", "RandomForestClassifier_rawPredictionCol",
-    "RandomForestClassifier_probabilityCol"
-  )
-
-  lazy val (testData, rawLabel, features) = TestFeatureBuilder[RealNN, OPVector]("label", "features",
-    Seq(
-      (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector),
-      (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector),
-      (0.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector),
-      (1.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector),
-      (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector),
-      (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector),
-      (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector),
-      (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector)
-    )
-  )
-
-  val label = rawLabel.copy(isResponse = true)
-
-  lazy val (multiClassTestData, rawLabelMulti, featuresMulti) =
-    TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti",
-      Seq(
-        (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector),
-        (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector),
-        (2.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector),
-        (2.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector),
-        (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector),
-        (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector),
-        (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector),
-        (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector),
-        (2.0.toRealNN, Vectors.dense(1.0, 4.0, 4.5).toOPVector),
-        (2.0.toRealNN, Vectors.dense(10.0, 1.5, 1.0).toOPVector)
-      )
-    )
-
-  val labelMulti = rawLabelMulti.copy(isResponse = true)
-
-  val randomForest = new OpRandomForest().setInput(label, features)
-  val outputs = randomForest.getOutput()
-  val (predName, rawName, probName) = (outputs._1.name, outputs._2.name, outputs._3.name)
-
-  val randomForestMulti = new OpRandomForest().setInput(labelMulti, featuresMulti)
-  val outputsMulti = randomForestMulti.getOutput()
-  val (predNameMulti, rawNameMulti, probNameMulti) = (outputsMulti._1.name, outputsMulti._2.name, outputsMulti._3.name)
-
-  Spec[OpRandomForest] should "allow the user to set the desired spark parameters" in {
-    randomForest.setThresholds(Array(1.0, 1.0))
-      .setMaxDepth(10)
-      .setImpurity(Impurity.Gini)
-      .setMaxBins(33)
-      .setMinInstancesPerNode(2)
-      .setMinInfoGain(0.2)
-      .setSubsamplingRate(0.9)
-      .setNumTrees(21)
-      .setSeed(2L)
-
-    randomForest.getSparkParams("thresholds").get.asInstanceOf[Array[Double]] should
-      contain theSameElementsAs Array(1.0, 1.0)
-    randomForest.getSparkParams("maxDepth").get.asInstanceOf[Int] shouldBe 10
-    randomForest.getSparkParams("maxBins").get.asInstanceOf[Int] shouldBe 33
-    randomForest.getSparkParams("impurity").get.asInstanceOf[String] shouldBe Impurity.Gini.sparkName
-    randomForest.getSparkParams("minInstancesPerNode").get.asInstanceOf[Int] shouldBe 2
-    randomForest.getSparkParams("minInfoGain").get.asInstanceOf[Double] shouldBe 0.2
-    randomForest.getSparkParams("subsamplingRate").get.asInstanceOf[Double] shouldBe 0.9
-    randomForest.getSparkParams("numTrees").get.asInstanceOf[Int] shouldBe 21
-    randomForest.getSparkParams("seed").get.asInstanceOf[Long] shouldBe 2L
-  }
-
-  it should "return a properly formed Random Forest when fitted" in {
-    the[IllegalArgumentException] thrownBy {
-      randomForest.setInput(label.copy(isResponse = true), features.copy(isResponse = true))
-    } should have message "The feature vector should not contain any response features."
-
-    val model = randomForest.fit(testData)
-
-    model shouldBe a[SwThreeStageBinaryModel[_, _, _, _, _, _]]
-    model.stage1 shouldBe a[SwBinaryModel[_, _, _, _]]
-
-    val sparkStage = model.stage1.getSparkMlStage()
-    assert(sparkStage.get.isInstanceOf[RandomForestClassificationModel])
-    assert(model.stage2.getSparkMlStage().isEmpty)
-    assert(model.stage3.getSparkMlStage().isEmpty)
-
-    model.stage1OperationName shouldBe stageNames(0)
-    model.stage2OperationName shouldBe stageNames(1)
-    model.stage3OperationName shouldBe stageNames(2)
-
-    val inputNames = model.getInputFeatures().map(_.name)
-    inputNames should have length 2
-    inputNames shouldBe Array(label.name, features.name)
-
-    val transformedData = model.transform(testData)
-
-    val fields = transformedData.select(rawName, probName, predName).schema.fields
-
-    fields.map(_.name).toList shouldBe List(rawName, probName, predName)
-
-    fields.map(_.dataType.typeName).toList shouldBe List("vector", "vector", "double")
-  }
-
-  it should "be implemented using shortcuts" in {
-    val (raw, prob, pred) = features.randomForest(label = label, impurity = Gini)
-    raw.name shouldBe raw.originStage.getOutputFeatureName
-    prob.name shouldBe prob.originStage.getOutputFeatureName
-    pred.name shouldBe pred.originStage.getOutputFeatureName
-  }
-
-  it should "return a model for multiClassification problem" in {
-    the[IllegalArgumentException] thrownBy {
-      randomForestMulti.setInput(labelMulti.copy(isResponse = true), featuresMulti.copy(isResponse = true))
-    } should have message "The feature vector should not contain any response features."
-
-    val modelMulti = randomForestMulti.fit(multiClassTestData)
-    val transformedDataMulti = modelMulti.transform(multiClassTestData)
-    val fieldsMulti = transformedDataMulti.select(rawNameMulti,
-      probNameMulti, predNameMulti).schema.fields
-    fieldsMulti.map(_.name).toList shouldBe List(rawNameMulti, probNameMulti, predNameMulti)
-    fieldsMulti.map(_.dataType.typeName).toList shouldBe List("vector", "vector", "double")
-  }
-}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala
index f02198bc53..3f15feea46 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala
@@ -31,31 +31,42 @@
 
 package com.salesforce.op.stages.impl.feature
 
-import com.salesforce.op.stages.FeatureGeneratorStage
+import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.binary.BinaryLambdaTransformer
-import com.salesforce.op.test.PassengerSparkFixtureTest
-import org.apache.spark.ml.param.ParamMap
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.utils.tuples.RichTuple._
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class AliasTransformerTest extends FlatSpec with PassengerSparkFixtureTest {
+class AliasTransformerTest extends OpTransformerSpec[RealNN, AliasTransformer[RealNN]] {
+  val sample = Seq((RealNN(1.0), RealNN(2.0)), (RealNN(4.0), RealNN(4.0)))
+  val (inputData, f1, f2) = TestFeatureBuilder(sample)
+  val transformer = new AliasTransformer(name = "feature").setInput(f1)
+  val expectedResult: Seq[RealNN] = sample.map(_._1)
 
-  Spec[AliasTransformer[_]] should "allow aliasing features" in {
-    val myFeature = (weight / height).alias
-    myFeature.name shouldBe "myFeature"
-    val all = myFeature.originStage.asInstanceOf[BinaryLambdaTransformer[_, _, _]]
-
-    val transformed = all.transform(passengersDataSet)
-    transformed.columns.contains(myFeature.name) shouldBe true
+  it should "have a shortcut that changes feature name on a raw feature" in {
+    val feature = f1.alias
+    feature.name shouldBe "feature"
+    feature.originStage shouldBe a[AliasTransformer[_]]
+    val origin = feature.originStage.asInstanceOf[AliasTransformer[RealNN]]
+    val transformed = origin.transform(inputData)
+    transformed.collect(feature) shouldEqual expectedResult
   }
-
-  it should "copy successfully" in {
-    val myFeature = ((weight * 2) / height).alias
-    val copy = myFeature.originStage.copy(new ParamMap())
-    copy.uid shouldBe myFeature.originStage.uid
+  it should "have a shortcut that changes feature name on a derived feature" in {
+    val feature = (f1 / f2).alias
+    feature.name shouldBe "feature"
+    feature.originStage shouldBe a[BinaryLambdaTransformer[_, _, _]]
+    val origin = feature.originStage.asInstanceOf[BinaryLambdaTransformer[_, _, _]]
+    val transformed = origin.transform(inputData)
+    transformed.columns should contain (feature.name)
+    transformed.collect(feature) shouldEqual sample.map { case (v1, v2) => (v1.v -> v2.v).map(_ / _).toRealNN(0.0) }
+  }
+  it should "have a shortcut that changes feature name on a derived wrapped feature" in {
+    val feature = f1.toIsotonicCalibrated(label = f2).alias
+    feature.name shouldBe "feature"
+    feature.originStage shouldBe a[AliasTransformer[_]]
   }
-
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala
index 85e5daa8d4..814a416eb2 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala
@@ -49,7 +49,8 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes
     val result = new OpWorkflow().setResultFeatures(vec).transform(randomData)
 
     result.collect(vec) should contain theSameElementsInOrderAs
-      OPVector(Vectors.dense(0.0, 0.0)) +: Array.fill(expectedRandom.length - 1)(OPVector(Vectors.dense(1.0, 0.0)))
+      OPVector(Vectors.dense(0.0, 0.0)) +:
+        Array.fill(expectedRandom.length - 1)(OPVector(Vectors.dense(1.0, 0.0)))
   }
   it should "vectorize some real binary content" in {
     val vec = realBase64.vectorize(topK = 10, minSupport = 0, cleanText = true)
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala
index 4fc0e6a91b..8ff482ad48 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala
@@ -32,77 +32,65 @@
 package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, IndColWithGroup, IndVal}
-import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
+import com.salesforce.op.stages.base.sequence.SequenceModel
+import com.salesforce.op.test.TestOpVectorColumnType.IndColWithGroup
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import org.apache.spark.ml.linalg.Vectors
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
+
 
 @RunWith(classOf[JUnitRunner])
-class BinaryMapVectorizerTest extends FlatSpec with TestSparkContext {
+class BinaryMapVectorizerTest
+  extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] {
 
-  lazy val (data, m1, m2) = TestFeatureBuilder("m1", "m2",
+  val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2",
     Seq(
       (Map("a" -> false, "b" -> true), Map("z" -> false)),
       (Map("c" -> false), Map("y" -> true, "x" -> true)),
       (Map.empty[String, Boolean], Map.empty[String, Boolean])
     ).map(v => v._1.toBinaryMap -> v._2.toBinaryMap)
   )
-  val vectorizer = new BinaryMapVectorizer().setInput(m1, m2).setCleanKeys(true)
 
-  /**
-   * Note that defaults and filters are tested in [[RealMapVectorizerTest]]
-   * as that code is shared between the two classes
-   */
-  Spec[BinaryMapVectorizer[_]] should "take an array of features as input and return a single vector feature" in {
-    val vector = vectorizer.getOutput()
+  val estimator = new BinaryMapVectorizer().setTrackNulls(false).setCleanKeys(true).setInput(m1, m2)
 
-    vector.name shouldBe vectorizer.getOutputFeatureName
-    vector.parents should contain theSameElementsAs Array(m1, m2)
-    vector.originStage shouldBe vectorizer
-    vector.typeName shouldBe FeatureType.typeName[OPVector]
-    vector.isResponse shouldBe false
-  }
+  val expectedResult: Seq[OPVector] = Seq(
+    Vectors.sparse(6, Array(1), Array(1.0)),
+    Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)),
+    Vectors.sparse(6, Array(), Array())
+  ).map(_.toOPVector)
 
-  it should "return a model that correctly transforms the data" in {
-    val transformed = vectorizer.setTrackNulls(false).fit(data).transform(data)
-    val vector = vectorizer.getOutput()
-    val expected = Array(
-      Vectors.sparse(6, Array(1), Array(1.0)),
-      Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)),
-      Vectors.sparse(6, Array(), Array())
-    ).map(_.toOPVector)
+  it should "return a model that correctly transforms the data and produces metadata" in {
+    val transformed = model.transform(inputData)
+    val vector = estimator.getOutput()
 
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
-      m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"),
-        IndColWithGroup(None, "C")),
+      estimator,
+      m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")),
       m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X"))
     )
 
-    transformed.collect(vector) shouldBe expected
-    val field = transformed.schema(vectorizer.getOutputFeatureName)
+    transformed.collect(vector) shouldBe expectedResult
+    val field = transformed.schema(estimator.getOutputFeatureName)
     OpVectorMetadata(field) shouldEqual expectedMeta
-    val vectorMetadata = vectorizer.getMetadata()
+    val vectorMetadata = estimator.getMetadata()
     OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta
   }
 
-  it should " track nulls" in {
-    val transformed = vectorizer.setTrackNulls(true).fit(data).transform(data)
-    val vector = vectorizer.getOutput()
+  it should "return a model that correctly transforms the data and produces metadata with null tracking" in {
+    val transformed = estimator.setTrackNulls(true).fit(inputData).transform(inputData)
+    val vector = estimator.getOutput()
     val expected = Array(
       Vectors.sparse(12, Array(2, 5, 9, 11), Array(1.0, 1.0, 1.0, 1.0)),
       Vectors.sparse(12, Array(1, 3, 7, 8, 10), Array(1.0, 1.0, 1.0, 1.0, 1.0)),
       Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))
     ).map(_.toOPVector)
 
-
     val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString)
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
+      estimator,
       m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"),
         IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"),
         IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")),
@@ -112,10 +100,9 @@ class BinaryMapVectorizerTest extends FlatSpec with TestSparkContext {
     )
 
     transformed.collect(vector) shouldBe expected
-    val field = transformed.schema(vectorizer.getOutputFeatureName)
+    val field = transformed.schema(estimator.getOutputFeatureName)
     OpVectorMetadata(field) shouldEqual expectedMeta
-    val vectorMetadata = vectorizer.getMetadata()
+    val vectorMetadata = estimator.getMetadata()
     OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta
   }
-
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala
index cc24908ddd..f7277ee642 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala
@@ -33,19 +33,18 @@ package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op.features.types._
 import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, RootCol}
-import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder}
 import com.salesforce.op.utils.spark.OpVectorMetadata
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.spark.ml.linalg.Vectors
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
 
 
 @RunWith(classOf[JUnitRunner])
-class BinaryVectorizerTest extends FlatSpec with TestSparkContext {
+class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] {
 
-  val (ds, f1, f2) = TestFeatureBuilder(
+  val (inputData, f1, f2) = TestFeatureBuilder(
     Seq[(Binary, Binary)](
       (Binary(false), Binary(false)),
       (Binary(false), Binary(true)),
@@ -59,17 +58,23 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext {
     )
   )
 
-  Spec[BinaryVectorizer] should "take an array of features as input and return a single vector feature" in {
-    val vectorizer = new BinaryVectorizer().setInput(f1, f2)
-    val vector = vectorizer.getOutput()
-    vector.name shouldBe vectorizer.getOutputFeatureName
-    vector.typeName shouldBe FeatureType.typeName[OPVector]
-    vector.isResponse shouldBe false
-  }
+  val transformer = new BinaryVectorizer().setInput(f1, f2) // default settings: trackNulls = true, setFillValue = false
+
+  val expectedResult = Seq(
+    Array(0.0, 0.0, 0.0, 0.0),
+    Array(0.0, 0.0, 1.0, 0.0),
+    Array(1.0, 0.0, 0.0, 0.0),
+    Array(1.0, 0.0, 1.0, 0.0),
+    Array(0.0, 1.0, 0.0, 0.0),
+    Array(0.0, 1.0, 1.0, 0.0),
+    Array(0.0, 0.0, 0.0, 1.0),
+    Array(1.0, 0.0, 0.0, 1.0),
+    Array(0.0, 1.0, 0.0, 1.0)
+  ).map(Vectors.dense(_).toOPVector)
 
   it should "transform the data correctly [trackNulls=true,fillValue=false]" in {
     val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(true).setFillValue(false)
-    val transformed = vectorizer.transform(ds)
+    val transformed = vectorizer.transform(inputData)
     val vector = vectorizer.getOutput()
     val expected = Array(
       Array(0.0, 0.0, 0.0, 0.0),
@@ -93,7 +98,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext {
 
   it should "transform the data correctly [trackNulls=true,fillValue=true]" in {
     val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(true).setFillValue(true)
-    val transformed = vectorizer.transform(ds)
+    val transformed = vectorizer.transform(inputData)
     val vector = vectorizer.getOutput()
     val expected = Array(
       Array(0.0, 0.0, 0.0, 0.0),
@@ -117,7 +122,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext {
 
   it should "transform the data correctly [trackNulls=false,fillValue=false]" in {
     val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(false).setFillValue(false)
-    val transformed = vectorizer.transform(ds)
+    val transformed = vectorizer.transform(inputData)
     val vector = vectorizer.getOutput()
     val expected = Array(
       Array(0.0, 0.0),
@@ -141,7 +146,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext {
 
   it should "transform the data correctly [trackNulls=false,fillValue=true]" in {
     val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(false).setFillValue(true)
-    val transformed = vectorizer.transform(ds)
+    val transformed = vectorizer.transform(inputData)
     val vector = vectorizer.getOutput()
     val expected = Array(
       Array(0.0, 0.0),
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala
index 2706d20ba1..58436bba1e 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala
@@ -31,23 +31,21 @@
 
 package com.salesforce.op.stages.impl.feature
 
-import com.salesforce.op.features.Feature
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.impl.feature.DateListPivot._
 import com.salesforce.op.test.TestOpVectorColumnType.IndCol
-import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder}
 import com.salesforce.op.utils.date.DateTimeUtils
 import com.salesforce.op.utils.spark.OpVectorMetadata
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.spark.ml.linalg.Vectors
 import org.joda.time.{DateTime, DateTimeConstants}
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class DateListVectorizerTest extends FlatSpec with TestSparkContext {
+class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] {
 
   // Sunday July 12th 1998 at 22:45
   val defaultDate = new DateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis
@@ -94,23 +92,15 @@ class DateListVectorizerTest extends FlatSpec with TestSparkContext {
   val testVectorizer = new DateListVectorizer[DateList]()
   val outputName = "vecDateList"
 
-  Spec[DateListVectorizer[_]] should "have output name set correctly" in {
-    testVectorizer.operationName shouldBe outputName
-  }
+  // OpTransformer base tests
+  val inputData = testDataCurrent
 
-  it should "throw an error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testVectorizer.getOutput())
-  }
+  val transformer = new DateListVectorizer[DateList]().setInput(clicks, opens, purchases)
 
-  it should "return a single output feature of the correct type" in {
-    val output = testVectorizer.setInput(clicks, opens, purchases).getOutput()
-    output shouldBe new Feature[OPVector](
-      name = testVectorizer.getOutputFeatureName,
-      originStage = testVectorizer,
-      isResponse = false,
-      parents = Array(clicks, opens, purchases)
-    )
-  }
+  val expectedResult = Seq(Vectors.dense(0.0, 1.0, 0.0, 1.0, 0.0, 1.0).toOPVector,
+    Vectors.dense(3.0, 0.0, 0.0, 0.0, 0.0, 0.0).toOPVector,
+    Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0, 0.0).toOPVector,
+    Vectors.dense(2.0, 0.0, 1.0, 0.0, -1.0, 0.0).toOPVector)
 
   it should "vectorize with SinceFirst" in {
     val testModelTimeSinceFirst = testVectorizer.setInput(clicks, opens, purchases).setPivot(SinceFirst)
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala
index 486c32523b..d88eb87bba 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala
@@ -34,19 +34,17 @@ package com.salesforce.op.stages.impl.feature
 import com.salesforce.op._
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.impl.feature.TimePeriod._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.OpVectorMetadata
 import com.salesforce.op.utils.spark.RichDataset._
-
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.Transformer
 import org.joda.time.{DateTime => JDateTime}
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 @RunWith(classOf[JUnitRunner])
-class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext {
+class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] {
 
   val eps = 1E-4
   val sampleDateTimes = Seq[JDateTime](
@@ -56,13 +54,12 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext {
     new JDateTime(2017, 4, 17, 18, 0, 0, 0),
     new JDateTime(1918, 2, 13, 3, 0, 0, 0)
   )
-  val expectedHourOfDayOutput = Array(
-    Array(1.0, 0.0),
-    Array(0.0, 1.0),
-    Array(-1.0, 0.0),
-    Array(0.0, -1.0),
-    Array(math.sqrt(2.0) / 2, math.sqrt(2.0) / 2)
-  ).map(Vectors.dense(_).toOPVector)
+
+  val (inputData, f1) = TestFeatureBuilder(sampleDateTimes.map(x => Date(x.getMillis)))
+
+  val transformer = new DateToUnitCircleTransformer().setInput(f1)
+
+  val expectedResult: Seq[OPVector] = transformData(sampleDateTimes, HourOfDay)
 
   def transformData[T <: TimePeriod](data: Seq[JDateTime], timePeriod: T): Array[OPVector] = {
     val dataTimeStamps: Seq[Date] = data.map(x => Date(x.getMillis()))
@@ -78,24 +75,13 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext {
       .map(Vectors.dense(_).toOPVector)
   }
 
-  Spec[DateToUnitCircleTransformer[_]] should
-    "take an array of features as input and return a single vector feature" in {
-    val dataTimeStamps: Seq[Date] = sampleDateTimes.map(x => Date(x.getMillis()))
-    val (ds, f) = TestFeatureBuilder(dataTimeStamps)
-    val vectorizer = new DateToUnitCircleTransformer().setInput(f)
-    val vector = vectorizer.getOutput()
-    vector.name shouldBe vectorizer.getOutputFeatureName
-    vector.typeName shouldBe FeatureType.typeName[OPVector]
-    vector.isResponse shouldBe false
-  }
-
   it should "work with its shortcut" in {
     val dataTimeStamps: Seq[Date] = sampleDateTimes.map(x => Date(x.getMillis()))
     val (ds, dateFeature) = TestFeatureBuilder(dataTimeStamps)
     val output = dateFeature.toUnitCircle(TimePeriod.HourOfDay)
     val transformed = output.originStage.asInstanceOf[Transformer].transform(ds)
     val actual = transformed.collect(output)
-    all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
+    all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
   }
 
   it should "work with its DateTime shortcut" in {
@@ -104,7 +90,7 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext {
     val output = dateTimeFeature.toUnitCircle(TimePeriod.HourOfDay)
     val transformed = output.originStage.asInstanceOf[Transformer].transform(ds)
     val actual = transformed.collect(output)
-    all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
+    all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
   }
 
   it should "store the proper meta data" in {
@@ -137,7 +123,7 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext {
 
   it should "transform the data correctly when the timePeriod is HourOfDay" in {
     val actual = transformData(sampleDateTimes, HourOfDay)
-    all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be <  eps
+    all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be <  eps
   }
 
   it should "transform the data correctly when the timePeriod is DayOfYear" in {
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
index f0807b0d16..c2d59429a6 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala
@@ -249,3 +249,4 @@ object DecisionTreeNumericBucketizerTestHelper extends Matchers {
   }
 
 }
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
index d3c2f3f4f7..5784a95021 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala
@@ -35,7 +35,9 @@ import com.salesforce.op.OpWorkflow
 import com.salesforce.op.features.types._
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.testkit.{RandomBinary, RandomReal}
+import com.salesforce.op.utils.spark.OpVectorMetadata
 import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.utils.spark.RichMetadata._
 import org.apache.spark.sql.DataFrame
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
@@ -81,6 +83,17 @@ class DecisionTreeNumericMapBucketizerTest extends FlatSpec with TestSparkContex
     val expectedSplits = Array(Double.NegativeInfinity, 15, 26, 91, Double.PositiveInfinity)
   }
 
+  lazy val (data, target, currencyMap, realMap) = TestFeatureBuilder("target", "currencyMap", "realMap2",
+    Seq[(RealNN, CurrencyMap, RealMap)](
+      (1.0.toRealNN, CurrencyMap(Map("c0" -> 10)), RealMap.empty),
+      (1.0.toRealNN, CurrencyMap(Map("c0" -> 10)), RealMap.empty),
+      (1.0.toRealNN, CurrencyMap(Map("c0" -> 8)), RealMap.empty),
+      (0.0.toRealNN, CurrencyMap(Map("c0" -> 5)), RealMap.empty),
+      (0.0.toRealNN, CurrencyMap(Map("c0" -> 3)), RealMap.empty),
+      (0.0.toRealNN, CurrencyMap(Map("c0" -> 0)), RealMap.empty)
+    )
+  )
+
   Spec[DecisionTreeNumericMapBucketizer[_, _]] should "produce output that is never a response, " +
     "except the case where both inputs are" in new NormalData {
     Seq(
@@ -143,6 +156,24 @@ class DecisionTreeNumericMapBucketizerTest extends FlatSpec with TestSparkContex
     )
   }
 
+  it should "drop empty numeric map" in {
+    val targetResponse = target.copy(isResponse = true)
+    val currencyMapBkts = currencyMap.autoBucketize(label = targetResponse, trackNulls = false, minInfoGain = 0.1)
+    val realMapBkts = realMap.autoBucketize(label = targetResponse, trackNulls = false, minInfoGain = 0.1)
+    val featureVector = Seq(currencyMapBkts, realMapBkts).transmogrify(Some(targetResponse))
+
+    val transformed = new OpWorkflow().setResultFeatures(currencyMapBkts, realMapBkts, featureVector).transform(data)
+
+    // featureVector should consist of bucketized features from currencyMap and no feature from realMap
+    val featureVectorMeta = OpVectorMetadata(transformed.schema(featureVector.name))
+    featureVectorMeta.columns.length shouldBe 2
+    featureVectorMeta.columns.foreach{ col =>
+      col.parentFeatureName should contain theSameElementsAs Seq("currencyMap")
+      col.parentFeatureType should contain theSameElementsAs Seq("com.salesforce.op.features.types.CurrencyMap")
+      col.indicatorGroup shouldBe Some("c0")
+    }
+  }
+
   private def assertBucketizer
   (
     bucketizer: DecisionTreeNumericMapBucketizer[_, _ <: OPMap[_]],
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala
similarity index 52%
rename from core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala
rename to core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala
index d9f42ca61c..83ff87113f 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala
@@ -29,42 +29,59 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package com.salesforce.op.stages.base.sequence
+package com.salesforce.op.stages.impl.feature
 
-import com.salesforce.op.test.PassengerSparkFixtureTest
 import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
-import com.salesforce.op.utils.spark.RichRow._
-import org.apache.spark.ml.param.ParamMap
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
+
 
 @RunWith(classOf[JUnitRunner])
-class SequenceTransformerTest extends FlatSpec with PassengerSparkFixtureTest {
+class FilterIntegralMapTest extends OpTransformerSpec[IntegralMap, FilterMap[IntegralMap]] {
+
+  val (inputData, f1Int) = TestFeatureBuilder[IntegralMap](
+    Seq(
+      IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)),
+      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)),
+      IntegralMap(Map("Knight" -> 5))
+    )
+  )
+  val transformer = new FilterMap[IntegralMap]().setInput(f1Int)
 
-  val toMP = new SequenceLambdaTransformer[Real, MultiPickList](operationName = "MP",
-    transformFn = value => MultiPickList(value.map(_.v.getOrElse(0.0).toString).toSet)
+  val expectedResult: Seq[IntegralMap] = Seq(
+    IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)),
+    IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)),
+    IntegralMap(Map("Knight" -> 5))
   )
 
-  Spec[SequenceLambdaTransformer[_, _]] should "work when returning a MultiPickList feature" in {
-    toMP.setInput(age, weight)
-    val transformedData = toMP.transform(passengersDataSet)
-    val columns = transformedData.columns
-    assert(columns.contains(toMP.getOutputFeatureName))
-    val output = toMP.getOutput()
-    val answer = passengersArray.map(r =>
-      toMP.transformFn(Seq(r.getFeatureType[Real](age), r.getFeatureType[Real](weight)))
+  it should "filter by whitelisted keys" in {
+    transformer.setWhiteListKeys(Array("Arthur", "Knight"))
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      IntegralMap(Map("Arthur" -> 1)),
+      IntegralMap.empty,
+      IntegralMap(Map("Knight" -> 5))
     )
-    transformedData.collect(output) shouldBe answer
+
+    filtered should contain theSameElementsAs dataExpected
   }
 
-  it should "copy successfully" in {
-    val tr = new SequenceLambdaTransformer[Text, Text](
-      operationName = "foo",
-      transformFn = x => x.head
+  it should "filter by blacklisted keys" in {
+    transformer.setInput(f1Int)
+      .setWhiteListKeys(Array[String]())
+      .setBlackListKeys(Array("Arthur", "Knight"))
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3)),
+      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)),
+      IntegralMap.empty
     )
-    tr.copy(new ParamMap()).uid shouldBe tr.uid
+
+    filtered should contain theSameElementsAs dataExpected
   }
 
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala
deleted file mode 100644
index 06fcf23bbc..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.impl.feature
-
-import com.salesforce.op._
-import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.RichDataset._
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
-
-
-@RunWith(classOf[JUnitRunner])
-class FilterMapTest extends FlatSpec with TestSparkContext {
-
-  val (ds, f1) = TestFeatureBuilder[TextMap](
-    Seq(
-      TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")),
-      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")),
-      TextMap(Map("Knight" -> "Ni"))
-    )
-  )
-
-  val filter = new FilterMap[TextMap]().setInput(f1)
-
-
-  val (dsInt, f1Int) = TestFeatureBuilder[IntegralMap](
-    Seq(
-      IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)),
-      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)),
-      IntegralMap(Map("Knight" -> 5))
-    )
-  )
-  val filterInt = new FilterMap[IntegralMap]().setInput(f1Int)
-
-
-  val (dsCat, f1Cat) = TestFeatureBuilder[MultiPickListMap](
-    Seq(
-      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"),
-        "Lancelot" -> Set("Brave", "Knight"),
-        "Galahad" -> Set("Pure", "Knight"))),
-      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
-        "Galahad" -> Set("Pure", "Knight"),
-        "Bedevere" -> Set("Wise", "Knight"))),
-      MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing")))
-    )
-  )
-  val filterCat = new FilterMap[MultiPickListMap]().setInput(f1Cat)
-
-
-  classOf[FilterMap[_]].getSimpleName should "return single properly formed feature" in {
-    val filtered = filter.getOutput()
-
-    filtered.name shouldBe filter.getOutputFeatureName
-    filtered.originStage shouldBe filter
-    filtered.parents shouldBe Array(f1)
-  }
-
-  it should "filter TextMap by whitelisted keys" in {
-    filter.setWhiteListKeys(Array("Arthur", "Knight"))
-
-    val filtered = filter.transform(ds).collect(filter.getOutput)
-    val dataExpected = Array(
-      TextMap(Map("Arthur" -> "King")),
-      TextMap.empty,
-      TextMap(Map("Knight" -> "Ni"))
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter TextMap by blacklisted keys" in {
-    filter.setInput(f1)
-      .setWhiteListKeys(Array[String]())
-      .setBlackListKeys(Array("Arthur", "Knight"))
-    val filtered = filter.transform(ds).collect(filter.getOutput)
-
-    val dataExpected = Array(
-      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure")),
-      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")),
-      TextMap.empty
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter IntegralMap by whitelisted keys" in {
-    filterInt.setWhiteListKeys(Array("Arthur", "Knight"))
-    val filtered = filterInt.transform(dsInt).collect(filterInt.getOutput())
-
-    val dataExpected = Array(
-      IntegralMap(Map("Arthur" -> 1)),
-      IntegralMap.empty,
-      IntegralMap(Map("Knight" -> 5))
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter IntegralMap by blacklisted keys" in {
-    filterInt.setInput(f1Int)
-      .setWhiteListKeys(Array[String]())
-      .setBlackListKeys(Array("Arthur", "Knight"))
-    val filtered = filterInt.transform(dsInt).collect(filterInt.getOutput())
-
-    val dataExpected = Array(
-      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3)),
-      IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)),
-      IntegralMap.empty
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter MultiPickListMap by whitelisted keys" in {
-    filterCat.setWhiteListKeys(Array("Arthur", "Knight"))
-    val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput())
-
-    val dataExpected = Array(
-      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
-      MultiPickListMap.empty,
-      MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing")))
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter MultiPickListMap by blacklisted keys" in {
-    filterCat
-      .setWhiteListKeys(Array[String]())
-      .setBlackListKeys(Array("Arthur", "Knight"))
-
-    val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput())
-
-    val dataExpected = Array(
-      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
-        "Galahad" -> Set("Pure", "Knight"))),
-      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
-        "Galahad" -> Set("Pure", "Knight"),
-        "Bedevere" -> Set("Wise", "Knight"))),
-      MultiPickListMap.empty
-    )
-
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "filter correctly when using shortcut" in {
-    val filtered = f1.filter(whiteList = Seq("Arthur", "Knight"), blackList = Seq())
-
-    filtered.name shouldBe filtered.originStage.getOutputFeatureName
-    filtered.originStage shouldBe a[FilterMap[_]]
-    filtered.parents shouldBe Array(f1)
-  }
-
-  it should "set cleanMapFlag correctly" in {
-    filter.setCleanText(false)
-    filter.get[Boolean](filter.cleanText).get shouldBe false
-    filter.setCleanKeys(false)
-    filter.get[Boolean](filter.cleanKeys).get shouldBe false
-  }
-
-  it should "not clean map when flag set to false" in {
-    filterCat
-      .setCleanText(false)
-      .setCleanKeys(false)
-      .setWhiteListKeys(Array("Arthur", "Knight"))
-      .setBlackListKeys(Array())
-    val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput())
-
-    val dataExpected = Array(
-      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
-      MultiPickListMap.empty,
-      MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing")))
-    )
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-  it should "clean map when flag set to true" in {
-    filterCat
-      .setCleanKeys(true)
-      .setCleanText(true)
-      .setWhiteListKeys(Array("Arthur", "Knight"))
-      .setBlackListKeys(Array())
-    val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput())
-
-    val dataExpected = Array(
-      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
-      MultiPickListMap.empty,
-      MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing")))
-    )
-    filtered should contain theSameElementsAs dataExpected
-  }
-
-}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala
new file mode 100644
index 0000000000..b4a98f5e92
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import com.salesforce.op.utils.spark.RichDataset._
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class FilterMultiPickListMapTest extends OpTransformerSpec[MultiPickListMap, FilterMap[MultiPickListMap]] {
+  val (inputData, f1Cat) = TestFeatureBuilder[MultiPickListMap](
+    Seq(
+      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"),
+        "Lancelot" -> Set("Brave", "Knight"),
+        "Galahad" -> Set("Pure", "Knight"))),
+      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
+        "Galahad" -> Set("Pure", "Knight"),
+        "Bedevere" -> Set("Wise", "Knight"))),
+      MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing")))
+    )
+  )
+  val transformer = new FilterMap[MultiPickListMap]().setInput(f1Cat)
+
+  val expectedResult = Seq(
+    MultiPickListMap(Map("Arthur" -> Set("King", "Briton"),
+      "Lancelot" -> Set("Brave", "Knight"),
+      "Galahad" -> Set("Pure", "Knight"))),
+    MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
+      "Galahad" -> Set("Pure", "Knight"),
+      "Bedevere" -> Set("Wise", "Knight"))),
+    MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing")))
+  )
+
+  it should "filter whitelisted keys" in {
+    transformer.setWhiteListKeys(Array("Arthur", "Knight"))
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
+      MultiPickListMap.empty,
+      MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing")))
+    )
+
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+  it should "filter blacklisted keys" in {
+    transformer
+      .setWhiteListKeys(Array[String]())
+      .setBlackListKeys(Array("Arthur", "Knight"))
+
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
+        "Galahad" -> Set("Pure", "Knight"))),
+      MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"),
+        "Galahad" -> Set("Pure", "Knight"),
+        "Bedevere" -> Set("Wise", "Knight"))),
+      MultiPickListMap.empty
+    )
+
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+  it should "not clean map when flag set to false" in {
+    transformer
+      .setCleanText(false)
+      .setCleanKeys(false)
+      .setWhiteListKeys(Array("Arthur", "Knight"))
+      .setBlackListKeys(Array())
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
+      MultiPickListMap.empty,
+      MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing")))
+    )
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+  it should "clean map when flag set to true" in {
+    transformer
+      .setCleanKeys(true)
+      .setCleanText(true)
+      .setWhiteListKeys(Array("Arthur", "Knight"))
+      .setBlackListKeys(Array())
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput())
+
+    val dataExpected = Array(
+      MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))),
+      MultiPickListMap.empty,
+      MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing")))
+    )
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala
new file mode 100644
index 0000000000..3cdae3044f
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op._
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import com.salesforce.op.utils.spark.RichDataset._
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class FilterTextMapTest extends OpTransformerSpec[TextMap, FilterMap[TextMap]] {
+  val (inputData, f1) = TestFeatureBuilder[TextMap](
+    Seq(
+      TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")),
+      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")),
+      TextMap(Map("Knight" -> "Ni"))
+    )
+  )
+
+  val transformer = new FilterMap[TextMap]().setInput(f1)
+
+  val expectedResult: Seq[TextMap] = Array(
+    TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")),
+    TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")),
+    TextMap(Map("Knight" -> "Ni"))
+  )
+
+  it should "filter whitelisted keys" in {
+    transformer.setWhiteListKeys(Array("Arthur", "Knight"))
+
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput)
+    val dataExpected = Array(
+      TextMap(Map("Arthur" -> "King")),
+      TextMap.empty,
+      TextMap(Map("Knight" -> "Ni"))
+    )
+
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+  it should "filter blacklisted keys" in {
+    transformer.setInput(f1)
+      .setWhiteListKeys(Array[String]())
+      .setBlackListKeys(Array("Arthur", "Knight"))
+    val filtered = transformer.transform(inputData).collect(transformer.getOutput)
+
+    val dataExpected = Array(
+      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure")),
+      TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")),
+      TextMap.empty
+    )
+
+    filtered should contain theSameElementsAs dataExpected
+  }
+
+  it should "set cleanMapFlag correctly" in {
+    transformer.setCleanText(false)
+    transformer.get[Boolean](transformer.cleanText).get shouldBe false
+    transformer.setCleanKeys(false)
+    transformer.get[Boolean](transformer.cleanKeys).get shouldBe false
+  }
+
+  it should "filter correctly when using shortcut" in {
+    val filtered = f1.filter(whiteList = Seq("Arthur", "Knight"), blackList = Seq())
+
+    filtered.name shouldBe filtered.originStage.getOutputFeatureName
+    filtered.originStage shouldBe a[FilterMap[_]]
+    filtered.parents shouldBe Array(f1)
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala
index dbc7ea26ff..2e301a2a93 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala
@@ -33,17 +33,15 @@ package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op._
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class JaccardSimilarityTest extends FlatSpec with TestSparkContext {
+class JaccardSimilarityTest extends OpTransformerSpec[RealNN, JaccardSimilarity] {
 
-  val (ds, f1, f2) = TestFeatureBuilder(
+  val (inputData, f1, f2) = TestFeatureBuilder(
     Seq(
       (Seq("Red", "Green"), Seq("Red")),
       (Seq("Red", "Green"), Seq("Yellow, Blue")),
@@ -51,15 +49,10 @@ class JaccardSimilarityTest extends FlatSpec with TestSparkContext {
     ).map(v => v._1.toMultiPickList -> v._2.toMultiPickList)
   )
 
-  val jacSimTrans = new JaccardSimilarity().setInput(f1, f2)
+  val transformer = new JaccardSimilarity().setInput(f1, f2)
 
-  classOf[JaccardSimilarity].getSimpleName should "return single properly formed feature" in {
-    val jaccard = jacSimTrans.getOutput()
+  val expectedResult: Seq[RealNN] = Seq(0.5, 0.0, 1.0).toRealNN
 
-    jaccard.name shouldBe jacSimTrans.getOutputFeatureName
-    jaccard.parents shouldBe Array(f1, f2)
-    jaccard.originStage shouldBe jacSimTrans
-  }
   it should "have a shortcut" in {
     val jaccard = f1.jaccardSimilarity(f2)
 
@@ -70,32 +63,25 @@ class JaccardSimilarityTest extends FlatSpec with TestSparkContext {
   it should "return 1 when both vectors are empty" in {
     val set1 = Seq.empty[String].toMultiPickList
     val set2 = Seq.empty[String].toMultiPickList
-    jacSimTrans.transformFn(set1, set2) shouldBe 1.0.toRealNN
+    transformer.transformFn(set1, set2) shouldBe 1.0.toRealNN
   }
   it should "return 1 when both vectors are the same" in {
     val set1 = Seq("Red", "Blue", "Green").toMultiPickList
     val set2 = Seq("Red", "Blue", "Green").toMultiPickList
-    jacSimTrans.transformFn(set1, set2) shouldBe 1.0.toRealNN
+    transformer.transformFn(set1, set2) shouldBe 1.0.toRealNN
   }
 
   it should "calculate similarity correctly when vectors are different" in {
     val set1 = Seq("Red", "Green", "Blue").toMultiPickList
     val set2 = Seq("Red", "Blue").toMultiPickList
-    jacSimTrans.transformFn(set1, set2) shouldBe (2.0 / 3.0).toRealNN
+    transformer.transformFn(set1, set2) shouldBe (2.0 / 3.0).toRealNN
 
     val set3 = Seq("Red").toMultiPickList
     val set4 = Seq("Blue").toMultiPickList
-    jacSimTrans.transformFn(set3, set4) shouldBe 0.0.toRealNN
+    transformer.transformFn(set3, set4) shouldBe 0.0.toRealNN
 
     val set5 = Seq("Red", "Yellow", "Green").toMultiPickList
     val set6 = Seq("Pink", "Green", "Blue").toMultiPickList
-    jacSimTrans.transformFn(set5, set6) shouldBe (1.0 / 5.0).toRealNN
-  }
-
-  it should "calculate similarity correctly on a dataset" in {
-    val transformed = jacSimTrans.transform(ds)
-    val output = jacSimTrans.getOutput()
-    val actualOutput = transformed.collect(output)
-    actualOutput shouldBe Seq(0.5, 0.0, 1.0).toRealNN
+    transformer.transformFn(set5, set6) shouldBe (1.0 / 5.0).toRealNN
   }
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala
index 837174f9c5..f4c3cb360a 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala
@@ -32,20 +32,19 @@
 package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.text.Language
 import org.apache.spark.ml.Transformer
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
 
 
 @RunWith(classOf[JUnitRunner])
-class LangDetectorTest extends FlatSpec with TestSparkContext {
+class LangDetectorTest extends OpTransformerSpec[RealMap, LangDetector[Text]] {
 
   // scalastyle:off
-  val (ds, f1, f2, f3) = TestFeatureBuilder(
+  val (inputData, f1, f2, f3) = TestFeatureBuilder(
     Seq(
       (
         "I've got a lovely bunch of coconuts".toText,
@@ -65,37 +64,31 @@ class LangDetectorTest extends FlatSpec with TestSparkContext {
     )
   )
   // scalastyle:on
-  val langDetector = new LangDetector[Text]().setInput(f1)
+  val transformer = new LangDetector[Text]().setInput(f1)
 
-  classOf[LangDetector[_]].getSimpleName should "return single properly formed feature" in {
-    val output1 = langDetector.getOutput()
+  private val langMap = f1.detectLanguages()
 
-    output1.name shouldBe langDetector.getOutputFeatureName
-    output1.parents shouldBe Array(f1)
-    output1.originStage shouldBe langDetector
-  }
+  // English result
+  val expectedResult: Seq[RealMap] = Seq(
+    Map("en" -> 0.9999984360934321),
+    Map("en" -> 0.9999900853228016),
+    Map("en" -> 0.9999900116744931)
+  ).map(_.toRealMap)
 
   it should "return empty RealMap when input text is empty" in {
-    langDetector.transformFn(Text.empty) shouldBe RealMap.empty
-  }
-
-  it should "detect English language" in {
-    assertDetectionResults(
-      results = langDetector.setInput(f1).transform(ds).collect(langDetector.getOutput()),
-      expectedLanguage = Language.English
-    )
+    transformer.transformFn(Text.empty) shouldBe RealMap.empty
   }
 
   it should "detect Japanese language" in {
     assertDetectionResults(
-      results = langDetector.setInput(f2).transform(ds).collect(langDetector.getOutput()),
+      results = transformer.setInput(f2).transform(inputData).collect(transformer.getOutput()),
       expectedLanguage = Language.Japanese
     )
   }
 
   it should "detect French language" in {
     assertDetectionResults(
-      results = langDetector.setInput(f3).transform(ds).collect(langDetector.getOutput()),
+      results = transformer.setInput(f3).transform(inputData).collect(transformer.getOutput()),
       expectedLanguage = Language.French
     )
   }
@@ -104,7 +97,7 @@ class LangDetectorTest extends FlatSpec with TestSparkContext {
     val tokenized = f1.detectLanguages()
 
     assertDetectionResults(
-      results = tokenized.originStage.asInstanceOf[Transformer].transform(ds).collect(tokenized),
+      results = tokenized.originStage.asInstanceOf[Transformer].transform(inputData).collect(tokenized),
       expectedLanguage = Language.English
     )
   }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala
index cb26746d74..a0eb2a7c1c 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala
@@ -36,19 +36,21 @@ import java.io.FileInputStream
 import com.salesforce.op._
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.unary.UnaryTransformer
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.testkit.RandomText
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.commons.io.IOUtils
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class MimeTypeDetectorTest extends FlatSpec with TestSparkContext with Base64TestData {
+class MimeTypeDetectorTest extends OpTransformerSpec[Text, MimeTypeDetector] with Base64TestData {
+  val inputData = randomData
+  val transformer = new MimeTypeDetector().setInput(randomBase64)
+  val expectedResult = expectedRandom
 
-  Spec[MimeTypeDetector] should "validate the type hint" in {
+  it should "validate the type hint" in {
     assertThrows[IllegalArgumentException](new MimeTypeDetector().setTypeHint("blarg"))
   }
   it should "validate the ma bytes to parse" in {
@@ -73,9 +75,9 @@ class MimeTypeDetectorTest extends FlatSpec with TestSparkContext with Base64Tes
 
     result.collect(mime) should contain theSameElementsInOrderAs expectedMimeJson
   }
-
 }
 
+
 trait Base64TestData {
   self: TestSparkContext =>
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala
new file mode 100644
index 0000000000..d891ebe261
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import com.salesforce.op.utils.text.Language
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class NameEntityRecognizerTest extends OpTransformerSpec[MultiPickListMap, NameEntityRecognizer[Text]] {
+
+  // Base tests
+  val (inputData, inputText) = TestFeatureBuilder(Seq(
+    ("Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is " +
+      "chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years " +
+      "old and former chairman of Consolidated Gold Fields PLC, was named a director of this " +
+      "British industrial conglomerate.").toText))
+
+  val transformer = new NameEntityRecognizer[Text].setInput(inputText)
+
+  val expectedResult: Seq[MultiPickListMap] = Seq(
+    Map("Rudolph" -> Set("Person"),
+      "Agnew" -> Set("Person"),
+      "Consolidated" -> Set("Organization"),
+      "Vinken" -> Set("Person"),
+      "Gold" -> Set("Organization"),
+      "PLC" -> Set("Organization"),
+      "Pierre" -> Set("Person"),
+      "Fields" -> Set("Organization")
+    ).toMultiPickListMap)
+
+  it should "find the same set of name entities using the shortcut in RichTextFeatures" in {
+    val nameEntityRecognizer = inputText.recognizeEntities().originStage.asInstanceOf[NameEntityRecognizer[Text]]
+      .setInput(inputText)
+    val transformed = nameEntityRecognizer.transform(inputData)
+    val output = nameEntityRecognizer.getOutput()
+    transformed.collect(output) shouldEqual expectedResult
+  }
+
+  it should "find name entities for Dutch text" in {
+    // scalastyle:off
+    val input = ("Pierre Vinken, 61 jaar oud, treedt toe tot het bestuur als een niet-uitvoerende " +
+      "directeur op Nov. 29. De heer Vinken is voorzitter van Elsevier N.V., de Nederlandse uitgeversgroep. " +
+      "Rudolph Agnew, 55 jaar oud en voormalig voorzitter van Consolidated Gold Fields PLC, werd benoemd tot " +
+      "bestuurder van dit Britse industriële conglomeraat.").toText
+    val expectedOutput = Map(
+      "Nederlandse" -> Set("Misc"),
+      "Nov." -> Set("Organization"),
+      "Consolidated" -> Set("Misc"),
+      "Vinken" -> Set("Person"),
+      "Pierre" -> Set("Person"),
+      "Britse" -> Set("Misc")
+    ).toMultiPickListMap
+    new NameEntityRecognizer[Text]().setDefaultLanguage(Language.Dutch).transformFn(input) shouldEqual expectedOutput
+    // scalastyle:on
+  }
+
+  it should "return an empty map when there's no pre-trained name entity recognition model for the given language" in {
+    val input = ("Pierre Vinken, mwenye umri wa miaka 61, atajiunga na bodi hiyo kama mkurugenzi asiyetarajiwa " +
+      "Novemba 29. Mheshimiwa Vinken ni mwenyekiti wa Elsevier N.V., kundi la kuchapisha Kiholanzi. " +
+      "Rudolph Agnew, mwenye umri wa miaka 55 na mwenyekiti wa zamani wa Mkutano Mkuu wa Gold Fields, " +
+      "aliitwa mkurugenzi wa muungano huu wa viwanda wa Uingereza.").toText
+    val expectedOutput = Map.empty[String, Set[String]].toMultiPickListMap
+    new NameEntityRecognizer[Text]().setDefaultLanguage(Language.Swahili).transformFn(input) shouldEqual expectedOutput
+  }
+
+  // TODO: add a test for spanish NER after finding the spanish tokenizer
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala
new file mode 100644
index 0000000000..541043fc34
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op._
+import com.salesforce.op.OpWorkflow
+import com.salesforce.op.features.Feature
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer
+import com.salesforce.op.test.{FeatureTestBase, TestFeatureBuilder}
+import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.testkit.{RandomIntegral, RandomReal}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class NumericVectorizerTest extends FlatSpec with FeatureTestBase {
+
+  val ageData: Seq[Real] = RandomReal.uniform[Real](maxValue = 80.0).limit(100)
+  val heightData: Seq[Real] = RandomReal.normal[Real](mean = 65.0, sigma = 8).limit(100)
+  val countData: Seq[Integral] = RandomIntegral.integrals(0, 10).limit(100)
+  val labelTransformer = new UnaryLambdaTransformer[Real, RealNN](operationName = "labelFunc",
+    transformFn = {
+      case SomeValue(Some(x)) if x > 30.0 => 1.toRealNN
+      case _ => 0.0.toRealNN
+    }
+  )
+
+  Spec[RichRealFeature[_]] should "vectorize a small sample of real values" in {
+    val inputData = Seq(-4, -3, -2, -1, 1, 2, 3, 4).map(_.toReal)
+    val labelData = Seq(0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0).map(_.toRealNN)
+    val generatedData = inputData.zip(labelData)
+    val (ds, input, label) = TestFeatureBuilder("input", "label", generatedData)
+    val autoBucketFeature = Seq(input).transmogrify(label = Some(label.copy(isResponse = true)))
+    val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature).transform(ds)
+    // value col, null indicator col, bucket 0 indicator, bucket 1 indicator
+    val expected = Array(
+      Array(-4.0, 0.0, 1.0, 0.0),
+      Array(-3.0, 0.0, 1.0, 0.0),
+      Array(-2.0, 0.0, 1.0, 0.0),
+      Array(-1.0, 0.0, 1.0, 0.0),
+      Array(1.0, 0.0, 0.0, 1.0),
+      Array(2.0, 0.0, 0.0, 1.0),
+      Array(3.0, 0.0, 0.0, 1.0),
+      Array(4.0, 0.0, 0.0, 1.0)
+    ).map(Vectors.dense(_).toOPVector)
+    vectorized.collect(autoBucketFeature) should contain theSameElementsAs expected
+  }
+  it should "vectorize single real feature with a label" in {
+    val (ds, age) = TestFeatureBuilder("age", ageData)
+    val labelData = age.transformWith(labelTransformer).asInstanceOf[Feature[RealNN]].copy(isResponse = true)
+    val autoBucketFeature = Seq(age).transmogrify(label = Some(labelData))
+    val manualBucketFeature = Seq(
+      age.vectorize(fillValue = 0, fillWithMean = true, trackNulls = true),
+      age.autoBucketize(labelData, trackNulls = false)
+    ).combine()
+    val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds)
+
+    for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} {
+      autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray
+    }
+  }
+  it should "vectorize multiple real features with a label" in {
+    val generatedData: Seq[(Real, Real)] = ageData.zip(heightData)
+    val (ds, age, height) = TestFeatureBuilder("age", "height", generatedData)
+    val labelData = age.transformWith(labelTransformer).asInstanceOf[Feature[RealNN]].copy(isResponse = true)
+    val autoBucketFeature = Seq(age, height).transmogrify(label = Some(labelData))
+    val manualBucketFeature = Seq(
+      age, age.autoBucketize(labelData, trackNulls = false),
+      height, height.autoBucketize(labelData, trackNulls = false)
+    ).transmogrify()
+    val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds)
+
+    for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} {
+      autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray
+    }
+  }
+  Spec[RichIntegralFeature[_]] should "vectorize single integral feature with a label" in {
+    val (ds, count) = TestFeatureBuilder("count", countData)
+    val labelTransformer = new UnaryLambdaTransformer[Integral, RealNN](operationName = "labelFunc",
+      transformFn = {
+        case SomeValue(Some(x)) if x > 5 => 1.0.toRealNN
+        case _ => 0.0.toRealNN
+      }
+    )
+    val labelData = labelTransformer.setInput(count).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true)
+    val autoBucketFeature = Seq(count).transmogrify(label = Some(labelData))
+    val manualBucketFeature = Seq(count, count.autoBucketize(labelData, trackNulls = false)).transmogrify()
+    val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds)
+
+    for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} {
+      autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray
+    }
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala
index 393cbc0266..57bb661453 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala
@@ -463,3 +463,5 @@ object OPMapVectorizerTestHelper extends Matchers {
   }
 
 }
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala
index 31da7287d0..252aa4c994 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala
@@ -266,8 +266,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext {
     val vectorizedStage = untypedVectorizedStage.asInstanceOf[OpSetVectorizer[_]]
 
     val inputDF = TestOpWorkflowBuilder(df, vectorized).computeDataUpTo(vectorized)
-    val vectorizedDF = vectorizedStage.fit(inputDF).transform(inputDF)
-    val featArray = vectorizedDF.collect(vectorized)
+    val featArray = inputDF.collect(vectorized)
     featArray.foreach { opVec => opVec.value.size shouldBe 5 }
   }
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
index 78923d31f1..4812f2adae 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
@@ -151,14 +151,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext {
     result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2}
   }
 
-  it should "detect two non categorical text features" in {
+  it should "use separate hash space for each text feature" in {
     val smartMapVectorized = new SmartTextMapVectorizer[TextMap]()
-      .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setMaxCardinality(1).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
       .setCleanKeys(false)
+      .setHashSpaceStrategy(HashSpaceStrategy.Separate)
       .setInput(m1, m2).getOutput()
 
     val smartVectorized = new SmartTextVectorizer()
-      .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setMaxCardinality(1).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setHashSpaceStrategy(HashSpaceStrategy.Separate)
       .setInput(f1, f2).getOutput()
 
     val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data)
@@ -172,8 +174,76 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext {
     mapMeta.columns.zip(meta.columns).foreach{ case (m, f) =>
       m.parentFeatureName shouldBe Array(m1.name)
       m.parentFeatureType shouldBe Array(m1.typeName)
-      if (m.index < 4) m.indicatorGroup shouldBe Option(f1.name)
-      else m.indicatorGroup shouldBe Option(f2.name)
+      if (m.index < 4 || m.index == 8) m.indicatorGroup shouldBe Option(f1.name)
+      else if (m.index < 8 || m.index == 9) m.indicatorGroup shouldBe Option(f2.name)
+      m.indicatorValue shouldBe f.indicatorValue
+    }
+
+    result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2}
+  }
+
+  it should "use shared hash space for two text features" in {
+    val smartMapVectorized = new SmartTextMapVectorizer[TextMap]()
+      .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setCleanKeys(false)
+      .setNumFeatures(4).setHashSpaceStrategy(HashSpaceStrategy.Shared)
+      .setInput(m1, m2).getOutput()
+
+    val smartVectorized = new SmartTextVectorizer()
+      .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setNumFeatures(4).setHashSpaceStrategy(HashSpaceStrategy.Shared)
+      .setInput(f1, f2).getOutput()
+
+    val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data)
+    val result = transformed.collect(smartMapVectorized, smartVectorized)
+
+    val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name))
+    val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
+    mapMeta.history.keys shouldBe Set(m1.name, m2.name)
+    mapMeta.columns.length shouldBe meta.columns.length
+
+    mapMeta.columns.zip(meta.columns).foreach{ case (m, f) =>
+      m.parentFeatureName shouldBe Array(m1.name)
+      m.parentFeatureType shouldBe Array(m1.typeName)
+      if (m.index == 4) {
+        assert(m.indicatorGroup === Option(f1.name), s"first null indicator should be from ${f1.name}")
+      } else if (m.index == 5) {
+        assert(m.indicatorGroup === Option(f2.name), s"second null indicator should be from ${f2.name}")
+      }
+      m.indicatorValue shouldBe f.indicatorValue
+    }
+
+    result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2}
+  }
+
+  it should "use shared hash space for two text features again" in {
+    val smartMapVectorized = new SmartTextMapVectorizer[TextMap]()
+      .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setCleanKeys(false)
+      .setNumFeatures(TransmogrifierDefaults.MaxNumOfFeatures).setHashSpaceStrategy(HashSpaceStrategy.Auto)
+      .setInput(m1, m2).getOutput()
+
+    val smartVectorized = new SmartTextVectorizer()
+      .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setNumFeatures(TransmogrifierDefaults.MaxNumOfFeatures).setHashSpaceStrategy(HashSpaceStrategy.Auto)
+      .setInput(f1, f2).getOutput()
+
+    val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data)
+    val result = transformed.collect(smartMapVectorized, smartVectorized)
+
+    val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name))
+    val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
+    mapMeta.history.keys shouldBe Set(m1.name, m2.name)
+    mapMeta.columns.length shouldBe meta.columns.length
+
+    mapMeta.columns.zip(meta.columns).foreach{ case (m, f) =>
+      m.parentFeatureName shouldBe Array(m1.name)
+      m.parentFeatureType shouldBe Array(m1.typeName)
+      if (m.index == TransmogrifierDefaults.MaxNumOfFeatures) {
+        assert(m.indicatorGroup === Option(f1.name), s"first null indicator should be from ${f1.name}")
+      } else if (m.index > TransmogrifierDefaults.MaxNumOfFeatures) {
+        assert(m.indicatorGroup === Option(f2.name), s"second null indicator should be from ${f2.name}")
+      }
       m.indicatorValue shouldBe f.indicatorValue
     }
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala
index a91a677428..b8961248d9 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala
@@ -47,8 +47,10 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext {
 
   val (ds, f1) = TestFeatureBuilder(
     Seq[(TextMap)](
-      TextMap(Map("k1" -> "A giraffe drinks by the watering hole", "k2" -> "Cheese")),
-      TextMap(Map("k2" -> "French Fries")),
+      TextMap(Map("k1" -> "A giraffe drinks by the watering hole", "k2" -> "Cheese", "k3" -> "Hello", "k4" -> "Bye")),
+      // scalastyle:off
+      TextMap(Map("k2" -> "French Fries", "k4" -> "\uA7BC\u10C8\u2829\u29BA\u23E1")),
+      // scalastyle:on
       TextMap(Map("k3" -> "Hip-hop Pottamus"))
     )
   )
@@ -68,9 +70,9 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext {
     val vector = vectorizer.getOutput()
 
     val expected = Array(
-      Array(0.0, 0.0, 1.0),
-      Array(1.0, 0.0, 1.0),
-      Array(1.0, 1.0, 0.0)
+      Array(0.0, 0.0, 0.0, 0.0),
+      Array(1.0, 0.0, 1.0, 1.0),
+      Array(1.0, 1.0, 0.0, 1.0)
     ).map(Vectors.dense(_).toOPVector)
     transformed.collect(vector) shouldBe expected
 
@@ -80,7 +82,8 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext {
       f1 -> List(
         IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k1"),
         IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k2"),
-        IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k3")
+        IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k3"),
+        IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k4")
       )
     )
   }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
index 8bd3fcfa41..62c328c7b3 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
@@ -70,15 +70,15 @@ class TextTokenizerTest extends FlatSpec with TestSparkContext {
 
   trait English {
     val expected = Array(
-      List("i'v", "got", "love", "bunch", "coconut").toTextList,
-      List("all", "stand", "row").toTextList,
-      List("big", "on", "small", "on", "some", "big", "your", "head").toTextList,
-      List("bodi", "big", "on", "small", "h1", "on", "h1", "some", "big", "your", "head", "bodi").toTextList,
+      List("got", "love", "bunch", "coconut").toTextList,
+      List("stand", "row").toTextList,
+      List("big", "on", "small", "on", "big", "head").toTextList,
+      List("bodi", "big", "on", "small", "h1", "on", "h1", "big", "head", "bodi").toTextList,
       TextList.empty
     )
     val expectedHtml = {
       val copy = expected.toList.toArray
-      copy(3) = List("big", "on", "small", "on", "some", "big", "your", "head").toTextList
+      copy(3) = List("big", "on", "small", "on", "big", "head").toTextList
       copy
     }
   }
@@ -188,10 +188,10 @@ class TextTokenizerTest extends FlatSpec with TestSparkContext {
       input = english,
       tokenizer = tokenized.originStage.asInstanceOf[TextTokenizer[Text]],
       expected = Array(
-        List("i've", "got", "lovely", "bunch", "coconuts").toTextList,
-        List("all", "standing", "row").toTextList,
-        List("big", "ones", "small", "ones", "some", "big", "your", "head").toTextList,
-        List("body", "big", "ones", "small", "h1", "ones", "h1", "some", "big", "your", "head", "body").toTextList,
+        List("got", "lovely", "bunch", "coconuts").toTextList,
+        List("standing", "row").toTextList,
+        List("big", "ones", "small", "ones", "big", "head").toTextList,
+        List("body", "big", "ones", "small", "h1", "ones", "h1", "big", "head", "body").toTextList,
         TextList.empty
       )
     )
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
index c8c5ed7cc7..f777c9b0f8 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala
@@ -32,19 +32,23 @@
 package com.salesforce.op.stages.impl.insights
 
 import com.salesforce.op.FeatureHistory
-import com.salesforce.op.stages.impl.classification.{OpLogisticRegression, OpRandomForest}
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.testkit.{RandomIntegral, RandomReal, RandomVector}
-import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
-import org.scalatest.junit.JUnitRunner
 import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.classification.{OpLogisticRegression, OpRandomForestClassifier}
 import com.salesforce.op.stages.impl.preparators.SanityCheckDataTest
 import com.salesforce.op.stages.impl.regression.OpLinearRegression
+import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
+import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.testkit.{RandomIntegral, RandomReal, RandomVector}
+import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
+import org.apache.spark.ml.classification.{LogisticRegressionModel, RandomForestClassificationModel}
+import org.apache.spark.ml.regression.LinearRegressionModel
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
-import com.salesforce.op.utils.spark.RichDataset._
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
 
 @RunWith(classOf[JUnitRunner])
 class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
@@ -57,7 +61,11 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val dfWithMeta = addMetaData(df, "features", 40)
     val sparkModel = new OpLogisticRegression().setInput(l1r, f1).fit(df)
 
-    val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1)
+    val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get
+      .asInstanceOf[LogisticRegressionModel]
+
+    // val model = sparkModel.getSparkMlStage().get
+    val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1)
     val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput())
     insights.foreach(_.value.size shouldBe 20)
     val parsed = insights.map(RecordInsightsParser.parseInsights)
@@ -71,9 +79,11 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val (df, f1, l1) = TestFeatureBuilder("features", "labels", features.zip(labels))
     val l1r = l1.copy(isResponse = true)
     val dfWithMeta = addMetaData(df, "features", 40)
-    val sparkModel = new OpRandomForest().setInput(l1r, f1).fit(df)
+    val sparkModel = new OpRandomForestClassifier().setInput(l1r, f1).fit(df)
+    val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get
+      .asInstanceOf[RandomForestClassificationModel]
 
-    val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1).setTopK(2)
+    val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1).setTopK(2)
     val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput())
     insights.foreach(_.value.size shouldBe 2)
     val parsed = insights.map(RecordInsightsParser.parseInsights)
@@ -93,8 +103,10 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val l1r = l1.copy(isResponse = true)
     val dfWithMeta = addMetaData(df, "features", 40)
     val sparkModel = new OpLinearRegression().setInput(l1r, f1).fit(df)
+    val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get
+      .asInstanceOf[LinearRegressionModel]
 
-    val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1)
+    val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1)
     val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput())
     insights.foreach(_.value.size shouldBe 20)
     val parsed = insights.map(RecordInsightsParser.parseInsights)
@@ -155,7 +167,9 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     val (testData, name, labelNoRes, featureVector) = TestFeatureBuilder("name", "label", "features", data)
     val label = labelNoRes.copy(isResponse = true)
     val testDataMeta = addMetaData(testData, "features", 5)
-    val model = new OpLogisticRegression().setInput(label, featureVector).fit(testData)
+    val sparkModel = new OpLogisticRegression().setInput(label, featureVector).fit(testData)
+    val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get
+      .asInstanceOf[LogisticRegressionModel]
 
     val transformer = new RecordInsightsLOCO(model).setInput(featureVector)
 
@@ -168,4 +182,4 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext {
     parsed.foreach { case (_, in) => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true }
   }
 
-}
\ No newline at end of file
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala
index 9d0fb44f3b..c9d82b1d85 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala
@@ -31,7 +31,7 @@
 
 package com.salesforce.op.stages.impl.preparators
 
-import com.salesforce.op.OpWorkflow
+import com.salesforce.op.{OpWorkflow, UID}
 import com.salesforce.op.features.types._
 import com.salesforce.op.features.{Feature, FeatureLike}
 import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer
@@ -53,6 +53,11 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging {
 
   // loggingLevel(Level.INFO)
 
+  override def beforeAll: Unit = {
+    super.beforeAll
+    UID.reset()
+  }
+
   Spec[SanityChecker] should "correctly identify label leakage in PickList features using the Cramer's V criteria" +
     "when the label corresponds to a binary classification problem" in {
     // First set up the raw features:
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala
index 5995a5a402..9c1ed41b26 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala
@@ -32,10 +32,11 @@
 package com.salesforce.op.stages.impl.preparators
 
 import com.salesforce.op._
+import com.salesforce.op.features.FeatureLike
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.MetadataParam
 import com.salesforce.op.stages.base.binary.BinaryModel
-import com.salesforce.op.stages.impl.feature.RealNNVectorizer
+import com.salesforce.op.stages.impl.feature.{HashSpaceStrategy, RealNNVectorizer, SmartTextMapVectorizer}
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.utils.spark.RichMetadata._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
@@ -60,9 +61,40 @@ case class SanityCheckDataTest
 
 case class SCDataTest(label: RealNN, features: OPVector)
 
+case class TextRawData
+(
+  id: String,
+  target: Double,
+  textMap: Map[String, String]
+)
+
 @RunWith(classOf[JUnitRunner])
 class SanityCheckerTest extends FlatSpec with TestSparkContext {
 
+  private val textRawData = Seq(
+    TextRawData("0", 1.0, Map("color" -> "red", "fruit" -> "berry", "beverage" -> "tea")),
+    TextRawData("1", 1.0, Map("color" -> "orange", "fruit" -> "berry", "beverage" -> "coffee")),
+    TextRawData("2", 1.0, Map("color" -> "yello", "fruit" -> "berry", "beverage" -> "water")),
+    TextRawData("3", 1.0, Map("color" -> "green", "fruit" -> "berry")),
+    TextRawData("4", 1.0, Map("color" -> "blue", "fruit" -> "berry")),
+    TextRawData("5", 1.0, Map("color" -> "indigo", "fruit" -> "berry")),
+    TextRawData("6", 0.0, Map("fruit" -> "peach")),
+    TextRawData("7", 0.0, Map("fruit" -> "peach")),
+    TextRawData("8", 0.0, Map("fruit" -> "mango")),
+    TextRawData("9", 0.0, Map("beverage" -> "tea")),
+    TextRawData("10", 0.0, Map("beverage" -> "coffee")),
+    TextRawData("11", 0.0, Map("beverage" -> "water"))
+  ).map( textRawData =>
+    (
+      textRawData.id.toText,
+      textRawData.target.toRealNN,
+      textRawData.textMap.toTextMap
+    )
+  )
+
+  val (textData, id, target, textMap) = TestFeatureBuilder("id", "target", "textMap", textRawData)
+  val targetResponse: FeatureLike[RealNN] = target.copy(isResponse = true)
+
   // scalastyle:off
   private val data = Seq(
     SanityCheckDataTest("alex",     32,  5.0,  0,  1,  0.5,  0),
@@ -314,6 +346,76 @@ class SanityCheckerTest extends FlatSpec with TestSparkContext {
       "requirement failed: The sanity checker has dropped all of your features, check your input data quality"
   }
 
+  it should "remove individual text hash features independently" in {
+    val smartMapVectorized = new SmartTextMapVectorizer[TextMap]()
+      .setMaxCardinality(2).setNumFeatures(8).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setHashSpaceStrategy(HashSpaceStrategy.Shared)
+      .setInput(textMap).getOutput()
+
+    val checkedFeatures = new SanityChecker()
+      .setCheckSample(1.0)
+      .setRemoveBadFeatures(true)
+      .setRemoveFeatureGroup(true)
+      .setProtectTextSharedHash(true)
+      .setMinCorrelation(0.0)
+      .setMaxCorrelation(0.8)
+      .setMaxCramersV(0.8)
+      .setInput(targetResponse, smartMapVectorized)
+      .getOutput()
+
+    checkedFeatures.originStage shouldBe a[SanityChecker]
+
+    val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, checkedFeatures).transform(textData)
+
+    val featuresToDrop = Seq("textMap_4", "textMap_7", "textMap_color_NullIndicatorValue_8")
+    val featuresWithCorr = Seq("textMap_0", "textMap_1", "textMap_2", "textMap_3", "textMap_4", "textMap_5",
+      "textMap_6", "textMap_color_NullIndicatorValue_8", "textMap_fruit_NullIndicatorValue_9",
+      "textMap_beverage_NullIndicatorValue_10"
+    )
+    val featuresWithNaNCorr = Seq("textMap_7")
+
+    validateTransformerOutput(checkedFeatures.name, transformed, featuresWithCorr, featuresToDrop, featuresWithNaNCorr)
+  }
+
+  it should "remove text hash features as groups" in {
+    val smartMapVectorized = new SmartTextMapVectorizer[TextMap]()
+      .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true)
+      .setHashSpaceStrategy(HashSpaceStrategy.Separate)
+      .setInput(textMap).getOutput()
+
+    val checkedFeatures = new SanityChecker()
+      .setCheckSample(1.0)
+      .setRemoveBadFeatures(true)
+      .setRemoveFeatureGroup(true)
+      .setProtectTextSharedHash(true)
+      .setMinCorrelation(0.0)
+      .setMaxCorrelation(0.8)
+      .setMaxCramersV(0.8)
+      .setInput(targetResponse, smartMapVectorized)
+      .getOutput()
+
+    checkedFeatures.originStage shouldBe a[SanityChecker]
+
+    val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, checkedFeatures).transform(textData)
+
+    val featuresToDrop = Seq("textMap_color_0", "textMap_color_1", "textMap_color_2", "textMap_color_3",
+      "textMap_fruit_4", "textMap_fruit_5", "textMap_fruit_6", "textMap_fruit_7",
+      "textMap_beverage_8", "textMap_beverage_9",
+      "textMap_color_NullIndicatorValue_12", "textMap_fruit_NullIndicatorValue_13"
+    )
+    val featuresWithCorr = Seq("textMap_color_0", "textMap_color_3",
+      "textMap_fruit_5", "textMap_fruit_6", "textMap_fruit_7",
+      "textMap_beverage_10", "textMap_beverage_11",
+      "textMap_color_NullIndicatorValue_12", "textMap_fruit_NullIndicatorValue_13",
+      "textMap_beverage_NullIndicatorValue_14"
+    )
+    val featuresWithNaNCorr = Seq("textMap_color_1", "textMap_color_2", "textMap_fruit_4",
+      "textMap_beverage_8", "textMap_beverage_9"
+    )
+
+    validateTransformerOutput(checkedFeatures.name, transformed, featuresWithCorr, featuresToDrop, featuresWithNaNCorr)
+  }
+
   private def validateEstimatorOutput(outputColName: String, model: BinaryModel[RealNN, OPVector, OPVector],
     expectedFeaturesToDrop: Seq[String], label: String): Unit = {
     val metadata = model.getMetadata()
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala
new file mode 100644
index 0000000000..1e0ae75d86
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test._
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction,
+  OpPredictorWrapperModel[DecisionTreeRegressionModel],
+  OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality {
+
+  val (inputData, rawLabel, features) = TestFeatureBuilder(
+    Seq[(RealNN, OPVector)](
+      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
+      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
+      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
+      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
+      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
+    )
+  )
+  val label = rawLabel.copy(isResponse = true)
+  val estimator = new OpDecisionTreeRegressor().setInput(label, features)
+
+  val expectedResult = Seq(
+    Prediction(10.0),
+    Prediction(20.0),
+    Prediction(30.0),
+    Prediction(40.0),
+    Prediction(50.0)
+  )
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxDepth(6)
+      .setMaxBins(2)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.1)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxDepth shouldBe 6
+    estimator.predictor.getMaxBins shouldBe 2
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.1
+
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala
new file mode 100644
index 0000000000..dde440e5f9
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test._
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpGBTRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTRegressionModel],
+  OpPredictorWrapper[GBTRegressor, GBTRegressionModel]] with PredictionEquality {
+
+  val (inputData, rawLabel, features) = TestFeatureBuilder(
+    Seq[(RealNN, OPVector)](
+      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
+      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
+      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
+      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
+      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
+    )
+  )
+  val label = rawLabel.copy(isResponse = true)
+  val estimator = new OpGBTRegressor().setInput(label, features)
+
+  val expectedResult = Seq(
+    Prediction(10.0),
+    Prediction(20.0),
+    Prediction(30.0),
+    Prediction(40.0),
+    Prediction(50.0)
+  )
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxIter(10)
+      .setMaxDepth(6)
+      .setMaxBins(2)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.1)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxIter shouldBe 10
+    estimator.predictor.getMaxDepth shouldBe 6
+    estimator.predictor.getMaxBins shouldBe 2
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.1
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala
new file mode 100644
index 0000000000..58cec6e044
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test._
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.regression.{GeneralizedLinearRegression, GeneralizedLinearRegressionModel}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpGeneralizedLinearRegressionTest extends OpEstimatorSpec[Prediction,
+  OpPredictorWrapperModel[GeneralizedLinearRegressionModel],
+  OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel]] with PredictionEquality {
+
+  val (inputData, rawLabel, features) = TestFeatureBuilder(
+    Seq[(RealNN, OPVector)](
+      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
+      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
+      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
+      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
+      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
+    )
+  )
+  val label = rawLabel.copy(isResponse = true)
+  val estimator = new OpGeneralizedLinearRegression().setInput(label, features)
+
+  val expectedResult = Seq(
+    Prediction(10.0, 9.99),
+    Prediction(20.0, 19.99),
+    Prediction(30.0, 29.99),
+    Prediction(40.0, 40.0),
+    Prediction(50.0, 50.0)
+  )
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxIter(10)
+      .setRegParam(0.1)
+      .setFitIntercept(true)
+      .setTol(1E-4)
+      .setSolver("normal")
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxIter shouldBe 10
+    estimator.predictor.getRegParam shouldBe 0.1
+    estimator.predictor.getFitIntercept shouldBe true
+    estimator.predictor.getTol shouldBe 1E-4
+    estimator.predictor.getSolver shouldBe "normal"
+
+  }
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala
index fa28f442e2..6efcf3232e 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala
@@ -32,62 +32,53 @@
 package com.salesforce.op.stages.impl.regression
 
 import com.salesforce.op.features.types._
-import com.salesforce.op.stages.impl.preparators.SanityChecker
-import com.salesforce.op.stages.sparkwrappers.generic._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel}
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test._
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.regression.LinearRegressionModel
+import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
 
 @RunWith(classOf[JUnitRunner])
-class OpLinearRegressionTest extends FlatSpec with TestSparkContext {
-  val stageNames = Array[String]("LinearRegression_predictionCol")
+class OpLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearRegressionModel],
+  OpPredictorWrapper[LinearRegression, LinearRegressionModel]] with PredictionEquality {
 
-  val (ds, rawLabel, features) = TestFeatureBuilder(
+  val (inputData, rawLabel, features) = TestFeatureBuilder(
     Seq[(RealNN, OPVector)](
       (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
       (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
       (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
       (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
       (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
-      )
+    )
   )
   val label = rawLabel.copy(isResponse = true)
-  val linReg = new OpLinearRegression().setInput(label, features)
-
-  Spec[OpLinearRegression] should "have output with correct origin stage" in {
-    val output = linReg.getOutput()
-    assert(output.originStage.isInstanceOf[SwBinaryEstimator[_, _, _, _, _]])
-    the[IllegalArgumentException] thrownBy {
-      linReg.setInput(label.copy(isResponse = true), features.copy(isResponse = true))
-    } should have message "The feature vector should not contain any response features."
-  }
-
-  it should "return a properly formed LinearRegressionModel when fitted" in {
-    val model = linReg.setSparkParams("maxIter", 10).fit(ds)
-    assert(model.isInstanceOf[SwBinaryModel[RealNN, OPVector, RealNN, LinearRegressionModel]])
-
-    val sparkStage = model.getSparkMlStage()
-
-    sparkStage.isDefined shouldBe true
-    sparkStage.get shouldBe a[LinearRegressionModel]
-
-    val inputNames = model.getInputFeatures().map(_.name)
-    inputNames should have length 2
-    inputNames shouldBe Array(label.name, features.name)
-  }
+  val estimator = new OpLinearRegression().setInput(label, features)
 
+  val expectedResult = Seq(
+    Prediction(10.0),
+    Prediction(20.0),
+    Prediction(30.0),
+    Prediction(40.0),
+    Prediction(50.0)
+  )
 
   it should "allow the user to set the desired spark parameters" in {
-    linReg.setMaxIter(10).setRegParam(0.1)
-    linReg.getMaxIter shouldBe 10
-    linReg.getRegParam shouldBe 0.1
+    estimator
+      .setMaxIter(10)
+      .setRegParam(0.1)
+      .setFitIntercept(true)
+      .setElasticNetParam(0.1)
+      .setSolver("normal")
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxIter shouldBe 10
+    estimator.predictor.getRegParam shouldBe 0.1
+    estimator.predictor.getFitIntercept shouldBe true
+    estimator.predictor.getElasticNetParam shouldBe 0.1
+    estimator.predictor.getSolver shouldBe "normal"
 
-    linReg.setFitIntercept(true).setElasticNetParam(0.1).setSolver("normal")
-    linReg.getFitIntercept shouldBe true
-    linReg.getElasticNetParam shouldBe 0.1
-    linReg.getSolver shouldBe "normal"
   }
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala
new file mode 100644
index 0000000000..7a9080b0f9
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.regression
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.PredictionEquality
+import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
+import com.salesforce.op.test._
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpRandomForestRegressorTest extends OpEstimatorSpec[Prediction,
+  OpPredictorWrapperModel[RandomForestRegressionModel],
+  OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]] with PredictionEquality {
+
+  val (inputData, rawLabel, features) = TestFeatureBuilder(
+    Seq[(RealNN, OPVector)](
+      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
+      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
+      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
+      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
+      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
+    )
+  )
+  val label = rawLabel.copy(isResponse = true)
+  val estimator = new OpRandomForestRegressor().setInput(label, features)
+
+  val expectedResult = Seq(
+    Prediction(20.0),
+    Prediction(23.5),
+    Prediction(31.5),
+    Prediction(35.5),
+    Prediction(37.0)
+  )
+
+  it should "allow the user to set the desired spark parameters" in {
+    estimator
+      .setMaxDepth(7)
+      .setMaxBins(3)
+      .setMinInstancesPerNode(2)
+      .setMinInfoGain(0.1)
+      .setSeed(42L)
+    estimator.fit(inputData)
+
+    estimator.predictor.getMaxDepth shouldBe 7
+    estimator.predictor.getMaxBins shouldBe 3
+    estimator.predictor.getMinInstancesPerNode shouldBe 2
+    estimator.predictor.getMinInfoGain shouldBe 0.1
+    estimator.predictor.getSeed shouldBe 42L
+
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala
similarity index 80%
rename from core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala
rename to core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala
index ea612d29a4..11b75b6772 100644
--- a/core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala
@@ -29,19 +29,20 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.regression
+package com.salesforce.op.stages.impl.regression
 
 import com.salesforce.op.features.types.{Prediction, RealNN}
+import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter.toOP
 import com.salesforce.op.test._
 import com.salesforce.op.testkit._
-import org.apache.spark.ml.SparkModelConverter.toOP
+import org.apache.spark.ml.regression._
 import org.apache.spark.sql.DataFrame
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 @RunWith(classOf[JUnitRunner])
-class OpPredictionModelTest extends FlatSpec with TestSparkContext {
+class OpRegressionModelTest extends FlatSpec with TestSparkContext {
 
   private val label = RandomIntegral.integrals(0, 2).limit(1000)
     .map{ v => RealNN(v.value.map(_.toDouble).getOrElse(0.0)) }
@@ -57,19 +58,19 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext {
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
 
 
-  Spec[OpLinearPredictionModel] should "produce the same values as the spark version" in {
+  Spec[OpLinearRegressionModel] should "produce the same values as the spark version" in {
     val spk = new LinearRegression()
       .setFeaturesCol(featureV.name)
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
@@ -80,7 +81,7 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext {
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
@@ -91,11 +92,21 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext {
       .setLabelCol(labelF.name)
       .fit(rawDF)
 
-    val op = toOP(Some(spk)).setInput(labelF, featureV)
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
 
     compareOutputs(spk.transform(rawDF), op.transform(rawDF))
   }
 
+  Spec[OpGeneralizedLinearRegressionModel] should "produce the same values as the spark version" in {
+    val spk = new GeneralizedLinearRegression()
+      .setFeaturesCol(featureV.name)
+      .setLabelCol(labelF.name)
+      .fit(rawDF)
+
+    val op = toOP(spk, spk.uid).setInput(labelF, featureV)
+
+    compareOutputs(spk.transform(rawDF), op.transform(rawDF))
+  }
 
   def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = {
     val sorted1 = df1.collect().sortBy(_.getAs[Double](2))
@@ -106,3 +117,5 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext {
     }
   }
 }
+
+
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala
index 5e98c480f4..f69c7ee0c1 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala
@@ -52,12 +52,12 @@ class DataBalancerTest extends FlatSpec with TestSparkContext {
   // Generate positive observations following a distribution ~ N((0.0, 0.0, 0.0), I_3)
   val positiveData = {
     RandomRDDs.normalVectorRDD(sc, bigCount, 3, seed = seed)
-      .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDS()
+      .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF()
   }
   // Generate negative observations following a distribution ~ N((10.0, 10.0, 10.0), I_3)
   val negativeData = {
     RandomRDDs.normalVectorRDD(sc, smallCount, 3, seed = seed)
-      .map(v => (0.0, Vectors.dense(v.toArray.map(_ + 10.0)), "B")).toDS()
+      .map(v => (0.0, Vectors.dense(v.toArray.map(_ + 10.0)), "B")).toDF()
   }
 
   val data = positiveData.union(negativeData)
@@ -77,7 +77,7 @@ class DataBalancerTest extends FlatSpec with TestSparkContext {
     val (downSample, upSample) = dataBalancer.getProportions(smallCount, bigCount, sampleFraction, maxTrainingSample)
     val reSampled = dataBalancer.rebalance(negativeData, upSample, positiveData, downSample, seed)
 
-    val Array(negData, posData) = Array(0.0, 1.0).map(label => reSampled.filter(_._1 == label).persist())
+    val Array(negData, posData) = Array(0.0, 1.0).map(label => reSampled.filter(_.getDouble(0) == label).persist())
     val negativeCount = negData.count()
     val positiveCount = posData.count()
 
@@ -107,8 +107,13 @@ class DataBalancerTest extends FlatSpec with TestSparkContext {
     balancer.getDownSampleFraction shouldBe downSample
     balancer.getIsPositiveSmall shouldBe false
 
+
     // Rerun balancer with set params
+    val metadata = balancer.metadataBuilder
     val ModelData(expected2, _) = balancer.prepare(data)
+    withClue("Data balancer should no update the metadata"){
+      balancer.metadataBuilder shouldBe metadata
+    }
     expected.collect() shouldBe expected2.collect()
   }
 
@@ -125,7 +130,11 @@ class DataBalancerTest extends FlatSpec with TestSparkContext {
     balancer.getAlreadyBalancedFraction shouldBe 1.0
 
     // Rerun balancer with set params
+    val metadata = balancer.metadataBuilder
     val ModelData(expected2, _) = balancer.prepare(data)
+    withClue("Data balancer should no update the metadata"){
+      balancer.metadataBuilder shouldBe metadata
+    }
     expected.collect() shouldBe expected2.collect()
 
   }
@@ -144,7 +153,11 @@ class DataBalancerTest extends FlatSpec with TestSparkContext {
     balancer.getAlreadyBalancedFraction shouldBe maxSize.toDouble / (smallCount + bigCount)
 
     // Rerun balancer with set params
+    val metadata = balancer.metadataBuilder
     val ModelData(expected2, _) = balancer.prepare(data)
+    withClue("Data balancer should no update the metadata"){
+      balancer.metadataBuilder shouldBe metadata
+    }
     expected.collect() shouldBe expected2.collect()
 
   }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala
index 0293b7799a..706472c37f 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala
@@ -54,8 +54,8 @@ class DataCutterTest extends FlatSpec with TestSparkContext {
 
   val data = labels.zip(vectors).zip(labelsBiased)
   val dataSize = data.size
-  val randDF = sc.makeRDD(data.map { case ((l, v), b) => (l.toDouble.get, v.value, b.toString) }).toDS()
-  val biasDF = sc.makeRDD(data.map { case ((l, v), b) => (b.toDouble.get, v.value, l.toString) }).toDS()
+  val randDF = sc.makeRDD(data.map { case ((l, v), b) => (l.toDouble.get, v.value, b.toString) }).toDF()
+  val biasDF = sc.makeRDD(data.map { case ((l, v), b) => (b.toDouble.get, v.value, l.toString) }).toDF()
   val seed = 42L
 
   Spec[DataCutter] should "not filter out any data when the parameters are permissive" in {
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala
index 2235597324..d4d1438881 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala
@@ -47,7 +47,7 @@ class DataSplitterTest extends FlatSpec with TestSparkContext {
 
   val data =
     RandomRDDs.normalVectorRDD(sc, 1000, 3, seed = seed)
-      .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDS()
+      .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF()
 
   val dataSplitter = new DataSplitter().setSeed(seed)
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala
index 5665bcfde7..116f94f979 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala
@@ -34,24 +34,23 @@ package com.salesforce.op.stages.impl.tuning
 import com.salesforce.op.evaluators.Evaluators
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.impl.classification.ProbabilisticClassifierType.{ProbClassifier, ProbClassifierModel}
-import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames
-import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey
 import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.testkit.{RandomBinary, RandomIntegral, RandomReal, RandomVector}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.functions.monotonically_increasing_id
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.types.MetadataBuilder
+import com.salesforce.op.utils.spark.RichDataset._
 
 @RunWith(classOf[JUnitRunner])
 class OpValidatorTest extends FlatSpec with TestSparkContext {
   // Random Data
   val count = 1000
   val sizeOfVector = 2
-  val seed = 1234L
+  val seed = 12345L
   val p = 0.325
   val multiClassProbabilities = Array(0.21, 0.29, 0.5)
   val vectors = RandomVector.sparse(RandomReal.uniform[Real](-1.0, 1.0), sizeOfVector).take(count)
@@ -75,27 +74,27 @@ class OpValidatorTest extends FlatSpec with TestSparkContext {
     stratify = true
   )
 
-  val rdd = data.withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id()).rdd
+  val binaryDS = data.select(label, features)
+  val multiDS = data.select(multiLabel, features)
 
-  val binaryRDD = rdd.map {
-    case Row(label, features, _, index) => (label, features, index).asInstanceOf[LabelFeaturesKey]
-  }
-
-  val multiRDD = rdd.map {
-    case Row(_, features, multiLabel, index) => (multiLabel, features, index).asInstanceOf[LabelFeaturesKey]
-  }
+  val condition = cv.isClassification && cv.stratify
+  val balancer = Option(new DataBalancer())
+  val cutter = Option(new DataCutter())
 
 
   Spec[OpCrossValidation[_, _]] should "stratify binary class data" in {
-    val splits = cv.createTrainValidationSplits(binaryRDD)
+    val splits = cv.createTrainValidationSplits(condition, binaryDS, label.name, balancer)
+    splits.length shouldBe ValidatorParamDefaults.NumFolds
     splits.foreach { case (train, validate) =>
       assertFractions(Array(1 - p, p), train)
       assertFractions(Array(1 - p, p), validate)
     }
+    balancer.get.metadataBuilder.build() should not be new MetadataBuilder().build()
   }
 
   it should "stratify multi class data" in {
-    val splits = cv.createTrainValidationSplits(multiRDD)
+    val splits = cv.createTrainValidationSplits(condition, multiDS, multiLabel.name, cutter)
+    splits.length shouldBe ValidatorParamDefaults.NumFolds
     splits.foreach { case (train, validate) =>
       assertFractions(multiClassProbabilities, train)
       assertFractions(multiClassProbabilities, validate)
@@ -104,15 +103,16 @@ class OpValidatorTest extends FlatSpec with TestSparkContext {
 
 
   Spec[OpTrainValidationSplit[_, _]] should "stratify binary class data" in {
-    val splits = ts.createTrainValidationSplits(binaryRDD)
+    val splits = ts.createTrainValidationSplits(condition, binaryDS, label.name, balancer)
     splits.foreach { case (train, validate) =>
       assertFractions(Array(1 - p, p), train)
       assertFractions(Array(1 - p, p), validate)
     }
+    balancer.get.metadataBuilder.build() should not be new MetadataBuilder().build()
   }
 
   it should "stratify multi class data" in {
-    val splits = ts.createTrainValidationSplits(multiRDD)
+    val splits = ts.createTrainValidationSplits(condition, multiDS, multiLabel.name, cutter)
     splits.foreach { case (train, validate) =>
       assertFractions(multiClassProbabilities, train)
       assertFractions(multiClassProbabilities, validate)
@@ -132,7 +132,8 @@ class OpValidatorTest extends FlatSpec with TestSparkContext {
     }.groupByKey().mapValues(_.size / n).sortBy(_._1).values.collect()
 
     fractions zip fractionsByClass map { case (expected, actual) =>
-      math.abs(expected - actual) should be < 0.05 }
+      math.abs(expected - actual) should be < 0.065
+    }
   }
 
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala
index 659c737ba2..e85cd7f5a5 100644
--- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala
@@ -66,13 +66,4 @@ class SparkWrapperParamsTest extends FlatSpec with BeforeAndAfterEach with TestC
     swEstimator.getSparkMlStage() shouldBe None
   }
 
-  it should "when setting the stage it should also set path" in {
-    // should should be none because nothing is set
-    swEstimator.getStageSavePath().get shouldBe swEstimator.getSavePath()
-
-    swEstimator.setSavePath(path)
-    swEstimator.setSparkMlStage(Some(new StandardScaler()))
-
-    swEstimator.getStageSavePath().get shouldBe swEstimator.getSavePath()
-  }
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala
index f7830d39a8..b421f255ec 100644
--- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala
@@ -32,6 +32,7 @@
 package com.salesforce.op.stages.sparkwrappers.specific
 
 import com.salesforce.op.features.types._
+import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
 import com.salesforce.op.test.{PrestigeData, TestFeatureBuilder, TestSparkContext}
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
@@ -46,11 +47,11 @@ class OpPredictorWrapperTest extends FlatSpec with TestSparkContext with Prestig
 
   val log = LoggerFactory.getLogger(this.getClass)
 
-  val (ds, targetLabel, featureVector) = TestFeatureBuilder[Real, OPVector](
-    prestigeSeq.map(p => p.prestige.toReal -> Vectors.dense(p.education, p.income, p.women).toOPVector)
+  val (ds, targetLabel, featureVector) = TestFeatureBuilder[RealNN, OPVector](
+    prestigeSeq.map(p => p.prestige.toRealNN -> Vectors.dense(p.education, p.income, p.women).toOPVector)
   )
 
-  Spec[OpPredictorWrapper[_, _, _, _]] should
+  Spec[OpPredictorWrapper[_, _]] should
     "be able to run a simple logistic regression model (fitIntercept=true)" in {
     val lrModel: LinearRegressionModel = fitLinRegModel(fitIntercept = true)
     lrModel.intercept.abs should be > 1E-6
@@ -69,12 +70,11 @@ class OpPredictorWrapperTest extends FlatSpec with TestSparkContext with Prestig
         .setElasticNetParam(0.8)
         .setFitIntercept(fitIntercept)
 
-    val lr =
-      new OpPredictorWrapper[Real, Real, LinearRegression, LinearRegressionModel](lrBase)
-        .setInput(targetLabel, featureVector)
+    val lr = new OpPredictorWrapper[LinearRegression, LinearRegressionModel](lrBase)
+      .setInput(targetLabel, featureVector)
 
     // Fit the model
-    val model = lr.fit(ds)
+    val model = lr.fit(ds).asInstanceOf[SparkWrapperParams[LinearRegressionModel]]
     val lrModel = model.getSparkMlStage().get
 
     // Print the coefficients and intercept for linear regression
diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala
deleted file mode 100644
index d9db98dc50..0000000000
--- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-package com.salesforce.op.stages.sparkwrappers.specific
-
-import com.salesforce.op.features.FeatureSparkTypes
-import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, LogisticRegression, LogisticRegressionModel}
-import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
-import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
-import org.scalatest.junit.JUnitRunner
-
-@RunWith(classOf[JUnitRunner])
-class OpProbabilisticClassifierWrapperTest extends FlatSpec with TestSparkContext {
-
-  val (testData, targetLabel, featureVector) = TestFeatureBuilder("label", "features",
-    Seq[(RealNN, OPVector)](
-      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
-      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
-      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
-      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
-      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
-      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
-      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
-      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector,
-      0.0.toRealNN -> Vectors.dense(12.0, 3.3, -0.1).toOPVector
-    )
-  )
-
-  Spec[OpProbabilisticClassifierWrapper[_, _]] should "have the correct params set (fitIntercept = true)" in {
-    val lrClassifierModel: LogisticRegressionModel = fitLrModel(fitInterceptParam = true)
-    lrClassifierModel.intercept.abs should be > 1E-6
-  }
-
-  it should "have the correct params set (logreg with fitIntercept = false)" in {
-    val lrClassifierModel: LogisticRegressionModel = fitLrModel(fitInterceptParam = false)
-    lrClassifierModel.intercept.abs should be < Double.MinPositiveValue
-  }
-
-  it should "should have the expected feature name (decision tree)" in {
-    val wrappedEstimator =
-      new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel](
-        new DecisionTreeClassifier()
-      ).setInput(targetLabel, featureVector)
-
-    val (out1, out2, out3) = wrappedEstimator.getOutput()
-
-    out1.name shouldBe wrappedEstimator.stage1.getOutput().name
-    out2.name shouldBe wrappedEstimator.stage2.getOutput().name
-    out3.name shouldBe wrappedEstimator.stage3.getOutput().name
-  }
-
-  it should "have the correct params set (decision tree with maxDepth = 1)" in {
-    val depth = 1
-    val dtClassifierModel: DecisionTreeClassificationModel = fitDtModel(depth)
-    assert(dtClassifierModel.toDebugString.contains(s"depth $depth"))
-  }
-
-  it should "have the correct params set (decision tree with maxDepth = 2)" in {
-    val depth = 2
-    val dtClassifierModel: DecisionTreeClassificationModel = fitDtModel(depth)
-    assert(dtClassifierModel.toDebugString.contains(s"depth $depth"))
-  }
-
-  it should "ignore values set for input and output cols outside the OP wrapper" in {
-    // configure input classifier and set input col names outside of OP wrapper
-    val customLabelColName = "indexedLabel"
-    val customFeaturesColName = "indexedFeatures"
-    val customProbCol = "xxx"
-    val customPredCol = "yyy"
-    val customRawCol = "zzz"
-    val dtClassifier = new DecisionTreeClassifier()
-
-    dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName)
-    dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol)
-
-    val dtEstimator =
-      new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel](dtClassifier)
-        .setInput(targetLabel, featureVector)
-
-    // verify that the colnames configured outside the opwrapper where ignored and are what is expected
-    val inputNames = dtEstimator.stage1.getInputFeatures().map(_.name)
-    inputNames should have length 2
-    inputNames shouldBe Array(targetLabel.name, featureVector.name)
-    dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName)
-    dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol)
-
-    val model = dtEstimator.fit(testData)
-
-    dtEstimator.uid shouldBe model.uid
-
-    dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName)
-    dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol)
-
-    val (out1, out2, out3) = model.getOutput()
-    val output = model.transform(testData)
-
-    output.schema shouldBe StructType(Array(
-      StructField(targetLabel.name, DoubleType, true),
-      StructField(featureVector.name, FeatureSparkTypes.sparkTypeOf[OPVector], true),
-      StructField(out2.name, FeatureSparkTypes.sparkTypeOf[OPVector], true),
-      StructField(out3.name, FeatureSparkTypes.sparkTypeOf[OPVector], true),
-      StructField(out1.name, DoubleType, true)
-    ))
-  }
-
-  def fitDtModel(depth: Int): DecisionTreeClassificationModel = {
-    val dtClassifier = new DecisionTreeClassifier().setMaxDepth(depth)
-
-    val dtEstimator = new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel](
-      dtClassifier
-    ).setInput(targetLabel, featureVector)
-
-    val model = dtEstimator.fit(testData)
-    val output = model.transform(testData)
-
-    val dtClassifierModel = model.stage1.getSparkMlStage().get
-    dtClassifierModel
-  }
-
-  def fitLrModel(fitInterceptParam: Boolean): LogisticRegressionModel = {
-    val regParam = 0.3
-    val elasticNetParam = 0.8
-    val maxIterParam = 100
-    val tolParam = 1E-6
-
-    val lrClassifier = new LogisticRegression()
-      .setRegParam(regParam)
-      .setElasticNetParam(elasticNetParam)
-      .setMaxIter(maxIterParam)
-      .setTol(tolParam)
-      .setFitIntercept(fitInterceptParam)
-
-    val testEstimator = new OpProbabilisticClassifierWrapper[LogisticRegression, LogisticRegressionModel](
-      lrClassifier
-    ).setInput(targetLabel, featureVector)
-
-    val model = testEstimator.fit(testData)
-    val output = model.transform(testData)
-
-    val lrClassifierModel = model.stage1.getSparkMlStage().get
-
-    lrClassifierModel.getRegParam shouldBe regParam
-    lrClassifierModel.getElasticNetParam shouldBe elasticNetParam
-    lrClassifierModel.getMaxIter shouldBe maxIterParam
-    lrClassifierModel.getTol shouldBe tolParam
-    lrClassifierModel.getFitIntercept shouldBe fitInterceptParam
-
-    lrClassifierModel
-  }
-}
-
-
diff --git a/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala
new file mode 100644
index 0000000000..bd94c6cfda
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.feature.NameEntityRecognizer
+import com.salesforce.op.test.TestCommon
+import com.salesforce.op.utils.text.NameEntityType._
+import opennlp.tools.util.Span
+import org.junit.runner.RunWith
+import org.scalatest._
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpenNLPNameEntityTaggerTest extends FlatSpec with TestCommon {
+
+  val nerTagger = new OpenNLPNameEntityTagger()
+
+  Spec[OpenNLPNameEntityTagger] should "return the consistent results as expected" in {
+    val input = Seq(
+      "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.",
+      "Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this" +
+        "a director of this British industrial conglomerate."
+    )
+    val tokens: Seq[TextList] = input.map(x => NameEntityRecognizer.Analyzer.analyze(x, Language.English).toTextList)
+    val expectedOutputs = Seq(
+      Map("Vinken" -> Set(Person), "Pierre" -> Set(Person)),
+      Map("Agnew" -> Set(Person), "Rudolph" -> Set(Person))
+    )
+    tokens.zip(expectedOutputs).foreach { case (tokenInput, expected) =>
+      nerTagger.tag(tokenInput.value, Language.English, Seq(NameEntityType.Person)).tokenTags shouldEqual expected
+    }
+  }
+
+  it should "load all the existing name entity recognition models" in {
+    val languageNameEntityPairs = Seq(
+      (Language.English, NameEntityType.Date),
+      (Language.English, NameEntityType.Location),
+      (Language.English, NameEntityType.Money),
+      (Language.English, NameEntityType.Organization),
+      (Language.English, NameEntityType.Percentage),
+      (Language.English, NameEntityType.Person),
+      (Language.English, NameEntityType.Time),
+      (Language.Spanish, NameEntityType.Location),
+      (Language.Spanish, NameEntityType.Organization),
+      (Language.Spanish, NameEntityType.Person),
+      (Language.Spanish, NameEntityType.Misc),
+      (Language.Dutch, NameEntityType.Location),
+      (Language.Dutch, NameEntityType.Organization),
+      (Language.Dutch, NameEntityType.Person),
+      (Language.Dutch, NameEntityType.Misc)
+    )
+    languageNameEntityPairs.foreach { case (l, n) =>
+      OpenNLPModels.getTokenNameFinderModel(l, n).isDefined shouldBe true
+    }
+  }
+
+  it should "not get any model correctly if no such model exists" in {
+    val languageNameEntityPairs = Seq(
+      (Language.Unknown, NameEntityType.Other),
+      (Language.Urdu, NameEntityType.Location)
+    )
+    languageNameEntityPairs.foreach { case (l, n) =>
+      OpenNLPModels.getTokenNameFinderModel(l, n) shouldBe None
+    }
+  }
+
+  // test the convertSpansToMap function
+  it should "retrieve correct information from the output of name entity recognition model" in {
+    val inputs = Seq(Array("ab", "xx", "yy", "zz", "ss", "dd", "cc") ->
+      Seq(new Span(2, 4, "person"), new Span(3, 5, "location")), // interweaving entities
+      Array("a", "b", "c", "d") -> Seq(new Span(3, 4, "location")), // end of sentence entity
+      Array("a", "b", "c", "d") -> Seq(new Span(0, 2, "location")), // beginning of sentence entity
+      Array("a", "b", "c", "d") -> Seq.empty
+    )
+    val expectedOutputs = Seq(
+      Map("yy" -> Set(Person), "zz" -> Set(Person, Location), "ss" -> Set(Location)),
+      Map("d" -> Set(Location)),
+      Map("a" -> Set(Location), "b" -> Set(Location)),
+      Map.empty[String, Set[String]]
+    )
+
+    inputs.zip(expectedOutputs).map { case (tokensInput, expected) =>
+      val actual = nerTagger.convertSpansToMap(tokensInput._2, tokensInput._1)
+      actual shouldEqual expected
+    }
+  }
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala
new file mode 100644
index 0000000000..14c6a5bbc9
--- /dev/null
+++ b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.feature.TextTokenizer
+import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult
+import com.salesforce.op.test.TestCommon
+import com.salesforce.op.utils.text.Language._
+import opennlp.tools.sentdetect.SentenceModel
+import opennlp.tools.tokenize.TokenizerModel
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class OpenNLPSentenceSplitterTest extends FlatSpec with TestCommon {
+
+  val splitter = new OpenNLPSentenceSplitter()
+
+  Spec[OpenNLPSentenceSplitter] should "split an English paragraph into sentences" in {
+    val input =
+      "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29. " +
+        "Mr Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and " +
+        "former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate."
+
+    splitter.getSentences(input, language = English) shouldEqual Seq(
+      "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29.",
+      "Mr Vinken is chairman of Elsevier N.V., the Dutch publishing group.",
+      "Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, " +
+        "was named a director of this British industrial conglomerate."
+    )
+
+    TextTokenizer.tokenize(input.toText, sentenceSplitter = Option(splitter), defaultLanguage = English) shouldEqual
+      TextTokenizerResult(English, Seq(
+        Seq("pierr", "vinken", "61", "year", "old", "will", "join", "board",
+          "nonexecut", "director", "nov", "29").toTextList,
+        Seq("mr", "vinken", "chairman", "elsevi", "n.v", "dutch", "publish", "group").toTextList,
+        Seq("rudolph", "agnew", "55", "year", "old", "former", "chairman", "consolid", "gold", "field", "plc",
+          "name", "director", "british", "industri", "conglomer").toTextList))
+
+    TextTokenizer.tokenize(input.toText, analyzer = new OpenNLPAnalyzer(), sentenceSplitter = Option(splitter),
+      defaultLanguage = English) shouldEqual TextTokenizerResult(
+      English, Seq(
+        Seq("pierre", "vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a",
+          "nonexecutive", "director", "nov", "29", ".").toTextList,
+        Seq("mr", "vinken", "is", "chairman", "of", "elsevier", "n", ".v.", ",", "the", "dutch", "publishing",
+          "group", ".").toTextList,
+        Seq("rudolph", "agnew", ",", "55", "years", "old", "and", "former", "chairman", "of", "consolidated",
+          "gold", "fields", "plc", ",", "was", "named", "a", "director", "of", "this", "british", "industrial",
+          "conglomerate", ".").toTextList))
+  }
+
+  it should "split a Portuguese text into sentences" in {
+    // scalastyle:off
+    val input = "Depois de Guimarães, o North Music Festival estaciona este ano no Porto. A partir de sexta-feira, " +
+      "a Alfândega do Porto recebe a segunda edição deste festival de dois dias. No cartaz há nomes como os " +
+      "portugueses Linda Martini e Mão Morta, mas também Guano Apes ou os DJ’s portugueses Rich e Mendes."
+
+    splitter.getSentences(input, language = Portuguese) shouldEqual Seq(
+      "Depois de Guimarães, o North Music Festival estaciona este ano no Porto.",
+      "A partir de sexta-feira, a Alfândega do Porto recebe a segunda edição deste festival de dois dias.",
+      "No cartaz há nomes como os portugueses Linda Martini e Mão Morta, mas também Guano Apes ou os DJ’s " +
+        "portugueses Rich e Mendes."
+    )
+    // scalastyle:on
+  }
+
+  it should "load a sentence detection and tokenizer model for a language if they exist" in {
+    val languages = Seq(Danish, Portuguese, English, Dutch, German, Sami)
+    languages.map { language =>
+      OpenNLPModels.getSentenceModel(language).exists(_.isInstanceOf[SentenceModel]) shouldBe true
+      OpenNLPModels.getTokenizerModel(language).exists(_.isInstanceOf[TokenizerModel]) shouldBe true
+    }
+  }
+
+  it should "load not a sentence detection and tokenizer model for a language if they do not exist" in {
+    val languages = Seq(Japanese, Czech)
+    languages.map { language =>
+      OpenNLPModels.getSentenceModel(language) shouldEqual None
+      OpenNLPModels.getTokenizerModel(language) shouldEqual None
+    }
+  }
+
+  it should "return non-preprocessed input if no such a sentence detection model exist" in {
+    // scalastyle:off
+    val input = "ピエール・ヴィンケン（61歳）は、11月29日に臨時理事に就任します。" +
+      "ヴィンケン氏は、オランダの出版グループであるエルゼビアN.V.の会長です。 " +
+      "55歳のルドルフ・アグニュー（Rudolph Agnew、元コネチカットゴールドフィールドPLC）会長は、" +
+      "この英国の産業大企業の取締役に任命されました。"
+    // scalastyle:on
+    splitter.getSentences(input, language = Language.Japanese) shouldEqual Seq(input)
+  }
+}
diff --git a/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala b/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala
index b1263b898e..f3c581dd43 100644
--- a/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala
+++ b/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala
@@ -110,7 +110,6 @@ class TransientFeature
    */
   def asFeatureLike[I <: FeatureType]: FeatureLike[I] = getFeature().asInstanceOf[FeatureLike[I]]
 
-
   /**
    * Transform trasient feature into column metadata for use vectors
    * (for when each feature creates one column of a vector)
@@ -167,6 +166,27 @@ class TransientFeature
     val json = render(toJson)
     if (pretty) JsonMethods.pretty(json) else compact(json)
   }
+
+  /**
+   * Tests the equality of the TransientFeature objects
+   */
+  override def equals(that: Any): Boolean = {
+    that match {
+      case t: TransientFeature =>
+        t.name == name && t.isResponse == isResponse && t.isRaw == isRaw &&
+          t.uid == uid && t.typeName == typeName && t.originFeatures == originFeatures &&
+          t.stages == stages
+      case _ => false
+    }
+  }
+
+  /**
+   * Returns the hash code of this feature
+   *
+   * @return hash code
+   */
+  override def hashCode(): Int = uid.hashCode
+
 }
 
 object TransientFeature {
diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala
index e060e305b5..5a81578ddd 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala
@@ -176,7 +176,6 @@ object SomeValue {
  */
 object FeatureType {
 
-
   /**
    * Returns feature type name
    *
@@ -225,7 +224,8 @@ object FeatureType {
    * @param t type tag
    * @return true if this type tag corresponds to one of the feature value types, false otherwise
    */
-  def isFeatureValueType(t: TypeTag[_]): Boolean = FeatureType.featureValueTypeTags.contains(t.tpe.dealias.toString)
+  def isFeatureValueType(t: TypeTag[_]): Boolean =
+    FeatureType.featureValueTypeTags.contains(ReflectionUtils.dealisedTypeName(t.tpe))
 
   /**
    * Feature type tag
@@ -351,7 +351,7 @@ object FeatureType {
       // Text
       typeTag[Option[String]]
     )
-    typeTags.map(tag => tag.tpe.dealias.toString -> tag).toMap
+    typeTags.map(tag => ReflectionUtils.dealisedTypeName(tag.tpe) -> tag).toMap
   }
 
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala
index eec88c2b5a..4816d38fd6 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala
@@ -37,7 +37,7 @@ import org.apache.spark.ml.linalg.Vectors
 import scala.reflect.runtime.universe._
 
 /**
- * Default Feature Type values
+ * Default values for Feature Types
  */
 case object FeatureTypeDefaults {
 
diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala
index ab585b8c6e..1cdf9e1b91 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala
@@ -36,7 +36,7 @@ import org.apache.spark.ml.linalg.Vector
 import scala.reflect.runtime.universe._
 
 /**
- * Factory for creating feature type instances
+ * Factory for creating Feature Type instances
  *
  * @tparam T feature type
  */
@@ -55,7 +55,7 @@ sealed trait FeatureTypeFactory[T <: FeatureType] extends Serializable {
 }
 
 /**
- * Factory for creating feature type instances from primitive values
+ * Factory for creating Feature Type instances from primitive values
  */
 case object FeatureTypeFactory {
   /**
diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala
index 9101b288ec..3dc88e9aeb 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala
@@ -38,7 +38,7 @@ import scala.reflect.runtime.universe._
 
 
 /**
- * Feature type from/to Spark primitives converter, i.e Real from/to Double etc.
+ * Feature Type from/to Spark primitives converter, i.e Real from/to Double etc.
  *
  * @tparam T feature type
  */
@@ -83,6 +83,30 @@ case object FeatureTypeSparkConverter {
       def fromSpark(value: Any): T = maker(value)
     }
 
+  /**
+   * For a given feature type class (or [[FeatureType.typeName]]) from/to Spark primitives converter,
+   * i.e Real from/to Double etc.
+   *
+   * @param featureTypeName full class name of the feature type, see [[FeatureType.typeName]]
+   * @throws IllegalArgumentException if feature type name is unknown
+   * @return feature type from/to Spark primitives converter
+   */
+  def fromFeatureTypeName(featureTypeName: String): FeatureTypeSparkConverter[_ <: FeatureType] = {
+    featureTypeSparkConverters.get(featureTypeName) match {
+      case Some(converter) => converter
+      case None => throw new IllegalArgumentException(s"Unknown feature type '$featureTypeName'")
+    }
+  }
+
+  /**
+   * A map from feature type class to [[FeatureTypeSparkConverter]]
+   */
+  private[types] val featureTypeSparkConverters: Map[String, FeatureTypeSparkConverter[_ <: FeatureType]] =
+    FeatureType.featureTypeTags.map {
+      case (featureTypeClass, featureTypeTag) =>
+        featureTypeClass.getName ->
+          FeatureTypeSparkConverter[FeatureType]()(featureTypeTag.asInstanceOf[WeakTypeTag[FeatureType]])
+    }
 
   /**
    * Converts feature type into a Spark primitive value
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala b/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala
index 60639cd604..c86d643a0b 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala
@@ -40,7 +40,8 @@ import Geolocation._
 import scala.util.Try
 
 /**
- * Represented as a list of latitude, longitude, accuracy (only populated if all are present)
+ * Represented as a list of latitude, longitude, accuracy
+ * The value is only populated if all are present, otherwise [[IllegalArgumentException]] is thrown.
  *
  * @param value a list of latitude, longitude, accuracy
  */
@@ -52,13 +53,33 @@ class Geolocation(val value: Seq[Double]) extends OPList[Double] with Location {
   }
   def this(lat: Double, lon: Double, accuracy: GeolocationAccuracy) = this(geolocationData(lat, lon, accuracy))
   def this(v: (Double, Double, Double)) = this(geolocationData(v._1, v._2, v._3))
+  /**
+   * Latitude value
+   */
   def lat: Double = if (isEmpty) Double.NaN else value(0)
+  /**
+   * Longitude value
+   */
   def lon: Double = if (isEmpty) Double.NaN else value(1)
+  /**
+   * Latitude value
+   */
   def latitude: Double = lat
+  /**
+   * Longitude value
+   */
   def longitude: Double = lon
+
+  /**
+   * Geolocation accuracy value [[GeolocationAccuracy]]
+   */
   def accuracy: GeolocationAccuracy = {
     if (isEmpty) GeolocationAccuracy.Unknown else GeolocationAccuracy.withValue(value(2).toInt)
   }
+
+  /**
+   * Convert to [[GeoPoint]] value
+   */
   def toGeoPoint: GeoPoint = {
     // If this Geolocation object is empty, then return the zero vector as the GeoPoint since we use
     // GeoPoint coordinates in aggregation functions
@@ -110,14 +131,17 @@ sealed abstract class GeolocationAccuracy
 (
   val value: Int,
   val name: String,
-  val rangeInMiles: Double) extends IntEnumEntry {
-  lazy val rangeInUnits: Double = rangeInMiles / EarthRadius
+  val rangeInMiles: Double
+) extends IntEnumEntry {
+  /**
+   * Range in units of Earth Radius
+   */
+  def rangeInUnits: Double = rangeInMiles / EarthRadius
 }
 
 case object GeolocationAccuracy extends IntEnum[GeolocationAccuracy] {
   val values: List[GeolocationAccuracy] = findValues.toList sortBy(_.rangeInMiles)
 
-  def geoUnitsToMiles(u: Double): Double = u * EarthRadius
 
   // No match for the address was found
   case object Unknown extends GeolocationAccuracy(0, name = "Unknown", rangeInMiles = EquatorInMiles / 2)
@@ -142,14 +166,40 @@ case object GeolocationAccuracy extends IntEnum[GeolocationAccuracy] {
   // Center of the state
   case object State extends GeolocationAccuracy(10, name = "State", rangeInMiles = 150.0)
 
+  /**
+   * Convert units of Earth Radius into miles
+   *
+   * @param u units of Earth Radius
+   * @return miles
+   */
+  def geoUnitsToMiles(u: Double): Double = u * EarthRadius
+
+  /**
+   * Construct accuracy value for a given range in miles
+   *
+   * @param miles range in miles
+   * @return accuracy
+   */
   def forRangeInMiles(miles: Double): GeolocationAccuracy = {
     val result = values.dropWhile(_.rangeInMiles < miles * 0.99).headOption getOrElse Unknown
     result
   }
 
+  /**
+   * Construct accuracy value for a given range in units of Earth Radius
+   *
+   * @param units units of Earth Radius
+   * @return accuracy
+   */
   def forRangeInUnits(units: Double): GeolocationAccuracy =
     forRangeInMiles(geoUnitsToMiles(units))
 
+  /**
+   * Find the worst accuracy value
+   *
+   * @param accuracies list of accuracies
+   * @return worst accuracy
+   */
   def worst(accuracies: GeolocationAccuracy*): GeolocationAccuracy = {
     forRangeInMiles((Unknown :: accuracies.toList) map (_.rangeInMiles) max)
   }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
index 496c86d45e..cf16a0deb8 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
@@ -31,6 +31,11 @@
 
 package com.salesforce.op.features.types
 
+/**
+ * A list of text values
+ *
+ * @param value list of text values
+ */
 class TextList(val value: Seq[String]) extends OPList[String] {
   def this(v: String*)(implicit d: DummyImplicit) = this(v)
 }
@@ -39,6 +44,11 @@ object TextList {
   def empty: TextList = FeatureTypeDefaults.TextList
 }
 
+/**
+ * A list of date values
+ *
+ * @param value list of date values (values assumed to be in ms since Epoch)
+ */
 class DateList(val value: Seq[Long]) extends OPList[Long] {
   def this(v: Long*)(implicit d: DummyImplicit) = this(v)
 }
@@ -47,6 +57,11 @@ object DateList {
   def empty: DateList = FeatureTypeDefaults.DateList
 }
 
+/**
+ * A list of date & time values
+ *
+ * @param value list of date & time values (values assumed to be in ms since Epoch)
+ */
 class DateTimeList(value: Seq[Long]) extends DateList(value) {
   def this(v: Long*)(implicit d: DummyImplicit) = this(v)
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala
index 89951d2eed..e27e6d0f81 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala
@@ -33,144 +33,267 @@ package com.salesforce.op.features.types
 
 import org.apache.spark.ml.linalg.Vector
 
-
+/**
+ * Map of text values
+ *
+ * @param value map of text values
+ */
 class TextMap(val value: Map[String, String]) extends OPMap[String]
 object TextMap {
   def apply(value: Map[String, String]): TextMap = new TextMap(value)
   def empty: TextMap = FeatureTypeDefaults.TextMap
 }
 
+/**
+ * Map of email values
+ *
+ * @param value map of email values
+ */
 class EmailMap(val value: Map[String, String]) extends OPMap[String]
 object EmailMap {
   def apply(value: Map[String, String]): EmailMap = new EmailMap(value)
   def empty: EmailMap = FeatureTypeDefaults.EmailMap
 }
 
+/**
+ * Map of base64 binary encoded values
+ *
+ * @param value map of base64 binary encoded values
+ */
 class Base64Map(val value: Map[String, String]) extends OPMap[String]
 object Base64Map {
   def apply(value: Map[String, String]): Base64Map = new Base64Map(value)
   def empty: Base64Map = FeatureTypeDefaults.Base64Map
 }
 
+/**
+ * Map of phone values
+ *
+ * @param value map of phone values
+ */
 class PhoneMap(val value: Map[String, String]) extends OPMap[String]
 object PhoneMap {
   def apply(value: Map[String, String]): PhoneMap = new PhoneMap(value)
   def empty: PhoneMap = FeatureTypeDefaults.PhoneMap
 }
 
+/**
+ * Map of ID values
+ *
+ * @param value map of ID values
+ */
 class IDMap(val value: Map[String, String]) extends OPMap[String]
 object IDMap {
   def apply(value: Map[String, String]): IDMap = new IDMap(value)
   def empty: IDMap = FeatureTypeDefaults.IDMap
 }
 
+/**
+ * Map of URL values
+ *
+ * @param value map of URL values
+ */
 class URLMap(val value: Map[String, String]) extends OPMap[String]
 object URLMap {
   def apply(value: Map[String, String]): URLMap = new URLMap(value)
   def empty: URLMap = FeatureTypeDefaults.URLMap
 }
 
+/**
+ * Map of text area values
+ *
+ * @param value map of text area values
+ */
 class TextAreaMap(val value: Map[String, String]) extends OPMap[String]
 object TextAreaMap {
   def apply(value: Map[String, String]): TextAreaMap = new TextAreaMap(value)
   def empty: TextAreaMap = FeatureTypeDefaults.TextAreaMap
 }
 
+/**
+ * Map of picklist values
+ *
+ * @param value map of picklist values
+ */
 class PickListMap(val value: Map[String, String]) extends OPMap[String]
 object PickListMap {
   def apply(value: Map[String, String]): PickListMap = new PickListMap(value)
   def empty: PickListMap = FeatureTypeDefaults.PickListMap
 }
 
+/**
+ * Map of combobox values
+ *
+ * @param value map of combobox values
+ */
 class ComboBoxMap(val value: Map[String, String]) extends OPMap[String]
 object ComboBoxMap {
   def apply(value: Map[String, String]): ComboBoxMap = new ComboBoxMap(value)
   def empty: ComboBoxMap = FeatureTypeDefaults.ComboBoxMap
 }
 
+/**
+ * Map of binary values
+ *
+ * @param value map of binary values
+ */
 class BinaryMap(val value: Map[String, Boolean]) extends OPMap[Boolean]
 object BinaryMap {
   def apply(value: Map[String, Boolean]): BinaryMap = new BinaryMap(value)
   def empty: BinaryMap = FeatureTypeDefaults.BinaryMap
 }
 
+/**
+ * Map of integral values
+ *
+ * @param value map of integral values
+ */
 class IntegralMap(val value: Map[String, Long]) extends OPMap[Long]
 object IntegralMap {
   def apply(value: Map[String, Long]): IntegralMap = new IntegralMap(value)
   def empty: IntegralMap = FeatureTypeDefaults.IntegralMap
 }
 
+/**
+ * Map of real values
+ *
+ * @param value map of real values
+ */
 class RealMap(val value: Map[String, Double]) extends OPMap[Double]
 object RealMap {
   def apply(value: Map[String, Double]): RealMap = new RealMap(value)
   def empty: RealMap = FeatureTypeDefaults.RealMap
 }
 
+/**
+ * Map of percent values
+ *
+ * @param value map of percent values
+ */
 class PercentMap(val value: Map[String, Double]) extends OPMap[Double]
 object PercentMap {
   def apply(value: Map[String, Double]): PercentMap = new PercentMap(value)
   def empty: PercentMap = FeatureTypeDefaults.PercentMap
 }
 
+/**
+ * Map of currency values
+ *
+ * @param value map of currency values
+ */
 class CurrencyMap(val value: Map[String, Double]) extends OPMap[Double]
 object CurrencyMap {
   def apply(value: Map[String, Double]): CurrencyMap = new CurrencyMap(value)
   def empty: CurrencyMap = FeatureTypeDefaults.CurrencyMap
 }
 
+/**
+ * Map of date values
+ *
+ * @param value map of date values
+ */
 class DateMap(val value: Map[String, Long]) extends OPMap[Long]
 object DateMap {
   def apply(value: Map[String, Long]): DateMap = new DateMap(value)
   def empty: DateMap = FeatureTypeDefaults.DateMap
 }
+
+/**
+ * Map of date & time values
+ *
+ * @param value map of date & time values
+ */
 class DateTimeMap(val value: Map[String, Long]) extends OPMap[Long]
 object DateTimeMap {
   def apply(value: Map[String, Long]): DateTimeMap = new DateTimeMap(value)
   def empty: DateTimeMap = FeatureTypeDefaults.DateTimeMap
 }
 
+/**
+ * Map of multi picklist values
+ *
+ * @param value map of multi picklist values
+ */
 class MultiPickListMap(val value: Map[String, Set[String]]) extends OPMap[Set[String]]
 object MultiPickListMap {
   def apply(value: Map[String, Set[String]]): MultiPickListMap = new MultiPickListMap(value)
   def empty: MultiPickListMap = FeatureTypeDefaults.MultiPickListMap
 }
 
+/**
+ * Map of country values
+ *
+ * @param value map of country values
+ */
 class CountryMap(val value: Map[String, String]) extends OPMap[String] with Location
 object CountryMap {
   def apply(value: Map[String, String]): CountryMap = new CountryMap(value)
   def empty: CountryMap = FeatureTypeDefaults.CountryMap
 }
 
+/**
+ * Map of state values
+ *
+ * @param value map of state values
+ */
 class StateMap(val value: Map[String, String]) extends OPMap[String] with Location
 object StateMap {
   def apply(value: Map[String, String]): StateMap = new StateMap(value)
   def empty: StateMap = FeatureTypeDefaults.StateMap
 }
 
+/**
+ * Map of city values
+ *
+ * @param value map of city values
+ */
 class CityMap(val value: Map[String, String]) extends OPMap[String] with Location
 object CityMap {
   def apply(value: Map[String, String]): CityMap = new CityMap(value)
   def empty: CityMap = FeatureTypeDefaults.CityMap
 }
 
+/**
+ * Map of postal code values
+ *
+ * @param value map of postal code values
+ */
 class PostalCodeMap(val value: Map[String, String]) extends OPMap[String] with Location
 object PostalCodeMap {
   def apply(value: Map[String, String]): PostalCodeMap = new PostalCodeMap(value)
   def empty: PostalCodeMap = FeatureTypeDefaults.PostalCodeMap
 }
 
+/**
+ * Map of street values
+ *
+ * @param value map of street values
+ */
 class StreetMap(val value: Map[String, String]) extends OPMap[String] with Location
 object StreetMap {
   def apply(value: Map[String, String]): StreetMap = new StreetMap(value)
   def empty: StreetMap = FeatureTypeDefaults.StreetMap
 }
 
+/**
+ * Map of geolocation values
+ *
+ * @param value map of geolocation values
+ */
 class GeolocationMap(val value: Map[String, Seq[Double]]) extends OPMap[Seq[Double]] with Location
 object GeolocationMap {
   def apply(value: Map[String, Seq[Double]]): GeolocationMap = new GeolocationMap(value)
   def empty: GeolocationMap = FeatureTypeDefaults.GeolocationMap
 }
 
+/**
+ * Prediction representation - a map containing prediction, and optional raw prediction and probability values.
+ *
+ * This value can only be constructed from a non empty map containing a prediction,
+ * and optional raw prediction and probability values, otherwise [[NonNullableEmptyException]] is thrown.
+ *
+ * @param value map containing prediction, and optional raw prediction and probability values.
+ */
 class Prediction private[op](value: Map[String, Double]) extends RealMap(value) with NonNullable {
   import Prediction.Keys._
 
@@ -190,9 +313,27 @@ class Prediction private[op](value: Map[String, Double]) extends RealMap(value)
       s"starting with '$RawPredictionName' or '$ProbabilityName'"
   )
   private def keysStartsWith(name: String): Array[String] = value.keys.filter(_.startsWith(name)).toArray.sorted
+
+  /**
+   * Prediction value
+   */
   def prediction: Double = value(PredictionName)
+
+  /**
+   * Raw prediction values
+   */
   def rawPrediction: Array[Double] = keysStartsWith(RawPredictionName).map(value)
+
+  /**
+   * Probability values
+   */
   def probability: Array[Double] = keysStartsWith(ProbabilityName).map(value)
+
+  /**
+   * Score values (based of probability or prediction)
+   *
+   * @return prediction values or prediction
+   */
   def score: Array[Double] = {
     val probKeys = keysStartsWith(ProbabilityName)
     if (probKeys.nonEmpty) probKeys.map(value) else Array(value(PredictionName))
@@ -206,11 +347,59 @@ object Prediction {
   }
   import Keys._
 
+  /**
+   * Creates [[Prediction]] given a prediction value
+   *
+   * @param prediction prediction value
+   * @return [[Prediction]]
+   */
   def apply(prediction: Double): Prediction = new Prediction(Map(PredictionName -> prediction))
 
+  /**
+   * Creates [[Prediction]] given a prediction value, raw prediction and probability values
+   *
+   * @param prediction    prediction value
+   * @param rawPrediction raw prediction values
+   * @return [[Prediction]]
+   */
+  def apply(prediction: Double, rawPrediction: Vector): Prediction = {
+    val rawPred = rawPrediction.toArray.zipWithIndex.map { case (v, i) => s"${RawPredictionName}_$i" -> v }
+    val pred = PredictionName -> prediction
+    new Prediction(rawPred.toMap + pred)
+  }
+
+  /**
+   * Creates [[Prediction]] given a prediction value, raw prediction and probability values
+   *
+   * @param prediction    prediction value
+   * @param rawPrediction raw prediction values
+   * @return [[Prediction]]
+   */
+  def apply(prediction: Double, rawPrediction: Double): Prediction = {
+    val rawPred = s"${RawPredictionName}_0" -> rawPrediction
+    val pred = PredictionName -> prediction
+    new Prediction(Map(rawPred, pred))
+  }
+
+  /**
+   * Creates [[Prediction]] given a prediction value, raw prediction and probability values
+   *
+   * @param prediction    prediction value
+   * @param rawPrediction raw prediction values
+   * @param probability   probability values value
+   * @return [[Prediction]]
+   */
   def apply(prediction: Double, rawPrediction: Vector, probability: Vector): Prediction =
     apply(prediction, rawPrediction = rawPrediction.toArray, probability = probability.toArray)
 
+  /**
+   * Creates [[Prediction]] given a prediction value, raw prediction and probability values
+   *
+   * @param prediction    prediction value
+   * @param rawPrediction raw prediction values
+   * @param probability   probability values value
+   * @return [[Prediction]]
+   */
   def apply(prediction: Double, rawPrediction: Array[Double], probability: Array[Double]): Prediction = {
     val rawPred = rawPrediction.zipWithIndex.map { case (v, i) => s"${RawPredictionName}_$i" -> v }
     val prob = probability.zipWithIndex.map { case (v, i) => s"${ProbabilityName}_$i" -> v }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala b/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala
index 0a9b7238bb..7d90c9a102 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala
@@ -31,6 +31,13 @@
 
 package com.salesforce.op.features.types
 
+/**
+ * Real value representation
+ *
+ * A base class for all the real Feature Types
+ *
+ * @param value real
+ */
 class Real(val value: Option[Double]) extends OPNumeric[Double] {
   def this(value: Double) = this(Option(value))
   final def toDouble: Option[Double] = value
@@ -42,8 +49,16 @@ object Real {
   def empty: Real = FeatureTypeDefaults.Real
 }
 
-class RealNN private[op](v: Option[Double]) extends Real(
-    if (v == null || v.isEmpty) throw new NonNullableEmptyException(classOf[RealNN]) else v
+/**
+ * Real non nullable value representation
+ *
+ * This value can only be constructed from a concrete [[Double]] value,
+ * if empty value is passed the [[NonNullableEmptyException]] is thrown.
+ *
+ * @param value real
+ */
+class RealNN private[op](value: Option[Double]) extends Real(
+    if (value == null || value.isEmpty) throw new NonNullableEmptyException(classOf[RealNN]) else value
   ) with NonNullable {
   def this(value: Double) = this(Option(value))
 }
@@ -51,6 +66,11 @@ object RealNN {
   def apply(value: Double): RealNN = new RealNN(value)
 }
 
+/**
+ * Binary value representation
+ *
+ * @param value binary
+ */
 class Binary(val value: Option[Boolean]) extends OPNumeric[Boolean] with SingleResponse {
   def this(value: Boolean) = this(Option(value))
   final def toDouble: Option[Double] = value.map(if (_) 1.0 else 0.0)
@@ -61,6 +81,13 @@ object Binary {
   def empty: Binary = FeatureTypeDefaults.Binary
 }
 
+/**
+ * Integral value representation
+ *
+ * A base class for all the integral Feature Types
+ *
+ * @param value integral
+ */
 class Integral(val value: Option[Long]) extends OPNumeric[Long] {
   def this(value: Long) = this(Option(value))
   final def toDouble: Option[Double] = value.map(_.toDouble)
@@ -71,6 +98,11 @@ object Integral {
   def empty: Integral = FeatureTypeDefaults.Integral
 }
 
+/**
+ * Percentage value representation
+ *
+ * @param value percentage
+ */
 class Percent(value: Option[Double]) extends Real(value) {
   def this(value: Double) = this(Option(value))
 }
@@ -80,6 +112,11 @@ object Percent {
   def empty: Percent = FeatureTypeDefaults.Percent
 }
 
+/**
+ * Currency value representation
+ *
+ * @param value currency
+ */
 class Currency(value: Option[Double]) extends Real(value) {
   def this(value: Double) = this(Option(value))
 }
@@ -89,6 +126,11 @@ object Currency {
   def empty: Currency = FeatureTypeDefaults.Currency
 }
 
+/**
+ * Date value representation
+ *
+ * @param value date (assumed to be in ms since Epoch)
+ */
 class Date(value: Option[Long]) extends Integral(value) {
   def this(value: Long) = this(Option(value))
 }
@@ -98,6 +140,11 @@ object Date {
   def empty: Date = FeatureTypeDefaults.Date
 }
 
+/**
+ * Date & time value representation
+ *
+ * @param value date & time (assumed to be in ms since Epoch)
+ */
 class DateTime(value: Option[Long]) extends Date(value) {
   def this(value: Long) = this(Option(value))
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala b/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala
index ef95e7cc9b..fb59a60933 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala
@@ -33,6 +33,6 @@ package com.salesforce.op.features.types
 
 
 /**
- * A base class for all the collections (arrays, lists, sets, maps, vector etc)
+ * A base class for all the collections Feature Types (maps, lists, sets, vector etc.)
  */
 abstract class OPCollection extends FeatureType
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPList.scala b/features/src/main/scala/com/salesforce/op/features/types/OPList.scala
index 317f6a224c..70ae77c52f 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPList.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPList.scala
@@ -34,11 +34,19 @@ package com.salesforce.op.features.types
 import scala.reflect.ClassTag
 
 /**
- * A base class for all the list feature types
+ * A base class for all the list Feature Types
+ *
  * @tparam A item type
  */
 abstract class OPList[A](implicit val cta: ClassTag[A]) extends OPCollection {
   override type Value = Seq[A]
+
   final def isEmpty: Boolean = value.isEmpty
+
+  /**
+   * Converts list to an array
+   *
+   * @return array of A
+   */
   final def toArray: Array[A] = value.toArray(cta)
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala b/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala
index f71c952dc7..a2374f39d9 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala
@@ -32,11 +32,15 @@
 package com.salesforce.op.features.types
 
 /**
- * A base class for all the map feature types
+ * A base class for all the map Feature Types
+ *
  * @tparam A item type
  */
 abstract class OPMap[A] extends OPCollection {
   type Element = A
+
   override type Value = Map[String, A]
+
   final def isEmpty: Boolean = value.isEmpty
+
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala b/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala
index f0c9e217fc..14f949a59b 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala
@@ -33,11 +33,19 @@ package com.salesforce.op.features.types
 
 
 /**
- * A base class for all the numeric feature types
+ * A base class for all the numeric Feature Types
+ *
  * @tparam N number type (Long, Double etc)
  */
 abstract class OPNumeric[N] extends FeatureType {
   type Value = Option[N]
+
+  /**
+   * Convert numeric value to [[Double]] representation
+   *
+   * @return [[Double]] representation of numeric value
+   */
   def toDouble: Option[Double]
+
   final def isEmpty: Boolean = value.isEmpty
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala b/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala
index 10e2b5bcc0..3f4f80d010 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala
@@ -35,10 +35,16 @@ import scala.reflect.ClassTag
 
 
 /**
- * A base class for all the set feature types
+ * A base class for all the set Feature Types
  */
 abstract class OPSet[A](implicit val cta: ClassTag[A]) extends OPCollection with MultiResponse {
   type Value <: scala.collection.Set[A]
   final def isEmpty: Boolean = value.isEmpty
+
+  /**
+   * Converts set to an array
+   *
+   * @return array of A
+   */
   final def toArray: Array[A] = value.toArray(cta)
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala b/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala
index 5b39d7fb11..64417d88ff 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala
@@ -31,9 +31,13 @@
 
 package com.salesforce.op.features.types
 
-import org.apache.spark.ml.linalg.Vector
-
+import org.apache.spark.ml.linalg._
 
+/**
+ * Vector representation
+ *
+ * @param value vector ([[SparseVector]] or [[DenseVector]])
+ */
 class OPVector(val value: Vector) extends OPCollection {
   type Value = Vector
   final def isEmpty: Boolean = value.size == 0
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Sets.scala b/features/src/main/scala/com/salesforce/op/features/types/Sets.scala
index 231dd4fa4d..b188699500 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Sets.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Sets.scala
@@ -31,7 +31,11 @@
 
 package com.salesforce.op.features.types
 
-
+/**
+ * Multi picklist value that represents a multiple selection from a set of values
+ *
+ * @param value multiple selection from a set of values
+ */
 class MultiPickList(val value: Set[String]) extends OPSet[String] {
   type Value = Set[String]
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Text.scala b/features/src/main/scala/com/salesforce/op/features/types/Text.scala
index 06fa778652..279fa8243e 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Text.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Text.scala
@@ -40,7 +40,9 @@ import org.apache.commons.io.input.CharSequenceInputStream
 import org.apache.commons.validator.routines.UrlValidator
 
 /**
- * A base class for all the text feature types
+ * Text value representation
+ *
+ * A base class for all the text Feature Types
  *
  * @param value text value
  */
@@ -56,9 +58,22 @@ object Text {
   def empty: Text = FeatureTypeDefaults.Text
 }
 
+/**
+ * Email value representation
+ *
+ * @param value email value
+ */
 class Email(value: Option[String]) extends Text(value) {
   def this(value: String) = this(Option(value))
+  /**
+   * Extract email prefix
+   * @return if email is invalid or empty - None is returned; otherwise some value with prefix
+   */
   def prefix: Option[String] = Email.prefixOrDomain(this, isPrefix = true)
+  /**
+   * Extract email domain
+   * @return if email is invalid or empty - None is returned; otherwise some value with domain
+   */
   def domain: Option[String] = Email.prefixOrDomain(this, isPrefix = false)
 }
 object Email {
@@ -79,7 +94,11 @@ object Email {
       if (!m.matches()) None else if (isPrefix) Option(m.group(1)) else Option(m.group(2))
     )
 }
-
+/**
+ * Base64 encoded binary value representation
+ *
+ * @param value base64 encoded binary value
+ */
 class Base64(value: Option[String]) extends Text(value) {
   def this(value: String) = this(Option(value))
   /**
@@ -107,13 +126,17 @@ class Base64(value: Option[String]) extends Text(value) {
    */
   def asString: Option[String] = asBytes map (new String(_))
 }
-
 object Base64 {
   def apply(value: Option[String]): Base64 = new Base64(value)
   def apply(value: String): Base64 = new Base64(value)
   def empty: Base64 = FeatureTypeDefaults.Base64
 }
 
+/**
+ * Phone number value representation, i.e. '+1-650-113-111-2222'
+ *
+ * @param value phone number
+ */
 class Phone(value: Option[String]) extends Text(value){
   def this(value: String) = this(Option(value))
 }
@@ -123,6 +146,11 @@ object Phone {
   def empty: Phone = FeatureTypeDefaults.Phone
 }
 
+/**
+ * Unique identifier value representation
+ *
+ * @param value unique identifier
+ */
 class ID(value: Option[String]) extends Text(value){
   def this(value: String) = this(Option(value))
 }
@@ -132,6 +160,11 @@ object ID {
   def empty: ID = FeatureTypeDefaults.ID
 }
 
+/**
+ * URL value representation
+ *
+ * @param value url
+ */
 class URL(value: Option[String]) extends Text(value){
   def this(value: String) = this(Option(value))
   /**
@@ -147,7 +180,7 @@ class URL(value: Option[String]) extends Text(value){
    */
   def isValid(protocols: Array[String]): Boolean = value.exists(new UrlValidator(protocols).isValid)
   /**
-   * Extracts url domain, i.e. salesforce.com, data.com etc.
+   * Extracts url domain, i.e. 'salesforce.com', 'data.com' etc.
    */
   def domain: Option[String] = value map (new java.net.URL(_).getHost)
   /**
@@ -161,6 +194,11 @@ object URL {
   def empty: URL = FeatureTypeDefaults.URL
 }
 
+/**
+ * Large text values (more than 4000 bytes)
+ *
+ * @param value large text value
+ */
 class TextArea(value: Option[String]) extends Text(value){
   def this(value: String) = this(Option(value))
 }
@@ -170,6 +208,11 @@ object TextArea {
   def empty: TextArea = FeatureTypeDefaults.TextArea
 }
 
+/**
+ * A single text value that represents a single selection from a set of values
+ *
+ * @param value selected text
+ */
 class PickList(value: Option[String]) extends Text(value) with SingleResponse {
   def this(value: String) = this(Option(value))
 }
@@ -178,7 +221,11 @@ object PickList {
   def apply(value: String): PickList = new PickList(value)
   def empty: PickList = FeatureTypeDefaults.PickList
 }
-
+/**
+ * A single text value that represents a selection from a set of values or a user specified one
+ *
+ * @param value selected or user specified text
+ */
 class ComboBox(value: Option[String]) extends Text(value){
   def this(value: String) = this(Option(value))
 }
@@ -188,6 +235,11 @@ object ComboBox {
   def empty: ComboBox = FeatureTypeDefaults.ComboBox
 }
 
+/**
+ * Country value representation, i.e. 'United States of America', 'France" etc.
+ *
+ * @param value country
+ */
 class Country(value: Option[String]) extends Text(value) with Location {
   def this(value: String) = this(Option(value))
 }
@@ -197,6 +249,11 @@ object Country {
   def empty: Country = FeatureTypeDefaults.Country
 }
 
+/**
+ * State value representation, i.e. 'CA', 'OR' etc.
+ *
+ * @param value state
+ */
 class State(value: Option[String]) extends Text(value) with Location {
   def this(value: String) = this(Option(value))
 }
@@ -206,6 +263,11 @@ object State {
   def empty: State = FeatureTypeDefaults.State
 }
 
+/**
+ * Postal code value representation, i.e. '92101', '72212-341' etc.
+ *
+ * @param value postal code
+ */
 class PostalCode(value: Option[String]) extends Text(value) with Location {
   def this(value: String) = this(Option(value))
 }
@@ -215,6 +277,11 @@ object PostalCode {
   def empty: PostalCode = FeatureTypeDefaults.PostalCode
 }
 
+/**
+ * City value representation, i.e. 'New York', 'Paris' etc.
+ *
+ * @param value city
+ */
 class City(value: Option[String]) extends Text(value) with Location {
   def this(value: String) = this(Option(value))
 }
@@ -224,6 +291,11 @@ object City {
   def empty: City = FeatureTypeDefaults.City
 }
 
+/**
+ * Street representation, i.e. '123 University Ave' etc.
+ *
+ * @param value street
+ */
 class Street(value: Option[String]) extends Text(value) with Location {
   def this(value: String) = this(Option(value))
 }
diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala
index e4f40b3514..7f0a121102 100644
--- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.{Metadata, StructType}
 
 
 /**
- * Parameters and functionalities shared across the input features
+ * Parameters and functions shared across the input features
  */
 trait InputParams extends Params {
 
diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala
index 38f41e516a..30d7edb9a1 100644
--- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala
@@ -77,7 +77,7 @@ final class OpPipelineStageWriter(val stage: OpPipelineStageBase) extends MLWrit
    */
   def writeToMap: Map[String, Any] = {
     // We produce stage metadata for all the Spark params
-    val metadataJson = SparkDefaultParamsReadWrite.getMetadataToSave(stage, sc)
+    val metadataJson = SparkDefaultParamsReadWrite.getMetadataToSave(stage)
     // Add isModel indicator
     val metadata = parse(metadataJson).extract[Map[String, Any]] + (FieldNames.IsModel.entryName -> isModel)
     // In case we stumbled upon a model instance, we also include it's ctor args
@@ -102,10 +102,10 @@ final class OpPipelineStageWriter(val stage: OpPipelineStageBase) extends MLWrit
       val anyValue = argValue match {
         // Special handling for Feature Type TypeTags
         case t: TypeTag[_] if FeatureType.isFeatureType(t) || FeatureType.isFeatureValueType(t) =>
-          AnyValue(`type` = AnyValueTypes.TypeTag, value = t.tpe.dealias.toString)
+          AnyValue(`type` = AnyValueTypes.TypeTag, value = ReflectionUtils.dealisedTypeName(t.tpe))
         case t: TypeTag[_] =>
           throw new RuntimeException(
-            s"Unknown type tag '${t.tpe.dealias.toString}'. " +
+            s"Unknown type tag '${t.tpe.toString}'. " +
               "Only Feature and Feature Value type tags are supported for serialization."
           )
 
diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala
index df2e5ed98d..e23669cd29 100644
--- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala
@@ -32,9 +32,10 @@
 package com.salesforce.op.stages
 
 import com.salesforce.op.features._
-import com.salesforce.op.features.types.{FeatureType, OPVector}
+import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.utils.reflection.ReflectionUtils
 import com.salesforce.op.utils.spark.RichDataType._
+import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.{MLWritable, MLWriter}
 import org.apache.spark.ml.{PipelineStage, Transformer}
@@ -203,7 +204,7 @@ trait OpPipelineStage[O <: FeatureType] extends OpPipelineStageBase {
    * Should output feature be a response? Yes, if any of the input features are.
    * @return true if the the output feature should be a response
    */
-  protected def outputIsResponse: Boolean = getTransientFeatures().exists(_.isResponse)
+  def outputIsResponse: Boolean = getTransientFeatures().exists(_.isResponse)
 
 }
 
@@ -553,12 +554,28 @@ trait OpPipelineStageN[I <: FeatureType, O <: FeatureType] extends OpPipelineSta
  * Trait to mix into transformers that indicates their transform functions can be combined into a single stage
  */
 private[op] trait OpTransformer {
-
   self: OpPipelineStage[_] with Transformer =>
 
   /**
-   * Creates a transform function to convert Row to a value
-   * @return a transform function to convert Row to a value
+   * Feature name (key) -> value lookup, e.g Row, Map etc.
+   */
+  type KeyValue = String => Any
+
+  /**
+   * Creates a transform function to transform Row to a value
+   * @return a transform function to transform Row to a value
+   */
+  def transformRow: Row => Any = r => transformKeyValue(r.getAny)
+
+  /**
+   * Creates a transform function to transform Map to a value
+   * @return a transform function to transform Map to a value
+   */
+  def transformMap: Map[String, Any] => Any = m => transformKeyValue(m.apply)
+
+  /**
+   * Creates a transform function to transform any key/value to a value
+   * @return a transform function to transform any key/value to a value
    */
-  def transformRow: Row => Any
+  def transformKeyValue: KeyValue => Any
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala
similarity index 99%
rename from core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala
index 0a463eb482..1950d28286 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala
@@ -35,6 +35,7 @@ import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types.{FeatureType, FeatureTypeSparkConverter}
 import com.salesforce.op.stages.OpPipelineStage2
 import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{Dataset, Encoder, Encoders}
 import org.apache.spark.util.ClosureUtils
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala
similarity index 96%
rename from core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala
index 55ccd97a38..4b2b70702e 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala
@@ -35,10 +35,9 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.{OpPipelineStage2, OpTransformer}
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.util.ClosureUtils
 
 import scala.reflect.runtime.universe.TypeTag
@@ -85,9 +84,9 @@ trait OpTransformer2[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType]
   }
 
   private val transform2Fn = FeatureSparkTypes.transform2[I1, I2, O](transformFn)
-  override def transformRow: Row => Any = {
+  override def transformKeyValue: KeyValue => Any = {
     val (in1name, in2name) = (in1.name, in2.name)
-    (row: Row) => transform2Fn(row.getAny(in1name), row.getAny(in2name))
+    (kv: KeyValue) => transform2Fn(kv(in1name), kv(in2name))
   }
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala
similarity index 96%
rename from core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala
index ac487b3ace..19286cd942 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala
@@ -35,10 +35,9 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.stages.{OpPipelineStage4, OpTransformer}
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.util.ClosureUtils
 
 import scala.reflect.runtime.universe.TypeTag
@@ -90,9 +89,9 @@ trait OpTransformer4[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, I4
   }
 
   private val transform4Fn = FeatureSparkTypes.transform4[I1, I2, I3, I4, O](transformFn)
-  override def transformRow: Row => Any = {
+  override def transformKeyValue: KeyValue => Any = {
     val (in1name, in2name, in3name, in4name) = (in1.name, in2.name, in3.name, in4.name)
-    (row: Row) => transform4Fn(row.getAny(in1name), row.getAny(in2name), row.getAny(in3name), row.getAny(in4name))
+    (kv: KeyValue) => transform4Fn(kv(in1name), kv(in2name), kv(in3name), kv(in4name))
   }
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala
similarity index 96%
rename from core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala
index 8126de32b1..8c68aa7eff 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala
@@ -35,10 +35,9 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.stages.{OpPipelineStageN, OpTransformer}
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.util.ClosureUtils
 
 import scala.reflect.runtime.universe.TypeTag
@@ -84,9 +83,9 @@ trait OpTransformerN[I <: FeatureType, O <: FeatureType]
   }
 
   private val transformNFn = FeatureSparkTypes.transformN[I, O](transformFn)
-  override def transformRow: Row => Any = {
+  override def transformKeyValue: KeyValue => Any = {
     val inNames = inN.map(_.name)
-    (row: Row) => transformNFn(inNames.map(name => row.getAny(name)))
+    (kv: KeyValue) => transformNFn(inNames.map(name => kv(name)))
   }
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala
similarity index 96%
rename from core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala
index 6feabd6d69..1d03d14a41 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala
@@ -35,10 +35,9 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.stages.{OpPipelineStage3, OpTransformer}
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.util.ClosureUtils
 
 import scala.reflect.runtime.universe.TypeTag
@@ -86,9 +85,9 @@ trait OpTransformer3[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O
   }
 
   private val transform3Fn = FeatureSparkTypes.transform3[I1, I2, I3, O](transformFn)
-  override def transformRow: Row => Any = {
+  override def transformKeyValue: KeyValue => Any = {
     val (in1name, in2name, in3name) = (in1.name, in2.name, in3.name)
-    (row: Row) => transform3Fn(row.getAny(in1name), row.getAny(in2name), row.getAny(in3name))
+    (kv: KeyValue) => transform3Fn(kv(in1name), kv(in2name), kv(in3name))
   }
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala
diff --git a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala
similarity index 96%
rename from core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala
rename to features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala
index bc6e4229f5..28004bcca3 100644
--- a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala
@@ -35,10 +35,9 @@ import com.salesforce.op.UID
 import com.salesforce.op.features.FeatureSparkTypes
 import com.salesforce.op.features.types.FeatureType
 import com.salesforce.op.stages.{OpPipelineStage1, OpTransformer}
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.util.ClosureUtils
 
 import scala.reflect.runtime.universe.TypeTag
@@ -82,9 +81,9 @@ trait OpTransformer1[I <: FeatureType, O <: FeatureType]
   }
 
   private val transform1Fn = FeatureSparkTypes.transform1[I, O](transformFn)
-  override def transformRow: Row => Any = {
+  override def transformKeyValue: KeyValue => Any = {
     val inName = in1.name
-    (r: Row) => transform1Fn(r.getAny(inName))
+    (kv: KeyValue) => transform1Fn(kv(inName))
   }
 
 }
diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala b/features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala
similarity index 95%
rename from core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala
rename to features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala
index d08373e710..4977a8b345 100644
--- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala
@@ -32,8 +32,8 @@
 package com.salesforce.op.stages.sparkwrappers.generic
 
 import com.salesforce.op.stages.SparkStageParam
-import org.apache.spark.ml.param.{Param, Params, StringArrayParam}
 import org.apache.spark.ml.PipelineStage
+import org.apache.spark.ml.param.{Param, Params, StringArrayParam}
 
 
 /**
@@ -41,7 +41,7 @@ import org.apache.spark.ml.PipelineStage
  *
  * @tparam S type of spark object to wrap
  */
-private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Params {
+trait SparkWrapperParams[S <: PipelineStage with Params] extends Params {
   self: PipelineStage =>
 
   final val sparkInputColParamNames = new StringArrayParam(
@@ -59,7 +59,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par
   /**
    * this must be private so that the stage can have it's path set properly
    */
-  private final val savePath = new Param[String](
+  final val savePath = new Param[String](
     parent = this, name = "savePath", doc = "path to save the spark stage"
   )
 
@@ -68,7 +68,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par
   /**
    * this must be private so that the stage can have it's path set properly
    */
-  private final val sparkMlStage = new SparkStageParam[S](
+  final val sparkMlStage = new SparkStageParam[S](
     parent = this, name = "sparkMlStage", doc = "the spark stage that is being wrapped for optimus prime"
   )
 
@@ -80,7 +80,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par
     this
   }
 
-  def setSparkMlStage(stage: Option[S]): this.type = {
+  protected def setSparkMlStage(stage: Option[S]): this.type = {
     set(sparkMlStage, stage)
     sparkMlStage.savePath = Option($(savePath))
     this
diff --git a/core/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala b/features/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala
rename to features/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala
diff --git a/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala b/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala
new file mode 100644
index 0000000000..d254c6de48
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.test
+
+import com.salesforce.op.features.types.FeatureType
+import org.scalactic.Equality
+import org.scalatest.Suite
+
+/**
+ * Feature Type equality instances mixin.
+ * Allowing users to customize equality in tests, for example to allow numerical tolerance.
+ *
+ * @tparam O feature type
+ */
+trait FeatureTypeEquality[O <: FeatureType] {
+  self: Suite =>
+
+  /**
+   * Feature type equality
+   */
+  implicit val featureTypeEquality: Equality[O] = new Equality[O] {
+    def areEqual(a: O, b: Any): Boolean = a.equals(b)
+  }
+
+  /**
+   * Feature type sequence equality
+   */
+  implicit val seqEquality: Equality[Seq[O]] = new Equality[Seq[O]] {
+    def areEqual(a: Seq[O], b: Any): Boolean = b match {
+      case s: Seq[_] if a.size == s.size => a.zip(s).forall { case (av, bv) => av === bv }
+      case _ => false
+    }
+  }
+
+}
diff --git a/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala
new file mode 100644
index 0000000000..51de105af6
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.test
+
+import java.io.File
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages._
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.sql.Dataset
+import org.scalactic.Equality
+import org.scalatest.events.{Event, TestFailed}
+import org.scalatest.{Args, Reporter}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect._
+import scala.reflect.runtime.universe._
+
+/**
+ * Base test class for testing OP estimator instances.
+ * Includes common tests for fitting estimator and verifying the fitted model.
+ *
+ * @tparam O             output feature type
+ * @tparam ModelType     model type produced by this estimator
+ * @tparam EstimatorType type of the estimator being tested
+ */
+abstract class OpEstimatorSpec[O <: FeatureType : WeakTypeTag : ClassTag,
+ModelType <: Model[ModelType] with OpPipelineStage[O] with OpTransformer : ClassTag,
+EstimatorType <: Estimator[ModelType] with OpPipelineStage[O] : ClassTag]
+  extends OpPipelineStageSpec[O, EstimatorType] {
+
+  /**
+   * Estimator instance to be tested
+   */
+  val estimator: EstimatorType
+
+  /**
+   * Input Dataset to fit & transform
+   */
+  val inputData: Dataset[_]
+
+  /**
+   * Expected result of the transformer applied on the Input Dataset
+   */
+  val expectedResult: Seq[O]
+
+  final override lazy val stage = estimator
+
+  /**
+   * Model (transformer) to fit
+   */
+  final lazy val model: ModelType = estimator.fit(inputData)
+
+  it should "fit a model" in {
+    model should not be null
+    model shouldBe a[ModelType]
+  }
+
+  it should behave like modelSpec()
+
+  it should "have fitted a model that matches the estimator" in {
+    withClue("Model doesn't have a parent:") {
+      model.hasParent shouldBe true
+    }
+    withClue("Model parent should be the original estimator instance:") {
+      model.parent shouldBe estimator
+    }
+    withClue("Model and estimator output feature names don't match:") {
+      model.getOutputFeatureName shouldBe estimator.getOutputFeatureName
+    }
+    assert(model.asInstanceOf[OpPipelineStageBase], estimator, expectSameClass = false)
+  }
+
+  // TODO: test metadata
+
+
+  /**
+   * Register all model spec tests
+   */
+  private def modelSpec(): Unit = {
+    // Define transformer spec for the fitted model reusing the same inputs & Spark context
+    val modelSpec = new OpTransformerSpec[O, ModelType] {
+      override implicit val featureTypeEquality: Equality[O] = OpEstimatorSpec.this.featureTypeEquality
+      override implicit val seqEquality: Equality[Seq[O]] = OpEstimatorSpec.this.seqEquality
+      lazy val transformer: ModelType = model.setInputFeatureArray(estimator.getInputFeatures())
+      lazy val inputData: Dataset[_] = OpEstimatorSpec.this.inputData
+      lazy val expectedResult: Seq[O] = OpEstimatorSpec.this.expectedResult
+      override implicit lazy val spark = OpEstimatorSpec.this.spark
+      override def specName: String = "model"
+      override def tempDir: File = OpEstimatorSpec.this.tempDir
+    }
+
+    // Register all model spec tests
+    for {
+      testName <- modelSpec.testNames
+    } registerTest(testName) {
+      // Run test & collect failures
+      val failures = ArrayBuffer.empty[TestFailed]
+      val reporter = new Reporter {
+        def apply(event: Event): Unit = event match {
+          case f: TestFailed => failures += f
+          case _ =>
+        }
+      }
+      // Note: We set 'runTestInNewInstance = true' to avoid restarting Spark context on every test run
+      val args = Args(reporter, runTestInNewInstance = true)
+      modelSpec.run(testName = Some(testName), args = args)
+
+      // Propagate the failure if any
+      for {failure <- failures.headOption} {
+        failure.throwable.map(fail(failure.message, _)).getOrElse(fail(failure.message))
+      }
+    }
+  }
+
+}
diff --git a/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala
new file mode 100644
index 0000000000..d5c674fc42
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.test
+
+import com.salesforce.op.features.Feature
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages._
+import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
+import org.apache.spark.ml.Estimator
+import org.apache.spark.ml.param.ParamMap
+import org.scalatest._
+
+import scala.reflect._
+import scala.reflect.runtime.universe._
+import scala.util.Failure
+
+
+/**
+ * Spec for testing [[OpPipelineStage]] instances (transformers or estimators).
+ * Includes common tests for output feature, copy, serialization, json read/write etc.
+ *
+ * @tparam O         output feature type
+ * @tparam StageType [[OpPipelineStage]] type being tested (transformer or estimator)
+ */
+abstract class OpPipelineStageSpec[O <: FeatureType : WeakTypeTag : ClassTag,
+StageType <: OpPipelineStage[O] : ClassTag]
+  extends FlatSpec
+    with FeatureTypeEquality[O]
+    with TestSparkContext
+    with OpPipelineStageAsserts {
+
+  /**
+   * [[OpPipelineStage]] instance to be tested
+   */
+  val stage: StageType
+
+  /**
+   * Spec name (StageType[O] by default)
+   */
+  def specName: String = Spec[O, StageType]
+
+  specName should "produce output feature" in {
+    val output = stage.getOutput()
+    output shouldBe new Feature[O](
+      name = stage.getOutputFeatureName,
+      originStage = stage,
+      isResponse = stage.outputIsResponse,
+      parents = stage.getInputFeatures()
+    )
+  }
+  it should "copy" in {
+    val copy = stage.copy(new ParamMap())
+    copy shouldBe a[StageType]
+    assert(copy, stage)
+  }
+  it should "be serializable" in {
+    stage.checkSerializable match {
+      case Failure(e) => fail("Stage is not serializable", e)
+      case _ =>
+    }
+  }
+  it should "be json writable/readable" in {
+    val loaded = writeAndRead(stage)
+    assert(loaded, stage)
+  }
+
+  /**
+   * A helper function to write and read stage into savePath
+   *
+   * @param stage stage instance to write and then read
+   * @param savePath Spark stage save path
+   * @return read stage
+   */
+  protected def writeAndRead(stage: StageType, savePath: String = stageSavePath): OpPipelineStageBase = {
+    val savable = stage match {
+      case s: SparkWrapperParams[_] => s.setSavePath(savePath)
+      case s => s
+    }
+    val json = new OpPipelineStageWriter(savable).overwrite().writeToJsonString
+    new OpPipelineStageReader(savable).loadFromJsonString(json)
+  }
+
+  /**
+   * Spark stage save path
+   */
+  protected def stageSavePath: String = s"$tempDir/${specName.filter(_.isLetterOrDigit)}-${System.currentTimeMillis()}"
+
+}
+
+
+/**
+ * Stage assertion for [[OpPipelineStage]]
+ */
+trait OpPipelineStageAsserts extends AppendedClues {
+  self: Matchers =>
+
+  /**
+   * Assert stage instances
+   *
+   * @param stage           instance to assert
+   * @param expected        instance to assert against
+   * @param expectSameClass should expect the same class or not
+   * @return
+   */
+  def assert(stage: OpPipelineStageBase, expected: OpPipelineStageBase, expectSameClass: Boolean = true): Assertion = {
+    def stageType(s: OpPipelineStageBase) = if (s.isInstanceOf[Estimator[_]]) "estimator" else "transformer"
+    lazy val stageClue =
+      if (expectSameClass) s", while asserting ${stage.getClass.getSimpleName} ${stageType(stage)}."
+      else {
+        s", while asserting ${stage.getClass.getSimpleName} ${stageType(stage)} " +
+          s"against ${expected.getClass.getSimpleName} ${stageType(expected)}."
+      }
+    def clue[T](msg: String)(fun: => T) = { withClue(msg)(fun) } withClue stageClue
+
+    if (expectSameClass) {
+      clue("Stage classes don't match:") {
+        stage.getClass shouldBe expected.getClass
+      }
+      clue("Params are not the same:") {
+        stage.params should contain theSameElementsAs expected.params
+      }
+      expected.params.foreach { p =>
+        clue(s"Param '${p.name}' should exist:") {
+          stage.hasParam(p.name) shouldBe expected.hasParam(p.name)
+        }
+        // TODO: add params value comparison (note: can be tricky)
+        // withClue(s"Param '${p.name}' values do not match:") {
+        //   stage.get(p) shouldBe expected.get(p)
+        // }
+      }
+    }
+    clue("Stage UIDs don't match:") {
+      stage.uid shouldBe expected.uid
+    }
+    clue("Stage outputs don't match:") {
+      stage.getOutput() shouldBe expected.getOutput()
+    }
+    clue("Operation names don't match:") {
+      stage.operationName shouldBe expected.operationName
+    }
+    clue("Stage names don't match:") {
+      stage.stageName shouldBe expected.stageName
+    }
+    clue("Transient features don't match:") {
+      stage.getTransientFeatures() should contain theSameElementsAs expected.getTransientFeatures()
+    }
+    clue("Input features don't match:") {
+      stage.getInputFeatures() should contain theSameElementsAs expected.getInputFeatures()
+    }
+    clue("Input schemas don't match:") {
+      stage.getInputSchema() shouldBe expected.getInputSchema()
+    }
+    clue("Metadata values don't match:") {
+      stage.getMetadata() shouldBe expected.getMetadata()
+    }
+  }
+
+}
diff --git a/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala
new file mode 100644
index 0000000000..ed3934a9dc
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.test
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.features.{FeatureLike, FeatureSparkTypes}
+import com.salesforce.op.stages._
+import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.utils.spark.RichRow._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.Dataset
+
+import scala.reflect._
+import scala.reflect.runtime.universe._
+
+/**
+ * Base test class for testing [[OpPipelineStage]] instances (transformers or estimators).
+ * Includes common tests for schema and data transformations.
+ *
+ * @tparam O               output feature type
+ * @tparam TransformerType type of the transformer being tested
+ */
+abstract class OpTransformerSpec[O <: FeatureType : WeakTypeTag : ClassTag,
+TransformerType <: OpPipelineStage[O] with Transformer with OpTransformer : ClassTag]
+  extends OpPipelineStageSpec[O, TransformerType] {
+
+  /**
+   * [[OpTransformer]] instance to be tested
+   */
+  val transformer: TransformerType
+
+  /**
+   * Input Dataset to transform
+   */
+  val inputData: Dataset[_]
+
+  /**
+   * Expected result of the transformer applied on the Input Dataset
+   */
+  val expectedResult: Seq[O]
+
+  final override lazy val stage = transformer
+  protected val convert = FeatureTypeSparkConverter[O]()
+
+  it should "transform schema" in {
+    val transformedSchema = transformer.transformSchema(inputData.schema)
+    val output = transformer.getOutput()
+    val validationResults =
+      FeatureSparkTypes.validateSchema(transformedSchema, transformer.getInputFeatures() :+ output)
+    if (validationResults.nonEmpty) {
+      fail("Dataset schema is invalid. Errors: " + validationResults.mkString("'", "','", "'"))
+    }
+  }
+  it should "transform data" in {
+    val transformed = transformer.transform(inputData)
+    val output = transformer.getOutput()
+    val res: Seq[O] = transformed.collect(output)(convert, classTag[O]).toSeq
+    res shouldEqual expectedResult
+  }
+  it should "transform rows" in {
+    val rows = inputData.toDF().collect()
+    val res: Seq[O] = rows.view.map(row => transformer.transformRow(row)).map(convert.fromSpark)
+    res shouldEqual expectedResult
+  }
+  it should "transform maps" in {
+    val rows = inputData.toDF().collect()
+    val inputNames = transformer.getTransientFeatures().map(_.name)
+    val maps = rows.view.map(row => inputNames.map(name => name -> row.getAny(name)).toMap)
+    val res: Seq[O] = maps.map(transformer.transformMap).map(convert.fromSpark)
+    res shouldEqual expectedResult
+  }
+  it should "transform key/value" in {
+    val rows = inputData.toDF().collect()
+    val res: Seq[O] = rows.view.map(row => transformer.transformKeyValue(row.getAny)).map(convert.fromSpark)
+    res shouldEqual expectedResult
+  }
+  it should "transform data after being loaded" in {
+    val loaded = writeAndRead(stage)
+    val transformed = loaded.asInstanceOf[TransformerType].transform(inputData)
+    val output = loaded.getOutput().asInstanceOf[FeatureLike[O]]
+    val res: Seq[O] = transformed.collect(output)(convert, classTag[O]).toSeq
+    res shouldEqual expectedResult
+  }
+
+  // TODO: test metadata
+
+}
diff --git a/core/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala b/features/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala
similarity index 100%
rename from core/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala
rename to features/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala
diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala
similarity index 100%
rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala
rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala
diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala
similarity index 91%
rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala
rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala
index 7bc8ed214a..6b5f39cee8 100644
--- a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala
+++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala
@@ -31,10 +31,13 @@
 
 package com.salesforce.op.utils.spark
 
+import com.salesforce.op.features.types.{FeatureType, OPMap}
 import com.salesforce.op.utils.json.JsonLike
 import org.apache.spark.sql.types.{Metadata, MetadataBuilder}
 import com.salesforce.op.utils.spark.RichMetadata.{RichMetadata => RichMeta}
 
+import scala.reflect.runtime.universe._
+
 
 /**
  * Represents the metadata a column in a vector.
@@ -113,19 +116,20 @@ case class OpVectorColumnMetadata
       s"${indicatorValue.map("_" + _).getOrElse("")}_$index"
 
   /**
-   * Does column have parent features that are maps
-   * @return boolean indicating whether parent feature type sequence contains Map types
+   * Does column have parent features of specified feature type O
    */
-  def hasMapParent(): Boolean = {
-    // TODO: move this class to `features` or `core` sub project to avoid mentioning types as strings
-    hasParentOfType("Map") || hasParentOfType("Prediction")
-  }
+  def hasParentOfType[O <: FeatureType](implicit tt: TypeTag[O]): Boolean =
+    parentFeatureType.exists { parentTypeName =>
+      FeatureType.featureTypeTag(parentTypeName).tpe =:= tt.tpe
+    }
 
   /**
-   * Does column have parent features of specified feature type
-   * @return boolean indicating whether parent feature type sequence contains type name
+   * Does column have parent features of which are subtypes of feature type O
    */
-  def hasParentOfType(typeName: String): Boolean = parentFeatureType.exists(_.contains(typeName))
+  def hasParentOfSubType[O <: FeatureType](implicit tt: TypeTag[O]): Boolean =
+    parentFeatureType.exists { parentTypeName =>
+      FeatureType.featureTypeTag(parentTypeName).tpe <:< tt.tpe
+    }
 
   /**
    * Return parent features names with the key (indicatorGroup) from any map parents included in name
@@ -133,7 +137,7 @@ case class OpVectorColumnMetadata
    *         for columns with map parent features
    */
   def parentNamesWithMapKeys(): Seq[String] =
-    if (hasMapParent()) parentFeatureName.map(p => indicatorGroup.map(p + "_" + _).getOrElse(p))
+    if (hasParentOfSubType[OPMap[_]]) parentFeatureName.map(p => indicatorGroup.map(p + "_" + _).getOrElse(p))
     else parentFeatureName
 
 }
diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
similarity index 100%
rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala b/features/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala
similarity index 100%
rename from utils/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala
rename to features/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala
diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala b/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala
new file mode 100644
index 0000000000..989e3f3f8a
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.spark
+
+import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
+
+/**
+ * [[org.apache.spark.ml.linalg.Vector]] enrichment functions
+ */
+object RichVector {
+
+  implicit class RichVector(val v: Vector) extends AnyVal {
+
+    /**
+     * Add vectors
+     *
+     * @param that another vector
+     * @throws IllegalArgumentException if the vectors have different sizes
+     * @return vector addition
+     */
+    def +(that: Vector): Vector = {
+      val res = v.toBreeze + that.toBreeze
+      toSpark(res)
+    }
+
+    /**
+     * Subtract vectors
+     *
+     * @param that another vector
+     * @throws IllegalArgumentException if the vectors have different sizes
+     * @return vector subtraction
+     */
+    def -(that: Vector): Vector = {
+      val res = v.toBreeze - that.toBreeze
+      toSpark(res)
+    }
+
+    /**
+     * Convert to [[breeze.linalg.Vector]]
+     *
+     * @return [[breeze.linalg.Vector]]
+     */
+    def toBreeze: BreezeVector[Double] = v match {
+      case s: SparseVector => new BreezeSparseVector[Double](s.indices, s.values, s.size)
+      case d: DenseVector => new BreezeDenseVector[Double](d.values)
+    }
+
+    /**
+     * Convert [[breeze.linalg.Vector]] back to [[org.apache.spark.ml.linalg.Vector]]
+     * @return [[org.apache.spark.ml.linalg.Vector]]
+     */
+    private def toSpark: BreezeVector[Double] => Vector = {
+      case s: BreezeSparseVector[Double]@unchecked => new SparseVector(s.length, s.index, s.data)
+      case d: BreezeDenseVector[Double]@unchecked => new DenseVector(d.data)
+    }
+
+  }
+
+}
diff --git a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala
index 1154b31cc7..65d92a309b 100644
--- a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala
+++ b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala
@@ -1,40 +1,32 @@
+// scalastyle:off header.matches
 /*
- * Copyright (c) 2017, Salesforce.com, Inc.
- * All rights reserved.
+ * Modifications: (c) 2017, Salesforce.com, Inc.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
  *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of Salesforce.com nor the names of its contributors may
- * be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 package org.apache.spark.ml
 
 import com.salesforce.op.stages.OpPipelineStageBase
-import org.apache.spark.SparkContext
+import org.apache.spark.ml.param.ParamPair
 import org.apache.spark.ml.util.DefaultParamsReader.{Metadata, loadMetadata}
 import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter}
+import org.json4s.JsonDSL._
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
 
 /**
  * Direct wrappers for ml private [[DefaultParamsWriter]] and [[DefaultParamsReader]]
@@ -43,13 +35,39 @@ import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter}
 case object SparkDefaultParamsReadWrite {
 
   /**
-   * Helper for [[saveMetadata()]] which extracts the JSON to save.
+   * Helper for [[OpPipelineStageWriter]] which extracts the JSON to save.
    * This is useful for ensemble models which need to save metadata for many sub-models.
    *
-   * @see [[saveMetadata()]] for details on what this includes.
+   * Note: this method was taken from DefaultParamsWriter.getMetadataToSave,
+   * but modified to avoid requiring Spark session
+   *
+   * @see [[OpPipelineStageWriter]] for details on what this includes.
    */
-  def getMetadataToSave(stage: OpPipelineStageBase, sc: SparkContext): String =
-    DefaultParamsWriter.getMetadataToSave(stage, sc)
+  def getMetadataToSave(
+    stage: OpPipelineStageBase,
+    extraMetadata: Option[JObject] = None,
+    paramMap: Option[JValue] = None
+  ): String = {
+    val uid = stage.uid
+    val cls = stage.getClass.getName
+    val params = stage.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]]
+    val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) =>
+      p.name -> parse(p.jsonEncode(v))
+    }.toList))
+    val basicMetadata = ("class" -> cls) ~
+      ("timestamp" -> System.currentTimeMillis()) ~
+      ("sparkVersion" -> org.apache.spark.SPARK_VERSION) ~
+      ("uid" -> uid) ~
+      ("paramMap" -> jsonParams)
+    val metadata = extraMetadata match {
+      case Some(jObject) =>
+        basicMetadata ~ jObject
+      case None =>
+        basicMetadata
+    }
+    val metadataJson: String = compact(render(metadata))
+    metadataJson
+  }
 
   /**
    * Parse metadata JSON string produced by [[DefaultParamsWriter.getMetadataToSave()]].
diff --git a/features/src/test/resources/OpParamsWithAltReader.json b/features/src/test/resources/OpParamsWithAltReader.json
index c1290ea795..88c87b627b 100644
--- a/features/src/test/resources/OpParamsWithAltReader.json
+++ b/features/src/test/resources/OpParamsWithAltReader.json
@@ -26,3 +26,4 @@
     }
   }
 }
+
diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala
index baa39339d9..22e7437ba2 100644
--- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala
+++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala
@@ -33,9 +33,10 @@ package com.salesforce.op.features.types
 
 import com.salesforce.op.test.TestCommon
 import org.junit.runner.RunWith
-import org.scalatest.{Assertion, Matchers, PropSpec}
+import org.scalactic.source
 import org.scalatest.junit.JUnitRunner
 import org.scalatest.prop.{PropertyChecks, TableFor1}
+import org.scalatest.{Assertion, Matchers, PropSpec}
 
 import scala.concurrent.duration._
 import scala.util.{Failure, Success, Try}
@@ -127,12 +128,32 @@ class FeatureTypeFactoryTest
 trait FeatureTypeAsserts {
   self: Matchers =>
 
-  def assertCreate(ft: => FeatureType): Assertion = Try(ft) match {
-    case Failure(e) =>
-      e shouldBe a[NonNullableEmptyException]
-    case Success(v) =>
-      v should not be null
-      v shouldBe a[FeatureType]
+  /**
+   * Asserts creation of the feature type value
+   *
+   * @param makeIt make block for feature
+   * @return [[Assertion]]
+   */
+  def assertCreate(makeIt: => FeatureType)(implicit pos: source.Position): Assertion =
+    assertCreate(makeIt, (v: FeatureType) => assert(true))
+
+  /**
+   * Asserts creation of the feature type value
+   *
+   * @param makeIt    make block for feature
+   * @param assertion optional assertion
+   * @return [[Assertion]]
+   */
+  def assertCreate(makeIt: => FeatureType, assertion: FeatureType => Assertion)
+    (implicit pos: source.Position): Assertion = {
+    Try(makeIt) match {
+      case Failure(e) =>
+        e shouldBe a[NonNullableEmptyException]
+      case Success(v) =>
+        v should not be null
+        v shouldBe a[FeatureType]
+        assertion(v)
+    }
   }
 
 }
diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala
index 178888a6d2..93fb4737bd 100644
--- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala
+++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala
@@ -33,6 +33,7 @@ package com.salesforce.op.features.types
 
 import com.salesforce.op.test.TestCommon
 import org.junit.runner.RunWith
+import org.scalacheck.Gen
 import org.scalatest.PropSpec
 import org.scalatest.junit.JUnitRunner
 import org.scalatest.prop.{PropertyChecks, TableFor1}
@@ -44,60 +45,13 @@ import scala.concurrent.duration._
 class FeatureTypeSparkConverterTest
   extends PropSpec with PropertyChecks with TestCommon with ConcurrentCheck with FeatureTypeAsserts {
 
-  val featureTypeConverters: TableFor1[FeatureTypeSparkConverter[_ <: FeatureType]] = Table("ft",
-    // Vector
-    FeatureTypeSparkConverter[OPVector](),
-    // Lists
-    FeatureTypeSparkConverter[TextList](),
-    FeatureTypeSparkConverter[DateList](),
-    FeatureTypeSparkConverter[DateTimeList](),
-    // Maps
-    FeatureTypeSparkConverter[Base64Map](),
-    FeatureTypeSparkConverter[BinaryMap](),
-    FeatureTypeSparkConverter[ComboBoxMap](),
-    FeatureTypeSparkConverter[CurrencyMap](),
-    FeatureTypeSparkConverter[DateMap](),
-    FeatureTypeSparkConverter[DateTimeMap](),
-    FeatureTypeSparkConverter[EmailMap](),
-    FeatureTypeSparkConverter[IDMap](),
-    FeatureTypeSparkConverter[IntegralMap](),
-    FeatureTypeSparkConverter[MultiPickListMap](),
-    FeatureTypeSparkConverter[PercentMap](),
-    FeatureTypeSparkConverter[PhoneMap](),
-    FeatureTypeSparkConverter[PickListMap](),
-    FeatureTypeSparkConverter[RealMap](),
-    FeatureTypeSparkConverter[TextAreaMap](),
-    FeatureTypeSparkConverter[TextMap](),
-    FeatureTypeSparkConverter[URLMap](),
-    FeatureTypeSparkConverter[CountryMap](),
-    FeatureTypeSparkConverter[StateMap](),
-    FeatureTypeSparkConverter[CityMap](),
-    FeatureTypeSparkConverter[PostalCodeMap](),
-    FeatureTypeSparkConverter[StreetMap](),
-    FeatureTypeSparkConverter[GeolocationMap](),
-    FeatureTypeSparkConverter[Prediction](),
-    // Numerics
-    FeatureTypeSparkConverter[Binary](),
-    FeatureTypeSparkConverter[Currency](),
-    FeatureTypeSparkConverter[Date](),
-    FeatureTypeSparkConverter[DateTime](),
-    FeatureTypeSparkConverter[Integral](),
-    FeatureTypeSparkConverter[Percent](),
-    FeatureTypeSparkConverter[Real](),
-    FeatureTypeSparkConverter[RealNN](),
-    // Sets
-    FeatureTypeSparkConverter[MultiPickList](),
-    // Text
-    FeatureTypeSparkConverter[Base64](),
-    FeatureTypeSparkConverter[ComboBox](),
-    FeatureTypeSparkConverter[Email](),
-    FeatureTypeSparkConverter[ID](),
-    FeatureTypeSparkConverter[Phone](),
-    FeatureTypeSparkConverter[PickList](),
-    FeatureTypeSparkConverter[Text](),
-    FeatureTypeSparkConverter[TextArea](),
-    FeatureTypeSparkConverter[URL]()
+  val featureTypeConverters: TableFor1[FeatureTypeSparkConverter[_ <: FeatureType]] = Table("ftc",
+    FeatureTypeSparkConverter.featureTypeSparkConverters.values.toSeq: _*
   )
+  val featureTypeNames: TableFor1[String] = Table("ftnames",
+    FeatureTypeSparkConverter.featureTypeSparkConverters.keys.toSeq: _*
+  )
+  val bogusNames = Gen.alphaNumStr
 
   property("is a feature type converter") {
     forAll(featureTypeConverters) { ft => ft shouldBe a[FeatureTypeSparkConverter[_]] }
@@ -105,14 +59,40 @@ class FeatureTypeSparkConverterTest
   property("is serializable") {
     forAll(featureTypeConverters) { ft => ft shouldBe a[Serializable] }
   }
+  property("make a converter by feature type name") {
+    forAll(featureTypeNames) { featureTypeName =>
+      val ft: FeatureTypeSparkConverter[_ <: FeatureType] =
+        FeatureTypeSparkConverter.fromFeatureTypeName(featureTypeName)
+      assertCreate(ft.fromSpark(null))
+    }
+  }
+  property("error on making a converter on no existent feature type name") {
+    forAll(bogusNames) { bogusName =>
+      intercept[IllegalArgumentException](
+        FeatureTypeSparkConverter.fromFeatureTypeName(bogusName)
+      ).getMessage shouldBe s"Unknown feature type '$bogusName'"
+    }
+  }
   property("create a feature type instance of null") {
     forAll(featureTypeConverters)(ft => assertCreate(ft.fromSpark(null)))
   }
-  property("create a feature type instance in a timely fashion") {
+  property("create a feature type instance of null and back") {
+    forAll(featureTypeConverters) { ft =>
+      assertCreate(ft.fromSpark(null), (v: FeatureType) => {
+        ft.asInstanceOf[FeatureTypeSparkConverter[FeatureType]].toSpark(v) shouldBe (null: Any)
+        FeatureTypeSparkConverter.toSpark(v) shouldBe (null: Any)
+      })
+    }
+  }
+  property("create a feature type instance and back in a timely fashion") {
     forAllConcurrentCheck[FeatureTypeSparkConverter[_ <: FeatureType]](
       numThreads = 10, numInstancesPerThread = 50000, atMost = 10.seconds,
       table = featureTypeConverters,
-      functionCheck = ft => assertCreate(ft.fromSpark(null))
+      functionCheck = ft => {
+        assertCreate(ft.fromSpark(null), (v: FeatureType) => {
+          ft.asInstanceOf[FeatureTypeSparkConverter[FeatureType]].toSpark(v) shouldBe (null: Any)
+        })
+      }
     )
   }
 }
diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
index 6f8bb73c89..ca6c960ec6 100644
--- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
+++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
@@ -32,6 +32,7 @@
 package com.salesforce.op.features.types
 
 import com.salesforce.op.test.TestCommon
+import com.salesforce.op.utils.reflection.ReflectionUtils
 import org.apache.lucene.geo.GeoUtils
 import org.apache.spark.ml.linalg.DenseVector
 import org.junit.runner.RunWith
@@ -237,8 +238,8 @@ class FeatureTypeValueTest extends PropSpec with PropertyChecks with TestCommon
    * @tparam FT feature type (OP type)
    */
   private def checkTypeTags[FT <: FeatureType](implicit vtt: TypeTag[FT#Value]): Assertion = {
-    withClue(s"Feature value type ${vtt.tpe} (dealised: ${vtt.tpe.dealias})") {
-      val tt = Try(FeatureType.featureValueTypeTag(vtt.tpe.dealias.toString))
+    withClue(s"Feature value type ${vtt.tpe} (dealised: ${ReflectionUtils.dealisedTypeName(vtt.tpe)}): ") {
+      val tt = Try(FeatureType.featureValueTypeTag(ReflectionUtils.dealisedTypeName(vtt.tpe)))
       if (tt.isFailure) fail(tt.failed.get)
       tt.get.tpe =:= vtt.tpe shouldBe true
       FeatureType.isFeatureValueType(vtt) shouldBe true
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala
similarity index 54%
rename from core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala
index e2aa10227b..4a939ba2d0 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala
@@ -32,22 +32,18 @@
 package com.salesforce.op.stages.base.binary
 
 import com.salesforce.op.UID
-import com.salesforce.op.features.Feature
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.sql.Dataset
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec}
-
 
 @RunWith(classOf[JUnitRunner])
-class BinaryEstimatorTest extends FlatSpec with TestSparkContext with Assertions {
+class BinaryEstimatorTest
+  extends OpEstimatorSpec[OPVector, BinaryModel[Text, Text, OPVector], BinaryEstimator[Text, Text, OPVector]] {
 
-  val (ds, city, country) = TestFeatureBuilder("city", "country",
+  val (inputData, city, country) = TestFeatureBuilder("city", "country",
     Seq(
       (Text("San Francisco"), Text("USA")),
       (Text("Paris"), Text("France")),
@@ -59,59 +55,21 @@ class BinaryEstimatorTest extends FlatSpec with TestSparkContext with Assertions
     )
   )
 
-  val testEstimator: BinaryEstimator[Text, Text, OPVector] = new TestPivotEstimator()
-
-  Spec[BinaryEstimator[_, _, _]] should "throw an error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testEstimator.getOutput())
-  }
-
-  it should "return a single output feature of the correct type" in {
-    val outputFeatures = testEstimator.setInput(city, country).getOutput()
-    outputFeatures shouldBe new Feature[OPVector](
-      name = testEstimator.getOutputFeatureName,
-      originStage = testEstimator,
-      isResponse = false,
-      parents = Array(city, country)
-    )
-  }
-
-  it should "return a BinaryModel with the estimator as the parent and the correct function" in {
-    val testModel = testEstimator.setInput(city, country).fit(ds)
-
-    testModel.parent shouldBe testEstimator
-    testModel.transformFn(Text("San Francisco"), Text("USA")) shouldBe Vectors.dense(1, 0).toOPVector
-  }
-
-
-  it should "create a BinaryModel that uses the specified transform function when fit" in {
-    val testModel = testEstimator.setInput(city, country).fit(ds)
-    val testDataTransformed = testModel.setInput(city, country).transform(ds)
-    val outputFeatures = testEstimator.getOutput()
-    val transformedValues = testDataTransformed.collect(city, country, outputFeatures).toList
+  val estimator = new TestPivotEstimator().setInput(city, country)
 
-    testDataTransformed.schema.fields.map(_.name).toSet shouldEqual Set(city.name, country.name, outputFeatures.name)
+  val expectedResult = Seq(
+    Vectors.dense(1.0, 0.0),
+    Vectors.dense(0.0, 1.0),
+    Vectors.dense(0.0, 1.0),
+    Vectors.dense(1.0, 0.0),
+    Vectors.dense(0.0, 1.0),
+    Vectors.dense(0.0, 1.0),
+    Vectors.dense(0.0, 1.0)
+  ).map(_.toOPVector)
 
-    transformedValues.toSet shouldEqual Set(
-      (Text("San Francisco"), Text("USA"), Vectors.dense(1.0, 0.0).toOPVector),
-      (Text("Paris"), Text("France"), Vectors.dense(0.0, 1.0).toOPVector),
-      (Text("Austin"), Text("USA"), Vectors.dense(0.0, 1.0).toOPVector),
-      (Text("San Francisco"), Text("USA"), Vectors.dense(1.0, 0.0).toOPVector),
-      (Text("Paris"), Text("USA"), Vectors.dense(0.0, 1.0).toOPVector),
-      (Text("Puerto Arenas"), Text("Chile"), Vectors.dense(0.0, 1.0).toOPVector),
-      (Text("Iquitos"), Text(None), Vectors.dense(0.0, 1.0).toOPVector)
-    )
-
-  }
-
-  it should "copy itself and the model successfully" in {
-    val est = new TestPivotEstimator()
-    val mod = new TestPivotModel("", est.operationName, est.uid)
-
-    est.copy(new ParamMap()).uid shouldBe est.uid
-    mod.copy(new ParamMap()).uid shouldBe mod.uid
-  }
 }
 
+
 class TestPivotEstimator(uid: String = UID[TestPivotEstimator])
   extends BinaryEstimator[Text, Text, OPVector](operationName = "pivot", uid = uid) {
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala
similarity index 70%
rename from core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala
index 14252cfdbb..601048b393 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala
@@ -29,24 +29,24 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package com.salesforce.op.stages.base.quaternary
+package com.salesforce.op.stages.base.binary
 
-import com.salesforce.op.features.types.Text
-import com.salesforce.op.test._
-import org.apache.spark.ml.param.ParamMap
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
+
 
 @RunWith(classOf[JUnitRunner])
-class QuaternaryTransformerTest extends FlatSpec with TestCommon {
-
-  Spec[QuaternaryLambdaTransformer[_, _, _, _ , _]] should "copy successfully" in {
-    val tr = new QuaternaryLambdaTransformer[Text, Text, Text, Text, Text](
-      operationName = "foo",
-      transformFn = (x, y, z, u) => x
-    )
-    tr.copy(new ParamMap()).uid shouldBe tr.uid
-  }
+class BinaryTransformerTest extends OpTransformerSpec[Real, BinaryTransformer[Real, RealNN, Real]] {
+
+  val sample = Seq(Real(1.0) -> RealNN(0.0), Real(2.0) -> RealNN(2.0), Real.empty -> RealNN(1.0))
+  val (inputData, f1, f2) = TestFeatureBuilder(sample)
+
+  val transformer = new BinaryLambdaTransformer[Real, RealNN, Real](operationName = "bmi",
+    transformFn = (i1, i2) => new Real(for { v1 <- i1.value; v2 <- i2.value } yield v1 / (v2 * v2))
+  ).setInput(f1, f2)
+
+  val expectedResult = Seq(Real(Double.PositiveInfinity), Real(0.5), Real.empty)
 
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala
similarity index 58%
rename from core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala
index 8912148736..08e4a72064 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala
@@ -32,76 +32,34 @@
 package com.salesforce.op.stages.base.quaternary
 
 import com.salesforce.op.UID
-import com.salesforce.op.features.Feature
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.PassengerSparkFixtureTest
-import com.salesforce.op.utils.spark.RichDataset._
-import org.apache.spark.ml.param.ParamMap
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
 import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.types._
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class QuaternaryEstimatorTest extends FlatSpec with PassengerSparkFixtureTest {
-
-  var testEstimator: QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real] = new FantasticFourEstimator()
-
-  Spec[QuaternaryEstimator[_, _, _, _, _]] should
-    "throw an error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testEstimator.getOutput())
-  }
-
-  it should "return a single output feature of the correct type" in {
-    val outputFeatures = testEstimator.setInput(age, stringMap, booleanMap, gender).getOutput()
-    outputFeatures shouldBe new Feature[Real](
-      name = testEstimator.getOutputFeatureName,
-      originStage = testEstimator,
-      isResponse = false,
-      parents = Array(age, stringMap, booleanMap, gender)
-    )
-
-  }
-
-  it should "create a TernaryModel that uses the specified transform function when fit" in {
-    val testModel = testEstimator.setInput(age, stringMap, booleanMap, gender).fit(passengersDataSet)
-    val testDataTransformed = testModel.setInput(age, stringMap, booleanMap, gender)
-      .transform(passengersDataSet.select(age.name, stringMap.name, booleanMap.name, gender.name))
-
-    testDataTransformed.schema shouldEqual StructType(
-      Seq(StructField(age.name, DoubleType, true),
-        StructField(stringMap.name, MapType(StringType, StringType, true), true),
-        StructField(booleanMap.name, MapType(StringType, BooleanType, true), true),
-        StructField(gender.name, ArrayType(StringType, true), true),
-        StructField(testEstimator.getOutputFeatureName, DoubleType, true)
-      )
+class QuaternaryEstimatorTest
+  extends OpEstimatorSpec[Real,
+    QuaternaryModel[Real, TextMap, BinaryMap, MultiPickList, Real],
+    QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real]] {
+
+  val (inputData, reals, textMap, booleanMap, binary) = TestFeatureBuilder(
+    Seq(
+      (Real.empty, TextMap(Map("a" -> "keen")), BinaryMap(Map("a" -> true)), MultiPickList(Set("a"))),
+      (Real(15.0), TextMap(Map("b" -> "bok")), BinaryMap(Map("b" -> true)), MultiPickList(Set("b"))),
+      (Real(23.0), TextMap(Map("c" -> "bar")), BinaryMap(Map("c" -> true)), MultiPickList(Set("c"))),
+      (Real(40.0), TextMap(Map.empty), BinaryMap(Map("d" -> true)), MultiPickList(Set("d"))),
+      (Real(65.0), TextMap(Map("e" -> "B")), BinaryMap(Map("e" -> true)), MultiPickList(Set("e")))
     )
+  )
 
-    val expected = Array(
-      Real(13.833333333333336),
-      Real(None),
-      Real(-3.1666666666666643),
-      Real(-34.166666666666664),
-      Real(None),
-      Real(-4.166666666666664)
-    )
-
-    testDataTransformed.collect(testModel.getOutput()) shouldEqual expected
-  }
-
-  it should "copy itself and the model successfully" in {
-    val est = new FantasticFourEstimator()
-    val mod = new FantasticFourModel(0.0, est.operationName, est.uid)
-
-    est.copy(new ParamMap()).uid shouldBe est.uid
-    mod.copy(new ParamMap()).uid shouldBe mod.uid
-  }
+  val estimator = new FantasticFourEstimator().setInput(reals, textMap, booleanMap, binary)
 
+  val expectedResult = Seq(Real.empty, Real(-31.6), Real(-23.6), Real.empty, Real(18.4))
 }
 
-
 class FantasticFourEstimator(uid: String = UID[FantasticFourEstimator])
   extends QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real](operationName = "fantasticFour", uid = uid)
     with FantasticFour  {
diff --git a/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala
new file mode 100644
index 0000000000..1f7ba68c62
--- /dev/null
+++ b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.base.quaternary
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.test._
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class QuaternaryTransformerTest
+  extends OpTransformerSpec[Real, QuaternaryTransformer[Real, Integral, Text, Binary, Real]] {
+
+  val sample = Seq(
+    (Real(1.0), Integral(0), Text("abc"), Binary(false)),
+    (Real(2.0), Integral(2), Text("a"), Binary(true)),
+    (Real.empty, Integral(3), Text("abcdefg"), Binary(true))
+  )
+
+  val (inputData, f1, f2, f3, f4) = TestFeatureBuilder(sample)
+
+  val transformer = new QuaternaryLambdaTransformer[Real, Integral, Text, Binary, Real](operationName = "quatro",
+    transformFn = (r, i, t, b) =>
+      (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0) +
+        t.value.map(_.length.toDouble).getOrElse(0.0)).toReal
+  ).setInput(f1, f2, f3, f4)
+
+  val expectedResult = Seq(4.toReal, 6.toReal, 11.toReal)
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala
similarity index 55%
rename from core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala
index 68b5218aec..f4800f72b1 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala
@@ -32,23 +32,20 @@
 package com.salesforce.op.stages.base.sequence
 
 import com.salesforce.op.UID
-import com.salesforce.op.features.Feature
 import com.salesforce.op.features.types._
-import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
-import com.salesforce.op.utils.spark.RichDataset._
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.SequenceAggregators
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.sql.Dataset
 import org.junit.runner.RunWith
-import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 
 @RunWith(classOf[JUnitRunner])
-class SequenceEstimatorTest extends FlatSpec with TestSparkContext {
+class SequenceEstimatorTest
+  extends OpEstimatorSpec[OPVector, SequenceModel[DateList, OPVector], SequenceEstimator[DateList, OPVector]] {
 
-  val data = Seq[(DateList, DateList, DateList)](
+  val sample = Seq[(DateList, DateList, DateList)](
     (new DateList(1476726419000L, 1476726019000L),
       new DateList(1476726919000L),
       new DateList(1476726519000L)),
@@ -59,60 +56,18 @@ class SequenceEstimatorTest extends FlatSpec with TestSparkContext {
       new DateList(1476728919000L),
       new DateList(1476726619000L, 1476726949000L))
   )
-  val (ds, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", data)
+  val (inputData, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", sample)
 
-  val testEstimator: SequenceEstimator[DateList, OPVector] = new FractionOfResponsesEstimator()
+  val estimator = new FractionOfResponsesEstimator().setInput(clicks, opens, purchases)
 
-  Spec[SequenceEstimator[_, _]] should "throw an error if you try to get the output without setting the inputs" in {
-    intercept[java.util.NoSuchElementException](testEstimator.getOutput())
-  }
-
-  it should "return a single output feature of the correct type" in {
-    val outputFeatures = testEstimator.setInput(clicks, opens, purchases).getOutput()
-    outputFeatures shouldBe new Feature[OPVector](
-      name = testEstimator.getOutputFeatureName,
-      originStage = testEstimator,
-      isResponse = false,
-      parents = Array(clicks, opens, purchases)
-    )
-  }
-
-  it should "return a SequenceModel with the estimator as the parent and the correct function" in {
-    val testModel = testEstimator.setInput(clicks, opens, purchases).fit(ds)
-    testModel.parent shouldBe testEstimator
-    testModel.transformFn(
-      Seq(new DateList(1476726419000L), new DateList(1476726419000L), new DateList(1476726419000L))
-    ) shouldEqual Vectors.dense(0.2, 0.25, 0.25).toOPVector
-  }
-
-  it should "create a SequenceModel that uses the specified transform function when fit" in {
-    val testModel = testEstimator.setInput(clicks, opens, purchases).fit(ds)
-    val testDataTransformed = testModel.setInput(clicks, opens, purchases).transform(ds)
-    val transformedValues = testDataTransformed.collect(clicks, opens, purchases, testModel.getOutput())
-
-    // This is string because of vector type being private to spark ml
-    testDataTransformed.schema.fieldNames.toSet shouldEqual
-      Set(clicks.name, opens.name, purchases.name, testEstimator.getOutputFeatureName)
-
-    val fractions = Array(
-      Vectors.dense(0.4, 0.25, 0.25).toOPVector,
-      Vectors.dense(0.4, 0.5, 0.25).toOPVector,
-      Vectors.dense(0.2, 0.25, 0.5).toOPVector
-    )
-    val expected = data.zip(fractions) .map { case ((d1, d2, d3), f) => (d1, d2, d3, f)}
-
-    transformedValues shouldBe expected
-  }
-
-  it should "copy itself and the model successfully" in {
-    val est = new FractionOfResponsesEstimator()
-    val mod = new FractionOfResponsesModel(Seq.empty, est.operationName, est.uid)
-
-    est.copy(new ParamMap()).uid shouldBe est.uid
-    mod.copy(new ParamMap()).uid shouldBe mod.uid
-  }
+  val expectedResult = Seq(
+    Vectors.dense(0.4, 0.25, 0.25).toOPVector,
+    Vectors.dense(0.4, 0.5, 0.25).toOPVector,
+    Vectors.dense(0.2, 0.25, 0.5).toOPVector
+  )
 }
 
+
 class FractionOfResponsesEstimator(uid: String = UID[FractionOfResponsesEstimator])
   extends SequenceEstimator[DateList, OPVector](operationName = "fractionOfResponses", uid = uid) {
   def fitFn(dataset: Dataset[Seq[Seq[Long]]]): SequenceModel[DateList, OPVector] = {
diff --git a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala
new file mode 100644
index 0000000000..16d531016e
--- /dev/null
+++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.base.sequence
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class SequenceTransformerTest extends OpTransformerSpec[MultiPickList, SequenceTransformer[Real, MultiPickList]] {
+
+  val sample = Seq(
+    1.toReal -> 1.toReal,
+    (-1).toReal -> 1.toReal,
+    15.toReal -> Real.empty,
+    1.111.toReal -> 2.222.toReal
+  )
+  val (inputData, f1, f2) = TestFeatureBuilder(sample)
+
+  val transformer = new SequenceLambdaTransformer[Real, MultiPickList](operationName = "realToMultiPicklist",
+    transformFn = value => MultiPickList(value.flatMap(_.v.map(_.toString)).toSet)
+  ).setInput(f1, f2)
+
+  val expectedResult = Seq(
+    Set("1.0").toMultiPickList,
+    Set("-1.0", "1.0").toMultiPickList,
+    Set("15.0").toMultiPickList,
+    Set("1.111", "2.222").toMultiPickList
+  )
+}
diff --git a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala
new file mode 100644
index 0000000000..3a11f4db3c
--- /dev/null
+++ b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.base.ternary
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.sql.Dataset
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class TernaryEstimatorTest
+  extends OpEstimatorSpec[Real,
+    TernaryModel[MultiPickList, Binary, RealMap, Real],
+    TernaryEstimator[MultiPickList, Binary, RealMap, Real]] {
+
+  val (inputData, gender, numericMap, survived) = TestFeatureBuilder("gender", "numericMap", "survived",
+    Seq(
+      (MultiPickList.empty, RealMap(Map("teen" -> 1.0)), Binary(true)),
+      (MultiPickList(Set("teen")), RealMap(Map("teen" -> 2.0)), Binary(false)),
+      (MultiPickList(Set("teen")), RealMap(Map("teen" -> 3.0)), Binary(false)),
+      (MultiPickList(Set("adult")), RealMap(Map("adult" -> 1.0)), Binary(false)),
+      (MultiPickList(Set("senior")), RealMap(Map("senior" -> 1.0, "adult" -> 2.0)), Binary(false))
+    )
+  )
+
+  val estimator = new TripleInteractionsEstimator().setInput(gender, survived, numericMap)
+
+  val expectedResult = Seq(Real.empty, Real(0.25), Real(1.25), Real(-0.75), Real(-0.75))
+}
+
+class TripleInteractionsEstimator(uid: String = UID[TripleInteractionsEstimator])
+  extends TernaryEstimator[MultiPickList, Binary, RealMap, Real](operationName = "tripleInteractions", uid = uid)
+    with TripleInteractions {
+
+  // scalastyle:off line.size.limit
+  def fitFn(dataset: Dataset[(MultiPickList#Value, Binary#Value, RealMap#Value)]): TernaryModel[MultiPickList, Binary, RealMap, Real] = {
+    import dataset.sparkSession.implicits._
+    val mean = {
+      dataset.map { case (gndr, srvvd, nmrcMp) =>
+        if (survivedAndMatches(gndr, srvvd, nmrcMp)) nmrcMp(gndr.head) else 0.0
+      }.filter(_ != 0.0).groupBy().mean().first().getDouble(0)
+    }
+    new TripleInteractionsModel(mean = mean, operationName = operationName, uid = uid)
+  }
+  // scalastyle:on
+
+}
+
+final class TripleInteractionsModel private[op](val mean: Double, operationName: String, uid: String)
+  extends TernaryModel[MultiPickList, Binary, RealMap, Real](operationName = operationName, uid = uid)
+    with TripleInteractions {
+
+  def transformFn: (MultiPickList, Binary, RealMap) => Real = (g: MultiPickList, s: Binary, nm: RealMap) => new Real(
+    if (!survivedAndMatches(g.value, s.value, nm.value)) None
+    else Some(nm.value(g.value.head) - mean)
+  )
+
+}
+
+sealed trait TripleInteractions {
+  def survivedAndMatches(g: MultiPickList#Value, s: Binary#Value, nm: RealMap#Value): Boolean =
+    !s.getOrElse(false) && g.nonEmpty && nm.contains(g.head)
+}
diff --git a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala
new file mode 100644
index 0000000000..6b8f1cf527
--- /dev/null
+++ b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.base.ternary
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.test._
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class TernaryTransformerTest extends OpTransformerSpec[Real, TernaryTransformer[Real, Integral, Binary, Real]] {
+
+  val sample = Seq(
+    (Real(1.0), Integral(0), Binary(false)),
+    (Real(2.0), Integral(2), Binary(true)),
+    (Real.empty, Integral(3), Binary(true))
+  )
+
+  val (inputData, f1, f2, f3) = TestFeatureBuilder(sample)
+
+  val transformer = new TernaryLambdaTransformer[Real, Integral, Binary, Real](operationName = "trio",
+    transformFn = (r, i, b) => (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0)).toReal
+  ).setInput(f1, f2, f3)
+
+  val expectedResult = Seq(1.toReal, 5.toReal, 4.toReal)
+
+}
diff --git a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala
similarity index 54%
rename from core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala
index 27d2f04005..570c3c0270 100644
--- a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala
@@ -29,51 +29,53 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package com.salesforce.op.stages.base.binary
+package com.salesforce.op.stages.base.unary
 
-import com.salesforce.op.test.PassengerSparkFixtureTest
+import com.salesforce.op.UID
 import com.salesforce.op.features.Feature
 import com.salesforce.op.features.types._
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.utils.spark.RichDataset._
-import com.salesforce.op.utils.spark.RichRow._
 import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType}
 import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpec, Matchers}
 
 
 @RunWith(classOf[JUnitRunner])
-class BinaryTransformerTest extends FlatSpec with PassengerSparkFixtureTest {
+class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] {
 
-  val bmi = new BinaryLambdaTransformer[Real, RealNN, Real](operationName = "bmi",
-    transformFn = (i1, i2) => new Real(for { v1 <- i1.value; v2 <- i2.value } yield v1 / (v2 * v2))
-  ).setInput(weight, height)
+  /**
+   * Input Dataset to fit & transform
+   */
+  val (inputData, f1) = TestFeatureBuilder(Seq(1.0, 5.0, 3.0, 2.0, 6.0).toReal)
 
-  Spec[BinaryLambdaTransformer[_, _, _]] should "return single properly formed Feature" in {
-    val feats = bmi.getOutput()
+  /**
+   * Estimator instance to be tested
+   */
+  val estimator = new MinMaxNormEstimator().setInput(f1)
 
-    feats shouldBe new Feature[Real](
-      name = bmi.getOutputFeatureName,
-      originStage = bmi,
-      isResponse = false,
-      parents = Array(weight, height)
-    )
-  }
+  /**
+   * Expected result of the transformer applied on the Input Dataset
+   */
+  val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal)
 
-  it should "add column to DataFrame when transformed" in {
-    val transformedData = bmi.transform(passengersDataSet)
-    val columns = transformedData.columns
-    assert(columns.contains(bmi.getOutputFeatureName))
-    val output = bmi.getOutput()
-    val answer = passengersArray.map(r =>
-      bmi.transformFn(r.getFeatureType[Real](weight), r.getFeatureType[RealNN](height))
-    )
-    transformedData.collect(output) shouldBe answer
-  }
+}
 
-  it should "copy successfully" in {
-    val copy = bmi.copy(new ParamMap())
-    copy.isInstanceOf[BinaryTransformer[_, _, _]] shouldBe true
-    copy.uid shouldBe bmi.uid
+class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
+  extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {
+
+  def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
+    val grouped = dataset.groupBy()
+    val maxVal = grouped.max().first().getDouble(0)
+    val minVal = grouped.min().first().getDouble(0)
+    new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid)
   }
 }
+
+final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String)
+  extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
+  def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal
+}
diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala
similarity index 66%
rename from core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala
rename to features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala
index e2707fb3b0..a4108318ea 100644
--- a/core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala
+++ b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala
@@ -29,26 +29,33 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-package org.apache.spark.ml.classification
+package com.salesforce.op.stages.base.unary
 
-import com.salesforce.op.UID
 import com.salesforce.op.features.types._
-import org.apache.spark.ml.linalg.{Matrix, Vector}
-
-import scala.reflect.runtime.universe.TypeTag
-
-class OpLogisticRegressionModel
-(
-  coefficientMatrix: Matrix,
-  interceptVector: Vector,
-  numClasses: Int,
-  val isMultinomial: Boolean,
-  val operationName: String = "opLR",
-  uid: String = UID[OpLogisticRegressionModel]
-)(
-  implicit val tti1: TypeTag[RealNN],
-  val tti2: TypeTag[OPVector],
-  val tto: TypeTag[Prediction],
-  val ttov: TypeTag[Prediction#Value]
-) extends LogisticRegressionModel(uid = uid, coefficientMatrix = coefficientMatrix,
-  interceptVector = interceptVector, numClasses = numClasses, isMultinomial = isMultinomial) with OpClassifierModelBase
+import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class UnaryTransformerTest extends OpTransformerSpec[Real, UnaryLambdaTransformer[Real, Real]] {
+
+  /**
+   * Input Dataset to transform
+   */
+  val (inputData, f1) = TestFeatureBuilder(Seq(Some(1), Some(2), Some(3), None).map(_.toReal))
+
+  /**
+   * [[OpTransformer]] instance to be tested
+   */
+  val transformer = new UnaryLambdaTransformer[Real, Real](
+    operationName = "unary",
+    transformFn = r => r.v.map(_ * 2.0).toReal
+  ).setInput(f1)
+
+  /**
+   * Expected result of the transformer applied on the Input Dataset
+   */
+  val expectedResult = Seq(Real(2), Real(4), Real(6), Real.empty)
+
+}
diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
similarity index 93%
rename from utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
rename to features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
index 3e44b3b09d..78577b79ea 100644
--- a/utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
@@ -32,6 +32,7 @@
 package com.salesforce.op.utils.spark
 
 import com.salesforce.op.FeatureHistory
+import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap}
 import com.salesforce.op.test.TestCommon
 import org.apache.spark.sql.types.Metadata
 import org.junit.runner.RunWith
@@ -51,13 +52,19 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
 
   // AttributeGroup and Attribute require non-empty names
   val genName: Gen[String] = Gen.nonEmptyListOf(alphaNumChar).map(_.mkString)
+  val genType: Gen[String] = Gen.oneOf(
+    FeatureType.typeName[DateTime], FeatureType.typeName[Email], FeatureType.typeName[PickList],
+    FeatureType.typeName[Prediction], FeatureType.typeName[Real], FeatureType.typeName[RealMap],
+    FeatureType.typeName[TextAreaMap]
+  )
   val genValue: Gen[String] = Gen.oneOf(genName, Gen.oneOf(Seq(OpVectorColumnMetadata.NullString)))
   val vecColTupleGen: Gen[OpVectorColumnTuple] = for {
-    seq <- Gen.containerOf[Seq, String](genName)
+    nameSeq <- Gen.containerOf[Seq, String](genName)
+    typeSeq <- Gen.listOfN(nameSeq.length, genType)
     group <- Gen.option(genName)
     value <- Gen.option(genValue)
   } yield {
-    (seq, seq, group, value, 0)
+    (nameSeq, typeSeq, group, value, 0)
   }
 
   val featHistTupleGen: Gen[FeatureHistoryTuple] = Gen.zip(
@@ -181,8 +188,8 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
           hist.indicatorValue shouldBe meta.indicatorValue
           hist.indicatorGroup shouldBe meta.indicatorGroup
           hist.indicatorValue.contains(OpVectorColumnMetadata.NullString) shouldBe meta.isNullIndicator
-          hist.parentFeatureType.foreach(p => p.contains(p) shouldBe meta.hasParentOfType(p))
-          hist.parentFeatureType.exists(p => p.contains("Map") || p.contains("Prediction")) shouldBe meta.hasMapParent()
+          hist.parentFeatureType.exists(p => p.contains("Map") || p.contains("Prediction")) shouldBe
+            meta.hasParentOfSubType[OPMap[_]]
         }
         if (colHist.nonEmpty && colHist.head.parentFeatureName.nonEmpty) {
           colHist.head.parentFeatureName.flatMap(p => history(p).stages).distinct.sorted should
diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala
similarity index 100%
rename from utils/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala
rename to features/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala
diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala
new file mode 100644
index 0000000000..cb4a4bd616
--- /dev/null
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.spark
+
+import com.holdenkarau.spark.testing.RDDGenerator
+import com.salesforce.op.features.types.ConcurrentCheck
+import com.salesforce.op.test.TestSparkContext
+import com.twitter.algebird.Monoid
+import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
+import org.apache.spark.rdd.RDD
+import org.junit.runner.RunWith
+import org.scalacheck.Gen
+import org.scalactic.TolerantNumerics
+import org.scalatest.PropSpec
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.prop.PropertyChecks
+
+import scala.concurrent.duration._
+
+
+@RunWith(classOf[JUnitRunner])
+class RichVectorTest extends PropSpec with PropertyChecks with TestSparkContext with ConcurrentCheck {
+
+  import VectorGenerators._
+  import com.salesforce.op.utils.spark.RichVector._
+
+  lazy val sparseVevtorsRDDGen = RDDGenerator.genRDD[Vector](spark.sparkContext)(sparseVectorGen)
+
+  property("Vectors should error on size mismatch") {
+    forAll(sparseVectorGen) { sparse: SparseVector =>
+      val wrongSize = Vectors.sparse(sparse.size + 1, Array(0), Array(1.0))
+      val dense = sparse.toDense
+      for {
+        res <- Seq(
+          () => sparse + wrongSize,
+          () => sparse - wrongSize,
+          () => dense + wrongSize,
+          () => dense - wrongSize,
+          () => dense + wrongSize.toDense,
+          () => dense - wrongSize.toDense
+        )
+      } {
+        intercept[IllegalArgumentException](res()).getMessage should {
+          startWith("requirement failed: Vectors must") and include("same length")
+        }
+      }
+    }
+  }
+
+  property("Vectors should '+' add correctly") {
+    forAll(sparseVectorGen) { sparse: SparseVector =>
+      val expected = sparse.toArray.map(_ * 2)
+      val dense = sparse.toDense
+      for {res <- Seq(sparse + sparse, dense + sparse, sparse + dense, dense + dense)} {
+        res.size shouldBe sparse.size
+        res.toArray should contain theSameElementsAs expected
+      }
+    }
+  }
+
+  property("Vectors should '-' subtract correctly") {
+    forAll(sparseVectorGen) { sparse: SparseVector =>
+      val dense = sparse.toDense
+      for {res <- Seq(sparse - sparse, dense - sparse, sparse - dense, dense - dense)} {
+        res.size shouldBe sparse.size
+        res.toArray.foreach(_ shouldBe 0.0)
+      }
+    }
+  }
+
+  property("Vectors convert to breeze vectors correctly") {
+    forAll(sparseVectorGen) { sparse: SparseVector =>
+      val dense = sparse.toDense
+      sparse.toBreeze.toArray should contain theSameElementsAs dense.toBreeze.toArray
+    }
+  }
+
+  property("Sparse vectors should '+' add efficiently") {
+    val sparseSize = 100000000
+    val sparse = new SparseVector(sparseSize, Array(0, 1, sparseSize - 1), Array(-1.0, 1.0, 3.0))
+    val expected = new SparseVector(sparseSize, Array(0, 1, sparseSize - 1), Array(-2.0, 2.0, 6.0))
+
+    forAllConcurrentCheck[SparseVector](
+      numThreads = 10, numInstancesPerThread = 100000, atMost = 10.seconds,
+      table = Table[SparseVector]("sparseVectors", sparse),
+      functionCheck = sparse => {
+        val res = sparse + sparse
+        res shouldBe a[SparseVector]
+        res shouldEqual expected
+      }
+    )
+  }
+
+  property("Vectors add in reduce") {
+    forAll(sparseVevtorsRDDGen) { rdd: RDD[Vector] =>
+      if (!rdd.isEmpty()) {
+        val tolerance = 1e-9 // we are loosing precision here, hence the tolerance
+        implicit val doubleEq = TolerantNumerics.tolerantDoubleEquality(tolerance)
+
+        val expected = rdd.map(_.toArray).reduce(Monoid.arrayMonoid[Double].plus)
+        for {
+          res <- Seq(
+            () => rdd.reduce(_ + _),
+            () => rdd.reduce(_.toDense + _),
+            () => rdd.reduce(_ + _.toDense),
+            () => rdd.reduce(_.toDense + _.toDense)
+          )
+          (v, exp) <- res().toArray.zip(expected)
+        } v shouldEqual exp
+      }
+    }
+  }
+
+}
+
+object VectorGenerators {
+
+  val size = 100
+
+  val sparseVectorGen = for {
+    indices <- Gen.listOfN(size, Gen.choose(0, size - 1))
+    values <- Gen.listOfN(size, Gen.choose(-100000.0, 100000.0).filter(!_.isNaN))
+    idx = indices.distinct.sorted.toArray
+  } yield new SparseVector(size, idx, values.toArray.take(idx.length))
+
+}
diff --git a/gradle.properties b/gradle.properties
index 1363c922c9..3ee2b49445 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,2 +1,2 @@
-version=3.3.1
+version=3.3.3
 group=com.salesforce
diff --git a/gradle/version-properties.gradle b/gradle/version-properties.gradle
index ff2661e452..c879034ed9 100644
--- a/gradle/version-properties.gradle
+++ b/gradle/version-properties.gradle
@@ -36,4 +36,4 @@ configure(allprojects - project(':templates').subprojects - project(':docs'))  {
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/models/README.md b/models/README.md
new file mode 100644
index 0000000000..0d15f563aa
--- /dev/null
+++ b/models/README.md
@@ -0,0 +1,5 @@
+# Models
+
+This project contains all the pretrained models used in OP, e.g. OpenNLP POS/NER models etc.
+
+Include this project as a runtime dependency if you use any of such models in you application, otherwise it's optional.
\ No newline at end of file
diff --git a/models/build.gradle b/models/build.gradle
new file mode 100644
index 0000000000..086e0b38a9
--- /dev/null
+++ b/models/build.gradle
@@ -0,0 +1,4 @@
+jar {
+    // Avoid compressing models, cause it's quite slow
+    entryCompression = ZipEntryCompression.STORED
+}
diff --git a/models/src/main/resources/OpenNLP/da-pos-maxent.bin b/models/src/main/resources/OpenNLP/da-pos-maxent.bin
new file mode 100644
index 0000000000..8fade7ce97
Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/da-pos-perceptron.bin b/models/src/main/resources/OpenNLP/da-pos-perceptron.bin
new file mode 100644
index 0000000000..baabfda3b6
Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/da-sent.bin b/models/src/main/resources/OpenNLP/da-sent.bin
new file mode 100644
index 0000000000..9913d530e7
Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/da-token.bin b/models/src/main/resources/OpenNLP/da-token.bin
new file mode 100644
index 0000000000..994d07e090
Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/de-pos-maxent.bin b/models/src/main/resources/OpenNLP/de-pos-maxent.bin
new file mode 100644
index 0000000000..c564d56ceb
Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/de-pos-perceptron.bin b/models/src/main/resources/OpenNLP/de-pos-perceptron.bin
new file mode 100644
index 0000000000..c79debd6e1
Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/de-sent.bin b/models/src/main/resources/OpenNLP/de-sent.bin
new file mode 100644
index 0000000000..71d4e5ddd9
Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/de-token.bin b/models/src/main/resources/OpenNLP/de-token.bin
new file mode 100644
index 0000000000..380e7ff058
Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-chunker.bin b/models/src/main/resources/OpenNLP/en-chunker.bin
new file mode 100644
index 0000000000..65d9356888
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-chunker.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-date.bin b/models/src/main/resources/OpenNLP/en-ner-date.bin
new file mode 100644
index 0000000000..a69923ac42
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-date.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-location.bin b/models/src/main/resources/OpenNLP/en-ner-location.bin
new file mode 100644
index 0000000000..f3788bc1f6
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-location.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-money.bin b/models/src/main/resources/OpenNLP/en-ner-money.bin
new file mode 100644
index 0000000000..2431e0f5ee
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-money.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-organization.bin b/models/src/main/resources/OpenNLP/en-ner-organization.bin
new file mode 100644
index 0000000000..1fb6d9fa8f
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-organization.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-percentage.bin b/models/src/main/resources/OpenNLP/en-ner-percentage.bin
new file mode 100644
index 0000000000..98cee1a341
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-percentage.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-person.bin b/models/src/main/resources/OpenNLP/en-ner-person.bin
new file mode 100644
index 0000000000..2f68318203
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-person.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-ner-time.bin b/models/src/main/resources/OpenNLP/en-ner-time.bin
new file mode 100644
index 0000000000..a5d8aa14d8
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-time.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-parser-chunking.bin b/models/src/main/resources/OpenNLP/en-parser-chunking.bin
new file mode 100644
index 0000000000..7550609ebc
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-parser-chunking.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-pos-maxent.bin b/models/src/main/resources/OpenNLP/en-pos-maxent.bin
new file mode 100644
index 0000000000..c8cae23c5f
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-pos-perceptron.bin b/models/src/main/resources/OpenNLP/en-pos-perceptron.bin
new file mode 100644
index 0000000000..b52903fd10
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-sent.bin b/models/src/main/resources/OpenNLP/en-sent.bin
new file mode 100644
index 0000000000..e89076be5a
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/en-token.bin b/models/src/main/resources/OpenNLP/en-token.bin
new file mode 100644
index 0000000000..c417277ca7
Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/es-ner-location.bin b/models/src/main/resources/OpenNLP/es-ner-location.bin
new file mode 100644
index 0000000000..a9a7223c78
Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-location.bin differ
diff --git a/models/src/main/resources/OpenNLP/es-ner-misc.bin b/models/src/main/resources/OpenNLP/es-ner-misc.bin
new file mode 100644
index 0000000000..d45eceedef
Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-misc.bin differ
diff --git a/models/src/main/resources/OpenNLP/es-ner-organization.bin b/models/src/main/resources/OpenNLP/es-ner-organization.bin
new file mode 100644
index 0000000000..efb75ba20c
Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-organization.bin differ
diff --git a/models/src/main/resources/OpenNLP/es-ner-person.bin b/models/src/main/resources/OpenNLP/es-ner-person.bin
new file mode 100644
index 0000000000..f4a15c0174
Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-person.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-ner-location.bin b/models/src/main/resources/OpenNLP/nl-ner-location.bin
new file mode 100644
index 0000000000..3cf9081f78
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-location.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-ner-misc.bin b/models/src/main/resources/OpenNLP/nl-ner-misc.bin
new file mode 100644
index 0000000000..cdf8144eb8
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-misc.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-ner-organization.bin b/models/src/main/resources/OpenNLP/nl-ner-organization.bin
new file mode 100644
index 0000000000..dd962adbc4
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-organization.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-ner-person.bin b/models/src/main/resources/OpenNLP/nl-ner-person.bin
new file mode 100644
index 0000000000..cd3df4efd3
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-person.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-pos-maxent.bin b/models/src/main/resources/OpenNLP/nl-pos-maxent.bin
new file mode 100644
index 0000000000..170e1e8b15
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin b/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin
new file mode 100644
index 0000000000..7db9bce873
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-sent.bin b/models/src/main/resources/OpenNLP/nl-sent.bin
new file mode 100644
index 0000000000..f212e279d5
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/nl-token.bin b/models/src/main/resources/OpenNLP/nl-token.bin
new file mode 100644
index 0000000000..cb6190c341
Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/pt-pos-maxent.bin b/models/src/main/resources/OpenNLP/pt-pos-maxent.bin
new file mode 100644
index 0000000000..12c666ac66
Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin b/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin
new file mode 100644
index 0000000000..2fe7ccf293
Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/pt-sent.bin b/models/src/main/resources/OpenNLP/pt-sent.bin
new file mode 100644
index 0000000000..c2c537bb33
Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/pt-token.bin b/models/src/main/resources/OpenNLP/pt-token.bin
new file mode 100644
index 0000000000..0fc90a3669
Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/se-pos-maxent.bin b/models/src/main/resources/OpenNLP/se-pos-maxent.bin
new file mode 100644
index 0000000000..1e4ce32ec5
Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-pos-maxent.bin differ
diff --git a/models/src/main/resources/OpenNLP/se-pos-perceptron.bin b/models/src/main/resources/OpenNLP/se-pos-perceptron.bin
new file mode 100644
index 0000000000..572241ef70
Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-pos-perceptron.bin differ
diff --git a/models/src/main/resources/OpenNLP/se-sent.bin b/models/src/main/resources/OpenNLP/se-sent.bin
new file mode 100644
index 0000000000..4a0b702545
Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-sent.bin differ
diff --git a/models/src/main/resources/OpenNLP/se-token.bin b/models/src/main/resources/OpenNLP/se-token.bin
new file mode 100644
index 0000000000..d66c8709a9
Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-token.bin differ
diff --git a/models/src/main/resources/OpenNLP/vesion b/models/src/main/resources/OpenNLP/vesion
new file mode 100644
index 0000000000..14ac82435a
--- /dev/null
+++ b/models/src/main/resources/OpenNLP/vesion
@@ -0,0 +1,2 @@
+OpenNLP models - Version 1.5
+Downloaded from - http://opennlp.sourceforge.net/models-1.5
diff --git a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala
index 8602c54f7a..5a250df992 100644
--- a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala
+++ b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala
@@ -31,7 +31,6 @@
 
 package com.salesforce.op.test
 
-import com.salesforce.op.UID
 import com.salesforce.op.features.types._
 import com.salesforce.op.features.{FeatureBuilder, OPFeature}
 import com.salesforce.op.utils.tuples.RichTuple._
@@ -40,8 +39,6 @@ import org.joda.time.Duration
 
 trait PassengerFeaturesTest {
 
-  UID.reset()
-
   val age = FeatureBuilder.Real[Passenger]
     .extract(_.getAge.toReal)
     .aggregate((l, r) => (l -> r).map(breeze.linalg.max(_, _)))
@@ -64,6 +61,7 @@ trait PassengerFeaturesTest {
   val booleanMap = FeatureBuilder.BinaryMap[Passenger].extract(p => p.getBooleanMap.toBinaryMap).asPredictor
   val survived = FeatureBuilder.Binary[Passenger].extract(p => Option(p.getSurvived).map(_ == 1).toBinary).asResponse
   val boardedTime = FeatureBuilder.Date[Passenger].extract(_.getBoarded.toLong.toDate).asPredictor
+  val boardedTimeAsDateTime = FeatureBuilder.DateTime[Passenger].extract(_.getBoarded.toLong.toDateTime).asPredictor
 
   val rawFeatures: Array[OPFeature] = Array(
     survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap
diff --git a/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala b/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala
index be17156ee5..2e6802c660 100644
--- a/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala
+++ b/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala
@@ -57,13 +57,13 @@ trait PassengerSparkFixtureTest extends TestSparkContext with PassengerFeaturesT
       key = _.getPassengerId.toString // Entity to score
     )
 
-  val simpleCsvReader = DataReaders.Simple.csv[PassengerCSV](
+  lazy val simpleCsvReader = DataReaders.Simple.csv[PassengerCSV](
     path = Some(passengerCsvPath), // Path should be optional so can also pass in as a parameter
     schema = PassengerCSV.getClassSchema.toString, // Input schema
     key = _.getPassengerId.toString // Entity to score
   )
 
-  val simpleStreamingReader = StreamingReaders.Simple.avro[Passenger](
+  lazy val simpleStreamingReader = StreamingReaders.Simple.avro[Passenger](
     key = _.getPassengerId.toString  // Entity to score
   )
 
diff --git a/resources/materializingdata.png b/resources/materializingdata.png
new file mode 100644
index 0000000000..eab29e2a0e
Binary files /dev/null and b/resources/materializingdata.png differ
diff --git a/resources/stages.png b/resources/stages.png
new file mode 100644
index 0000000000..49e009e3b4
Binary files /dev/null and b/resources/stages.png differ
diff --git a/resources/workflows.png b/resources/workflows.png
new file mode 100644
index 0000000000..5feb167e3c
Binary files /dev/null and b/resources/workflows.png differ
diff --git a/settings.gradle b/settings.gradle
index eb42f67ae1..1771889027 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -1,4 +1,4 @@
 rootProject.name='optimus-prime'
 
-include 'utils', 'features', 'readers', 'core', 'testkit', 'cli', 'templates:simple', 'docs'
+include 'utils', 'features', 'readers', 'core', 'models', 'testkit', 'cli', 'templates:simple', 'docs'
 
diff --git a/test-data/DataGeneration.sc b/test-data/DataGeneration.sc
index f29bbd0a4f..ac1a9e41d5 100644
--- a/test-data/DataGeneration.sc
+++ b/test-data/DataGeneration.sc
@@ -1,6 +1,32 @@
 /*
  * Copyright (c) 2017, Salesforce.com, Inc.
  * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
  */
 
 import org.apache.spark.ml.linalg.Vectors
@@ -28,8 +54,8 @@ object DataGeneration {
   val df2 = df.toDF(names: _*)
 
   case class PassengerDG(passengerId: Int, age: Option[Int], gender: String,
-    height: Int, weight: Int, description: Option[String],
-    boarded: Long, recordDate: Long, survived: Boolean)
+                       height: Int, weight: Int, description: Option[String],
+                       boarded: Long, recordDate: Long, survived: Boolean)
 
   val ds = df2.as[PassengerDG]
 
diff --git a/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala b/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala
index ec5982ffc6..743ea787a3 100644
--- a/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala
+++ b/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala
@@ -64,7 +64,6 @@ trait TestCommon extends Matchers with Assertions {
   case object Spec {
     def apply[T: ClassTag]: String = apply(classTag[T].runtimeClass)
     def apply[T1: ClassTag, T2: ClassTag]: String = apply[T2] + "[" + apply[T1] + "]"
-
     def apply(klazz: Class[_]): String = klazz.getSimpleName.stripSuffix("$")
   }
 
diff --git a/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala b/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala
index 5b24a7ecba..598d8922db 100644
--- a/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala
+++ b/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala
@@ -36,9 +36,8 @@ import org.scalatest.Suite
 trait TestSparkContext extends TempDirectoryTest with TestCommon {
   self: Suite =>
 
-  // Remove Logging of OWLQN and LBFGS used in LogisticRegression
-  Logger.getLogger("breeze.optimize.OWLQN").setLevel(Level.WARN)
-  Logger.getLogger("breeze.optimize.LBFGS").setLevel(Level.WARN)
+  // Remove Breeze logging noise
+  Logger.getLogger("breeze.optimize").setLevel(Level.WARN)
 
   lazy val kryoClasses: Array[Class[_]] = Array(
     classOf[com.salesforce.op.test.Passenger],
diff --git a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala
index 20ebbe5838..ad00a19699 100644
--- a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala
+++ b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala
@@ -146,16 +146,34 @@ object ReflectionUtils {
   def reflectSetterMethod[T: ClassTag](
     instance: T,
     setterName: String,
+    inputs: Seq[Any],
     classLoader: ClassLoader = defaultClassLoader
-  ): Option[MethodMirror] = {
+  ): Any = {
+    reflectMethod(instance, s"set$setterName", classLoader).apply(inputs: _*)
+  }
+
+  /**
+   * Find setter methods for the provided method name
+   * @param instance     class to find method for
+   * @param methodName   name of method to find
+   * @param classLoader  class loader to use
+   * @tparam T  type of instance to copy
+   * @return    reflected method to set type
+   */
+  def reflectMethod[T: ClassTag](
+    instance: T,
+    methodName: String,
+    classLoader: ClassLoader = defaultClassLoader
+  ): MethodMirror = {
     val klazz = instance.getClass
     val (runtimeMirror, classMirror) = mirrors(klazz, classLoader)
     val classType = runtimeMirror.classSymbol(klazz).toType
     val tMembers = classType.members
-    val settrs = tMembers.collect { case m: MethodSymbol if m.isPublic &&
-      termNameStr(m.name).compareToIgnoreCase(s"set$setterName") == 0 => m }
+    val methods = tMembers.collect { case m: MethodSymbol if m.isMethod &&
+      termNameStr(m.name).compareToIgnoreCase(methodName) == 0 => m
+    }
     val instanceMirror = runtimeMirror.reflect(instance)
-    settrs.headOption.map(instanceMirror.reflectMethod(_))
+    instanceMirror.reflectMethod(methods.head)
   }
 
   /**
@@ -168,6 +186,26 @@ object ReflectionUtils {
    */
   def classForName(name: String, classLoader: ClassLoader = defaultClassLoader): Class[_] = classLoader.loadClass(name)
 
+
+  /**
+   * Fully dealiased type name for [[Type]].
+   * This method performs a recursive dealising vs a regular type.dealias, which does on one level only.
+   *
+   * E.g: given a type of "Map[String,Double]" the result is
+   * "scala.collection.immutable.Map[java.lang.String,scala.Double]"
+   *
+   * @param t type to dealias
+   * @return fully dealised type name
+   */
+  def dealisedTypeName(t: Type): String = {
+    val dealised = t.dealias
+    if (dealised.typeArgs.isEmpty) dealised.typeSymbol.fullName
+    else {
+      dealised.typeConstructor.dealias.typeSymbol.fullName +
+        dealised.typeArgs.map(dealisedTypeName).mkString("[", ",", "]")
+    }
+  }
+
   /**
    * Create a TypeTag for Type
    *
diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala b/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala
index 31d5735fe6..753e2b3622 100644
--- a/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala
+++ b/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala
@@ -34,6 +34,7 @@ package com.salesforce.op.utils.spark
 import org.apache.spark.sql.types._
 
 import scala.collection.mutable.{Map => MMap}
+import shapeless._
 
 object RichMetadata {
 
@@ -103,7 +104,7 @@ object RichMetadata {
             case (Some(av: String), Some(bv: String)) => av + bv
             case (Some(av: Metadata), Some(bv: Metadata)) => av.deepMerge(bv)
             case (Some(av), Some(bv)) => throw new RuntimeException(
-              s"Failed to merge metadatas for key $key due to incompatible value types '$av' and '$bv'"
+              s"Failed to merge metadata for key $key due to incompatible value types '$av' and '$bv'"
             )
           }
         res += key -> resVal
@@ -150,24 +151,40 @@ object RichMetadata {
     }
   }
 
+  private val booleanSeq = TypeCase[Seq[Boolean]]
+  private val longSeq = TypeCase[Seq[Long]]
+  private val intSeq = TypeCase[Seq[Int]]
+  private val doubleSeq = TypeCase[Seq[Double]]
+  private val stringSeq = TypeCase[Seq[String]]
+
   /**
    * Enrichment functions for Maps
    * @param theMap Map[String, Any]
    */
   implicit class RichMap(val theMap: Map[String, Any]) extends AnyVal {
 
-    def toMetadata: Metadata = theMap.foldLeft(new MetadataBuilder()) {
-      case (m, (k, v: Boolean)) => m.putBoolean(k, v)
-      case (m, (k, v: Double)) => m.putDouble(k, v)
-      case (m, (k, v: Long)) => m.putLong(k, v)
-      case (m, (k, v: String)) => m.putString(k, v)
-      case (m, (k, v: Array[Boolean])) => m.putBooleanArray(k, v)
-      case (m, (k, v: Array[Double])) => m.putDoubleArray(k, v)
-      case (m, (k, v: Array[Long])) => m.putLongArray(k, v)
-      case (m, (k, v: Array[String])) => m.putStringArray(k, v)
-      case (_, (k, v)) => throw new RuntimeException(s"Key '$k' has unsupported value type")
-    }.build()
-
+    def toMetadata: Metadata = {
+      val builder = new MetadataBuilder()
+      def unsupported(k: String) = throw new RuntimeException(s"Key '$k' has unsupported value type")
+      def putCollection(key: String, seq: Seq[Any]): MetadataBuilder = seq match {
+        case booleanSeq(v) => builder.putBooleanArray(key, v.toArray)
+        case intSeq(v) => builder.putLongArray(key, v.map(_.toLong).toArray)
+        case longSeq(v) => builder.putLongArray(key, v.toArray)
+        case doubleSeq(v) => builder.putDoubleArray(key, v.toArray)
+        case stringSeq(v) => builder.putStringArray(key, v.toArray)
+        case _ => unsupported(key)
+      }
+      theMap.foldLeft(builder) {
+        case (m, (k, v: Boolean)) => m.putBoolean(k, v)
+        case (m, (k, v: Double)) => m.putDouble(k, v)
+        case (m, (k, v: Long)) => m.putLong(k, v)
+        case (m, (k, v: String)) => m.putString(k, v)
+        case (m, (k, v: Seq[_])) => putCollection(k, v)
+        case (m, (k, v: Array[_])) => putCollection(k, v)
+        case (m, (k, v: Map[_, _])) => m.putMetadata(k, v.map { case (k, v) => k.toString -> v }.toMetadata)
+        case (_, (k, _)) => unsupported(k)
+      }.build()
+    }
   }
 
 }
diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala b/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala
index fdfa1311fd..57c8a1606e 100644
--- a/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala
+++ b/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala
@@ -66,9 +66,11 @@ object Language extends Enum[Language] {
   case object Asturian extends Language("ast")
   case object Belarusian extends Language("be")
   case object Breton extends Language("br")
-  case object Catalan extends Language("ca")
   case object Bulgarian extends Language("bg")
   case object Bengali extends Language("bn")
+  case object Brazilian extends Language("br")
+  case object Catalan extends Language("ca")
+  case object Sorani extends Language("ckb")
   case object Czech extends Language("cs")
   case object Welsh extends Language("cy")
   case object Danish extends Language("da")
@@ -112,6 +114,7 @@ object Language extends Enum[Language] {
   case object Portuguese extends Language("pt")
   case object Romanian extends Language("ro")
   case object Russian extends Language("ru")
+  case object Sami extends Language("se")
   case object Slovak extends Language("sk")
   case object Slovene extends Language("sl")
   case object Somali extends Language("so")
diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala b/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala
new file mode 100644
index 0000000000..5a1da39261
--- /dev/null
+++ b/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+import enumeratum.{Enum, EnumEntry}
+
+/**
+ * Interface for Name Entity Recognition tagger
+ *
+ * @tparam Result result of the [[NameEntityTagger.tag]] function call
+ */
+trait NameEntityTagger[Result <: TaggerResult] extends Serializable {
+
+  /**
+   * Apply the name entity recognition model on the sentence tokens to retrieve information
+   *
+   * @param tokens        sentence tokens
+   * @param language      language
+   * @param entitiesToTag entities to tag if found
+   * @return map of entity and corresponding tokens
+   */
+  def tag(tokens: Seq[String], language: Language, entitiesToTag: Seq[NameEntityType]): Result
+
+}
+
+/**
+ * Result of [[NameEntityTagger.tag]] function call
+ */
+trait TaggerResult extends Serializable {
+
+  /**
+   * Result must be convertible to Map,
+   * where keys are token and values are entities matching each token
+   */
+  def tokenTags: Map[String, Set[NameEntityType]]
+
+}
+
+
+/**
+ * Name Entity Recognition entity type
+ */
+sealed trait NameEntityType extends EnumEntry with Serializable
+
+/**
+ * Name Entity Recognition entity type
+ */
+object NameEntityType extends Enum[NameEntityType] {
+  val values = findValues
+  case object Date extends NameEntityType
+  case object Location extends NameEntityType
+  case object Money extends NameEntityType
+  case object Organization extends NameEntityType
+  case object Percentage extends NameEntityType
+  case object Person extends NameEntityType
+  case object Time extends NameEntityType
+  case object Misc extends NameEntityType
+  case object Other extends NameEntityType
+}
diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala b/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala
new file mode 100644
index 0000000000..fddf2dbeb0
--- /dev/null
+++ b/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Salesforce.com nor the names of its contributors may
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.utils.text
+
+/**
+ * Interface for Sentence Splitter that allows detecting and splitting text
+ * into separate sentences.
+ */
+trait SentenceSplitter extends Serializable {
+
+  /**
+   * Get sentences from the text
+   *
+   * @param input    text input
+   * @param language language
+   * @return sentences
+   */
+  def getSentences(input: String, language: Language): Seq[String]
+}
diff --git a/utils/src/test/scala/com/salesforce/op/UIDTest.scala b/utils/src/test/scala/com/salesforce/op/UIDTest.scala
index 74d6191cf3..3f717b1346 100644
--- a/utils/src/test/scala/com/salesforce/op/UIDTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/UIDTest.scala
@@ -75,3 +75,4 @@ class UIDTest extends FlatSpec with TestCommon {
     intercept[IllegalArgumentException](UID.fromString("foo")).getMessage shouldBe "Invalid UID: foo"
   }
 }
+
diff --git a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala
index 11e9fb8fa9..639d11e915 100644
--- a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala
@@ -79,6 +79,8 @@ class TestClassVar {
     myVar = Option(s)
     this
   }
+  private def getValue: Int = 2
+  def getValuePerf: Int = 2
 }
 
 @RunWith(classOf[JUnitRunner])
@@ -109,6 +111,14 @@ class ReflectionUtilsTest extends FlatSpec with Matchers {
     dealiasedTag.tpe shouldBe aliasTag.tpe.dealias
   }
 
+  it should "deep dealias types" in {
+    val tt = typeTag[Map[String, Seq[(Double, ListStringAlias)]]].tpe
+    ReflectionUtils.dealisedTypeName(tt) shouldBe
+      "scala.collection.immutable.Map[" +
+        "java.lang.String," +
+        "scala.collection.Seq[scala.Tuple2[scala.Double,scala.collection.immutable.List[java.lang.String]]]]"
+  }
+
   it should "allow copying a class" in {
     val orig = new TestClass[TestValClass](
       i = 123,
@@ -191,9 +201,31 @@ class ReflectionUtilsTest extends FlatSpec with Matchers {
 
   it should "allow you to find and use a setter for a class" in {
     val myClass = new TestClassVar()
-    val setter = ReflectionUtils.reflectSetterMethod(myClass, "myVar")
-    setter.map(_.apply("yay"))
+    val setter = ReflectionUtils.reflectSetterMethod(myClass, "myVar", Seq("yay"))
     myClass.myVar shouldBe Some("yay")
   }
 
+  it should "allow you to find and use a private method for a class" in {
+    val myClass = new TestClassVar()
+    val value = ReflectionUtils.reflectMethod(myClass, "getValue").apply()
+    value shouldBe 2
+  }
+
+  it should "reflected method should be fast to execute" in {
+    val myClass = new TestClassVar()
+    val method = ReflectionUtils.reflectMethod(myClass, "getValue")
+    val max = 100000
+    def measure(fun: => Int): Long = {
+      val start = System.currentTimeMillis()
+      (0 until max).foreach(_ => fun shouldBe 2)
+      System.currentTimeMillis() - start
+    }
+    val warmUp = measure(method.apply().asInstanceOf[Int]) -> measure(myClass.getValuePerf) // warm up
+    val elapsedReflect = measure(method.apply().asInstanceOf[Int])
+    val actual = measure(myClass.getValuePerf)
+
+    elapsedReflect should be <= 5 * actual
+  }
+
 }
+
diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala b/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala
index ae4be94d1b..3007b15601 100644
--- a/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala
@@ -33,6 +33,8 @@ package com.salesforce.op.utils.spark
 
 import com.salesforce.op.test.TestCommon
 import org.apache.spark.sql.types.{MetadataBuilder, StructField}
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
 import org.scalatest.{FlatSpec, Matchers}
@@ -46,10 +48,12 @@ class RichMetadataTest extends FlatSpec with TestCommon {
   Spec[RichMetadata] should "create a metadata from a map" in {
     val expected = Map(
       "1" -> 1L, "2" -> 1.0, "3" -> true, "4" -> "1",
-      "5" -> Array(1L), "6" -> Array(1.0), "6" -> Array(true), "7" -> Array("1")
+      "5" -> Array(1L), "6" -> Array(1.0), "6" -> Array(true), "7" -> Array("1"),
+      "8" -> Seq(1L), "9" -> Seq(1.0), "10" -> Seq(true), "11" -> Seq("1")
     )
     val meta = expected.toMetadata
-    meta.underlyingMap.toSeq shouldBe expected.toSeq
+    implicit val formats = DefaultFormats
+    meta.json shouldBe Serialization.write(expected)
   }
 
   it should "throw an error on unsupported type in a map" in {