Merge pull request #90 from MadDataScience/master

Fix SimpleIndexer fit method to set inputCol and outputCol correctly
high-performance-spark · Jul 22, 2017 · 04f7d83 · 04f7d83
2 parents bb5e995 + dd03847
commit 04f7d83
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ project/plugins/project/
 # Scala-IDE specific
 .scala_dependencies
 .worksheet
+.idea/
 
 # emacs stuff
 \#*\#

diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
@@ -122,7 +122,9 @@ class SimpleIndexer(override val uid: String)
     import dataset.sparkSession.implicits._
     val words = dataset.select(dataset($(inputCol)).as[String]).distinct
       .collect()
-    new SimpleIndexerModel(uid, words)
+    val model = new SimpleIndexerModel(uid, words)
+    this.copyValues(model)
+    model
   }
 }
 

diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
@@ -0,0 +1,40 @@
+/**
+  * Simple tests for our CustomPipeline demo pipeline stage
+  */
+package com.highperformancespark.examples.ml
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.apache.spark.sql.Dataset
+import org.scalatest.FunSuite
+
+case class TestRow(id: Int, inputColumn: String)
+
+class CustomPipelineSuite extends FunSuite with DataFrameSuiteBase {
+  val d = List(
+    TestRow(0, "a"),
+    TestRow(1, "b"),
+    TestRow(2, "c"),
+    TestRow(3, "a"),
+    TestRow(4, "a"),
+    TestRow(5, "c")
+  )
+
+  test("test spark context") {
+    val session = spark
+    val rdd = session.sparkContext.parallelize(1 to 10)
+    assert(rdd.sum === 55)
+  }
+
+  test("simple indexer test") {
+    val session = spark
+    import session.implicits._
+    val ds: Dataset[TestRow] = session.createDataset(d)
+    val indexer = new SimpleIndexer()
+    indexer.setInputCol("inputColumn")
+    indexer.setOutputCol("categoryIndex")
+    val model = indexer.fit(ds)
+    val predicted = model.transform(ds)
+    assert(predicted.columns.contains("categoryIndex"))
+    predicted.show()
+  }
+}