Description
The last part of the installation needs its own "mkdir lib". Now, at least on Ubuntu, Maven is part of the standard system, but sbt is not and I was off to a wild goose chase for a while to find a good way to install it. The first two methods I tried were defunct, finally I settled for the script https://raw.githubusercontent.com/paulp/sbt-extras/master/sbt , no doubt that script may disappear or become undesirable with time but it would be a courtesy to users of this module to point to a currently working method.
Also, the Spark script invocation needs to be corrected with the name of the example "ml-forest-example.scala" . I am not really a Spark user, just downloaded it for this purpose and got errors
scala> :load "../redis/spark-redis-ml/scripts/ml-forest-example.scala"
Loading ../redis/spark-redis-ml/scripts/ml-forest-example.scala...
import scala.collection.mutable
import scala.language.reflectiveCalls
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
import org.apache.spark.ml.tree.{CategoricalSplit, ContinuousSplit, Split}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.{SparkSession, _}
import redis.clients.jedis.Protocol.Command
import redis.clients.jedis.{Jedis, _}
import com.redislabs.client.redisml.MLClient
import com.redislabs.provider.redis.ml.Forest
loadData: (spark: org.apache.spark.sql.SparkSession, path: String, format: String, expectedNumFeatures: Option[Int])org.apache.spark.sql.DataFrame
loadDatasets: (input: String, dataFormat: String, testInput: String, algo: String, fracTest: Double)(org.apache.spark.sql.DataFrame, org.apache.spark.sql.DataFrame)
defined class Params
params: Params = Params(file:///root/spark/data/mllib/sample_libsvm_data.txt,,libsvm,classification,5,32,1,0.0,10,auto,0.2,false,None,10)
algo: String = classification
RandomForestExample with parameters:
Params(file:///root/spark/data/mllib/sample_libsvm_data.txt,,libsvm,classification,5,32,1,0.0,10,auto,0.2,false,None,10)
org.apache.spark.sql.AnalysisException: Path does not exist: file:/root/spark/data/mllib/sample_libsvm_data.txt;
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:382)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:370)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:370)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:135)
at loadData(:54)
at loadDatasets(:54)
... 76 elided
stages: scala.collection.mutable.ArrayBuffer[org.apache.spark.ml.PipelineStage] = ArrayBuffer()
labelColName: String = indexedLabel
res4: Any = ArrayBuffer(strIdx_348bb105c92b)
featuresIndexer: org.apache.spark.ml.feature.VectorIndexer = vecIdx_8176e0e50d19
res5: stages.type = ArrayBuffer(strIdx_348bb105c92b, vecIdx_8176e0e50d19)
dt: org.apache.spark.ml.classification.RandomForestClassifier = rfc_4bee75d8596f
res6: stages.type = ArrayBuffer(strIdx_348bb105c92b, vecIdx_8176e0e50d19, rfc_4bee75d8596f)
pipeline: org.apache.spark.ml.Pipeline = pipeline_b4782508197c
startTime: Long = 22579135088679
:45: error: not found: value training
val pipelineModel = pipeline.fit(training)