diff --git a/photon-ml/src/integTest/scala/com/linkedin/photon/ml/cli/game/training/DriverTest.scala b/photon-ml/src/integTest/scala/com/linkedin/photon/ml/cli/game/training/DriverTest.scala index eb1dc2f0..ce83990c 100644 --- a/photon-ml/src/integTest/scala/com/linkedin/photon/ml/cli/game/training/DriverTest.scala +++ b/photon-ml/src/integTest/scala/com/linkedin/photon/ml/cli/game/training/DriverTest.scala @@ -14,7 +14,13 @@ */ package com.linkedin.photon.ml.cli.game.training +import java.nio.file.{FileSystems, Files, Path} + import collection.JavaConversions._ +import org.apache.spark.SparkConf +import org.testng.Assert._ +import org.testng.annotations.Test + import com.linkedin.photon.ml.SparkContextConfiguration import com.linkedin.photon.ml.avro.AvroIOUtils import com.linkedin.photon.ml.avro.data.NameAndTerm @@ -23,18 +29,12 @@ import com.linkedin.photon.ml.avro.model.ModelProcessingUtils import com.linkedin.photon.ml.data.{FixedEffectDataSet, RandomEffectDataSet} import com.linkedin.photon.ml.io.ModelOutputMode import com.linkedin.photon.ml.supervised.TaskType -import com.linkedin.photon.ml.supervised.regression.LinearRegressionModel import com.linkedin.photon.ml.test.{CommonTestUtils, SparkTestUtils, TestTemplateWithTmpDir} -import com.linkedin.photon.ml.util.{Utils, PhotonLogger} -import org.apache.spark.SparkConf -import org.testng.annotations.Test -import org.testng.Assert._ - -import java.nio.file.{Files, FileSystems, Path} +import com.linkedin.photon.ml.util.{PhotonLogger, Utils} class DriverTest extends SparkTestUtils with TestTemplateWithTmpDir { - import DriverTest._ import CommonTestUtils._ + import DriverTest._ @Test def testFixedEffectsWithIntercept() = sparkTest("testFixedEffectsWithIntercept", useKryo = true) { diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Driver.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Driver.scala index 43f07edd..976cea9f 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Driver.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Driver.scala @@ -14,10 +14,12 @@ */ package com.linkedin.photon.ml.cli.game.training -import com.linkedin.photon.ml.optimization.{ - GeneralizedLinearOptimizationProblem, SmoothedHingeLossLinearSVMOptimizationProblem, PoissonRegressionOptimizationProblem, LogisticRegressionOptimizationProblem, LinearRegressionOptimizationProblem} -import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration -import com.linkedin.photon.ml.{RDDLike, SparkContextConfiguration} +import scala.collection.Map + +import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + import com.linkedin.photon.ml.algorithm._ import com.linkedin.photon.ml.avro.AvroUtils import com.linkedin.photon.ml.avro.data.{DataProcessingUtils, NameAndTerm, NameAndTermFeatureSetContainer} @@ -28,16 +30,13 @@ import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.function.DiffFunction import com.linkedin.photon.ml.io.ModelOutputMode import com.linkedin.photon.ml.model.GAMEModel -import com.linkedin.photon.ml.optimization.game.{FactoredRandomEffectOptimizationProblem, RandomEffectOptimizationProblem} +import com.linkedin.photon.ml.optimization.game.{GLMOptimizationConfiguration, FactoredRandomEffectOptimizationProblem, RandomEffectOptimizationProblem} +import com.linkedin.photon.ml.optimization.{GeneralizedLinearOptimizationProblem, LinearRegressionOptimizationProblem, LogisticRegressionOptimizationProblem, PoissonRegressionOptimizationProblem, SmoothedHingeLossLinearSVMOptimizationProblem} import com.linkedin.photon.ml.projector.IdentityProjection import com.linkedin.photon.ml.supervised.TaskType._ import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util._ -import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD - -import scala.collection.Map +import com.linkedin.photon.ml.{RDDLike, SparkContextConfiguration} /** * The driver class, which provides the main entrance to GAME model training @@ -48,8 +47,6 @@ final class Driver(val params: Params, val sparkContext: SparkContext, val logge private val hadoopConfiguration = sparkContext.hadoopConfiguration - private val isTrackingState = true - /** * Builds feature name-and-term to index maps according to configuration * @@ -321,15 +318,14 @@ final class Driver(val params: Params, val sparkContext: SparkContext, val logge trainingEvaluator: Evaluator, validatingDataAndEvaluatorOption: Option[(RDD[(Long, GameDatum)], Evaluator)]): Map[String, GAMEModel] = { - val optimizationProblemBuilder: - Function3[GLMOptimizationConfiguration, Int, Boolean, - GeneralizedLinearOptimizationProblem[GeneralizedLinearModel, DiffFunction[LabeledPoint]]] = taskType match { + val optimizationProblemBuilder: (GLMOptimizationConfiguration, Int, Boolean, Boolean) => + GeneralizedLinearOptimizationProblem[GeneralizedLinearModel, DiffFunction[LabeledPoint]] = taskType match { - case LOGISTIC_REGRESSION => LogisticRegressionOptimizationProblem.buildOptimizationProblem _ - case LINEAR_REGRESSION => LinearRegressionOptimizationProblem.buildOptimizationProblem _ - case POISSON_REGRESSION => PoissonRegressionOptimizationProblem.buildOptimizationProblem _ + case LOGISTIC_REGRESSION => LogisticRegressionOptimizationProblem.buildOptimizationProblem + case LINEAR_REGRESSION => LinearRegressionOptimizationProblem.buildOptimizationProblem + case POISSON_REGRESSION => PoissonRegressionOptimizationProblem.buildOptimizationProblem case SMOOTHED_HINGE_LOSS_LINEAR_SVM => - SmoothedHingeLossLinearSVMOptimizationProblem.buildOptimizationProblem _ + SmoothedHingeLossLinearSVMOptimizationProblem.buildOptimizationProblem case _ => throw new Exception(s"Loss function for taskType $taskType is currently not supported.") } @@ -354,20 +350,28 @@ final class Driver(val params: Params, val sparkContext: SparkContext, val logge val optimizationConfiguration = fixedEffectOptimizationConfiguration(coordinateId) // If number of features is from moderate to large (>200000), then use tree aggregate, // otherwise use aggregate. - val treeAggregateDepth = if (fixedEffectDataSet.numFeatures < 200000) 1 else 2 + val treeAggregateDepth = if (fixedEffectDataSet.numFeatures < 200000) { + Driver.DEFAULT_TREE_AGGREGATE_DEPTH + } else { + Driver.DEEP_TREE_AGGREGATE_DEPTH + } val optimizationProblem = optimizationProblemBuilder( optimizationConfiguration, treeAggregateDepth, - isTrackingState) + Driver.TRACK_STATE, + computeVariance) new FixedEffectCoordinate(fixedEffectDataSet, optimizationProblem) case randomEffectDataSetInProjectedSpace: RandomEffectDataSetInProjectedSpace => // Random effect coordinate val optimizationConfiguration = randomEffectOptimizationConfiguration(coordinateId) - val randomEffectOptimizationProblem = RandomEffectOptimizationProblem.buildRandomEffectOptimizationProblem( + val randomEffectOptimizationProblem = RandomEffectOptimizationProblem + .buildRandomEffectOptimizationProblem( optimizationProblemBuilder, optimizationConfiguration, - randomEffectDataSetInProjectedSpace) + randomEffectDataSetInProjectedSpace, + Driver.DEFAULT_TREE_AGGREGATE_DEPTH, + computeVariance) .setName(s"Random effect optimization problem of coordinate $coordinateId") .persistRDD(StorageLevel.INFREQUENT_REUSE_RDD_STORAGE_LEVEL) new RandomEffectCoordinateInProjectedSpace( @@ -385,7 +389,10 @@ final class Driver(val params: Params, val sparkContext: SparkContext, val logge randomEffectOptimizationConfiguration, latentFactorOptimizationConfiguration, mfOptimizationConfiguration, - randomEffectDataSet) + randomEffectDataSet, + Driver.DEFAULT_TREE_AGGREGATE_DEPTH, + Driver.TRACK_STATE, + computeVariance) .setName(s"Factored random effect optimization problem of coordinate $coordinateId") .persistRDD(StorageLevel.INFREQUENT_REUSE_RDD_STORAGE_LEVEL) new FactoredRandomEffectCoordinate(randomEffectDataSet, factoredRandomEffectOptimizationProblem) @@ -517,6 +524,9 @@ final class Driver(val params: Params, val sparkContext: SparkContext, val logge } object Driver { + val DEFAULT_TREE_AGGREGATE_DEPTH = 1 + val DEEP_TREE_AGGREGATE_DEPTH = 2 + val TRACK_STATE = false val LOGS = "logs" /** diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Params.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Params.scala index baa577b9..ead688bb 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Params.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/cli/game/training/Params.scala @@ -109,6 +109,11 @@ class Params { */ var numIterations: Int = 1 + /** + * Whether to compute coefficient variance + */ + var computeVariance: Boolean = false + /** * Updating order of the ordinates (separated by commas) in the coordinate descent algorithm. */ @@ -192,6 +197,7 @@ class Params { .map(_.mkString("\n")).mkString("\n")}\n" + s"randomEffectDataConfigurations:\n${randomEffectDataConfigurations.mkString("\n")}\n" + s"taskType: $taskType\n" + + s"computeVariance: $computeVariance\n" + s"modelOutputOption: $modelOutputMode\n" + s"numberOfOutputFilesForRandomEffectModel: $numberOfOutputFilesForRandomEffectModel\n" + s"deleteOutputDirIfExists: $deleteOutputDirIfExists\n" + @@ -355,6 +361,9 @@ object Params { } .toMap ) + opt[Boolean]("compute-variance") + .text(s"Whether to compute the coefficient variance, default: ${defaultParams.computeVariance}") + .foreach(x => params.computeVariance = x) opt[Boolean]("save-models-to-hdfs") .text(s"DEPRECATED -- USE model-output-mode") .foreach(x => params.modelOutputMode = if (x) ALL else NONE) diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/LogisticLossFunction.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/LogisticLossFunction.scala index 72057796..c0e85b11 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/LogisticLossFunction.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/LogisticLossFunction.scala @@ -19,48 +19,48 @@ import com.linkedin.photon.ml.normalization.{NoNormalization, NormalizationConte import com.linkedin.photon.ml.util.Utils /** - * Class for the logistic loss function: - * sum_i (w_i*(y_i*log(1 + exp(-(theta'x_i + o_i))) + (1-y_i)*log(1 + exp(theta'x_i + o_i)))), - * where \theta is the coefficients of the data features to be estimated, (y_i, x_i, o_i, w_i) are the tuple - * for label, features, offset, and weight of the i'th labeled data point, respectively. - * Note that the above equation assumes the label y_i \in {0, 1}. However, the code below would also work when - * y_i \in {-1, 1}. - */ + * Class for the logistic loss function: + * sum_i (w_i*(y_i*log(1 + exp(-(theta'x_i + o_i))) + (1-y_i)*log(1 + exp(theta'x_i + o_i)))), + * where \theta is the coefficients of the data features to be estimated, (y_i, x_i, o_i, w_i) are the tuple + * for label, features, offset, and weight of the i'th labeled data point, respectively. + * Note that the above equation assumes the label y_i \in {0, 1}. However, the code below would also work when + * y_i \in {-1, 1}. + */ class LogisticLossFunction(normalizationContext: ObjectProvider[NormalizationContext] = new SimpleObjectProvider[NormalizationContext](NoNormalization)) extends GeneralizedLinearModelLossFunction(PointwiseLogisticLossFunction, normalizationContext) /** - * A single logistic loss function - * - * l(z, y) = - log [1/(1+exp(-z))] if this is a positive sample - * - * or - log [1 - 1/(1+exp(-z))] if this is a negative sample - */ + * A single logistic loss function + * + * l(z, y) = - log [1 / (1 + exp(-z))] if this is a positive sample + * + * - log [1 - (1 / (1 + exp(-z)))] if this is a negative sample + */ @SerialVersionUID(1L) object PointwiseLogisticLossFunction extends PointwiseLossFunction { /** - * The sigmoid function 1/(1+exp(-z)) - * - * @param z z - * @return The value - */ + * The sigmoid function 1 / (1 + exp(-z)) + * + * @param z z + * @return The value + */ private def sigmoid(z: Double): Double = 1.0 / (1.0 + math.exp(-z)) /** - * l(z, y) = - log [1 / (1 + exp(-z))] = log [1 + exp(-z)] if this is a positive sample - * - * - log [1 - 1/(1+exp(-z))] = log [1 + exp(z)] if this is a negative sample - * - * dl/dz = - 1 / (1 + exp(z)) if this is a positive sample - * - * 1 / (1 + exp(-z)) if this is a negative sample - * - * @param margin The margin, i.e. z in l(z, y) - * @param label The label, i.e. y in l(z, y) - * @return The value and the 1st derivative - */ + * l(z, y) = - log [1 / (1 + exp(-z))] = log [1 + exp(-z)] if this is a positive sample + * + * - log [1 - (1 / (1 + exp(-z)))] = log [1 + exp(z)] if this is a negative sample + * + * dl/dz = -1 / (1 + exp(z)) if this is a positive sample + * + * 1 / (1 + exp(-z)) if this is a negative sample + * + * @param margin The margin, i.e. z in l(z, y) + * @param label The label, i.e. y in l(z, y) + * @return The value and the 1st derivative + */ override def loss(margin: Double, label: Double): (Double, Double) = { if (label > 0) { // The following is equivalent to log(1 + exp(-margin)) but more numerically stable. @@ -71,12 +71,12 @@ object PointwiseLogisticLossFunction extends PointwiseLossFunction { } /** - * d^2^l/dz^2^ = sigmoid(z) * (1 - sigmoid(z)) - * - * @param margin The margin, i.e. z in l(z, y) - * @param label The label, i.e. y in l(z, y) - * @return The value and the 2st derivative with respect to z - */ + * d^2^l/dz^2^ = sigmoid(z) * (1 - sigmoid(z)) + * + * @param margin The margin, i.e. z in l(z, y) + * @param label The label, i.e. y in l(z, y) + * @return The value and the 2st derivative with respect to z + */ override def d2lossdz2(margin: Double, label: Double): Double = { val s = sigmoid(margin) s * (1 - s) diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/PoissonLossFunction.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/PoissonLossFunction.scala index d9035f7e..af57e21d 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/PoissonLossFunction.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/PoissonLossFunction.scala @@ -14,17 +14,13 @@ */ package com.linkedin.photon.ml.function - -import com.linkedin.photon.ml.data.{SimpleObjectProvider, ObjectProvider} +import com.linkedin.photon.ml.data.{ObjectProvider, SimpleObjectProvider} import com.linkedin.photon.ml.normalization.{NoNormalization, NormalizationContext} - /** * Class for the Poisson loss function: sum_i (w_i*(exp(theta'x_i + o_i) - y_i*(theta'x_i + o_i))), * where \theta is the coefficients of the data features to be estimated, (y_i, x_i, o_i, w_i) are the tuple * for label, features, offset, and weight of the i'th labeled data point, respectively. - * @author asaha - * @author dpeng */ class PoissonLossFunction( normalizationContext: ObjectProvider[NormalizationContext] = @@ -42,8 +38,8 @@ class PoissonLossFunction( object PointwisePoissonLossFunction extends PointwiseLossFunction { /** * l(z, y) = exp(z) - y * z + * dl/dz = exp(z) - y * - * dl/dz = exp(z) - y * @param margin The margin, i.e. z in l(z, y) * @param label The label, i.e. y in l(z, y) * @return The value and the 1st derivative @@ -55,6 +51,7 @@ object PointwisePoissonLossFunction extends PointwiseLossFunction { /** * d^2^l/dz^2^ = exp(z) + * * @param margin The margin, i.e. z in l(z, y) * @param label The label, i.e. y in l(z, y) * @return The value and the 2st derivative with respect to z diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/SquaredLossFunction.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/SquaredLossFunction.scala index 40931fb6..34607e53 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/function/SquaredLossFunction.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/function/SquaredLossFunction.scala @@ -17,15 +17,11 @@ package com.linkedin.photon.ml.function import com.linkedin.photon.ml.data.{ObjectProvider, SimpleObjectProvider} import com.linkedin.photon.ml.normalization.{NoNormalization, NormalizationContext} - /** * Class for the squared loss function: sum_i w_i/2*(theta'x_i + o_i - y_i)**2, where theta is the weight coefficients * of the data features to be estimated, (y_i, x_i, o_i, w_i) are the label, features, offset, and weight of * the i'th labeled data point, respectively. - * @author xazhang - * @author dpeng */ - class SquaredLossFunction( normalizationContext: ObjectProvider[NormalizationContext] = new SimpleObjectProvider[NormalizationContext](NoNormalization)) @@ -56,6 +52,7 @@ object PointwiseSquareLossFunction extends PointwiseLossFunction { /** * d^2^l/dz^2^ = 1 + * * @param margin The margin, i.e. z in l(z, y) * @param label The label, i.e. y in l(z, y) * @return The value and the 2st derivative with respect to z diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/normalization/NormalizationContext.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/normalization/NormalizationContext.scala index 3c9ed113..c5023039 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/normalization/NormalizationContext.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/normalization/NormalizationContext.scala @@ -15,8 +15,8 @@ package com.linkedin.photon.ml.normalization import breeze.linalg.{DenseVector, Vector} -import com.linkedin.photon.ml.stat.BasicStatisticalSummary +import com.linkedin.photon.ml.stat.BasicStatisticalSummary /** * The normalization approach for the optimization problem, especially for generalized linear model. This gives concrete @@ -116,39 +116,44 @@ private[ml] object NormalizationContext { * @param interceptId The index of the intercept * @return The normalization context */ - def apply(normalizationType: NormalizationType, summary: => BasicStatisticalSummary, - interceptId: Option[Int]): NormalizationContext = { - normalizationType match { - case NormalizationType.NONE => - new NormalizationContext(None, None, interceptId) - case NormalizationType.SCALE_WITH_MAX_MAGNITUDE => - val factors = summary.max.toArray.zip(summary.min.toArray).map { - case (max, min) => - val magnitude = math.max(math.abs(max), math.abs(min)) - if (magnitude == 0) 1.0 else 1.0 / magnitude - } - new NormalizationContext(Some(DenseVector(factors)), None, interceptId) - case NormalizationType.SCALE_WITH_STANDARD_DEVIATION => - val factors = summary.variance.map(x => { - val std = math.sqrt(x) - if (std == 0) 1.0 else 1.0 / std - }) - new NormalizationContext(Some(factors), None, interceptId) - case NormalizationType.STANDARDIZATION => - val factors = summary.variance.map(x => { - val std = math.sqrt(x) - if (std == 0) 1.0 else 1.0 / std - }) - val shifts = summary.mean.copy - // Do not transform intercept - interceptId.foreach(id => { - shifts(id) = 0.0 - factors(id) = 1.0 - }) - new NormalizationContext(Some(factors), Some(shifts), interceptId) - case _ => - throw new IllegalArgumentException(s"NormalizationType $normalizationType not recognized.") - } + def apply( + normalizationType: NormalizationType, + summary: => BasicStatisticalSummary, + interceptId: Option[Int]): NormalizationContext = normalizationType match { + + case NormalizationType.NONE => + new NormalizationContext(None, None, interceptId) + + case NormalizationType.SCALE_WITH_MAX_MAGNITUDE => + val factors = summary.max.toArray.zip(summary.min.toArray).map { + case (max, min) => + val magnitude = math.max(math.abs(max), math.abs(min)) + if (magnitude == 0) 1.0 else 1.0 / magnitude + } + new NormalizationContext(Some(DenseVector(factors)), None, interceptId) + + case NormalizationType.SCALE_WITH_STANDARD_DEVIATION => + val factors = summary.variance.map(x => { + val std = math.sqrt(x) + if (std == 0) 1.0 else 1.0 / std + }) + new NormalizationContext(Some(factors), None, interceptId) + + case NormalizationType.STANDARDIZATION => + val factors = summary.variance.map(x => { + val std = math.sqrt(x) + if (std == 0) 1.0 else 1.0 / std + }) + val shifts = summary.mean.copy + // Do not transform intercept + interceptId.foreach(id => { + shifts(id) = 0.0 + factors(id) = 1.0 + }) + new NormalizationContext(Some(factors), Some(shifts), interceptId) + + case _ => + throw new IllegalArgumentException(s"NormalizationType $normalizationType not recognized.") } } diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala index 97202c1c..267a9b1f 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala @@ -15,9 +15,10 @@ package com.linkedin.photon.ml.optimization import scala.collection.mutable +import scala.math.abs +import scala.reflect.ClassTag -import breeze.linalg.Vector -import breeze.linalg.sum +import breeze.linalg.{Vector, sum} import org.apache.spark.Logging import org.apache.spark.rdd.RDD @@ -27,9 +28,6 @@ import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.sampler.DownSampler import com.linkedin.photon.ml.supervised.model.{GeneralizedLinearModel, ModelTracker} -import scala.reflect.ClassTag -import scala.math.abs - /** * GeneralizedOptimizationProblem implements methods to train a Generalized Linear Model (GLM). * This class should be extended with a loss function and the createModel function to create a new GLM. diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblem.scala index 5ff495cc..d877d0ab 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblem.scala @@ -145,8 +145,6 @@ case class LinearRegressionOptimizationProblem( } object LinearRegressionOptimizationProblem { - val COMPUTING_VARIANCE = false - /** * Build a logistic regression optimization problem * @@ -158,7 +156,8 @@ object LinearRegressionOptimizationProblem { protected[ml] def buildOptimizationProblem( configuration: GLMOptimizationConfiguration, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = true): LinearRegressionOptimizationProblem = { + isTrackingState: Boolean = true, + isComputingVariance: Boolean = false): LinearRegressionOptimizationProblem = { val optimizerConfig = configuration.optimizerConfig val regularizationContext = configuration.regularizationContext @@ -182,7 +181,7 @@ object LinearRegressionOptimizationProblem { regularizationWeight, if (isTrackingState) { Some(new mutable.ListBuffer[ModelTracker]())} else { None }, treeAggregateDepth, - COMPUTING_VARIANCE) + isComputingVariance) } def initializeZeroModel(dimension: Int): LinearRegressionModel = diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblem.scala index 12a86d63..04b9fd7b 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblem.scala @@ -145,8 +145,6 @@ case class LogisticRegressionOptimizationProblem( } object LogisticRegressionOptimizationProblem { - val COMPUTING_VARIANCE = false - /** * Build a logistic regression optimization problem * @@ -158,7 +156,8 @@ object LogisticRegressionOptimizationProblem { protected[ml] def buildOptimizationProblem( configuration: GLMOptimizationConfiguration, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = true): LogisticRegressionOptimizationProblem = { + isTrackingState: Boolean = true, + isComputingVariance: Boolean = false): LogisticRegressionOptimizationProblem = { val optimizerConfig = configuration.optimizerConfig val regularizationContext = configuration.regularizationContext @@ -184,7 +183,7 @@ object LogisticRegressionOptimizationProblem { regularizationWeight, if (isTrackingState) { Some(new mutable.ListBuffer[ModelTracker]())} else { None }, treeAggregateDepth, - COMPUTING_VARIANCE) + isComputingVariance) } def initializeZeroModel(dimension: Int): LogisticRegressionModel = diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblem.scala index de8c0633..1a4242ed 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblem.scala @@ -145,8 +145,6 @@ case class PoissonRegressionOptimizationProblem( } object PoissonRegressionOptimizationProblem { - val COMPUTING_VARIANCE = false - /** * Build a logistic regression optimization problem * @@ -158,7 +156,8 @@ object PoissonRegressionOptimizationProblem { protected[ml] def buildOptimizationProblem( configuration: GLMOptimizationConfiguration, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = true): PoissonRegressionOptimizationProblem = { + isTrackingState: Boolean = true, + isComputingVariance: Boolean = false): PoissonRegressionOptimizationProblem = { val optimizerConfig = configuration.optimizerConfig val regularizationContext = configuration.regularizationContext @@ -182,7 +181,7 @@ object PoissonRegressionOptimizationProblem { regularizationWeight, if (isTrackingState) { Some(new mutable.ListBuffer[ModelTracker]())} else { None }, treeAggregateDepth, - COMPUTING_VARIANCE) + isComputingVariance) } def initializeZeroModel(dimension: Int): PoissonRegressionModel = diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblem.scala index 59e3020e..2f2cdc2d 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblem.scala @@ -131,8 +131,6 @@ case class SmoothedHingeLossLinearSVMOptimizationProblem( } object SmoothedHingeLossLinearSVMOptimizationProblem { - val COMPUTING_VARIANCE = false - /** * Build a logistic regression optimization problem * @@ -144,7 +142,8 @@ object SmoothedHingeLossLinearSVMOptimizationProblem { protected[ml] def buildOptimizationProblem( configuration: GLMOptimizationConfiguration, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = true): SmoothedHingeLossLinearSVMOptimizationProblem = { + isTrackingState: Boolean = true, + isComputingVariance: Boolean = false): SmoothedHingeLossLinearSVMOptimizationProblem = { val optimizerConfig = configuration.optimizerConfig val regularizationContext = configuration.regularizationContext @@ -168,7 +167,7 @@ object SmoothedHingeLossLinearSVMOptimizationProblem { regularizationWeight, if (isTrackingState) { Some(new mutable.ListBuffer[ModelTracker]())} else { None }, treeAggregateDepth, - COMPUTING_VARIANCE) + isComputingVariance) } def initializeZeroModel(dimension: Int): SmoothedHingeLossLinearSVMModel = diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/FactoredRandomEffectOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/FactoredRandomEffectOptimizationProblem.scala index 66817e26..af06fa38 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/FactoredRandomEffectOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/FactoredRandomEffectOptimizationProblem.scala @@ -105,23 +105,27 @@ object FactoredRandomEffectOptimizationProblem { */ protected[ml] def buildFactoredRandomEffectOptimizationProblem[GLM <: GeneralizedLinearModel, F <: DiffFunction[LabeledPoint]]( - builder: (GLMOptimizationConfiguration, Int, Boolean) => GeneralizedLinearOptimizationProblem[GLM, F], + builder: (GLMOptimizationConfiguration, Int, Boolean, Boolean) => GeneralizedLinearOptimizationProblem[GLM, F], randomEffectOptimizationConfiguration: GLMOptimizationConfiguration, latentFactorOptimizationConfiguration: GLMOptimizationConfiguration, mfOptimizationConfiguration: MFOptimizationConfiguration, randomEffectDataSet: RandomEffectDataSet, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = false): FactoredRandomEffectOptimizationProblem[GLM, F] = { + isTrackingState: Boolean = false, + isComputingVariance: Boolean = false): FactoredRandomEffectOptimizationProblem[GLM, F] = { val MFOptimizationConfiguration(numInnerIterations, latentSpaceDimension) = mfOptimizationConfiguration val latentFactorOptimizationProblem = builder( latentFactorOptimizationConfiguration, treeAggregateDepth, - isTrackingState) + isTrackingState, + isComputingVariance) val randomEffectOptimizationProblem = RandomEffectOptimizationProblem.buildRandomEffectOptimizationProblem( builder, randomEffectOptimizationConfiguration, - randomEffectDataSet) + randomEffectDataSet, + treeAggregateDepth, + isComputingVariance) new FactoredRandomEffectOptimizationProblem( randomEffectOptimizationProblem, diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/GLMOptimizationConfiguration.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/GLMOptimizationConfiguration.scala index 94bdffc5..bfa0b62b 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/GLMOptimizationConfiguration.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/GLMOptimizationConfiguration.scala @@ -26,9 +26,9 @@ import com.linkedin.photon.ml.optimization._ */ protected[ml] case class GLMOptimizationConfiguration ( optimizerConfig: OptimizerConfig = OptimizerConfig(OptimizerType.TRON, 20, 1E-5, None), - regularizationContext: RegularizationContext = L2RegularizationContext, - regularizationWeight: Double = 50, - downSamplingRate: Double = 1) { + regularizationContext: RegularizationContext = NoRegularizationContext, + regularizationWeight: Double = 0D, + downSamplingRate: Double = 1D) { override def toString: String = { s"optimizerConfig: ${optimizerConfig.toSummaryString}," + diff --git a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala index 3ef8bce7..c36e9cd5 100644 --- a/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala +++ b/photon-ml/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala @@ -37,8 +37,7 @@ import org.apache.spark.storage.StorageLevel * optimization problem */ protected[ml] class RandomEffectOptimizationProblem[GLM <: GeneralizedLinearModel, F <: DiffFunction[LabeledPoint]]( - val optimizationProblems: RDD[(String, GeneralizedLinearOptimizationProblem[GLM, F])], - baseOptimizationProblem: GeneralizedLinearOptimizationProblem[GLM, F]) + val optimizationProblems: RDD[(String, GeneralizedLinearOptimizationProblem[GLM, F])]) extends RDDLike { def sparkContext: SparkContext = optimizationProblems.sparkContext @@ -73,7 +72,7 @@ protected[ml] class RandomEffectOptimizationProblem[GLM <: GeneralizedLinearMode * @param dimension The dimensionality of the model coefficients * @return A model with zero coefficients */ - def initializeModel(dimension: Int): GLM = baseOptimizationProblem.initializeZeroModel(dimension) + def initializeModel(dimension: Int): GLM = optimizationProblems.first()._2.initializeZeroModel(dimension) /** * Compute the regularization term value @@ -92,6 +91,12 @@ protected[ml] class RandomEffectOptimizationProblem[GLM <: GeneralizedLinearMode } object RandomEffectOptimizationProblem { + // Random effect models should not track optimization states per random effect ID. This info is not currently used + // anywhere and would waste memory. + // + // In addition, when enabled the 'run' method in the GeneralizedLinearOptimizationProblem will fail due to an implicit + // cast of mutable.ListBuffer to mutable.ArrayBuffer, the cause of which is currently undetermined. + val TRACK_STATE = false /** * Build an instance of random effect optimization problem @@ -100,23 +105,20 @@ object RandomEffectOptimizationProblem { * @param configuration Optimizer configuration * @param randomEffectDataSet The training dataset * @param treeAggregateDepth - * @param isTrackingState * @return A new optimization problem instance */ protected[ml] def buildRandomEffectOptimizationProblem[GLM <: GeneralizedLinearModel, F <: DiffFunction[LabeledPoint]]( - builder: (GLMOptimizationConfiguration, Int, Boolean) => GeneralizedLinearOptimizationProblem[GLM, F], + builder: (GLMOptimizationConfiguration, Int, Boolean, Boolean) => GeneralizedLinearOptimizationProblem[GLM, F], configuration: GLMOptimizationConfiguration, randomEffectDataSet: RandomEffectDataSet, treeAggregateDepth: Int = 1, - isTrackingState: Boolean = false): RandomEffectOptimizationProblem[GLM, F] = { + isComputingVariance: Boolean = false): RandomEffectOptimizationProblem[GLM, F] = { // Build an optimization problem for each random effect type val optimizationProblems = randomEffectDataSet.activeData.mapValues(_ => - builder(configuration, treeAggregateDepth, isTrackingState) + builder(configuration, treeAggregateDepth, TRACK_STATE, isComputingVariance) ) - new RandomEffectOptimizationProblem( - optimizationProblems, - builder(configuration, treeAggregateDepth, isTrackingState)) + new RandomEffectOptimizationProblem(optimizationProblems) } } diff --git a/photon-ml/src/test/scala/com/linkedin/photon/ml/cli/game/training/ParamsTest.scala b/photon-ml/src/test/scala/com/linkedin/photon/ml/cli/game/training/ParamsTest.scala index ee803202..b6ee7448 100644 --- a/photon-ml/src/test/scala/com/linkedin/photon/ml/cli/game/training/ParamsTest.scala +++ b/photon-ml/src/test/scala/com/linkedin/photon/ml/cli/game/training/ParamsTest.scala @@ -265,6 +265,14 @@ class ParamsTest { assertEquals(params.randomEffectDataConfigurations, expectedValue) } + @Test + def testComputeVariance(): Unit = { + val paramsAll = Params.parseFromCommandLine(setOneMoreArg(COMPUTE_VARIANCE, "trUE")) + assertEquals(paramsAll.computeVariance, true) + val paramsNone = Params.parseFromCommandLine(setOneMoreArg(COMPUTE_VARIANCE, "fAlSe")) + assertEquals(paramsNone.computeVariance, false) + } + @Test def testSaveModelsToHDFS(): Unit = { val paramsAll = Params.parseFromCommandLine(setOneMoreArg(SAVE_MODELS_TO_HDFS, "true")) @@ -336,6 +344,7 @@ object ParamsTest { val FEATURE_SHARD_ID_TO_FEATURE_SECTION_KEYS_MAP = "feature-shard-id-to-feature-section-keys-map" val FEATURE_SHARD_ID_TO_INTERCEPT_MAP = "feature-shard-id-to-intercept-map" val NUM_ITERATIONS = "num-iterations" + val COMPUTE_VARIANCE = "compute-variance" val FIXED_EFFECT_OPTIMIZATION_CONFIGURATIONS = "fixed-effect-optimization-configurations" val FIXED_EFFECT_DATA_CONFIGURATIONS = "fixed-effect-data-configurations" val RANDOM_EFFECT_OPTIMIZATION_CONFIGURATIONS = "random-effect-optimization-configurations" diff --git a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblemTest.scala b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblemTest.scala index 26dd5250..1240093b 100644 --- a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblemTest.scala +++ b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LinearRegressionOptimizationProblemTest.scala @@ -14,24 +14,60 @@ */ package com.linkedin.photon.ml.optimization -import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} -import com.linkedin.photon.ml.model.Coefficients -import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration -import com.linkedin.photon.ml.test.CommonTestUtils +import java.util.Random +import breeze.linalg.{DenseVector, Vector} import org.apache.spark.rdd.RDD import org.mockito.Mockito._ import org.testng.Assert._ -import org.testng.annotations.Test +import org.testng.annotations.{DataProvider, Test} -class LinearRegressionOptimizationProblemTest { - import LinearRegressionOptimizationProblemTest._ +import com.linkedin.photon.ml.constants.MathConst +import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} +import com.linkedin.photon.ml.model.Coefficients +import com.linkedin.photon.ml.normalization.NormalizationContext +import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration +import com.linkedin.photon.ml.test.{CommonTestUtils, SparkTestUtils} + +class LinearRegressionOptimizationProblemTest extends SparkTestUtils { import CommonTestUtils._ + import LinearRegressionOptimizationProblemTest._ + + def generateUnweightedBenignLocalDataSet: List[LabeledPoint] = { + drawSampleFromNumericallyBenignDenseFeaturesForLinearRegressionLocal(DATA_RANDOM_SEED, TRAINING_SAMPLES, DIMENSIONS) + .map { case (label, features) => + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features) + } + .toList + } + + def generateWeightedBenignLocalDataSet: List[LabeledPoint] = { + val r: Random = new Random(WEIGHT_RANDOM_SEED) + + drawSampleFromNumericallyBenignDenseFeaturesForLinearRegressionLocal(DATA_RANDOM_SEED, TRAINING_SAMPLES, DIMENSIONS) + .map { case (label, features) => + val offset = 0D + val weight = r.nextDouble() * WEIGHT_MAX + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features, offset, weight) + } + .toList + } + + @DataProvider(parallel = true) + def getDataAndWeights: Array[Array[Object]] = { + val weightsToTest = Array(0.1, 1.0, 10.0, 100.0) + val dataSet = generateWeightedBenignLocalDataSet + + weightsToTest.map( Array(_, dataSet).asInstanceOf[Array[Object]] ) + } @Test def testUpdateObjective(): Unit = { - val problem = createProblem + val problem = createProblem() val normalizationContext = new SimpleObjectProvider(mock(classOf[NormalizationContext])) val regularizationWeight = 1D @@ -44,41 +80,110 @@ class LinearRegressionOptimizationProblemTest { @Test def testInitializeZeroModel(): Unit = { - val problem = createProblem - val zeroModel = problem.initializeZeroModel(Dimensions) + val problem = createProblem() + val zeroModel = problem.initializeZeroModel(DIMENSIONS) - assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(Dimensions)) + assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(DIMENSIONS)) } @Test def testCreateModel(): Unit = { - val problem = createProblem - val coefficients = generateDenseVector(Dimensions) + val problem = createProblem() + val coefficients = generateDenseVector(DIMENSIONS) val model = problem.createModel(coefficients, None) assertEquals(model.coefficients.means, coefficients) } @Test - def testComputeVariances(): Unit = { - val problem = createProblem + def testComputeVariancesDisabled(): Unit = { + val problem = createProblem() val input = mock(classOf[RDD[LabeledPoint]]) - val coefficients = generateDenseVector(Dimensions) + val coefficients = generateDenseVector(DIMENSIONS) - // TODO: computeVarainces is currently disabled. This test will need to be updated when the default changes assertEquals(problem.computeVariances(input, coefficients), None) } + + @Test + def testComputeVariancesSimple(): Unit = { + val problem = createProblem(computeVariance = true) + val input = generateUnweightedBenignLocalDataSet + val coefficients = generateDenseVector(DIMENSIONS) + + // For linear regression, the second derivative of the loss function (with regard to z = X_i * B) is 1. + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + + diagonal + (features :* features) + } + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonal.map( v => 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } + + @Test(dataProvider = "getDataAndWeights") + def testComputeVariancesComplex(regularizationWeight: Double, input: Iterable[LabeledPoint]): Unit = { + val problem = createProblem(L2RegularizationContext, regularizationWeight, computeVariance = true) + val coefficients = generateDenseVector(DIMENSIONS) + + // For linear regression, the second derivative of the loss function (with regard to z = X_i * B) is 1. + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + val weight: Double = datum.weight + + diagonal + (weight * features :* features) + } + // Add the regularization to the Hessian. The second derivative of the L2 regularization term is the regularization + // weight. + val hessianDiagonalWithL2: Vector[Double] = hessianDiagonal + regularizationWeight + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonalWithL2.map( v => + 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } } object LinearRegressionOptimizationProblemTest { - val Dimensions = 10 - - def createProblem() = { - val config = new GLMOptimizationConfiguration + val DATA_RANDOM_SEED: Int = 7 + val WEIGHT_RANDOM_SEED: Int = 13 + val WEIGHT_MAX: Double = 10.0 + val DIMENSIONS: Int = 5 + val TRAINING_SAMPLES: Int = DIMENSIONS * DIMENSIONS + + def createProblem( + regularizationContext: RegularizationContext = NoRegularizationContext, + regularizationWeight: Double = 0D, + computeVariance: Boolean = false): LinearRegressionOptimizationProblem = { + + val config = new GLMOptimizationConfiguration( + optimizerConfig = OptimizerConfig(OptimizerType.LBFGS, 100, 1E-10, None), + regularizationContext = regularizationContext, + regularizationWeight = regularizationWeight, + downSamplingRate = 1D) val treeAggregateDepth = 1 val isTrackingState = false LinearRegressionOptimizationProblem.buildOptimizationProblem( - config, treeAggregateDepth, isTrackingState) + config, + treeAggregateDepth, + isTrackingState, + computeVariance) } } diff --git a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblemTest.scala b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblemTest.scala index 3ce6b3b3..886e0c3f 100644 --- a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblemTest.scala +++ b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/LogisticRegressionOptimizationProblemTest.scala @@ -14,24 +14,68 @@ */ package com.linkedin.photon.ml.optimization -import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} -import com.linkedin.photon.ml.model.Coefficients -import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration -import com.linkedin.photon.ml.test.CommonTestUtils +import java.util.Random +import breeze.linalg.{DenseVector, Vector} import org.apache.spark.rdd.RDD import org.mockito.Mockito._ import org.testng.Assert._ -import org.testng.annotations.Test +import org.testng.annotations.{DataProvider, Test} -class LogisticRegressionOptimizationProblemTest { - import LogisticRegressionOptimizationProblemTest._ +import com.linkedin.photon.ml.constants.MathConst +import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} +import com.linkedin.photon.ml.model.Coefficients +import com.linkedin.photon.ml.normalization.NormalizationContext +import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration +import com.linkedin.photon.ml.test.{CommonTestUtils, SparkTestUtils} + +class LogisticRegressionOptimizationProblemTest extends SparkTestUtils { import CommonTestUtils._ + import LogisticRegressionOptimizationProblemTest._ + + def sigmoid(z: Double): Double = 1.0 / (1.0 + math.exp(-z)) + + def generateUnweightedBenignLocalDataSet: List[LabeledPoint] = { + drawBalancedSampleFromNumericallyBenignDenseFeaturesForBinaryClassifierLocal( + DATA_RANDOM_SEED, + TRAINING_SAMPLES, + DIMENSIONS) + .map { case (label, features) => + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features) + } + .toList + } + + def generateWeightedBenignLocalDataSet: List[LabeledPoint] = { + val r: Random = new Random(WEIGHT_RANDOM_SEED) + + drawBalancedSampleFromNumericallyBenignDenseFeaturesForBinaryClassifierLocal( + DATA_RANDOM_SEED, + TRAINING_SAMPLES, + DIMENSIONS) + .map { case (label, features) => + val offset = 0D + val weight = r.nextDouble() * WEIGHT_MAX + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features, offset, weight) + } + .toList + } + + @DataProvider(parallel = true) + def getDataAndWeights: Array[Array[Object]] = { + val weightsToTest = Array(0.1, 1.0, 10.0, 100.0) + val dataSet = generateWeightedBenignLocalDataSet + + weightsToTest.map( Array(_, dataSet).asInstanceOf[Array[Object]] ) + } @Test def testUpdateObjective(): Unit = { - val problem = createProblem + val problem = createProblem() val normalizationContext = new SimpleObjectProvider(mock(classOf[NormalizationContext])) val regularizationWeight = 1D @@ -44,41 +88,118 @@ class LogisticRegressionOptimizationProblemTest { @Test def testInitializeZeroModel(): Unit = { - val problem = createProblem - val zeroModel = problem.initializeZeroModel(Dimensions) + val problem = createProblem() + val zeroModel = problem.initializeZeroModel(DIMENSIONS) - assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(Dimensions)) + assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(DIMENSIONS)) } @Test def testCreateModel(): Unit = { - val problem = createProblem - val coefficients = generateDenseVector(Dimensions) + val problem = createProblem() + val coefficients = generateDenseVector(DIMENSIONS) val model = problem.createModel(coefficients, None) assertEquals(model.coefficients.means, coefficients) } @Test - def testComputeVariances(): Unit = { - val problem = createProblem + def testComputeVariancesDisabled(): Unit = { + val problem = createProblem() val input = mock(classOf[RDD[LabeledPoint]]) - val coefficients = generateDenseVector(Dimensions) + val coefficients = generateDenseVector(DIMENSIONS) - // TODO: computeVarainces is currently disabled. This test will need to be updated when the default changes assertEquals(problem.computeVariances(input, coefficients), None) } + + @Test + def testComputeVariancesSimple(): Unit = { + val problem = createProblem(computeVariance = true) + val input = generateUnweightedBenignLocalDataSet + val coefficients = generateDenseVector(DIMENSIONS) + + // For logistic regression, the second derivative of the loss function (with regard to z = X_i * B) is: + // sigmoid(z) * (1 - sigmoid(z)) + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + val z: Double = datum.computeMargin(coefficients) + val sigm: Double = sigmoid(z) + val d2lossdz2: Double = sigm * (1.0 - sigm) + + diagonal + (d2lossdz2 * features :* features) + } + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonal.map( v => 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } + + @Test(dataProvider = "getDataAndWeights") + def testComputeVariancesComplex(regularizationWeight: Double, input: Iterable[LabeledPoint]): Unit = { + val problem = createProblem(L2RegularizationContext, regularizationWeight, computeVariance = true) + val coefficients = generateDenseVector(DIMENSIONS) + + // For logistic regression, the second derivative of the loss function (with regard to z = X_i * B) is: + // sigmoid(z) * (1 - sigmoid(z)) + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + val weight: Double = datum.weight + val z: Double = datum.computeMargin(coefficients) + val sigm: Double = sigmoid(z) + val d2lossdz2: Double = sigm * (1.0 - sigm) + + diagonal + (weight * d2lossdz2 * features :* features) + } + // Add the regularization to the Hessian. The second derivative of the L2 regularization term is the regularization + // weight. + val hessianDiagonalWithL2: Vector[Double] = hessianDiagonal + regularizationWeight + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonalWithL2.map( v => + 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } } object LogisticRegressionOptimizationProblemTest { - val Dimensions = 10 - - def createProblem() = { - val config = new GLMOptimizationConfiguration + val DATA_RANDOM_SEED: Int = 7 + val WEIGHT_RANDOM_SEED: Int = 13 + val WEIGHT_MAX: Double = 10.0 + val DIMENSIONS: Int = 5 + val TRAINING_SAMPLES: Int = DIMENSIONS * DIMENSIONS + + def createProblem( + regularizationContext: RegularizationContext = NoRegularizationContext, + regularizationWeight: Double = 0D, + computeVariance: Boolean = false): LogisticRegressionOptimizationProblem = { + + val config = new GLMOptimizationConfiguration( + optimizerConfig = OptimizerConfig(OptimizerType.LBFGS, 100, 1E-10, None), + regularizationContext = regularizationContext, + regularizationWeight = regularizationWeight, + downSamplingRate = 1D) val treeAggregateDepth = 1 val isTrackingState = false LogisticRegressionOptimizationProblem.buildOptimizationProblem( - config, treeAggregateDepth, isTrackingState) + config, + treeAggregateDepth, + isTrackingState, + computeVariance) } } diff --git a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblemTest.scala b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblemTest.scala index d7f952eb..6a8f3ad9 100644 --- a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblemTest.scala +++ b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/PoissonRegressionOptimizationProblemTest.scala @@ -14,24 +14,66 @@ */ package com.linkedin.photon.ml.optimization -import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} -import com.linkedin.photon.ml.model.Coefficients -import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration -import com.linkedin.photon.ml.test.CommonTestUtils +import java.util.Random +import breeze.linalg.{DenseVector, Vector} import org.apache.spark.rdd.RDD import org.mockito.Mockito._ import org.testng.Assert._ -import org.testng.annotations.Test +import org.testng.annotations.{DataProvider, Test} -class PoissonRegressionOptimizationProblemTest { - import PoissonRegressionOptimizationProblemTest._ +import com.linkedin.photon.ml.constants.MathConst +import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} +import com.linkedin.photon.ml.model.Coefficients +import com.linkedin.photon.ml.normalization.NormalizationContext +import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration +import com.linkedin.photon.ml.test.{CommonTestUtils, SparkTestUtils} + +class PoissonRegressionOptimizationProblemTest extends SparkTestUtils{ import CommonTestUtils._ + import PoissonRegressionOptimizationProblemTest._ + + def generateUnweightedBenignLocalDataSet: List[LabeledPoint] = { + drawSampleFromNumericallyBenignDenseFeaturesForPoissonRegressionLocal( + DATA_RANDOM_SEED, + TRAINING_SAMPLES, + DIMENSIONS) + .map { case (label, features) => + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features) + } + .toList + } + + def generateWeightedBenignLocalDataSet: List[LabeledPoint] = { + val r: Random = new Random(WEIGHT_RANDOM_SEED) + + drawSampleFromNumericallyBenignDenseFeaturesForPoissonRegressionLocal( + DATA_RANDOM_SEED, + TRAINING_SAMPLES, + DIMENSIONS) + .map { case (label, features) => + val offset = 0D + val weight = r.nextDouble() * WEIGHT_MAX + assertEquals(features.length, DIMENSIONS, "Samples should have expected lengths") + + new LabeledPoint(label, features, offset, weight) + } + .toList + } + + @DataProvider(parallel = true) + def getDataAndWeights: Array[Array[Object]] = { + val weightsToTest = Array(0.1, 1.0, 10.0, 100.0) + val dataSet = generateWeightedBenignLocalDataSet + + weightsToTest.map( Array(_, dataSet).asInstanceOf[Array[Object]] ) + } @Test def testUpdateObjective(): Unit = { - val problem = createProblem + val problem = createProblem() val normalizationContext = new SimpleObjectProvider(mock(classOf[NormalizationContext])) val regularizationWeight = 1D @@ -44,41 +86,114 @@ class PoissonRegressionOptimizationProblemTest { @Test def testInitializeZeroModel(): Unit = { - val problem = createProblem - val zeroModel = problem.initializeZeroModel(Dimensions) + val problem = createProblem() + val zeroModel = problem.initializeZeroModel(DIMENSIONS) - assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(Dimensions)) + assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(DIMENSIONS)) } @Test def testCreateModel(): Unit = { - val problem = createProblem - val coefficients = generateDenseVector(Dimensions) + val problem = createProblem() + val coefficients = generateDenseVector(DIMENSIONS) val model = problem.createModel(coefficients, None) assertEquals(model.coefficients.means, coefficients) } @Test - def testComputeVariances(): Unit = { - val problem = createProblem + def testComputeVariancesDisabled(): Unit = { + val problem = createProblem() val input = mock(classOf[RDD[LabeledPoint]]) - val coefficients = generateDenseVector(Dimensions) + val coefficients = generateDenseVector(DIMENSIONS) - // TODO: computeVarainces is currently disabled. This test will need to be updated when the default changes assertEquals(problem.computeVariances(input, coefficients), None) } + + @Test + def testComputeVariancesSimple(): Unit = { + val problem = createProblem(computeVariance = true) + val input = generateUnweightedBenignLocalDataSet + val coefficients = generateDenseVector(DIMENSIONS) + + // For Poisson regression, the second derivative of the loss function (with regard to z = X_i * B) is e^z. + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + val z: Double = datum.computeMargin(coefficients) + val d2lossdz2 = math.exp(z) + + diagonal + (d2lossdz2 * features :* features) + } + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonal.map( v => 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } + + @Test(dataProvider = "getDataAndWeights") + def testComputeVariancesComplex(regularizationWeight: Double, input: Iterable[LabeledPoint]): Unit = { + val problem = createProblem(L2RegularizationContext, regularizationWeight, computeVariance = true) + val coefficients = generateDenseVector(DIMENSIONS) + + // For linear regression, the second derivative of the loss function (with regard to z = X_i * B) is e^z. + val hessianDiagonal: Vector[Double] = input.foldLeft(new DenseVector[Double](DIMENSIONS)) + { (diagonal: DenseVector[Double], datum: LabeledPoint) => + val features: Vector[Double] = datum.features + val weight: Double = datum.weight + val z: Double = datum.computeMargin(coefficients) + val d2lossdz2 = math.exp(z) + + diagonal + (weight * d2lossdz2 * features :* features) + } + // Add the regularization to the Hessian. The second derivative of the L2 regularization term is the regularization + // weight. + val hessianDiagonalWithL2: Vector[Double] = hessianDiagonal + regularizationWeight + // Simple estimate of the diagonal of the covariance matrix (instead of a full inverse). + val expected: Vector[Double] = hessianDiagonalWithL2.map( v => + 1D / (v + MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + ) + + val actual: Vector[Double] = problem.computeVariances(input, coefficients).get + + assertEquals(actual.length, DIMENSIONS) + assertEquals(actual.length, expected.length) + for (i <- 0 until DIMENSIONS) { + assertEquals(actual(i), expected(i), MathConst.HIGH_PRECISION_TOLERANCE_THRESHOLD) + } + } } object PoissonRegressionOptimizationProblemTest { - val Dimensions = 10 - - def createProblem() = { - val config = new GLMOptimizationConfiguration + val DATA_RANDOM_SEED: Int = 7 + val WEIGHT_RANDOM_SEED: Int = 13 + val WEIGHT_MAX: Double = 10.0 + val DIMENSIONS: Int = 5 + val TRAINING_SAMPLES: Int = DIMENSIONS * DIMENSIONS + + def createProblem( + regularizationContext: RegularizationContext = NoRegularizationContext, + regularizationWeight: Double = 0D, + computeVariance: Boolean = false): PoissonRegressionOptimizationProblem = { + + val config = new GLMOptimizationConfiguration( + optimizerConfig = OptimizerConfig(OptimizerType.LBFGS, 100, 1E-10, None), + regularizationContext = regularizationContext, + regularizationWeight = regularizationWeight, + downSamplingRate = 1D) val treeAggregateDepth = 1 val isTrackingState = false PoissonRegressionOptimizationProblem.buildOptimizationProblem( - config, treeAggregateDepth, isTrackingState) + config, + treeAggregateDepth, + isTrackingState, + computeVariance) } } diff --git a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblemTest.scala b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblemTest.scala index 38e5d248..0eb76193 100644 --- a/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblemTest.scala +++ b/photon-ml/src/test/scala/com/linkedin/photon/ml/optimization/SmoothedHingeLossLinearSVMOptimizationProblemTest.scala @@ -14,24 +14,24 @@ */ package com.linkedin.photon.ml.optimization +import org.apache.spark.rdd.RDD +import org.mockito.Mockito._ +import org.testng.Assert._ +import org.testng.annotations.Test + import com.linkedin.photon.ml.data.{LabeledPoint, SimpleObjectProvider} import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.game.GLMOptimizationConfiguration import com.linkedin.photon.ml.test.CommonTestUtils -import org.apache.spark.rdd.RDD -import org.mockito.Mockito._ -import org.testng.Assert._ -import org.testng.annotations.Test - class SmoothedHingeLossLinearSVMOptimizationProblemTest { - import SmoothedHingeLossLinearSVMOptimizationProblemTest._ import CommonTestUtils._ + import SmoothedHingeLossLinearSVMOptimizationProblemTest._ @Test def testUpdateObjective(): Unit = { - val problem = createProblem + val problem = createProblem() val normalizationContext = new SimpleObjectProvider(mock(classOf[NormalizationContext])) val regularizationWeight = 1D @@ -44,42 +44,53 @@ class SmoothedHingeLossLinearSVMOptimizationProblemTest { @Test def testInitializeZeroModel(): Unit = { - val problem = createProblem - val zeroModel = problem.initializeZeroModel(Dimensions) + val problem = createProblem() + val zeroModel = problem.initializeZeroModel(DIMENSIONS) - assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(Dimensions)) + assertEquals(zeroModel.coefficients, Coefficients.initializeZeroCoefficients(DIMENSIONS)) } @Test def testCreateModel(): Unit = { - val problem = createProblem - val coefficients = generateDenseVector(Dimensions) + val problem = createProblem() + val coefficients = generateDenseVector(DIMENSIONS) val model = problem.createModel(coefficients, None) assertEquals(model.coefficients.means, coefficients) } @Test - def testComputeVariances(): Unit = { - val problem = createProblem + def testComputeVariancesDisabled(): Unit = { + val problem = createProblem() + val input = mock(classOf[RDD[LabeledPoint]]) + val coefficients = generateDenseVector(DIMENSIONS) + + assertEquals(problem.computeVariances(input, coefficients), None) + } + + @Test + def testComputeVariancesEnabled(): Unit = { + val problem = createProblem(computeVariance = true) val input = mock(classOf[RDD[LabeledPoint]]) - val coefficients = generateDenseVector(Dimensions) + val coefficients = generateDenseVector(DIMENSIONS) - // TODO: computeVarainces is currently disabled. This test will need to be updated when the default changes assertEquals(problem.computeVariances(input, coefficients), None) } } object SmoothedHingeLossLinearSVMOptimizationProblemTest { - val Dimensions = 10 + val DIMENSIONS = 10 - def createProblem() = { - val optimizerConfig = OptimizerConfig(OptimizerType.LBFGS, 1, 1e-5, None) - val config = new GLMOptimizationConfiguration(optimizerConfig) + def createProblem(computeVariance: Boolean = false) = { + val config = new GLMOptimizationConfiguration( + optimizerConfig = OptimizerConfig(OptimizerType.LBFGS, 100, 1E-10, None)) val treeAggregateDepth = 1 val isTrackingState = false SmoothedHingeLossLinearSVMOptimizationProblem.buildOptimizationProblem( - config, treeAggregateDepth, isTrackingState) + config, + treeAggregateDepth, + isTrackingState, + computeVariance) } }