diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala index 8c3457ad..81a8370f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala @@ -184,7 +184,7 @@ object ModelTraining extends Logging { // Initialize the list with the result from the first regularization weight optimizationProblem.updateRegularizationWeight(currentWeight) - val glm = if (numWarmStartModels == 0) { + val (glm, stateTracker) = if (numWarmStartModels == 0) { logger.info(s"No warm start model found; beginning training with a 0-coefficients model") @@ -199,14 +199,14 @@ object ModelTraining extends Logging { optimizationProblem.run(trainingData, warmStartModels(maxLambda)) } - List((currentWeight, glm, optimizationProblem.getStatesTracker)) + List((currentWeight, glm, stateTracker)) case (latestWeightsModelsAndTrackers, currentWeight) => optimizationProblem.updateRegularizationWeight(currentWeight) // Train the rest of the models - val glm = if (useWarmStart) { + val (glm, stateTracker) = if (useWarmStart) { val previousModel = latestWeightsModelsAndTrackers.head._2 logger.info(s"Training model with regularization weight $currentWeight started (warm start)") @@ -219,7 +219,7 @@ object ModelTraining extends Logging { optimizationProblem.run(trainingData) } - (currentWeight, glm, optimizationProblem.getStatesTracker) +: latestWeightsModelsAndTrackers + (currentWeight, glm, stateTracker) +: latestWeightsModelsAndTrackers } broadcastNormalizationContext.unpersist() diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala index f1b3df63..e6556eb3 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala @@ -21,7 +21,7 @@ import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.SparkSession import org.apache.spark.SparkConf -import com.linkedin.photon.ml.data.{GameDatum, LabeledPoint, LocalDataset} +import com.linkedin.photon.ml.data.{GameDatum, LabeledPoint} import com.linkedin.photon.ml.function._ import com.linkedin.photon.ml.function.glm.{HessianVectorAggregator, ValueAndGradientAggregator} import com.linkedin.photon.ml.model.Coefficients @@ -57,7 +57,6 @@ object SparkSessionConfiguration { classOf[LabeledPoint], classOf[LBFGS], classOf[LinearRegressionModel], - classOf[LocalDataset], classOf[LogisticRegressionModel], classOf[Matrix[Double]], classOf[NormalizationContext], diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala index 95d8bb28..3104f00a 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala @@ -14,9 +14,12 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.{Dataset, FixedEffectDataset, RandomEffectDataset} -import com.linkedin.photon.ml.function.ObjectiveFunctionHelper.{DistributedObjectiveFunctionFactory, ObjectiveFunctionFactoryFactory, SingleNodeObjectiveFunctionFactory} +import org.apache.spark.sql.{DataFrame, SparkSession} + +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.function.ObjectiveFunction +import com.linkedin.photon.ml.function.ObjectiveFunctionHelper.{DistributedObjectiveFunctionFactory, ObjectiveFunctionFactoryFactory, SingleNodeObjectiveFunctionFactory} import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.DistributedOptimizationProblem @@ -34,11 +37,12 @@ import com.linkedin.photon.ml.util.PhotonBroadcast object CoordinateFactory { /** - * Creates a [[Coordinate]] of the appropriate type, given the input [[Dataset]], + * Creates a [[Coordinate]] of the appropriate type, given the input data set, * [[CoordinateOptimizationConfiguration]], and [[ObjectiveFunction]]. * - * @tparam D Some type of [[Dataset]] * @param dataset The input data to use for training + * @param featureShardId + * @param inputColumnsNames * @param coordinateOptConfig The optimization settings for training * @param lossFunctionFactoryConstructor A constructor for the loss function factory function * @param glmConstructor A constructor for the type of [[GeneralizedLinearModel]] being trained @@ -46,23 +50,27 @@ object CoordinateFactory { * @param normalizationContext The [[NormalizationContext]] * @param varianceComputationType Should the trained coefficient variances be computed in addition to the means? * @param interceptIndexOpt The index of the intercept, if one is present - * @return A [[Coordinate]] for the [[Dataset]] of type [[D]] + * @param rETypeOpt + * @return A [[Coordinate]] instance */ - def build[D <: Dataset[D]]( - dataset: D, + def build( + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, coordinateOptConfig: CoordinateOptimizationConfiguration, lossFunctionFactoryConstructor: ObjectiveFunctionFactoryFactory, glmConstructor: Coefficients => GeneralizedLinearModel, downSamplerFactory: DownSamplerFactory, normalizationContext: NormalizationContext, varianceComputationType: VarianceComputationType, - interceptIndexOpt: Option[Int]): Coordinate[D] = { + interceptIndexOpt: Option[Int], + rETypeOpt: Option[REType]): Coordinate = { val lossFunctionFactory = lossFunctionFactoryConstructor(coordinateOptConfig) - (dataset, coordinateOptConfig, lossFunctionFactory) match { + (rETypeOpt, coordinateOptConfig, lossFunctionFactory) match { case ( - fEDataset: FixedEffectDataset, + None, fEOptConfig: FixedEffectOptimizationConfiguration, distributedLossFunctionFactory: DistributedObjectiveFunctionFactory) => @@ -71,36 +79,43 @@ object CoordinateFactory { } else { None } - val normalizationPhotonBroadcast = PhotonBroadcast(fEDataset.sparkContext.broadcast(normalizationContext)) + val normalizationPhotonBroadcast = PhotonBroadcast( + SparkSession.builder.getOrCreate.sparkContext + .broadcast(normalizationContext)) new FixedEffectCoordinate( - fEDataset, + dataset, DistributedOptimizationProblem( fEOptConfig, distributedLossFunctionFactory(interceptIndexOpt), downSamplerOpt, glmConstructor, normalizationPhotonBroadcast, - varianceComputationType)).asInstanceOf[Coordinate[D]] + varianceComputationType), + featureShardId, + inputColumnsNames).asInstanceOf[Coordinate] case ( - rEDataset: RandomEffectDataset, + Some(rEType), rEOptConfig: RandomEffectOptimizationConfiguration, singleNodeLossFunctionFactory: SingleNodeObjectiveFunctionFactory) => RandomEffectCoordinate( - rEDataset, + dataset, + rEType, + featureShardId, + inputColumnsNames, rEOptConfig, singleNodeLossFunctionFactory, glmConstructor, normalizationContext, varianceComputationType, - interceptIndexOpt).asInstanceOf[Coordinate[D]] + interceptIndexOpt).asInstanceOf[Coordinate] case _ => throw new UnsupportedOperationException( s"""Cannot build coordinate for the following input class combination: - | ${dataset.getClass.getName} + | ${rETypeOpt.getOrElse("fixed-effect")} | ${coordinateOptConfig.getClass.getName} | ${lossFunctionFactory.getClass.getName}""".stripMargin) } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 201691f4..7b2f157c 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -14,52 +14,41 @@ */ package com.linkedin.photon.ml.algorithm -import org.apache.spark.rdd.RDD +import org.apache.spark.ml.linalg.{Vector => SparkVector} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col +import org.apache.spark.storage.StorageLevel import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.DistributedObjectiveFunction import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} import com.linkedin.photon.ml.optimization.{DistributedOptimizationProblem, FixedEffectOptimizationTracker, OptimizationTracker} +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} /** * The optimization problem coordinate for a fixed effect model. * * @tparam Objective The type of objective function used to solve the fixed effect optimization problem - * @param dataset The training dataset + * @param dataset The raw training data * @param optimizationProblem The fixed effect optimization problem + * @param inputColumnsNames */ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunction]( - override protected val dataset: FixedEffectDataset, - optimizationProblem: DistributedOptimizationProblem[Objective]) - extends Coordinate[FixedEffectDataset](dataset) { + var dataset: DataFrame, + optimizationProblem: DistributedOptimizationProblem[Objective], + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) + extends Coordinate { - /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - override protected[algorithm] def updateCoordinateWithDataset( - dataset: FixedEffectDataset): FixedEffectCoordinate[Objective] = - new FixedEffectCoordinate[Objective](dataset, optimizationProblem) - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optimization state tracking information) tuple - */ - override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { - - val updatedFixedEffectModel = FixedEffectCoordinate.trainModel( - dataset.labeledPoints, - optimizationProblem, - dataset.featureShardId, - None) - val optimizationTracker = new FixedEffectOptimizationTracker(optimizationProblem.getStatesTracker) - - (updatedFixedEffectModel, optimizationTracker) + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { + model match { + case fixedEffectModel: FixedEffectModel => + dataset = FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId, inputColumnsNames) + case _ => + throw new UnsupportedOperationException(s"Unsupported model type: ${model.modelType}") + } } /** @@ -72,43 +61,37 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct override protected[algorithm] def trainModel(model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = model match { case fixedEffectModel: FixedEffectModel => - val updatedFixedEffectModel = FixedEffectCoordinate.trainModel( - dataset.labeledPoints, + FixedEffectCoordinate.trainModel( + dataset, optimizationProblem, - dataset.featureShardId, + featureShardId, + inputColumnsNames, Some(fixedEffectModel)) - val optimizationTracker = new FixedEffectOptimizationTracker(optimizationProblem.getStatesTracker) - - (updatedFixedEffectModel, optimizationTracker) case _ => throw new UnsupportedOperationException( s"Training model of type ${model.getClass} in ${this.getClass} is not supported") } + /** - * Compute scores for the coordinate dataset using the given model. + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. * - * @param model The input model - * @return The dataset scores + * @return A (updated model, optimization state tracking information) tuple */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { + override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = + FixedEffectCoordinate.trainModel(dataset, optimizationProblem, featureShardId, inputColumnsNames, None) - case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) - - case _ => - throw new UnsupportedOperationException( - s"Scoring with model of type ${model.getClass} in ${this.getClass} is not supported") - } } object FixedEffectCoordinate { + def SCORE_FIELD = "fixed_score" + /** * Train a new [[FixedEffectModel]] (i.e. run model optimization). * - * @param input The training dataset + * @param dataset The training dataset * @param optimizationProblem The optimization problem * @param featureShardId The ID of the feature shard for the training data * @param initialFixedEffectModelOpt An optional existing [[FixedEffectModel]] to use as a starting point for @@ -116,39 +99,62 @@ object FixedEffectCoordinate { * @return A new [[FixedEffectModel]] */ private def trainModel[Function <: DistributedObjectiveFunction]( - input: RDD[(UniqueSampleId, LabeledPoint)], - optimizationProblem: DistributedOptimizationProblem[Function], - featureShardId: FeatureShardId, - initialFixedEffectModelOpt: Option[FixedEffectModel]): FixedEffectModel = { - - val newModel = initialFixedEffectModelOpt - .map { initialFixedEffectModel => - optimizationProblem.runWithSampling(input, initialFixedEffectModel.model) + dataset: DataFrame, + optimizationProblem: DistributedOptimizationProblem[Function], + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, + initialFixedEffectModelOpt: Option[FixedEffectModel]): (FixedEffectModel, OptimizationTracker) = { + + val rdd = dataset + .rdd + .map { row => + val uid = row.getAs[UniqueSampleId](DataConst.ID) + val features = row.getAs[SparkVector](featureShardId) + val label = row.getAs[Double](inputColumnsNames(InputColumnsNames.RESPONSE)) + + (uid, LabeledPoint(label, VectorUtils.mlToBreeze(features))) } - .getOrElse(optimizationProblem.runWithSampling(input)) - val updatedModelBroadcast = input.sparkContext.broadcast(newModel) + rdd.persist(StorageLevel.MEMORY_ONLY) - new FixedEffectModel(updatedModelBroadcast, featureShardId) - } + val (glm, stateTracker) = initialFixedEffectModelOpt + .map ( initialFixedEffectModel => + optimizationProblem.runWithSampling(rdd, initialFixedEffectModel.model) + ) + .getOrElse(optimizationProblem.runWithSampling(rdd)) - /** - * Score a [[FixedEffectDataset]] using a given [[FixedEffectModel]]. - * - * @note The score is the dot product of the model coefficients with the feature values (i.e., it does not go - * through a non-linear link function). - * @param fixedEffectDataset The dataset to score - * @param fixedEffectModel The model used to score the dataset - * @return The computed scores - */ - protected[algorithm] def score( - fixedEffectDataset: FixedEffectDataset, - fixedEffectModel: FixedEffectModel): CoordinateDataScores = { + rdd.unpersist() - val modelBroadcast = fixedEffectModel.modelBroadcast - val scores = fixedEffectDataset.labeledPoints.mapValues { case LabeledPoint(_, features, _, _) => - modelBroadcast.value.computeScore(features) - } + (FixedEffectModel(glm, featureShardId), new FixedEffectOptimizationTracker(stateTracker)) + } - new CoordinateDataScores(scores) + def updateOffset( + dataset: DataFrame, fixedEffectModel: FixedEffectModel, featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames): DataFrame = { + + require( + featureShardId == fixedEffectModel.featureShardId, + s"Fixed effect coordinate featureShardId ${featureShardId} != model.featureShardId ${ + fixedEffectModel + .featureShardId + }") + + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val hasOffsetField = ApiUtils.hasColumn(dataset, offset) + val hasCoordinateScoreField = ApiUtils.hasColumn(dataset, SCORE_FIELD) + + if (hasOffsetField && hasCoordinateScoreField) { + // offset = offset - old_coordinateScore + new_coordinateScore + dataset.withColumn(offset, col(offset) - col(SCORE_FIELD)) + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(offset) + col(SCORE_FIELD)) + } else if (!hasOffsetField && !hasCoordinateScoreField) { + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(SCORE_FIELD)) + } else if (hasOffsetField && !hasCoordinateScoreField) { + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(offset) + col(SCORE_FIELD)) + } else { + throw new UnsupportedOperationException("It shouldn't happen!") + } } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 19355f0f..12f57417 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -14,16 +14,22 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.FixedEffectDataset -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores +import org.apache.spark.sql.DataFrame + +import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} /** * The optimization problem coordinate for a pre-trained fixed effect model. * * @param dataset The training dataset + * @param featureShardId The ID of the feature shard for the training data */ -class FixedEffectModelCoordinate(dataset: FixedEffectDataset) extends ModelCoordinate(dataset) { +class FixedEffectModelCoordinate( + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. @@ -31,10 +37,11 @@ class FixedEffectModelCoordinate(dataset: FixedEffectDataset) extends ModelCoord * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { + model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) + FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId, inputColumnsNames) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala deleted file mode 100644 index 503df9d3..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.algorithm - -import com.linkedin.photon.ml.data.RandomEffectDataset -import com.linkedin.photon.ml.model.{Coefficients, RandomEffectModel} - -/** - * Trait to encapsulate [[RandomEffectModel]] projection. Needed as the random effects have their feature space - * collapsed to reduce the amount of memory used and training time. - */ -trait ModelProjection extends Coordinate[RandomEffectDataset] { - - /** - * Project a [[RandomEffectModel]] from the original space to the projected space. - * - * @param randomEffectModel The [[RandomEffectModel]] in the original space - * @return The same [[RandomEffectModel]] in the projected space - */ - protected[algorithm] def projectModelForward(randomEffectModel: RandomEffectModel): RandomEffectModel = { - - // Left join the models to projectors for cases where we have a prior model but no new model (and hence no - // projectors) - val linearSubspaceProjectorsRDD = dataset.projectors - val newModels = randomEffectModel - .modelsRDD - .leftOuterJoin(linearSubspaceProjectorsRDD) - .mapValues { case (model, projectorOpt) => - projectorOpt - .map { projector => - val oldCoefficients = model.coefficients - val newCoefficients = Coefficients( - projector.projectForward(oldCoefficients.means), - oldCoefficients.variancesOption.map(projector.projectForward)) - - model.updateCoefficients(newCoefficients) - } - .getOrElse(model) - } - - randomEffectModel.update(newModels) - } - - /** - * Project a [[RandomEffectModel]] from the projected space to the original space. - * - * @param randomEffectModel The [[RandomEffectModel]] in the projected space - * @return The same [[RandomEffectModel]] in the original space - */ - protected[algorithm] def projectModelBackward(randomEffectModel: RandomEffectModel): RandomEffectModel = { - - // Left join the models to projectors for cases where we have a prior model but no new model (and hence no - // projectors) - val linearSubspaceProjectorsRDD = dataset.projectors - val newModels = randomEffectModel - .modelsRDD - .leftOuterJoin(linearSubspaceProjectorsRDD) - .mapValues { case (model, projectorOpt) => - projectorOpt - .map { projector => - val oldCoefficients = model.coefficients - val newCoefficients = Coefficients( - projector.projectBackward(oldCoefficients.means), - oldCoefficients.variancesOption.map(projector.projectBackward)) - - model.updateCoefficients(newCoefficients) - } - .getOrElse(model) - } - - randomEffectModel.update(newModels) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 8afa2949..b69cbbd5 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -14,60 +14,81 @@ */ package com.linkedin.photon.ml.algorithm -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD +import scala.collection.mutable + +import org.apache.spark.ml.linalg.{Vector => SparkVector} +import org.apache.spark.sql.functions.{col, collect_list} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel +import com.linkedin.photon.ml.TaskType +import com.linkedin.photon.ml.TaskType.TaskType +import com.linkedin.photon.ml.Types.{FeatureShardId, REId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffectModel} import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} -import com.linkedin.photon.ml.optimization._ import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType -import com.linkedin.photon.ml.spark.RDDLike +import com.linkedin.photon.ml.optimization._ +import com.linkedin.photon.ml.optimization.game.RandomEffectOptimizationConfiguration +import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} +import com.linkedin.photon.ml.util.{ApiUtils, PhotonNonBroadcast, VectorUtils} /** * The optimization problem coordinate for a random effect model. * * @tparam Objective The type of objective function used to solve individual random effect optimization problems - * @param dataset The training dataset - * @param optimizationProblem The random effect optimization problem + * @param rEType The random effect type + * @param rawData The raw training dataframe + * @param optimizationProblem The single node optimization problem + * @param inputColumnsNames */ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunction]( - override protected val dataset: RandomEffectDataset, - protected val optimizationProblem: RandomEffectOptimizationProblem[Objective]) - extends Coordinate[RandomEffectDataset](dataset) - with ModelProjection - with RDDLike { + rEType: REType, + var rawData: DataFrame, + optimizationProblem: SingleNodeOptimizationProblem[Objective], + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) + extends Coordinate { + + /* Get the training data from raw data */ + var dataset: DataFrame = null + + protected def updateDataset(): Unit = { + + val label = inputColumnsNames(InputColumnsNames.RESPONSE) + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val weight = inputColumnsNames(InputColumnsNames.WEIGHT) + + dataset = rawData + .select(rEType, featureShardId, label, offset, weight) + .groupBy(rEType) + .agg( + collect_list(featureShardId).alias("features"), + collect_list(label).alias("labels"), + collect_list(offset).alias("offsets"), + collect_list(weight).alias("weights")) + } // // Coordinate functions // + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { - /** - * Update the coordinate with a new [[RandomEffectDataset]]. - * - * @param dataset The updated [[RandomEffectDataset]] - * @return A new coordinate with the updated [[RandomEffectDataset]] - */ - override protected[algorithm] def updateCoordinateWithDataset( - dataset: RandomEffectDataset): RandomEffectCoordinate[Objective] = - new RandomEffectCoordinate(dataset, optimizationProblem) - - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optional optimization tracking information) tuple - */ - override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { + model match { + case randomEffectModel: RandomEffectModel => + rawData = RandomEffectCoordinate.updateOffset( + rawData, randomEffectModel, featureShardId, + rEType, inputColumnsNames) - val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel(dataset, optimizationProblem, None) + updateDataset() - (projectModelBackward(newModel), optimizationTracker) + case _ => + throw new UnsupportedOperationException(s"Unsupported model type: ${model.modelType}") + } } /** @@ -78,100 +99,50 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct * @return A (updated model, optional optimization tracking information) tuple */ override protected[algorithm] def trainModel( - model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = + model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = { + + if (dataset == null) { + updateDataset() + } model match { case randomEffectModel: RandomEffectModel => val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel( dataset, + rEType, + featureShardId, optimizationProblem, - Some(projectModelForward(randomEffectModel))) - - (projectModelBackward(newModel), optimizationTracker) + inputColumnsNames, + Some(randomEffectModel)) + (newModel, optimizationTracker) case _ => throw new UnsupportedOperationException( s"Updating model of type ${model.getClass} in ${this.getClass} is not supported") } - - /** - * Compute scores for the coordinate data using a given model. - * - * @param model The input model - * @return The dataset scores - */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { - - case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, projectModelForward(randomEffectModel)) - - case _ => - throw new UnsupportedOperationException( - s"Scoring with model of type ${model.getClass} in ${this.getClass} is not supported") - } - - // - // RDDLike Functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = optimizationProblem.sparkContext - - /** - * Assign a given name to the [[optimizationProblem]] [[RDD]]. - * - * @param name The parent name for all [[RDD]] objects in this class - * @return This object with the name of the [[optimizationProblem]] [[RDD]] assigned - */ - override def setName(name: String): RandomEffectCoordinate[Objective] = { - - optimizationProblem.setName(name) - - this } /** - * Set the persistence storage level of the [[optimizationProblem]] [[RDD]]. - * - * @param storageLevel The storage level - * @return This object with the storage level of the [[optimizationProblem]] [[RDD]] set - */ - override def persistRDD(storageLevel: StorageLevel): RandomEffectCoordinate[Objective] = { - - optimizationProblem.persistRDD(storageLevel) - - this - } - - /** - * Mark the [[optimizationProblem]] [[RDD]] as unused, and asynchronously remove all blocks for it from memory and - * disk. + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. * - * @return This object with the [[optimizationProblem]] [[RDD]] unpersisted + * @return A (updated model, optimization state tracking information) tuple */ - override def unpersistRDD(): RandomEffectCoordinate[Objective] = { + override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { + if (dataset == null) { + updateDataset() + } - optimizationProblem.unpersistRDD() + val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel( + dataset, + rEType, + featureShardId, + optimizationProblem, + inputColumnsNames, + None) - this + (newModel, optimizationTracker) } - /** - * Materialize the [[optimizationProblem]] [[RDD]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be - * evaluated). - * - * @return This object with the [[optimizationProblem]] [[RDD]] materialized - */ - override def materialize(): RandomEffectCoordinate[Objective] = { - - optimizationProblem.materialize() - - this - } } object RandomEffectCoordinate { @@ -179,9 +150,11 @@ object RandomEffectCoordinate { /** * Helper function to construct [[RandomEffectCoordinate]] objects. * - * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization - * problems - * @param randomEffectDataset The data on which to run the optimization algorithm + * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization problems + * @param data The data on which to run the optimization algorithm + * @param rEType + * @param featureShardId + * @param inputColumnsNames * @param configuration The optimization problem configuration * @param objectiveFunctionFactory The objective function to optimize * @param glmConstructor The function to use for producing GLMs from trained coefficients @@ -191,62 +164,62 @@ object RandomEffectCoordinate { * @return A new [[RandomEffectCoordinate]] object */ protected[ml] def apply[RandomEffectObjective <: SingleNodeObjectiveFunction]( - randomEffectDataset: RandomEffectDataset, - configuration: RandomEffectOptimizationConfiguration, - objectiveFunctionFactory: Option[Int] => RandomEffectObjective, - glmConstructor: Coefficients => GeneralizedLinearModel, - normalizationContext: NormalizationContext, - varianceComputationType: VarianceComputationType = VarianceComputationType.NONE, - interceptIndexOpt: Option[Int] = None): RandomEffectCoordinate[RandomEffectObjective] = { + data: DataFrame, + rEType: REType, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, + configuration: RandomEffectOptimizationConfiguration, + objectiveFunctionFactory: Option[Int] => RandomEffectObjective, + glmConstructor: Coefficients => GeneralizedLinearModel, + normalizationContext: NormalizationContext, + varianceComputationType: VarianceComputationType, + interceptIndexOpt: Option[Int] = None): RandomEffectCoordinate[RandomEffectObjective] = { // Generate parameters of ProjectedRandomEffectCoordinate - val randomEffectOptimizationProblem = RandomEffectOptimizationProblem( - randomEffectDataset.projectors, + val optimizationProblem = SingleNodeOptimizationProblem( configuration, - objectiveFunctionFactory, + objectiveFunctionFactory(interceptIndexOpt), glmConstructor, - normalizationContext, - varianceComputationType, - interceptIndexOpt) + PhotonNonBroadcast(normalizationContext), + varianceComputationType) - new RandomEffectCoordinate(randomEffectDataset, randomEffectOptimizationProblem) + new RandomEffectCoordinate(rEType, data, optimizationProblem, featureShardId, inputColumnsNames) } /** * Train a new [[RandomEffectModel]] (i.e. run model optimization for each entity). * * @tparam Function The type of objective function used to solve individual random effect optimization problems - * @param randomEffectDataset The training dataset - * @param randomEffectOptimizationProblem The per-entity optimization problems + * @param trainingData The training dataset + * @param randomEffectType + * @param featureShardId + * @param optimizationProblem The per-entity optimization problems * @param initialRandomEffectModelOpt An optional existing [[RandomEffectModel]] to use as a starting point for * optimization * @return A (new [[RandomEffectModel]], optional optimization stats) tuple */ protected[algorithm] def trainModel[Function <: SingleNodeObjectiveFunction]( - randomEffectDataset: RandomEffectDataset, - randomEffectOptimizationProblem: RandomEffectOptimizationProblem[Function], - initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { - - // All 3 RDDs involved in the joins below use the same partitioner + trainingData: DataFrame, + randomEffectType: REType, + featureShardId: FeatureShardId, + optimizationProblem: SingleNodeOptimizationProblem[Function], + inputColumnsNames: InputColumnsNames, + initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { - // Optimization problems are created for each entity with a projector, and thus guaranteed to match active data - // exactly (see RandomEffectDataset.apply) - val dataAndOptimizationProblems = randomEffectDataset - .activeData - .join(randomEffectOptimizationProblem.optimizationProblems) - - // Left join the models to the (data, optimization problem) tuple for cases where we have a prior model but no new - // data val (newModels, randomEffectOptimizationTracker) = initialRandomEffectModelOpt .map { randomEffectModel => - val modelsAndTrackers = randomEffectModel - .modelsRDD - .leftOuterJoin(dataAndOptimizationProblems) + + val modelsAndTrackers = randomEffectModel.models.join(trainingData, col(randomEffectType), "left_outer") + .rdd + .map { row => + val reid = row.getAs[REId](randomEffectType) + val labeledPoints: Option[Array[LabeledPoint]] = getLabeledPoints(row) + val model = getModel(row) + (reid, (model, labeledPoints)) + } .mapValues { - case (localModel, Some((localDataset, optimizationProblem))) => - val trainingLabeledPoints = localDataset.dataPoints.map(_._2) - val updatedModel = optimizationProblem.run(trainingLabeledPoints, localModel) - val stateTrackers = optimizationProblem.getStatesTracker + case (localModel, Some((localDataset))) => + val (updatedModel, stateTrackers) = optimizationProblem.run(localDataset, localModel) (updatedModel, Some(stateTrackers)) @@ -261,73 +234,136 @@ object RandomEffectCoordinate { (models, optimizationTracker) } .getOrElse { - val modelsAndTrackers = dataAndOptimizationProblems.mapValues { case (localDataset, optimizationProblem) => - val trainingLabeledPoints = localDataset.dataPoints.map(_._2) - val newModel = optimizationProblem.run(trainingLabeledPoints) - val stateTrackers = optimizationProblem.getStatesTracker - - (newModel, stateTrackers) - } - modelsAndTrackers.persist(StorageLevel.MEMORY_ONLY_SER) + val modelsAndTrackers = trainingData + .rdd + .map( + row => { + val reid = row.getAs[REId](randomEffectType) + (reid, getLabeledPoints(row).get) + }) + .mapValues(optimizationProblem.run(_)) + modelsAndTrackers.persist(StorageLevel.MEMORY_AND_DISK_SER) val models = modelsAndTrackers.mapValues(_._1) val optimizationTracker = RandomEffectOptimizationTracker(modelsAndTrackers.map(_._2._2)) - (models, optimizationTracker) } val newRandomEffectModel = new RandomEffectModel( - newModels, - randomEffectDataset.randomEffectType, - randomEffectDataset.featureShardId) + RandomEffectModel.toDataFrame(newModels), + randomEffectType, + featureShardId) (newRandomEffectModel, randomEffectOptimizationTracker) } /** - * Score a [[RandomEffectDataset]] using a given [[RandomEffectModel]]. + * Get the score field name + * @param rEType Random effect type + * @return A field name + */ + def getScoreFieldName(rEType: REType): String = { + return s"${rEType}_score" + } + + /** + * Create a generalized linear model from an input row + * @param row An input row + * @return A generalized linear model + */ + def getModel(row: Row): GeneralizedLinearModel = { + val coefficients = Coefficients(VectorUtils.mlToBreeze(row.getAs[SparkVector](DataConst.COEFFICIENTS))) + val modelType: TaskType = TaskType.withName(row.getAs[String](DataConst.MODEL_TYPE)) + val model = modelType match { + case TaskType.LINEAR_REGRESSION => + LinearRegressionModel(coefficients) + case TaskType.LOGISTIC_REGRESSION => + LogisticRegressionModel(coefficients) + case TaskType.POISSON_REGRESSION => + PoissonRegressionModel(coefficients) + case TaskType.SMOOTHED_HINGE_LOSS_LINEAR_SVM => + SmoothedHingeLossLinearSVMModel(coefficients) + } + model + } + + /** + * Create an optional array of labeled points + * @param row An input row + * @return An optional array of labeled points + */ + def getLabeledPoints(row: Row): Option[Array[LabeledPoint]] = { + + val features = row.getAs[List[SparkVector]]("features") + val labels = row.getAs[List[Double]]("labels") + val offsets = row.getAs[List[Double]]("offsets") + val weights = row.getAs[List[Double]]("weights") + + if (features != null) { + require(features.size == labels.size) + require(features.size == offsets.size) + require(features.size == weights.size) + + val result = new mutable.ArrayBuffer[LabeledPoint](features.size) + + for (i <- features.indices) + result += LabeledPoint(labels(i), VectorUtils.mlToBreeze(features(i)), offsets(i), weights(i)) + + Option.apply(result.toArray) + } else { + None + } + } + + /** + * Score a dataset using a given [[RandomEffectModel]]. * - * For information about the differences between active and passive data, see the [[RandomEffectDataset]] + * For information about the differences between active and passive data * documentation. * * @note The score is the raw dot product of the model coefficients and the feature values - it does not go through a * non-linear link function. - * @param randomEffectDataset The [[RandomEffectDataset]] to score + * @param dataset The data set to score * @param randomEffectModel The [[RandomEffectModel]] with which to score * @return The computed scores */ - protected[algorithm] def score( - randomEffectDataset: RandomEffectDataset, - randomEffectModel: RandomEffectModel): CoordinateDataScores = { - - // There may be more models than active data. However, since we're computing residuals for future coordinates, no - // data means no residual. Therefore, we use an inner join. Note that the active data and models use the same - // partitioner, but scores need to use GameDatum partitioner. - val activeScores = randomEffectDataset - .activeData - .join(randomEffectModel.modelsRDD) - .flatMap { case (_, (localDataset, model)) => - localDataset.dataPoints.map { case (uniqueId, labeledPoint) => - (uniqueId, model.computeScore(labeledPoint.features)) - } - } - .partitionBy(randomEffectDataset.uniqueIdPartitioner) - - // Passive data already uses the GameDatum partitioner. Note that this code assumes few (if any) entities have a - // passive dataset. - val passiveDataREIds = randomEffectDataset.passiveDataREIds - val modelsForPassiveData = randomEffectModel - .modelsRDD - .filter { case (reId, _) => - passiveDataREIds.value.contains(reId) - } - .collectAsMap() - val passiveScores = randomEffectDataset - .passiveData - .mapValues { case (randomEffectId, labeledPoint) => - modelsForPassiveData(randomEffectId).computeScore(labeledPoint.features) - } - - new CoordinateDataScores(activeScores ++ passiveScores) + def updateOffset( + dataset: DataFrame, randomEffectModel: RandomEffectModel, featureShardId: FeatureShardId, + rEType: REType, + inputColumnsNames: InputColumnsNames): DataFrame = { + + require( + featureShardId == randomEffectModel.featureShardId, + s"Random effect coordinate featureShardId ${featureShardId} != model.featureShardId ${ + randomEffectModel + .featureShardId + }") + + require( + rEType == randomEffectModel.randomEffectType, + s"Random effect coordinate randomEffectType ${rEType} != model.randomEffectType ${ + randomEffectModel + .randomEffectType + }") + + val scoreField = getScoreFieldName(rEType) + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val hasOffsetField = ApiUtils.hasColumn(dataset, offset) + val hasCoordinateScoreField = ApiUtils.hasColumn(dataset, scoreField) + + if (hasOffsetField && hasCoordinateScoreField) { + // offset = offset - old_coordinateScore + new_coordinateScore + dataset.withColumn(offset, col(offset) - col(scoreField)) + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(offset) + col(scoreField)) + } else if (!hasOffsetField && !hasCoordinateScoreField) { + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(scoreField)) + } else if (hasOffsetField && !hasCoordinateScoreField) { + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(offset) + col(scoreField)) + } else { + throw new UnsupportedOperationException("It shouldn't happen!") + } } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala index 58543b38..a6ec1b29 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala @@ -14,8 +14,10 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.RandomEffectDataset -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores +import org.apache.spark.sql.DataFrame + +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} /** @@ -23,9 +25,12 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} * * @param dataset The training dataset */ -class RandomEffectModelCoordinate(dataset: RandomEffectDataset) - extends ModelCoordinate(dataset) - with ModelProjection { +class RandomEffectModelCoordinate( + rEType: REType, + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) + extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. @@ -33,10 +38,11 @@ class RandomEffectModelCoordinate(dataset: RandomEffectDataset) * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { + model match { case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, projectModelForward(randomEffectModel)) + RandomEffectCoordinate.updateOffset(dataset, randomEffectModel, featureShardId, rEType, inputColumnsNames) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala index f1a2642c..1a06a51b 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala @@ -28,7 +28,7 @@ sealed trait CoordinateDataConfiguration { } /** - * Configuration needed in order to generate a [[com.linkedin.photon.ml.data.FixedEffectDataset]]. + * Configuration needed in order to generate a FixedEffectCoordinate. * * @param featureShardId Key of the feature shard used to generate the dataset * @param minNumPartitions Minimum number of data partitions @@ -39,7 +39,7 @@ case class FixedEffectDataConfiguration( extends CoordinateDataConfiguration /** - * Configurations needed in order to generate a [[com.linkedin.photon.ml.data.RandomEffectDataset]]. + * Configurations needed in order to generate a RandomEffectCoordinate. * * @param randomEffectType The corresponding random effect type of the dataset * @param featureShardId Key of the feature shard used to generate the dataset diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala deleted file mode 100644 index 5c7154e1..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel - -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.spark.RDDLike - -/** - * Dataset implementation for fixed effect datasets. - * - * @param labeledPoints The input data - * @param featureShardId The feature shard id - */ -protected[ml] class FixedEffectDataset( - val labeledPoints: RDD[(UniqueSampleId, LabeledPoint)], - val featureShardId: FeatureShardId) - extends Dataset[FixedEffectDataset] - with RDDLike { - - lazy val numFeatures: Int = labeledPoints.first()._2.features.length - - /** - * Add scores to data offsets. - * - * @param scores The scores used throughout the coordinate descent algorithm - * @return An updated dataset with scores added to offsets - */ - override def addScoresToOffsets(scores: CoordinateDataScores): FixedEffectDataset = { - - // It's possible that other coordinates did not score some data. Since we're trying to add scores to the offset and - // the default score is 0, the result of a left join vs. an inner join is the same. However, an inner join will drop - // data which does not have a score. Thus, we need a left join. - val updatedLabeledPoints = labeledPoints - .leftOuterJoin(scores.scoresRdd) - .mapValues { case (LabeledPoint(label, features, offset, weight), scoreOpt) => - LabeledPoint(label, features, offset + scoreOpt.getOrElse(MathConst.DEFAULT_SCORE), weight) - } - - new FixedEffectDataset(updatedLabeledPoints, featureShardId) - } - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = labeledPoints.sparkContext - - /** - * Assign a given name to [[labeledPoints]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[labeledPoints]] assigned - */ - override def setName(name: String): FixedEffectDataset = { - - labeledPoints.setName(name) - - this - } - - /** - * Set the storage level of [[labeledPoints]], and persist their values across the cluster the first time they are - * computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[labeledPoints]] set - */ - override def persistRDD(storageLevel: StorageLevel): FixedEffectDataset = { - - if (!labeledPoints.getStorageLevel.isValid) labeledPoints.persist(storageLevel) - - this - } - - /** - * Mark [[labeledPoints]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[labeledPoints]] marked non-persistent - */ - override def unpersistRDD(): FixedEffectDataset = { - - if (labeledPoints.getStorageLevel.isValid) labeledPoints.unpersist() - - this - } - - /** - * Materialize [[labeledPoints]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[labeledPoints]] materialized - */ - override def materialize(): FixedEffectDataset = { - - labeledPoints.count() - - this - } - - /** - * Build a summary string for the dataset. - * - * @return A String representation of the dataset - */ - override def toSummaryString: String = { - - val numSamples = labeledPoints.count() - val weightSum = labeledPoints.values.map(_.weight).sum() - val responseSum = labeledPoints.values.map(_.label).sum() - val featureStats = labeledPoints.values.map(_.features.activeSize).stats() - - s"numSamples: $numSamples\n" + - s"weightSum: $weightSum\n" + - s"responseSum: $responseSum\n" + - s"numFeatures: $numFeatures\n" + - s"featureStats: $featureStats" - } -} - -object FixedEffectDataset { - - /** - * Build an instance of a fixed effect dataset for the given feature shard. - * - * @param gameDataset The input dataset - * @param featureShardId The feature shard ID - * @return A new dataset with given configuration - */ - protected[ml] def apply( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - featureShardId: FeatureShardId): FixedEffectDataset = { - - val labeledPoints = gameDataset.mapValues(_.generateLabeledPointWithFeatureShardId(featureShardId)) - - new FixedEffectDataset(labeledPoints, featureShardId) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala deleted file mode 100644 index 71d80498..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.linalg.SparseVector -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} - -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.util.VectorUtils - -/** - * A collection of utility functions for converting to and from GAME datasets. - */ -object GameConverters { - - /** - * Converts a [[DataFrame]] into an [[RDD]] of type [[GameDatum]]. - * - * @note We "decode" the map of column names into an Array[String] which we broadcast for performance. The - * "inputColumnNames" contains the user-specified custom names of columns required by GAME, with default names - * for the unspecified columns. - * @param data The source [[DataFrame]] - * @param featureShards A set of feature shard ids - * @param idTagSet The set of columns/metadata fields expected for each [[Row]] in the [[DataFrame]] - * @param isResponseRequired Whether a response column is mandatory. For example: [[GameDatum]] used for training - * require a response for each [[Row]]; [[GameDatum]] used for scoring do not. - * @param inputColumnsNames User-supplied input column names to read the input data - * @return An [[RDD]] of type [[GameDatum]] - */ - protected[ml] def getGameDatasetFromDataFrame( - data: DataFrame, - featureShards: Set[FeatureShardId], - idTagSet: Set[String], - isResponseRequired: Boolean, - inputColumnsNames: InputColumnsNames = InputColumnsNames()): RDD[(UniqueSampleId, GameDatum)] = { - - val colNamesSet = inputColumnsNames.getNames - - // Cannot use response, offset, weight, or uid fields as fields for grouping random effects or queries - require( - idTagSet.intersect(colNamesSet).isEmpty, - s"Cannot use required columns (${colNamesSet.mkString(", ")}) for random effect/validation grouping.") - - val inputColumnsNamesBroadcast = data.sqlContext.sparkContext.broadcast(inputColumnsNames) - - data - .rdd - .zipWithUniqueId - .map { case (row, index) => - (index, getGameDatumFromRow(row, featureShards, idTagSet, isResponseRequired, inputColumnsNamesBroadcast)) - } - } - - /** - * Build a [[GameDatum]] from a [[DataFrame]] [[Row]]. - * - * @param row The source [[DataFrame]] [[Row]] (must contain [[SparseVector]] instances) - * @param featureShards A set of feature shard ids - * @param idTagSet The set of columns/metadata fields expected for the [[Row]] - * @param isResponseRequired Whether a response column is mandatory. For example: [[GameDatum]] used for training - * require a response for the [[Row]]; [[GameDatum]] used for scoring do not. - * @param columnsBroadcast The names of the columns to look for in the input rows, in order - * @return A [[GameDatum]] - */ - protected[data] def getGameDatumFromRow( - row: Row, - featureShards: Set[String], - idTagSet: Set[String], - isResponseRequired: Boolean, - columnsBroadcast: Broadcast[InputColumnsNames]): GameDatum = { - - val columns = columnsBroadcast.value - - val featureShardContainer = featureShards.map { shardId => - val features = row.getAs[SparseVector](shardId) - (shardId, VectorUtils.mlToBreeze(features)) - }.toMap - - val response = if (isResponseRequired) { - row.getAs[Number](columns(InputColumnsNames.RESPONSE)).doubleValue - } else { - if (row.schema.fieldNames.contains(columns(InputColumnsNames.RESPONSE))) { - row.getAs[Number](columns(InputColumnsNames.RESPONSE)).doubleValue - } else { - Double.NaN - } - } - - val offset = if (row.schema.fieldNames.contains(columns(InputColumnsNames.OFFSET))) { - Option(row.getAs[Number](columns(InputColumnsNames.OFFSET))).map(_.doubleValue) - } else { - None - } - - val weight = if (row.schema.fieldNames.contains(columns(InputColumnsNames.WEIGHT))) { - Option(row.getAs[Number](columns(InputColumnsNames.WEIGHT))).map(_.doubleValue) - } else { - None - } - - val idTagToValueMap = - // TODO: find a better way to handle the field "uid", which is used in ScoringResult - if (row.schema.fieldNames.contains(columns(InputColumnsNames.UID)) - && row.getAs[Any](columns(InputColumnsNames.UID)) != null) { - getIdTagToValueMapFromRow(row, idTagSet, columns) + - (InputColumnsNames.UID.toString -> row.getAs[Any](columns(InputColumnsNames.UID)).toString) - } else { - getIdTagToValueMapFromRow(row, idTagSet, columns) - } - - new GameDatum( - response, - offset, - weight, - featureShardContainer, - idTagToValueMap) - } - - /** - * Given a [[DataFrame]] [[Row]], build a map of ID tag to ID value. - * - * @param row The source DataFrame row - * @param idTagSet The set of columns/metadata fields expected for the [[Row]] - * @return The map of ID tag to ID value map for the [[Row]] - */ - protected[data] def getIdTagToValueMapFromRow( - row: Row, - idTagSet: Set[String], - columns: InputColumnsNames = InputColumnsNames()): Map[String, String] = { - - val metaMap: Option[Map[String, String]] = if (row.schema.fieldNames.contains(columns(InputColumnsNames.META_DATA_MAP))) { - Some(row.getAs[Map[String, String]](columns(InputColumnsNames.META_DATA_MAP))) - } else { - None - } - - idTagSet - .map { idTag => - val idFromRow: Option[String] = if (row.schema.fieldNames.contains(idTag)) { - Some(row.getAs[Any](idTag).toString) - } else { - None - } - - val id = idFromRow - .orElse { - metaMap.flatMap(_.get(idTag)) - } - .getOrElse( - throw new IllegalArgumentException( - s"Cannot find id in either record field: $idTag or in metadataMap with key: #$idTag")) - - // random effect group name -> random effect group id value - // random effect types are assumed to be strings - (idTag, id) - } - .toMap - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala deleted file mode 100644 index 487d7ceb..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import scala.collection.mutable - -import breeze.linalg.Vector - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.util.VectorUtils - -/** - * Local dataset implementation. - * - * @note One design concern is whether to store the local data as a [[Map]] or an [[Array]] (high sort cost, but low - * merge cost vs. no sort cost but high merge cost). Currently, we use an [[Array]] since the data is only sorted - * once, and used as the base for all other data/score [[Array]]s. - * - * @param dataPoints Local data points consists of (globalId, labeledPoint) pairs - */ -protected[ml] case class LocalDataset(dataPoints: Array[(UniqueSampleId, LabeledPoint)]) { - - require( - dataPoints.length > 0, - "Cannot create LocalDataset with empty data array") - - val numDataPoints: Int = dataPoints.length - val numFeatures: Int = dataPoints - .head - ._2 - .features - .length - - /** - * - * @return - */ - def getLabels: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.label) - } - - /** - * - * @return - */ - def getWeights: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.weight) - } - - /** - * - * @return - */ - def getOffsets: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.offset) - } - - /** - * - * @return - */ - def getUniqueIds: Array[UniqueSampleId] = dataPoints.map(_._1) - - /** - * Add the residual scores to the offsets. - * - * @param residualScores The residual scores - * @return The [[LocalDataset]] with updated offsets - */ - def addScoresToOffsets(residualScores: Array[(UniqueSampleId, Double)]): LocalDataset = { - - val updatedDataPoints = dataPoints - .zip(residualScores) - .map { case ((dataId, LabeledPoint(label, features, offset, weight)), (residualScoreId, residualScoreDatum)) => - - require(residualScoreId == dataId, s"residual score Id ($residualScoreId) and data Id ($dataId) don't match!") - - (dataId, LabeledPoint(label, features, residualScoreDatum + offset, weight)) - } - - LocalDataset(updatedDataPoints) - } - - /** - * Filter features by Pearson correlation score. - * - * @param numFeaturesToKeep The number of features to keep - * @return The filtered dataset - */ - def filterFeaturesByPearsonCorrelationScore(numFeaturesToKeep: Int): LocalDataset = { - - val numActiveFeatures: Int = dataPoints.flatMap(_._2.features.activeKeysIterator).toSet.size - - if (numFeaturesToKeep < numActiveFeatures) { - val labelAndFeatures = dataPoints.map { case (_, labeledPoint) => (labeledPoint.label, labeledPoint.features) } - val pearsonScores = LocalDataset.stableComputePearsonCorrelationScore(labelAndFeatures) - - val filteredFeaturesIndexSet = pearsonScores - .toArray - .sortBy { case (_, score) => math.abs(score) } - .takeRight(numFeaturesToKeep) - .map(_._1) - .toSet - - val filteredActivities = dataPoints.map { case (id, LabeledPoint(label, features, offset, weight)) => - - val filteredFeatures = LocalDataset.filterFeaturesWithFeatureIndexSet(features, filteredFeaturesIndexSet) - - (id, LabeledPoint(label, filteredFeatures, offset, weight)) - } - - LocalDataset(filteredActivities) - } else { - this - } - } -} - -object LocalDataset { - - /** - * Factory method for LocalDataset. - * - * @param dataPoints The array of underlying data - * @param isSortedByFirstIndex Whether or not to sort the data by global ID - * @return A new LocalDataset - */ - protected[ml] def apply( - dataPoints: Array[(UniqueSampleId, LabeledPoint)], - isSortedByFirstIndex: Boolean): LocalDataset = { - - if (isSortedByFirstIndex) { - LocalDataset(dataPoints) - } else { - LocalDataset(dataPoints.sortBy(_._1)) - } - } - - /** - * Filter features by feature index. - * - * @param features The original feature set - * @param featureIndexSet The feature index set - * @return The filtered feature vector - */ - private def filterFeaturesWithFeatureIndexSet( - features: Vector[Double], - featureIndexSet: Set[Int]): Vector[Double] = { - - val result = VectorUtils.zeroOfSameType(features) - - features.activeIterator.foreach { case (key, value) => - if (featureIndexSet.contains(key)) { - result(key) = value - } - } - - result - } - - /** - * Compute Pearson correlation scores using a numerically stable algorithm. - * - * @param labelAndFeatures An array of (label, feature) tuples - * @return The Pearson correlation scores for each tuple - */ - protected[ml] def stableComputePearsonCorrelationScore( - labelAndFeatures: Array[(Double, Vector[Double])]): Map[Int, Double] = { - - val featureMeans = mutable.Map[Int, Double]() - val featureUnscaledVars = mutable.Map[Int, Double]() - var labelMean = 0.0 - var labelUnscaledVariance = 0.0 - val unscaledCovariances = mutable.Map[Int, Double]() - var interceptAdded = false - var numSamples = 0 - - labelAndFeatures.foreach { case (label, features) => - numSamples += 1 - - val deltaLabel = label - labelMean - labelMean += deltaLabel / numSamples - labelUnscaledVariance += deltaLabel * (label - labelMean) - - // Note that, if there is duplicated keys in the feature vector, then the following Pearson correlation scores - // calculation will screw up - features.iterator.foreach { case (key, value) => - val prevFeatureMean = featureMeans.getOrElse(key, 0.0) - val deltaFeature = value - prevFeatureMean - val featureMean = prevFeatureMean + deltaFeature / numSamples - - val prevFeatureUnscaledVar = featureUnscaledVars.getOrElse(key, 0.0) - val featureUnscaledVar = prevFeatureUnscaledVar + deltaFeature * (value - featureMean) - - val prevCovariance = unscaledCovariances.getOrElse(key, 0.0) - val unscaledCovariance = prevCovariance + deltaFeature * deltaLabel * (numSamples - 1) / numSamples - - featureMeans.update(key, featureMean) - featureUnscaledVars.update(key, featureUnscaledVar) - unscaledCovariances.update(key, unscaledCovariance) - } - } - - val labelStd = math.sqrt(labelUnscaledVariance) - - featureMeans - .iterator - .map { case (key, featureMean) => - val featureStd = math.sqrt(featureUnscaledVars(key)) - val covariance = unscaledCovariances(key) - - // When the standard deviation of the feature is close to 0 we treat it as the intercept term. - val score = if (featureStd < math.sqrt(numSamples) * MathConst.EPSILON) { - // Note that if the mean and standard deviation are equal to zero, it either means that the feature is constant - if (featureMean == 1.0 && !interceptAdded) { - interceptAdded = true - 1.0 - } else { - 0.0 - } - } else { - covariance / (labelStd * featureStd + MathConst.EPSILON) - } - - require(math.abs(score) <= 1 + MathConst.EPSILON, - s"Computed pearson correlation score is $score, while the score's magnitude should be less than 1. " + - s"(Diagnosis:\n" + - s"featureKey=$key\n" + - s"featureStd=$featureStd\n" + - s"labelStd=$labelStd\n" + - s"covariance=$covariance\n" + - s"numSamples=$numSamples\n" + - s"labelAndFeatures used to compute Pearson correlation score:\n${labelAndFeatures.mkString("\n")}})") - - (key, score) - } - .toMap - } - - /** - * Compute Pearson correlation scores. - * - * @param labelAndFeatures An array of (label, feature) tuples - * @return The Pearson correlation scores for each tuple - */ - protected[ml] def computePearsonCorrelationScore( - labelAndFeatures: Array[(Double, Vector[Double])]): Map[Int, Double] = { - - val featureLabelProductSums = mutable.Map[Int, Double]() - val featureFirstOrderSums = mutable.Map[Int, Double]() - val featureSecondOrderSums = mutable.Map[Int, Double]() - var labelFirstOrderSum = 0.0 - var labelSecondOrderSum = 0.0 - var numSamples = 0 - var interceptAdded = false - - labelAndFeatures.foreach { case (label, features) => - numSamples += 1 - labelFirstOrderSum += label - labelSecondOrderSum += label * label - // Note that, if there is duplicated keys in the feature vector, then the following Pearson correlation scores - // calculation will screw up - features.activeIterator.foreach { case (key, value) => - featureFirstOrderSums.update(key, featureFirstOrderSums.getOrElse(key, 0.0) + value) - featureSecondOrderSums.update(key, featureSecondOrderSums.getOrElse(key, 0.0) + value * value) - featureLabelProductSums.update(key, featureLabelProductSums.getOrElse(key, 0.0) + value * label) - } - } - - featureFirstOrderSums - .keySet - .map { key => - val featureFirstOrderSum = featureFirstOrderSums(key) - val featureSecondOrderSum = featureSecondOrderSums(key) - val featureLabelProductSum = featureLabelProductSums(key) - val numerator = numSamples * featureLabelProductSum - featureFirstOrderSum * labelFirstOrderSum - val std = math.sqrt(math.abs(numSamples * featureSecondOrderSum - featureFirstOrderSum * featureFirstOrderSum)) - val denominator = std * math.sqrt(numSamples * labelSecondOrderSum - labelFirstOrderSum * labelFirstOrderSum) - - // When the standard deviation of the feature is close to 0, we treat it as the intercept term - val score = if (std < MathConst.EPSILON) { - if (interceptAdded) { - 0.0 - } else { - interceptAdded = true - 1.0 - } - } else { - numerator / (denominator + MathConst.EPSILON) - } - - require(math.abs(score) <= 1 + MathConst.EPSILON, - s"Computed pearson correlation score is $score, while the score's magnitude should be less than 1. " + - s"(Diagnosis:\n" + - s"numerator=$numerator\n" + - s"denominator=$denominator\n" + - s"numSamples=$numSamples\n" + - s"featureFirstOrderSum=$featureFirstOrderSum\n" + - s"featureSecondOrderSum=$featureSecondOrderSum\n" + - s"featureLabelProductSum=$featureLabelProductSum\n" + - s"labelFirstOrderSum=$labelFirstOrderSum\n" + - s"labelSecondOrderSum=$labelSecondOrderSum\n" + - s"labelAndFeatures used to compute Pearson correlation score:\n${labelAndFeatures.mkString("\n")}})") - - (key, score) - } - .toMap - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala deleted file mode 100644 index f59e3653..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala +++ /dev/null @@ -1,647 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import scala.collection.mutable -import scala.util.hashing.byteswap64 - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.storage.StorageLevel -import org.apache.spark.{Partitioner, SparkContext} - -import com.linkedin.photon.ml.Types.{FeatureShardId, REId, REType, UniqueSampleId} -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.projector.LinearSubspaceProjector -import com.linkedin.photon.ml.spark.{BroadcastLike, RDDLike} -import com.linkedin.photon.ml.util.VectorUtils - -/** - * Dataset implementation for random effect data. - * - * All of the training data for a single random effect must fit into on Spark partition. The size limit of a single - * Spark partition is 2 GB. If the size of (samples * features) exceeds the maximum size of a single Spark partition, - * the data is split into two sections: active and passive data. - * - * activeData + passiveData = full data set - * - * Active data is used for both training and scoring (to determine residuals for partial score). Passive data is used - * only for scoring. In the vast majority of cases, all data is active data. - * - * @param activeData Per-entity datasets used to train per-entity models and to compute residuals - * @param passiveData Per-entity datasets used only to compute residuals - * @param activeUniqueIdToRandomEffectIds Map of unique sample id to random effect id for active data samples - * @param projectors The per-entity [[LinearSubspaceProjector]] objects used to compress the per-entity feature spaces - * @param randomEffectType The random effect type (e.g. "memberId") - * @param featureShardId The ID of the data feature shard used by this dataset - */ -protected[ml] class RandomEffectDataset( - val activeData: RDD[(REId, LocalDataset)], - val passiveData: RDD[(UniqueSampleId, (REId, LabeledPoint))], - val activeUniqueIdToRandomEffectIds: RDD[(UniqueSampleId, REId)], - val projectors: RDD[(REId, LinearSubspaceProjector)], - val randomEffectType: REType, - val featureShardId: FeatureShardId) - extends Dataset[RandomEffectDataset] - with BroadcastLike - with RDDLike { - - lazy val passiveDataREIds: Broadcast[Set[REId]] = SparkSession - .builder() - .getOrCreate() - .sparkContext - .broadcast(passiveData.map(_._2._1).distinct().collect().toSet) - val randomEffectIdPartitioner: Partitioner = activeData.partitioner.get - val uniqueIdPartitioner: Partitioner = passiveData.partitioner.get - - // - // Dataset functions - // - - /** - * Add residual scores to the data offsets. - * - * @param scores The residual scores - * @return The dataset with updated offsets - */ - override def addScoresToOffsets(scores: CoordinateDataScores): RandomEffectDataset = { - - // It's possible that other coordinates did not score some data. Since we're trying to add scores to the offset and - // the default score is 0, the result of a left join vs. an inner join is the same. However, an inner join will drop - // data which does not have a score. Thus, we need a left join. - val scoresGroupedByRandomEffectId = activeUniqueIdToRandomEffectIds - .leftOuterJoin(scores.scoresRdd, uniqueIdPartitioner) - .map { case (uniqueId, (reId, scoreOpt)) => - (reId, (uniqueId, scoreOpt.getOrElse(MathConst.DEFAULT_SCORE))) - } - .groupByKey(randomEffectIdPartitioner) - .mapValues(_.toArray.sortBy(_._1)) - - // Since we use a left join above, we're guaranteed to have each random effect entity from the active data present - // and thus use an inner join - val updatedActiveData = activeData - .join(scoresGroupedByRandomEffectId, randomEffectIdPartitioner) - .mapValues { case (localData, localScore) => localData.addScoresToOffsets(localScore) } - - // The resultant dataset is only used for training a new model, thus only the active data needs to have scores added - new RandomEffectDataset( - updatedActiveData, - passiveData, - activeUniqueIdToRandomEffectIds, - projectors, - randomEffectType, - featureShardId) - } - - // - // BroadcastLike Functions - // - - /** - * Asynchronously delete cached copies of [[passiveDataREIds]] on all executors. - * - * @return This [[RandomEffectDataset]] with [[passiveDataREIds]] unpersisted - */ - override protected[ml] def unpersistBroadcast(): RandomEffectDataset = { - - passiveDataREIds.unpersist() - - this - } - - // - // RDDLike Functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = activeData.sparkContext - - /** - * Assign a given name to [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the names [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] - * assigned - */ - override def setName(name: String): RandomEffectDataset = { - - activeData.setName(s"$name - Active Data") - passiveData.setName(s"$name - Passive Data") - activeUniqueIdToRandomEffectIds.setName(s"$name - UID to REID") - projectors.setName(s"$name - Projectors") - - this - } - - /** - * Set the storage level of [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]], and persist - * their values across the cluster the first time they are computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[activeData]], [[activeUniqueIdToRandomEffectIds]], and - * [[passiveData]] set - */ - override def persistRDD(storageLevel: StorageLevel): RandomEffectDataset = { - - if (!activeData.getStorageLevel.isValid) activeData.persist(storageLevel) - if (!passiveData.getStorageLevel.isValid) passiveData.persist(storageLevel) - if (!activeUniqueIdToRandomEffectIds.getStorageLevel.isValid) activeUniqueIdToRandomEffectIds.persist(storageLevel) - if (!projectors.getStorageLevel.isValid) projectors.persist(storageLevel) - - this - } - - /** - * Mark [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] as non-persistent, and remove all - * blocks for them from memory and disk. - * - * @return This object with [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] marked - * non-persistent - */ - override def unpersistRDD(): RandomEffectDataset = { - - if (activeData.getStorageLevel.isValid) activeData.unpersist() - if (passiveData.getStorageLevel.isValid) passiveData.unpersist() - if (activeUniqueIdToRandomEffectIds.getStorageLevel.isValid) activeUniqueIdToRandomEffectIds.unpersist() - if (projectors.getStorageLevel.isValid) projectors.unpersist() - - this - } - - /** - * Materialize [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] (Spark [[RDD]]s are lazy - * evaluated: this method forces them to be evaluated). - * - * @return This object with [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] materialized - */ - override def materialize(): RandomEffectDataset = { - - activeData.count() - passiveData.count() - activeUniqueIdToRandomEffectIds.count() - projectors.count() - - this - } - - // - // Summarizable Functions - // - - /** - * Build a human-readable summary for [[RandomEffectDataset]]. - * - * @return A summary of the object in string representation - */ - override def toSummaryString: String = { - - val stringBuilder = new StringBuilder("Random Effect Data Set:") - - val activeDataValues = activeData.values.persist(StorageLevel.MEMORY_ONLY_SER) - - val numActiveSamples = activeUniqueIdToRandomEffectIds.count() - val activeSampleWeightSum = activeDataValues.map(_.getWeights.map(_._2).sum).sum() - val activeSampleResponseSum = activeDataValues.map(_.getLabels.map(_._2).sum).sum() - val numPassiveSamples = passiveData.count() - val passiveSampleResponsesSum = passiveData.values.map(_._2.label).sum() - val numAllSamples = numActiveSamples + numPassiveSamples - val numActiveSamplesStats = activeDataValues.map(_.numDataPoints).stats() - val activeSamplerResponseSumStats = activeDataValues.map(_.getLabels.map(_._2).sum).stats() - val numFeaturesStats = activeDataValues.map(_.numFeatures).stats() - - activeDataValues.unpersist() - - // TODO: Need more descriptive text than just the variable name - stringBuilder.append(s"\nnumActiveSamples: $numActiveSamples") - stringBuilder.append(s"\nactiveSampleWeightSum: $activeSampleWeightSum") - stringBuilder.append(s"\nactiveSampleResponseSum: $activeSampleResponseSum") - stringBuilder.append(s"\nnumPassiveSamples: $numPassiveSamples") - stringBuilder.append(s"\npassiveSampleResponsesSum: $passiveSampleResponsesSum") - stringBuilder.append(s"\nnumAllSamples: $numAllSamples") - stringBuilder.append(s"\nnumActiveSamplesStats: $numActiveSamplesStats") - stringBuilder.append(s"\nactiveSamplerResponseSumStats: $activeSamplerResponseSumStats") - stringBuilder.append(s"\nnumFeaturesStats: $numFeaturesStats") - - stringBuilder.toString() - } -} - -object RandomEffectDataset { - - /** - * Build a new [[RandomEffectDataset]] from the raw data using the given configuration. - * - * @param gameDataset The [[RDD]] of [[GameDatum]] used to generate the random effect dataset - * @param randomEffectDataConfiguration The data configuration for the random effect dataset - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @param existingModelKeysRddOpt Optional set of entities that have existing models - * @return A new [[RandomEffectDataset]] - */ - def apply( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - randomEffectDataConfiguration: RandomEffectDataConfiguration, - randomEffectPartitioner: RandomEffectDatasetPartitioner, - existingModelKeysRddOpt: Option[RDD[REId]], - storageLevel: StorageLevel): RandomEffectDataset = { - - val uniqueIdPartitioner = gameDataset.partitioner.get - - // - // Generate RDDs - // - - val keyedGameDataset = generateKeyedGameDataset(gameDataset, randomEffectDataConfiguration) - keyedGameDataset.persist(StorageLevel.MEMORY_ONLY_SER).count - - // In this RDD, there is a projector for every entity (even those which may later be filtered by the lower bound) - val unfilteredProjectors = generateLinearSubspaceProjectors(keyedGameDataset, randomEffectPartitioner) - unfilteredProjectors.persist(storageLevel).count - - val projectedKeyedGameDataset = generateProjectedDataset(keyedGameDataset, unfilteredProjectors, randomEffectPartitioner) - projectedKeyedGameDataset.persist(StorageLevel.MEMORY_ONLY_SER).count - - val unfilteredActiveData = generateGroupedActiveData( - projectedKeyedGameDataset, - randomEffectDataConfiguration, - randomEffectPartitioner) - - val (activeData, passiveData, uniqueIdToRandomEffectIds, projectors) = - randomEffectDataConfiguration.numActiveDataPointsLowerBound match { - - case Some(activeDataLowerBound) => - - unfilteredActiveData.persist(StorageLevel.MEMORY_ONLY_SER) - - // Filter entities which do not meet active data lower bound threshold - val filteredActiveData = filterActiveData( - unfilteredActiveData, - activeDataLowerBound, - existingModelKeysRddOpt) - filteredActiveData.persist(storageLevel).count - - val passiveData = generatePassiveData( - projectedKeyedGameDataset, - generateIdMap(unfilteredActiveData, uniqueIdPartitioner)) - passiveData.persist(storageLevel).count - - val uniqueIdToRandomEffectIds = generateIdMap(filteredActiveData, uniqueIdPartitioner) - uniqueIdToRandomEffectIds.persist(storageLevel).count - - val filteredProjectors = filterProjectors(unfilteredProjectors, filteredActiveData) - filteredProjectors.persist(storageLevel).count - - unfilteredActiveData.unpersist() - unfilteredProjectors.unpersist() - - (filteredActiveData, passiveData, uniqueIdToRandomEffectIds, filteredProjectors) - - case None => - - unfilteredActiveData.persist(storageLevel).count - - val uniqueIdToRandomEffectIds = generateIdMap(unfilteredActiveData, uniqueIdPartitioner) - uniqueIdToRandomEffectIds.persist(storageLevel).count - - val passiveData = generatePassiveData(projectedKeyedGameDataset, uniqueIdToRandomEffectIds) - passiveData.persist(storageLevel).count - - (unfilteredActiveData, passiveData, uniqueIdToRandomEffectIds, unfilteredProjectors) - } - - // - // Unpersist component RDDs - // - - keyedGameDataset.unpersist() - projectedKeyedGameDataset.unpersist() - - // - // Return new dataset - // - - new RandomEffectDataset( - activeData, - passiveData, - uniqueIdToRandomEffectIds, - projectors, - randomEffectDataConfiguration.randomEffectType, - randomEffectDataConfiguration.featureShardId) - } - - /** - * Process the raw data to be keyed by the [[REId]]s for the given [[REType]], and filter the feature vector for only - * the given shard. - * - * @param gameDataset The [[RDD]] of [[GameDatum]] used to generate the random effect dataset - * @param randomEffectDataConfiguration The data configuration for the random effect dataset - * @return The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - */ - protected[data] def generateKeyedGameDataset( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - randomEffectDataConfiguration: RandomEffectDataConfiguration): RDD[(REId, (UniqueSampleId, LabeledPoint))] = { - - val randomEffectType = randomEffectDataConfiguration.randomEffectType - val featureShardId = randomEffectDataConfiguration.featureShardId - - gameDataset - .map { case (uniqueId, gameData) => - val randomEffectId = gameData.idTagToValueMap(randomEffectType) - val labeledPoint = gameData.generateLabeledPointWithFeatureShardId(featureShardId) - - (randomEffectId, (uniqueId, labeledPoint)) - } - } - - /** - * Generate the [[LinearSubspaceProjector]] objects used to compress the feature vectors for each per-entity dataset. - * - * @param keyedGameDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return An [[RDD]] of per-entity [[LinearSubspaceProjector]] objects - */ - protected[data] def generateLinearSubspaceProjectors( - keyedGameDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - randomEffectPartitioner: RandomEffectDatasetPartitioner): RDD[(REId, LinearSubspaceProjector)] = { - - val originalSpaceDimension = keyedGameDataset - .take(1) - .head - ._2 - ._2 - .features - .length - - keyedGameDataset - .mapValues { case (_, labeledPoint) => - VectorUtils.getActiveIndices(labeledPoint.features) - } - .foldByKey(mutable.Set[Int](), randomEffectPartitioner)(_.union(_)) - .mapValues(activeIndices => new LinearSubspaceProjector(activeIndices.toSet, originalSpaceDimension)) - } - - /** - * Project the per-entity datasets to a linear subspace - thus reducing the size of their feature vectors (for faster - * optimization). - * - * @param keyedGameDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param projectors An [[RDD]] of per-entity [[LinearSubspaceProjector]] objects - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]], with feature vectors - * reduced to the smallest linear subspace possible without loss - */ - protected[data] def generateProjectedDataset( - keyedGameDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - projectors: RDD[(REId, LinearSubspaceProjector)], - randomEffectPartitioner: RandomEffectDatasetPartitioner): RDD[(REId, (UniqueSampleId, LabeledPoint))] = - - keyedGameDataset - .partitionBy(randomEffectPartitioner) - .zipPartitions(projectors) { case (dataIt, projectorsIt) => - - val projectorLookupTable = projectorsIt.toMap - - dataIt.map { case (rEID, (uID, LabeledPoint(label, features, offset, weight))) => - - val projector = projectorLookupTable(rEID) - val projectedFeatures = projector.projectForward(features) - - (rEID, (uID, LabeledPoint(label, projectedFeatures, offset, weight))) - } - } - - /** - * Generate active data, down-sampling using reservoir sampling if the data for any entity exceeds the upper bound. - * - * @param projectedKeyedDataset The input data, keyed by entity ID - * @param randomEffectDataConfiguration The random effect data configuration - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return The input data, grouped by entity ID, and down-sampled if necessary - */ - protected[data] def generateGroupedActiveData( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - randomEffectDataConfiguration: RandomEffectDataConfiguration, - randomEffectPartitioner: Partitioner): RDD[(REId, LocalDataset)] = { - - // Filter data using reservoir sampling if active data size is bounded - val groupedActiveData = randomEffectDataConfiguration - .numActiveDataPointsUpperBound - .map { activeDataUpperBound => - groupDataByKeyAndSample( - projectedKeyedDataset, - randomEffectPartitioner, - activeDataUpperBound, - randomEffectDataConfiguration.randomEffectType) - } - .getOrElse(projectedKeyedDataset.groupByKey(randomEffectPartitioner)) - .mapValues { iterable => - LocalDataset(iterable.toArray, isSortedByFirstIndex = false) - } - - // Filter features if feature dimension of active data is bounded - featureSelectionOnActiveData(groupedActiveData, randomEffectDataConfiguration.numFeaturesToSamplesRatioUpperBound) - } - - /** - * Generate a dataset grouped by random effect ID and limited to a maximum number of samples selected via reservoir - * sampling. - * - * The 'Min Heap' reservoir sampling algorithm is used for two reasons: - * 1. The exact sampling must be reproducible so that [[RDD]] partitions can be recovered - * 2. The linear algorithm is non-trivial to combine in a distributed manner - * - * @param projectedKeyedDataset The raw dataset, with samples keyed by random effect ID - * @param partitioner The partitioner - * @param sampleCap The sample cap - * @param randomEffectType The type of random effect - * @return An [[RDD]] of data grouped by individual ID - */ - private def groupDataByKeyAndSample( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - partitioner: Partitioner, - sampleCap: Int, - randomEffectType: REType): RDD[(REId, Iterable[(UniqueSampleId, LabeledPoint)])] = { - - // Helper class for defining a constant ordering between data samples (necessary for RDD re-computation) - case class ComparableLabeledPointWithId(comparableKey: Int, uniqueId: UniqueSampleId, labeledPoint: LabeledPoint) - extends Comparable[ComparableLabeledPointWithId] { - - override def compareTo(comparableLabeledPointWithId: ComparableLabeledPointWithId): Int = { - if (comparableKey - comparableLabeledPointWithId.comparableKey > 0) { - 1 - } else { - -1 - } - } - } - - val createCombiner = - (comparableLabeledPointWithId: ComparableLabeledPointWithId) => { - new MinHeapWithFixedCapacity[ComparableLabeledPointWithId](sampleCap) += comparableLabeledPointWithId - } - - val mergeValue = ( - minHeapWithFixedCapacity: MinHeapWithFixedCapacity[ComparableLabeledPointWithId], - comparableLabeledPointWithId: ComparableLabeledPointWithId) => { - minHeapWithFixedCapacity += comparableLabeledPointWithId - } - - val mergeCombiners = ( - minHeapWithFixedCapacity1: MinHeapWithFixedCapacity[ComparableLabeledPointWithId], - minHeapWithFixedCapacity2: MinHeapWithFixedCapacity[ComparableLabeledPointWithId]) => { - minHeapWithFixedCapacity1 ++= minHeapWithFixedCapacity2 - } - - // The reservoir sampling algorithm is fault tolerant, assuming that the uniqueId for a sample is recovered after - // node failure. We attempt to maximize the likelihood of successful recovery through RDD replication, however there - // is a non-zero possibility of massive failure. If this becomes an issue, we may need to resort to check-pointing - // the raw data RDD after uniqueId assignment. - projectedKeyedDataset - .mapValues { case (uniqueId, labeledPoint) => - val comparableKey = (byteswap64(randomEffectType.hashCode) ^ byteswap64(uniqueId)).hashCode() - ComparableLabeledPointWithId(comparableKey, uniqueId, labeledPoint) - } - .combineByKey[MinHeapWithFixedCapacity[ComparableLabeledPointWithId]]( - createCombiner, - mergeValue, - mergeCombiners, - partitioner) - .mapValues { minHeapWithFixedCapacity => - val count = minHeapWithFixedCapacity.getCount - val data = minHeapWithFixedCapacity.getData - val weightMultiplierOpt = if (count > sampleCap) Some(1D * count / sampleCap) else None - - data.map { case ComparableLabeledPointWithId(_, uniqueId, LabeledPoint(label, features, offset, weight)) => - (uniqueId, LabeledPoint(label, features, offset, weightMultiplierOpt.map(_ * weight).getOrElse(weight))) - } - } - } - - /** - * Filter out entities with less data than a given threshold. - * - * @param groupedActiveData An [[RDD]] of data grouped by entity ID - * @param numActiveDataPointsLowerBound Threshold for number of data points require to receive a per-entity model - * @param existingModelKeysRddOpt Optional set of entities that have existing models - * @return The input data with entities that did not meet the minimum sample threshold removed - */ - protected[data] def filterActiveData( - groupedActiveData: RDD[(REId, LocalDataset)], - numActiveDataPointsLowerBound: Int, - existingModelKeysRddOpt: Option[RDD[REId]]): RDD[(REId, LocalDataset)] = - - existingModelKeysRddOpt match { - case Some(existingModelKeysRdd) => - groupedActiveData.zipPartitions(existingModelKeysRdd, preservesPartitioning = true) { (dataIt, existingKeysIt) => - - val lookupTable = existingKeysIt.toSet - - dataIt.filter { case (key, data) => - (data.numDataPoints >= numActiveDataPointsLowerBound) || !lookupTable.contains(key) - } - } - - case None => - groupedActiveData.filter { case (_, data) => - data.numDataPoints >= numActiveDataPointsLowerBound - } - } - - /** - * Reduce active data feature dimension for entities with few samples. The maximum feature dimension is limited to - * the number of samples multiplied by the feature dimension ratio. Features are chosen by greatest Pearson - * correlation score. - * - * @param activeData An [[RDD]] of data grouped by entity ID - * @param numFeaturesToSamplesRatioUpperBoundOpt Optional ratio of samples to feature dimension - * @return The input data with feature dimension reduced for entities whose feature dimension greatly exceeded the - * number of available samples - */ - private def featureSelectionOnActiveData( - activeData: RDD[(REId, LocalDataset)], - numFeaturesToSamplesRatioUpperBoundOpt: Option[Double]): RDD[(REId, LocalDataset)] = - numFeaturesToSamplesRatioUpperBoundOpt - .map { numFeaturesToSamplesRatioUpperBound => - activeData.mapValues { localDataset => - - var numFeaturesToKeep = math.ceil(numFeaturesToSamplesRatioUpperBound * localDataset.numDataPoints).toInt - // In case the above product overflows - if (numFeaturesToKeep < 0) numFeaturesToKeep = Int.MaxValue - - localDataset.filterFeaturesByPearsonCorrelationScore(numFeaturesToKeep) - } - } - .getOrElse(activeData) - - /** - * Generate a map of unique sample id to random effect id for active data samples. - * - * @param activeData The active dataset - * @param partitioner The [[Partitioner]] to use for the [[RDD]] of unique sample ID to random effect ID - * @return A map of unique sample id to random effect id for active data samples - */ - protected[data] def generateIdMap( - activeData: RDD[(REId, LocalDataset)], - partitioner: Partitioner): RDD[(UniqueSampleId, REId)] = - activeData - .flatMap { case (individualId, localDataset) => - localDataset.getUniqueIds.map((_, individualId)) - } - .partitionBy(partitioner) - - /** - * Generate passive dataset. - * - * @param projectedKeyedDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param activeUniqueIDs The unique IDs of the active dataset - * @return The passive dataset - */ - protected[data] def generatePassiveData( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - activeUniqueIDs: RDD[(UniqueSampleId, REId)]): RDD[(UniqueSampleId, (REId, LabeledPoint))] = { - - val passiveDataPool = projectedKeyedDataset.map { case (rEID, (uniqueID, labeledPoint)) => - (uniqueID, (rEID, labeledPoint)) - } - - passiveDataPool.subtractByKey(activeUniqueIDs) - } - - /** - * Filter out projectors for entities which were filtered out. - * - * @param unfilteredProjectors The unfiltered projectors - * @param filteredActiveData The filtered active data - * @return [[unfilteredProjectors]] with all projectors for entities not in [[filteredActiveData]] removed - */ - protected[data] def filterProjectors( - unfilteredProjectors: RDD[(REId, LinearSubspaceProjector)], - filteredActiveData: RDD[(REId, LocalDataset)]): RDD[(REId, LinearSubspaceProjector)] = - // Both RDDs use the same partitioner, thus there should be no shuffle. Use inner join to drop projectors for - // filtered entities. - filteredActiveData - .join(unfilteredProjectors) - .map { case (rEId, (_, projector)) => (rEId, projector) } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala deleted file mode 100644 index fcc81e13..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import scala.collection.{Map, immutable, mutable} - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.{HashPartitioner, Partitioner} - -import com.linkedin.photon.ml.Types.REId -import com.linkedin.photon.ml.spark.BroadcastLike - -/** - * Partitioner implementation for random effect datasets. - * - * In GAME, we can improve on Spark default partitioning by using domain-specific knowledge in two ways. First, we can - * reduce time spent in shuffle operations by leveraging training record keys (helping joins). Second, we assume that - * each random effect has less than the maximum partition size of associated training data, i.e. that all the training - * data for a given RE will fit within a single Spark data partition. So we can group the training records so that they - * all land in the same partition for a given RE, which is what RandomEffectDatasetPartitioner is about. - * - * RandomEffectDatasetPartitioner also makes sure that partitions are as equally balanced as possible, to equalize the - * workload of the executors: because we assume the data for each random effect is small, it will usually not even fill - * a Spark data partition, so we fill up the partition (i.e. add (id/partition) records to idToPartitionMap with data - * for multiple random effects). However, since idToPartitionMap is eventually broadcast to the executors, we also want - * to keep the size of that Map under control (see parameter partitionerCapacity below). - * - * @param numPartitions Number of partitions across which to split random effects - * @param idToPartitionMap Random effect type to partition map - */ -protected[ml] class RandomEffectDatasetPartitioner( - val numPartitions: Int, - private val idToPartitionMap: Broadcast[Map[REId, Int]]) - extends Partitioner - with BroadcastLike { - - // Backup partitioner for random effect IDs not found in the primary assignment Map - lazy private val backupPartitioner: HashPartitioner = new HashPartitioner(numPartitions) - - /** - * Asynchronously delete cached copies of this broadcast on the executors. - * - * @return This object with all its broadcast variables unpersisted - */ - override def unpersistBroadcast(): this.type = { - idToPartitionMap.unpersist() - this - } - - /** - * Compares two [[RandomEffectDatasetPartitioner]] objects. - * - * @param that Some other object - * @return True if the two partitioners have the same idToPartitionMap, false otherwise - */ - override def equals(that: Any): Boolean = - that match { - case other: RandomEffectDatasetPartitioner => this.idToPartitionMap.value.equals(other.idToPartitionMap.value) - case _ => false - } - - /** - * Returns a hash code value for the object. - * - * @return An [[Int]] hash code - */ - override def hashCode: Int = idToPartitionMap.hashCode() - - /** - * For a given key, get the corresponding partition id. If the key is not in any partition, we randomly assign - * the training vector to a partition (with Spark's HashPartitioner). - * - * @param key A training vector key (String). - * @return The partition id to which the training vector belongs. - */ - def getPartition(key: Any): Int = key match { - case reId: REId => - idToPartitionMap.value.getOrElse(reId, backupPartitioner.getPartition(reId)) - - case any => - throw new IllegalArgumentException(s"Expected key of ${this.getClass} is String, but ${any.getClass} found") - } -} - -object RandomEffectDatasetPartitioner { - - /** - * Generate a partitioner for one random effect model. - * - * Multiple random effect models, one per random effect ID (e.g. "user123"), are instantiated for a single random - * effect type (e.g. "per-user"), and each of these instantiations is trained with training vectors marked for that - * random effect ID. We collect the training vector ids that correspond to the random effect type, then build an id - * to partition map. Data should be distributed across partitions as equally as possible. Since some items have more - * data points than others, this partitioner uses simple 'bin packing' for distributing data load across partitions - * (using minHeap). - * - * We stop filling in idToPartitionMap at partitionerCapacity records, because this map is passed to the executors - * and we therefore wish to control/limit its size. - * - * @param gameDataset The GAME training dataset - * @param reConfig The random effect data configuration options - * @param partitionerCapacity The partitioner capacity - * @return A partitioner for one random effect model - */ - def fromGameDataset( - gameDataset: RDD[(Long, GameDatum)], - reConfig: RandomEffectDataConfiguration, - partitionerCapacity: Int = 10000): RandomEffectDatasetPartitioner = { - - val numPartitions = reConfig.minNumPartitions - val randomEffectType = reConfig.randomEffectType - val activeDataUpperBoundOpt = reConfig.numActiveDataPointsUpperBound - - require(numPartitions > 0, s"Number of partitions ($numPartitions) has to be larger than 0.") - - val rawSortedRandomEffectTypes = gameDataset - .values - .filter(_.idTagToValueMap.contains(randomEffectType)) - .map(gameData => (gameData.idTagToValueMap(randomEffectType), 1)) - .reduceByKey(_ + _) - .collect() - .sortBy(_._2 * -1) - .take(partitionerCapacity) - - // If the number of active samples is bounded, we can partition them better by using the bound as the count - val sortedRandomEffectTypes = activeDataUpperBoundOpt match { - case Some(bound) => - rawSortedRandomEffectTypes.map { case (reId, count) => - - val newCount = if (count > bound) bound else count - - (reId, newCount) - } - - case None => - rawSortedRandomEffectTypes - } - - val ordering = new Ordering[(Int, Int)] { - def compare(pair1: (Int, Int), pair2: (Int, Int)): Int = pair2._2 compare pair1._2 - } - - val minHeap = mutable.PriorityQueue.newBuilder[(Int, Int)](ordering) - minHeap ++= Array.tabulate[(Int, Int)](numPartitions)(i => (i, 0)) - val idToPartitionMapBuilder = immutable.Map.newBuilder[String, Int] - idToPartitionMapBuilder.sizeHint(numPartitions) - - sortedRandomEffectTypes.foreach { case (id, size) => - val (partition, currentSize) = minHeap.dequeue() - idToPartitionMapBuilder += id -> partition - minHeap.enqueue((partition, currentSize + size)) - } - - new RandomEffectDatasetPartitioner( - numPartitions, - gameDataset.sparkContext.broadcast(idToPartitionMapBuilder.result())) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index c78d51d3..092dbf32 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -15,6 +15,7 @@ package com.linkedin.photon.ml.estimators import scala.language.existentials +import scala.util.Random import org.apache.commons.cli.MissingArgumentException import org.apache.spark.SparkContext @@ -22,18 +23,20 @@ import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.udf import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId, UniqueSampleId} +import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} import com.linkedin.photon.ml.algorithm._ +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.function.ObjectiveFunctionHelper import com.linkedin.photon.ml.function.glm._ -import com.linkedin.photon.ml.model.{GameModel, RandomEffectModel} +import com.linkedin.photon.ml.model.GameModel import com.linkedin.photon.ml.normalization._ import com.linkedin.photon.ml.optimization.VarianceComputationType import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType @@ -302,14 +305,14 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P * Fits a GAME model to the training dataset, once per configuration. * * @param data The training set - * @param validationData Optional validation set for per-iteration validation + * @param validationDataOpt Optional validation set for per-iteration validation * @param optimizationConfigurations A set of GAME optimization configurations * @return A set of (trained GAME model, optional evaluation results, GAME model configuration) tuples, one for each * configuration */ def fit( data: DataFrame, - validationData: Option[DataFrame], + validationDataOpt: Option[DataFrame], optimizationConfigurations: Seq[GameOptimizationConfiguration]): Seq[GameResult] = { // Verify valid GameEstimator settings @@ -318,45 +321,16 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P // Verify valid function input validateInput(optimizationConfigurations) - // Group additional columns to include in GameDatum - val randomEffectIdCols: Set[String] = getRequiredParam(coordinateDataConfigurations) - .flatMap { case (_, config) => - config match { - case reConfig: RandomEffectDataConfiguration => Some(reConfig.randomEffectType) - case _ => None - } - } - .toSet - val evaluatorCols = get(validationEvaluators).map(MultiEvaluatorType.getMultiEvaluatorIdTags).getOrElse(Set()) - val additionalCols = randomEffectIdCols ++ evaluatorCols - - // Gather the names of the feature shards used by the coordinates - val featureShards = getRequiredParam(coordinateDataConfigurations) - .map { case (_, coordinateDataConfig) => - coordinateDataConfig.featureShardId - } - .toSet - - // Transform the GAME training data set into fixed and random effect specific datasets - val gameDataset = Timed("Process training data from raw DataFrame to RDD of samples") { - prepareGameDataset(data, featureShards, additionalCols) - } - val trainingDatasets = Timed("Prepare training data") { - prepareTrainingDatasets(gameDataset) - } - // Transform the GAME validation data set into fixed and random effect specific data sets - val validationDatasetAndEvaluationSuiteOpt = Timed("Prepare validation data, if any") { - prepareValidationDatasetAndEvaluators( - validationData, - featureShards, - additionalCols) + val evaluationSuiteOpt = Timed("Prepare validation data, if any") { + validationDataOpt.map(validationData => prepareValidationEvaluators(validationData)) } val coordinateDescent = new CoordinateDescent( getRequiredParam(coordinateUpdateSequence), getOrDefault(coordinateDescentIterations), - validationDatasetAndEvaluationSuiteOpt, + validationDataOpt, + evaluationSuiteOpt, getOrDefault(partialRetrainLockedCoordinates), logger) @@ -370,8 +344,8 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P optimizationConfigurations.map { optimizationConfiguration => val (gameModel, evaluations) = train( + data, optimizationConfiguration, - trainingDatasets, coordinateDescent, prevGameModel) @@ -381,23 +355,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - // Purge the raw GAME data, training data, validation data, and normalization contexts in reverse order of - // definition - gameDataset.unpersist() - trainingDatasets.foreach { case (_, dataset) => - dataset match { - case rddLike: RDDLike => rddLike.unpersistRDD() - case _ => - } - dataset match { - case broadcastLike: BroadcastLike => broadcastLike.unpersistBroadcast() - case _ => - } - } - validationDatasetAndEvaluationSuiteOpt.map { case (validationDataset, evaluationSuite) => - validationDataset.unpersist() - evaluationSuite.unpersistRDD() - } + evaluationSuiteOpt.map(_.unpersistRDD()) // Return the trained models, along with validation information (if any), and model configuration results @@ -438,145 +396,22 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - /** - * Construct a [[RDD]] of data processed into GAME format from a raw [[DataFrame]]. - * - * @param data The raw [[DataFrame]] - * @param featureShards The IDs of the feature shards to keep - * @param additionalCols The names of fields containing information necessary for random effects or evaluation - * @return A [[RDD]] of data processed into GAME format - */ - protected def prepareGameDataset( - data: DataFrame, - featureShards: Set[FeatureShardId], - additionalCols: Set[String]): RDD[(UniqueSampleId, GameDatum)] = - GameConverters - .getGameDatasetFromDataFrame( - data, - featureShards, - additionalCols, - isResponseRequired = true, - getOrDefault(inputColumnNames)) - .partitionBy(new LongHashPartitioner(data.rdd.getNumPartitions)) - .setName("GAME training data") - .persist(StorageLevel.DISK_ONLY) - - /** - * Construct one or more [[Dataset]]s from an [[RDD]] of samples. - * - * @param gameDataset The training data samples - * @return A map of coordinate ID to training [[Dataset]] - */ - protected def prepareTrainingDatasets( - gameDataset: RDD[(UniqueSampleId, GameDatum)]): Map[CoordinateId, D forSome { type D <: Dataset[D] }] = { - - val coordinateDataConfigs = getRequiredParam(coordinateDataConfigurations) - - coordinateDataConfigs.map { case (coordinateId, config) => - - val result = config match { - - case feConfig: FixedEffectDataConfiguration => - - val fixedEffectDataset = FixedEffectDataset(gameDataset, feConfig.featureShardId) - .setName(s"Fixed Effect Dataset: $coordinateId") - .persistRDD(StorageLevel.DISK_ONLY) - - if (logger.isDebugEnabled) { - // Eval this only in debug mode, because the call to "toSummaryString" can be very expensive - logger.debug( - s"Summary of fixed effect dataset with coordinate ID '$coordinateId':\n" + - s"${fixedEffectDataset.toSummaryString}") - } - - (coordinateId, fixedEffectDataset) - - case reConfig: RandomEffectDataConfiguration => - - val rePartitioner = RandomEffectDatasetPartitioner.fromGameDataset(gameDataset, reConfig) - val existingModelKeysRddOpt = if (getOrDefault(ignoreThresholdForNewModels)) { - getRequiredParam(initialModel).getModel(coordinateId).map { - case rem: RandomEffectModel => - rem.modelsRDD.partitionBy(rePartitioner).keys - - case other => - throw new IllegalArgumentException( - s"Model type mismatch: expected Random Effect Model but found '${other.getClass}'") - } - } else { - None - } - - val randomEffectDataset = RandomEffectDataset( - gameDataset, - reConfig, - rePartitioner, - existingModelKeysRddOpt, - StorageLevel.DISK_ONLY) - randomEffectDataset.setName(s"Random Effect Data Set: $coordinateId") - - if (logger.isDebugEnabled) { - // Eval this only in debug mode, because the call to "toSummaryString" can be very expensive - logger.debug( - s"Summary of random effect dataset with coordinate ID $coordinateId:\n" + - s"${randomEffectDataset.toSummaryString}\n") - } - - (coordinateId, randomEffectDataset) - } - - result.asInstanceOf[(CoordinateId, D forSome { type D <: Dataset[D] })] - } - } - - /** - * Optionally construct an [[RDD]] of validation data samples, and an [[EvaluationSuite]] to compute evaluation metrics - * over the validation data. - * - * @param dataOpt Optional [[DataFrame]] of validation data - * @param featureShards The feature shard columns to import from the [[DataFrame]] - * @param additionalCols A set of additional columns whose values should be maintained for validation evaluation - * @return An optional ([[RDD]] of validation data, validation metric [[EvaluationSuite]]) tuple - */ - protected def prepareValidationDatasetAndEvaluators( - dataOpt: Option[DataFrame], - featureShards: Set[FeatureShardId], - additionalCols: Set[String]): Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)] = - - dataOpt.map { data => - val partitioner = new LongHashPartitioner(data.rdd.partitions.length) - val gameDataset = Timed("Convert validation data from raw DataFrame to processed RDD of GAME data") { - GameConverters - .getGameDatasetFromDataFrame( - data, - featureShards, - additionalCols, - isResponseRequired = true, - getOrDefault(inputColumnNames)) - .partitionBy(partitioner) - .setName("Validation Game dataset") - .persist(StorageLevel.DISK_ONLY) - } - val evaluationSuite = Timed("Prepare validation metric evaluators") { - prepareValidationEvaluators(gameDataset) - } - - (gameDataset, evaluationSuite) - } - /** * Construct the validation [[EvaluationSuite]]. * - * @param gameDataset An [[RDD]] of validation data samples + * @param dataset An [[RDD]] of validation data samples * @return [[EvaluationSuite]] containing one or more validation metric [[Evaluator]] objects */ - protected def prepareValidationEvaluators(gameDataset: RDD[(UniqueSampleId, GameDatum)]): EvaluationSuite = { + protected def prepareValidationEvaluators(dataset: DataFrame): EvaluationSuite = { + + val columnsNames = getOrDefault(inputColumnNames) + val response = columnsNames(InputColumnsNames.RESPONSE) + val offset = columnsNames(InputColumnsNames.OFFSET) + val weight = columnsNames(InputColumnsNames.WEIGHT) + val validatingLabelsAndOffsetsAndWeights = dataset.select(DataConst.ID, response, offset, weight) - val validatingLabelsAndOffsetsAndWeights = gameDataset.mapValues { gameData => - (gameData.response, gameData.offset, gameData.weight) - } val evaluators = get(validationEvaluators) - .map(_.map(EvaluatorFactory.buildEvaluator(_, gameDataset))) + .map(_.map(EvaluatorFactory.buildEvaluator(_, dataset))) .getOrElse { // Get default evaluators given the task type val taskType = getRequiredParam(trainingTask) @@ -589,13 +424,22 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P Seq(defaultEvaluator) } - val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeights) + + val validatingLabelsAndOffsetsAndWeightsRdd = validatingLabelsAndOffsetsAndWeights + .rdd.map(row => (row.getAs[UniqueSampleId](0), (row.getDouble(1), row.getDouble(2), row.getDouble(3)))) + val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeightsRdd) .setName(s"Evaluation: validation data labels, offsets, and weights") .persistRDD(StorageLevel.MEMORY_AND_DISK) if (logger.isDebugEnabled) { - val randomScores = gameDataset.mapValues(_ => math.random).persist() + val randUdf = udf({() => Random.nextInt()}) + val randomScores = dataset.withColumn(DataConst.SCORE, randUdf()) + .select(DataConst.ID, DataConst.SCORE) + .rdd + .map(row => (row.getAs[UniqueSampleId](0), row.getDouble(1))) + + randomScores.persist() evaluationSuite .evaluate(randomScores) @@ -619,15 +463,15 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P * with the most general 'coordinates' and end with the least general - each successive update learning the residuals * of the previous 'coordinates'. * + * @param data Input training data set * @param configuration The configuration for the GAME optimization problem - * @param trainingDatasets The training datasets for each coordinate of the GAME optimization problem * @param coordinateDescent The coordinate descent driver * @param initialModelOpt An optional existing GAME model who's components should be used to warm-start training * @return A trained GAME model */ protected def train( + data: DataFrame, configuration: GameOptimizationConfiguration, - trainingDatasets: Map[CoordinateId, D forSome { type D <: Dataset[D] }], coordinateDescent: CoordinateDescent, initialModelOpt: Option[GameModel] = None): (GameModel, Option[EvaluationResults]) = Timed(s"Train model:") { @@ -638,6 +482,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val task = getRequiredParam(trainingTask) val updateSequence = getRequiredParam(coordinateUpdateSequence) + val dataConfigs = getRequiredParam(coordinateDataConfigurations) val normalizationContexts = get(coordinateNormalizationContexts).getOrElse(Map()) val variance = getOrDefault(varianceComputationType) val lossFunctionFactoryFactory = ObjectiveFunctionHelper.buildFactory(task, getOrDefault(treeAggregateDepth)) @@ -652,26 +497,41 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val lockedCoordinates = get(partialRetrainLockedCoordinates).getOrElse(Set()) val interceptIndices = getOrDefault(coordinateInterceptIndices) + val columnsNames = getOrDefault(inputColumnNames) + // Create the optimization coordinates for each component model - val coordinates: Map[CoordinateId, C forSome { type C <: Coordinate[_] }] = + val coordinates: Map[CoordinateId, C forSome { type C <: Coordinate }] = updateSequence .map { coordinateId => - val coordinate: C forSome { type C <: Coordinate[_] } = if (lockedCoordinates.contains(coordinateId)) { - trainingDatasets(coordinateId) match { - case feDataset: FixedEffectDataset => new FixedEffectModelCoordinate(feDataset) - case reDataset: RandomEffectDataset => new RandomEffectModelCoordinate(reDataset) - case dataset => throw new UnsupportedOperationException(s"Unsupported dataset type: ${dataset.getClass}") + + val dataConfiguration: CoordinateDataConfiguration = dataConfigs(coordinateId) + val coordinate: C forSome {type C <: Coordinate} = if (lockedCoordinates.contains(coordinateId)) { + dataConfiguration match { + case fedc: FixedEffectDataConfiguration => new FixedEffectModelCoordinate( + data, fedc.featureShardId, + columnsNames) + case redc: RandomEffectDataConfiguration => new RandomEffectModelCoordinate( + redc.randomEffectType, data, + redc.featureShardId, columnsNames) + case oConfig => throw new UnsupportedOperationException( + s"Unsupported coordinate type: ${oConfig.getClass}") } } else { CoordinateFactory.build( - trainingDatasets(coordinateId), + data, + dataConfiguration.featureShardId, + columnsNames, configuration(coordinateId), lossFunctionFactoryFactory, glmConstructor, downSamplerFactory, normalizationContexts.getOrElse(coordinateId, NoNormalization()), variance, - interceptIndices.get(coordinateId)) + interceptIndices.get(coordinateId), + dataConfiguration match { + case redc: RandomEffectDataConfiguration => Some(redc.randomEffectType) + case _: FixedEffectDataConfiguration => None + }) } (coordinateId, coordinate) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala index 740efe79..b7d83c40 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala @@ -14,10 +14,10 @@ */ package com.linkedin.photon.ml.evaluation -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.data.GameDatum +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.evaluation.EvaluatorType._ /** @@ -29,13 +29,13 @@ object EvaluatorFactory { * Construct [[Evaluator]] objects. * * @param evaluatorType The [[EvaluatorType]] - * @param gameDataset A [[RDD]] of (unique ID, GAME data point) which may be necessary to construct [[MultiEvaluator]] + * @param gameDataset A [[DataFrame]] of (unique ID, GAME data point, scores) which may be necessary to construct [[MultiEvaluator]] * objects * @return A new [[Evaluator]] */ protected[ml] def buildEvaluator( evaluatorType: EvaluatorType, - gameDataset: RDD[(UniqueSampleId, GameDatum)]): Evaluator = + gameDataset: DataFrame): Evaluator = evaluatorType match { case AUC => AreaUnderROCCurveEvaluator @@ -52,12 +52,14 @@ object EvaluatorFactory { case SquaredLoss => SquaredLossEvaluator case MultiPrecisionAtK(k, idTag) => - val ids = gameDataset.mapValues(_.idTagToValueMap(idTag)) - new PrecisionAtKMultiEvaluator(k, idTag, ids) + val idsRDD = gameDataset.select(DataConst.ID, idTag) + .rdd.map(row => (row.getAs[UniqueSampleId](0), row.getString(1))) + new PrecisionAtKMultiEvaluator(k, idTag, idsRDD) case MultiAUC(idTag) => - val ids = gameDataset.mapValues(_.idTagToValueMap(idTag)) - new AreaUnderROCCurveMultiEvaluator(idTag, ids) + val idsRDD = gameDataset.select(DataConst.ID, idTag) + .rdd.map(row => (row.getAs[UniqueSampleId](0), row.getString(1))) + new AreaUnderROCCurveMultiEvaluator(idTag, idsRDD) case _ => throw new UnsupportedOperationException(s"Unsupported evaluator type: $evaluatorType") diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala index da2dae64..ce3b7c1f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala @@ -15,14 +15,15 @@ package com.linkedin.photon.ml.model import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.{DataFrame, SparkSession} import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.spark.BroadcastLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} /** * Representation of a fixed effect model. @@ -31,10 +32,10 @@ import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel * @param featureShardId The feature shard id */ class FixedEffectModel( - val modelBroadcast: Broadcast[GeneralizedLinearModel], - val featureShardId: String) + val modelBroadcast: Broadcast[GeneralizedLinearModel], + val featureShardId: String) extends DatumScoringModel - with BroadcastLike { + with BroadcastLike { override val modelType: TaskType = modelBroadcast.value.modelType @@ -50,25 +51,30 @@ class FixedEffectModel( * * @note Use a static method to avoid serializing entire model object during RDD operations. * @param dataPoints The dataset to score + * @param scoreField The name of the score field * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = - FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, ModelDataScores.toScore, ModelDataScores.apply) + override def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame = { + + FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, scoreField) + } /** - * Compute the scores for the GAME dataset, and store the scores only. + * Accumulatively compute the scores for the GAME dataset. * - * @note Use a static method to avoid serializing entire model object during RDD operations. + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore * @return The computed scores */ - override protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - FixedEffectModel.score( - dataPoints, - modelBroadcast, - featureShardId, - CoordinateDataScores.toScore, - CoordinateDataScores.apply) + override def computeScore( + dataPoints: DataFrame, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, scoreField, DataConst.SCORE) + } /** * Build a summary string for the coefficients. @@ -82,6 +88,7 @@ class FixedEffectModel( * Clean up coefficient broadcast. */ override protected[ml] def unpersistBroadcast(): BroadcastLike = { + modelBroadcast.unpersist() this } @@ -93,6 +100,7 @@ class FixedEffectModel( * @return True if both models have the same feature shard ID and underlying models, false otherwise */ override def equals(that: Any): Boolean = { + that match { case other: FixedEffectModel => val sameMetaData = this.featureShardId == other.featureShardId @@ -112,25 +120,55 @@ class FixedEffectModel( object FixedEffectModel { + def apply(glm: GeneralizedLinearModel, featureShardId: FeatureShardId): FixedEffectModel = { + + new FixedEffectModel(SparkSession.builder.getOrCreate.sparkContext.broadcast(glm), featureShardId) + } + /** * Compute the scores for the dataset. * - * @param dataPoints The dataset to score + * @param dataset The dataset to score * @param modelBroadcast The model to use for scoring * @param featureShardId The feature shard id * @return The scores */ - private def score[T, V]( - dataPoints: RDD[(UniqueSampleId, GameDatum)], - modelBroadcast: Broadcast[GeneralizedLinearModel], - featureShardId: FeatureShardId, - toScore: (GameDatum, Double) => T, - toResult: RDD[(UniqueSampleId, T)] => V): V = { - - val scores = dataPoints.mapValues { gameDatum => - toScore(gameDatum, modelBroadcast.value.computeScore(gameDatum.featureShardContainer(featureShardId))) - } + private def score( + dataset: DataFrame, + modelBroadcast: Broadcast[GeneralizedLinearModel], + featureShardId: FeatureShardId, + scoreField: String): DataFrame = { - toResult(scores) + val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + } + + /** + * Compute the scores for the dataset. + * + * @param dataset The dataset to score + * @param modelBroadcast The model to use for scoring + * @param featureShardId The feature shard id + * @return The scores + */ + private def score( + dataset: DataFrame, + modelBroadcast: Broadcast[GeneralizedLinearModel], + featureShardId: FeatureShardId, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) + + if (ApiUtils.hasColumn(dataset, DataConst.SCORE)) { + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(scoreField)) + } else { + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .withColumn(DataConst.SCORE, col(scoreField)) + } } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala index 7ce23a66..e167a12f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala @@ -15,32 +15,30 @@ package com.linkedin.photon.ml.model import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD._ -import org.apache.spark.storage.StorageLevel -import org.apache.spark.{HashPartitioner, SparkContext} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, lit} +import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{UniqueSampleId, REId, REType, FeatureShardId} -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} -import com.linkedin.photon.ml.spark.RDDLike +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.util.ApiUtils /** * Representation of a random effect model. * - * @param modelsRDD The models, one for each unique random effect value + * @param models The models, one for each unique random effect value * @param randomEffectType The random effect type * @param featureShardId The feature shard id */ class RandomEffectModel( - val modelsRDD: RDD[(REId, GeneralizedLinearModel)], + val models: DataFrame, val randomEffectType: REType, val featureShardId: FeatureShardId) - extends DatumScoringModel - with RDDLike { + extends DatumScoringModel { - override val modelType: TaskType = RandomEffectModel.determineModelType(modelsRDD) + override val modelType: TaskType = RandomEffectModel.determineModelType(models) // // RandomEffectModel functions @@ -49,11 +47,11 @@ class RandomEffectModel( /** * Create a new [[RandomEffectModel]] with new underlying models. * - * @param newModelsRdd The new underlying models, one per entity + * @param newModels The new underlying models, one per entity * @return A new [[RandomEffectModel]] */ - def update(newModelsRdd: RDD[(REId, GeneralizedLinearModel)]): RandomEffectModel = - new RandomEffectModel(newModelsRdd, randomEffectType, featureShardId) + def update(newModels: DataFrame): RandomEffectModel = + new RandomEffectModel(newModels, randomEffectType, featureShardId) // // DatumScoringModel functions @@ -63,35 +61,35 @@ class RandomEffectModel( * Compute the score for the dataset. * * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") + * @param dataset The dataset to score * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = + override def computeScore(dataset: DataFrame, scoreField: String): DataFrame = { + RandomEffectModel.score( - dataPoints, - modelsRDD, + dataset, + models, randomEffectType, featureShardId, - ModelDataScores.toScore, - ModelDataScores.apply) + scoreField) + } /** - * Compute the scores for the GAME dataset, and store the scores only. + * Accumulatively compute the scores for the GAME dataset. * - * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) + * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore * @return The computed scores */ - override def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - RandomEffectModel.score( - dataPoints, - modelsRDD, - randomEffectType, - featureShardId, - CoordinateDataScores.toScore, - CoordinateDataScores.apply) + override def computeScore( + dataPoints: DataFrame, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + RandomEffectModel.score(dataPoints, models, randomEffectType, featureShardId, scoreField, DataConst.SCORE) + } // // Summarizable functions @@ -108,78 +106,15 @@ class RandomEffectModel( stringBuilder.append(s"\nRandom Effect Type: '$randomEffectType'") stringBuilder.append(s"\nFeature Shard ID: '$featureShardId'") - stringBuilder.append(s"\nLength: ${modelsRDD.values.map(_.coefficients.means.length).stats()}") - stringBuilder.append(s"\nMean: ${modelsRDD.values.map(_.coefficients.meansL2Norm).stats()}") - if (modelsRDD.first()._2.coefficients.variancesOption.isDefined) { - stringBuilder.append(s"\nVariance: ${modelsRDD.values.map(_.coefficients.variancesL2NormOption.get).stats()}") - } + //stringBuilder.append(s"\nLength: ${modelsRDD.values.map(_.coefficients.means.length).stats()}") + //stringBuilder.append(s"\nMean: ${modelsRDD.values.map(_.coefficients.meansL2Norm).stats()}") + //if (modelsRDD.first()._2.coefficients.variancesOption.isDefined) { + // stringBuilder.append(s"\nVariance: ${modelsRDD.values.map(_.coefficients.variancesL2NormOption.get).stats()}") + //} stringBuilder.toString() } - // - // RDDLike functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override protected[ml] def sparkContext: SparkContext = modelsRDD.sparkContext - - /** - * Assign a given name to [[modelsRDD]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[modelsRDD]] assigned - */ - override protected[ml] def setName(name: String): RandomEffectModel = { - - modelsRDD.setName(name) - - this - } - - /** - * Set the storage level of [[modelsRDD]], and persist their values across the cluster the first time they are - * computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[modelsRDD]] set - */ - override protected[ml] def persistRDD(storageLevel: StorageLevel): RandomEffectModel = { - - if (!modelsRDD.getStorageLevel.isValid) modelsRDD.persist(storageLevel) - - this - } - - /** - * Mark [[modelsRDD]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[modelsRDD]] marked non-persistent - */ - override protected[ml] def unpersistRDD(): RandomEffectModel = { - - if (modelsRDD.getStorageLevel.isValid) modelsRDD.unpersist() - - this - } - - /** - * Materialize [[modelsRDD]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[modelsRDD]] materialized - */ - override protected[ml] def materialize(): RandomEffectModel = { - - modelsRDD.count() - - this - } - /** * Compares two [[RandomEffectModel]] objects. * @@ -194,18 +129,12 @@ class RandomEffectModel( val areTypesEqual = this.randomEffectType == other.randomEffectType val areShardsEqual = this.featureShardId == other.featureShardId lazy val areAllModelsEqual = this - .modelsRDD - .fullOuterJoin(other.modelsRDD) - .mapPartitions { iterator => - - val areModelsEqual = iterator.forall { - case (_, (Some(model1), Some(model2))) => model1.equals(model2) - case _ => false - } - - Iterator.single(areModelsEqual) - } - .fold(true)(_ && _) + .models + .withColumnRenamed(DataConst.COEFFICIENTS, "s1") + .join(other.models.withColumnRenamed(DataConst.COEFFICIENTS, "s2"), col(DataConst.ID), "fullouter") + .filter("s1 is null or s2 is null or s1 != s2") //TODO: add udf to compare two vectors + .head(1) + .isEmpty areTypesEqual && areShardsEqual && areAllModelsEqual @@ -221,6 +150,7 @@ class RandomEffectModel( * @return An [[Int]] hash code */ override def hashCode(): Int = super.hashCode() + } object RandomEffectModel { @@ -233,67 +163,74 @@ object RandomEffectModel { * that type - it will be faster for large numbers of random effect models. Note that it may still be a * bottleneck if we check each time a new RandomEffectModel is created. * - * @param modelsRDD The random effect models + * @param models The random effect models * @return The GAME model type */ - protected def determineModelType(modelsRDD: RDD[(REId, GeneralizedLinearModel)]): TaskType = { + protected def determineModelType(models: DataFrame): TaskType = { - val modelTypes = modelsRDD.values.map(_.modelType).distinct().collect() + val modelTypes = models.select(GeneralizedLinearModel.MODEL_TYPE).head(1) require( modelTypes.length == 1, - s"${modelsRDD.name} has multiple model types:\n${modelTypes.mkString(", ")}") + s"models has multiple model types:\n${modelTypes.mkString(", ")}") - modelTypes.head + TaskType.withName(modelTypes(0).getString(0)) } /** * Compute the scores for a dataset, using random effect models. * - * @param dataPoints The dataset to score - * @param modelsRDD The individual random effect models to use for scoring + * @param dataset The dataset to score + * @param models The individual random effect models to use for scoring * @param randomEffectType The random effect type * @param featureShardId The feature shard id * @return The scores */ - private def score[T, V]( - dataPoints: RDD[(UniqueSampleId, GameDatum)], - modelsRDD: RDD[(REId, GeneralizedLinearModel)], + private def score ( + dataset: DataFrame, + models: DataFrame, randomEffectType: REType, featureShardId: FeatureShardId, - toScore: (GameDatum, Double) => T, - toResult: RDD[(UniqueSampleId, T)] => V): V = { - - val hashPartitioner = new HashPartitioner(dataPoints.getNumPartitions) - - /* - * We perform a replicated partitioned hash join here under the assumption that we can fit the per partition - * random effect models in memory. We first partition both relations using the same partitioner and then zip them. - * This ensures that the same keys from both relations go in the same partition. Given above, we can now perform the - * join by doing the following operations per partition: - * 1. Load the random effect models in memory - * 2. Iterate over the data points - * 3. For each data point, look up the corresponding random effect model in the in memory map and score - */ - val scores = dataPoints - .map { case (uniqueId, gameDatum) => - (gameDatum.idTagToValueMap(randomEffectType), (uniqueId, gameDatum)) - } - .partitionBy(hashPartitioner) - .zipPartitions(modelsRDD.partitionBy(hashPartitioner)) { (dataIt, modelIt) => - - val lookupTable = modelIt.toMap - - dataIt.map { case (id, (uid, datum)) => - val score = lookupTable - .get(id) - .map(_.computeScore(datum.featureShardContainer(featureShardId))) - .getOrElse(0.0) - - (uid, toScore(datum, score)) - } - } - - toResult(scores) + scoreField: String): DataFrame = { + + dataset + .join(models, randomEffectType) + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(col(DataConst.COEFFICIENTS), col(featureShardId))) + } + + /** + * Compute the scores for the dataset. + * + * @param dataset The dataset to score + * @param models The individual random effect models to use for scoring + * @param randomEffectType The random effect type + * @param featureShardId The feature shard id + * @param scoreField The field name of the coordinate + * @param accumulativeScoreField The field name of the accumulative score + * @return The scores + */ + private def score( + dataset: DataFrame, + models: DataFrame, + randomEffectType: REType, + featureShardId: FeatureShardId, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + if (ApiUtils.hasColumn(dataset, DataConst.SCORE)) { + dataset + .join(models, randomEffectType) + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(DataConst.COEFFICIENTS), col(featureShardId))) + .withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(scoreField)) + } else { + dataset + .join(models, randomEffectType) + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(DataConst.COEFFICIENTS), col(featureShardId))) + .withColumn(DataConst.SCORE, col(scoreField)) + } + } + + def toDataFrame(input: RDD[(REType, GeneralizedLinearModel)]): DataFrame = { + null } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala index 2573bf3b..d56aa475 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala @@ -17,7 +17,6 @@ package com.linkedin.photon.ml.optimization import breeze.linalg.{Vector, cholesky, diag} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel - import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint @@ -110,7 +109,7 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: RDD[LabeledPoint]): GeneralizedLinearModel = + override def run(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = run(input, initializeZeroModel(input.first.features.size)) /** @@ -121,13 +120,16 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param initialModel The initial model from which to begin optimization * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: RDD[LabeledPoint], initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { + override def run( + input: RDD[LabeledPoint], + initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { val normalizationContext = optimizer.getNormalizationContext - val (optimizedCoefficients, _) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) + val (optimizedCoefficients, stateTracker) = optimizer + .optimize(objectiveFunction, initialModel.coefficients.means)(input) val optimizedVariances = computeVariances(input, optimizedCoefficients) - createModel(normalizationContext, optimizedCoefficients, optimizedVariances) + (createModel(normalizationContext, optimizedCoefficients, optimizedVariances), stateTracker) } /** @@ -137,8 +139,8 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - def runWithSampling(input: RDD[(UniqueSampleId, LabeledPoint)]): GeneralizedLinearModel = - runWithSampling(input, initializeZeroModel(input.first._2.features.size)) + def runWithSampling(input: RDD[(UniqueSampleId, LabeledPoint)]): (GeneralizedLinearModel, OptimizationStatesTracker) = + runWithSampling(input, initializeZeroModel(input.values.first.features.size)) /** * Run the algorithm with the configured parameters, starting from the initial model provided, and down-sample the @@ -150,7 +152,7 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec */ def runWithSampling( input: RDD[(UniqueSampleId, LabeledPoint)], - initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { + initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { val data = (samplerOption match { case Some(sampler) => sampler.downSample(input).values diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala index 4766cc2a..09577c87 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala @@ -41,13 +41,6 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O glmConstructor: Coefficients => GeneralizedLinearModel, varianceComputation: VarianceComputationType) extends Logging { - /** - * Get the optimization state trackers for the optimization problems solved - * - * @return Some(OptimizationStatesTracker) if optimization states were tracked, otherwise None - */ - def getStatesTracker: OptimizationStatesTracker = optimizer.getStateTracker - /** * Create a default generalized linear model with 0-valued coefficients * @@ -98,7 +91,7 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O * @param input The training data * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight */ - def run(input: objectiveFunction.Data): GeneralizedLinearModel + def run(input: objectiveFunction.Data): (GeneralizedLinearModel, OptimizationStatesTracker) /** * Run the optimization algorithm on the input data, starting from the initial model provided. @@ -107,7 +100,7 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O * @param initialModel The initial model from which to begin optimization * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight */ - def run(input: objectiveFunction.Data, initialModel: GeneralizedLinearModel): GeneralizedLinearModel + def run(input: objectiveFunction.Data, initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) /** * Compute the regularization term value diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala index c5875a8b..aea3db19 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala @@ -15,7 +15,6 @@ package com.linkedin.photon.ml.optimization import breeze.linalg.{Vector, cholesky, diag} - import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint import com.linkedin.photon.ml.function._ @@ -79,23 +78,24 @@ protected[ml] class SingleNodeOptimizationProblem[Objective <: SingleNodeObjecti * @param input The training data * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight */ - override def run(input: Iterable[LabeledPoint]): GeneralizedLinearModel = + override def run(input: Iterable[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = run(input, initializeZeroModel(input.head.features.size)) + /** - * Run the optimization algorithm on the input data, starting from the initial model provided. + * Run the algorithm with the configured parameters, starting from the initial model provided + * (warm start in iterations over the regularization weights for hyperparameter tuning). * * @param input The training data - * @param initialModel The initial model from which to begin optimization - * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight + * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: Iterable[LabeledPoint], initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { + override def run(input: Iterable[LabeledPoint], initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { val normalizationContext = optimizer.getNormalizationContext - val (optimizedCoefficients, _) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) + val (optimizedCoefficients, stateTracker) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) val optimizedVariances = computeVariances(input, optimizedCoefficients) - createModel(normalizationContext, optimizedCoefficients, optimizedVariances) + (createModel(normalizationContext, optimizedCoefficients, optimizedVariances), stateTracker) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala index c0a0201e..25de24d7 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala @@ -16,15 +16,16 @@ package com.linkedin.photon.ml.optimization.game import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.REId +import com.linkedin.photon.ml.Types.{REId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.{SingleNodeOptimizationProblem, VarianceComputationType} import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType -import com.linkedin.photon.ml.projector.LinearSubspaceProjector +import com.linkedin.photon.ml.optimization.{SingleNodeOptimizationProblem, VarianceComputationType} import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util.PhotonNonBroadcast @@ -124,8 +125,6 @@ object RandomEffectOptimizationProblem { * * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization * problems - * @param linearSubspaceProjectorsRDD The per-entity [[LinearSubspaceProjector]] objects used to compress the - * per-entity feature spaces * @param configuration The optimization problem configuration * @param objectiveFunctionFactory The objective function to optimize * @param glmConstructor The function to use for producing GLMs from trained coefficients @@ -135,7 +134,8 @@ object RandomEffectOptimizationProblem { * @return A new [[RandomEffectOptimizationProblem]] object */ def apply[RandomEffectObjective <: SingleNodeObjectiveFunction]( - linearSubspaceProjectorsRDD: RDD[(REId, LinearSubspaceProjector)], + data: DataFrame, + rEType: REType, configuration: RandomEffectOptimizationConfiguration, objectiveFunctionFactory: Option[Int] => RandomEffectObjective, glmConstructor: Coefficients => GeneralizedLinearModel, @@ -144,29 +144,21 @@ object RandomEffectOptimizationProblem { interceptIndexOpt: Option[Int]): RandomEffectOptimizationProblem[RandomEffectObjective] = { // Generate new NormalizationContext and SingleNodeOptimizationProblem objects - val optimizationProblems = linearSubspaceProjectorsRDD - .mapValues { projector => - val factors = normalizationContext.factorsOpt.map(factors => projector.projectForward(factors)) - val shiftsAndIntercept = normalizationContext - .shiftsAndInterceptOpt - .map { case (shifts, intercept) => - val newShifts = projector.projectForward(shifts) - val newIntercept = projector.originalToProjectedSpaceMap(intercept) - - (newShifts, newIntercept) - } - val projectedNormalizationContext = new NormalizationContext(factors, shiftsAndIntercept) - val projectedInterceptOpt = interceptIndexOpt.map { interceptIndex => - projector.originalToProjectedSpaceMap(interceptIndex) - } - - // TODO: Broadcast arguments to SingleNodeOptimizationProblem? - SingleNodeOptimizationProblem( + val optimizationProblems = data + .select(rEType, DataConst.ID) + .groupBy(rEType) + .count + .rdd + .map { row => + val reid = row.getInt(0).toString + val problem = SingleNodeOptimizationProblem( configuration, - objectiveFunctionFactory(projectedInterceptOpt), + objectiveFunctionFactory(interceptIndexOpt), glmConstructor, - PhotonNonBroadcast(projectedNormalizationContext), + PhotonNonBroadcast(normalizationContext), varianceComputationType) + + (reid, problem) } new RandomEffectOptimizationProblem(optimizationProblems, glmConstructor) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala index 8f55fbf0..ac711c30 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala @@ -15,9 +15,13 @@ package com.linkedin.photon.ml.supervised.model import breeze.linalg.Vector +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector => SparkVector} import org.apache.spark.rdd.RDD - +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import com.linkedin.photon.ml.TaskType.TaskType +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.util.Summarizable @@ -50,25 +54,6 @@ abstract class GeneralizedLinearModel(val coefficients: Coefficients) extends Se */ protected[ml] def computeMean(features: Vector[Double], offset: Double): Double - /** - * Compute the score for the given features. - * - * @note "score" = coefficients * features (no link function in the case of logistic regression: see above) - * - * @param features The input data point's feature - * @return The score for the passed features - */ - def computeScore(features: Vector[Double]): Double = coefficients.computeScore(features) - - /** - * Compute the value of the mean function of the generalized linear model given one data point using the estimated - * coefficients. - * - * @param features Vector representing a single data point's features - * @return Computed mean function value - */ - def computeMeanFunction(features: Vector[Double]): Double = computeMeanFunctionWithOffset(features, 0.0) - /** * Compute the value of the mean function of the generalized linear model given one data point using the estimated * coefficients. @@ -136,6 +121,15 @@ abstract class GeneralizedLinearModel(val coefficients: Coefficients) extends Se } object GeneralizedLinearModel { + + // Schema for [[DataFrame]] + def schema: StructType = StructType(Array( + StructField(DataConst.MODEL_ID, StringType, false), + StructField(DataConst.MODEL_TYPE, StringType, false), + StructField(DataConst.COEFFICIENTS, VectorType , false), + StructField(DataConst.VARIANCES, VectorType, true) + )) + /** * Compute the value of the mean functions of the generalized linear model given a RDD of data points using the * estimated coefficients and intercept. @@ -165,4 +159,35 @@ object GeneralizedLinearModel { broadcastModel.unpersist() result } + + val MODEL_TYPE = "modelType" + + /** + * A UDF to compute scores given a linear model and a feature vector + * + * @return The score which is the dot product of model coefficients and features + */ + def scoreUdf = udf[Double, SparseVector, SparseVector]( + { (coefficients: SparkVector, features: SparkVector) => + require( + coefficients.size == features.size, + s"Coefficients.size = ${coefficients.size} and features.size = ${features.size}") + + var score = 0D + + coefficients match { + case denseCoef: DenseVector => + features.foreachActive { case (index, value) => + score += value * denseCoef(index) + } + + case sparseCoef: SparseVector => + sparseCoef.foreachActive { case (index, coefficient) => + score += coefficient * features(index) + } + } + + score + }) + } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala index 82fb6c39..a8fb9318 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala @@ -18,18 +18,19 @@ import org.apache.commons.cli.MissingArgumentException import org.apache.spark.SparkContext import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, count, monotonically_increasing_id} import org.apache.spark.storage.StorageLevel import org.slf4j.Logger -import com.linkedin.photon.ml.Types.{FeatureShardId, REType, UniqueSampleId} -import com.linkedin.photon.ml.data.scoring.ModelDataScores -import com.linkedin.photon.ml.data.{GameConverters, GameDatum, InputColumnsNames} +import com.linkedin.photon.ml.Types.{REType, UniqueSampleId} +import com.linkedin.photon.ml.constants.DataConst +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.model.{FixedEffectModel, GameModel, RandomEffectModel} import com.linkedin.photon.ml.util._ + /** * Scores input data using a [[GameModel]]. Plays a similar role to the [[org.apache.spark.ml.Model]]. * @@ -147,7 +148,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends * @param data Input [[DataFrame]] of samples * @return Scored data samples */ - def transform(data: DataFrame): ModelDataScores = { + def transform(data: DataFrame): DataFrame = { validateParams() @@ -169,7 +170,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends .toSet val gameDataset = Timed("Preparing GAME dataset") { - prepareGameDataset(data, randomEffectTypes, featureShards) + data.withColumn(DataConst.ID, monotonically_increasing_id) } if (getOrDefault(logDataAndModelStats)) { @@ -177,55 +178,27 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends logger.debug(s"GAME model summary:\n${getRequiredParam(model).toSummaryString}") } - val scores = Timed("Computing scores") { - scoreGameDataset(gameDataset) + val storageLevel = if (getOrDefault(spillScoresToDisk)) { + StorageLevel.MEMORY_AND_DISK + } else { + StorageLevel.MEMORY_ONLY } - - gameDataset.unpersist() + val gameDataWithScores = Timed("Computing scores") { + getRequiredParam(model).score(gameDataset) + } + gameDataWithScores.persist(storageLevel) Timed("Evaluating scores") { get(validationEvaluators).foreach( _.foreach { evaluatorType => - val evaluationMetricValue = evaluateScores(evaluatorType, gameDataset, scores) + val evaluationMetricValue = evaluateScores(evaluatorType, gameDataWithScores) logger.info(s"Evaluation metric value on scores with $evaluatorType: $evaluationMetricValue") }) } - // TODO: Instead, we should merge the scores back into the DataFrame in a new column (at least optionally) - - scores + gameDataWithScores } - /** - * Builds a GAME dataset according to input data configuration. - * - * @param dataFrame A [[DataFrame]] of raw input data - * @param randomEffectTypes The set of unique identifier fields used by the random effects of the model - * @param featureShards The set of feature shards used by the model - * @return The prepared GAME dataset - */ - protected def prepareGameDataset( - dataFrame: DataFrame, - randomEffectTypes: Set[REType], - featureShards: Set[FeatureShardId]): RDD[(UniqueSampleId, GameDatum)] = { - - val parallelism = sc.getConf.get("spark.default.parallelism", s"${sc.getExecutorStorageStatus.length * 3}").toInt - val partitioner = new LongHashPartitioner(parallelism) - val idTagSet = randomEffectTypes ++ - get(validationEvaluators).map(MultiEvaluatorType.getMultiEvaluatorIdTags).getOrElse(Seq()) - val gameDataset = GameConverters - .getGameDatasetFromDataFrame( - dataFrame, - featureShards, - idTagSet, - isResponseRequired = false, - getOrDefault(inputColumnNames)) - .partitionBy(partitioner) - .setName("Game dataset with UIDs for scoring") - .persist(StorageLevel.DISK_ONLY) - - gameDataset - } /** * Log some simple summary statistics for the GAME dataset. @@ -233,7 +206,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends * @param gameDataset The GAME dataset * @param randomEffectTypes The set of unique identifier fields used by the random effects of the model */ - private def logGameDataset(gameDataset: RDD[(UniqueSampleId, GameDatum)], randomEffectTypes: Set[REType]): Unit = { + private def logGameDataset(gameDataset: DataFrame, randomEffectTypes: Set[REType]): Unit = { val numSamples = gameDataset.count() @@ -242,63 +215,46 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends randomEffectTypes.foreach { idTag => val numSamplesStats = gameDataset - .map { case (_, gameData) => - val idValue = gameData.idTagToValueMap(idTag) - (idValue, 1) - } - .reduceByKey(_ + _) - .values - .stats() + .groupBy(idTag).agg(count("*").alias("cnt")) + .describe("cnt") + .collect() + .map(t => t.getString(0) + "\t" + t.getDouble(1) + "\t" + t.getDouble(2)) + .mkString("\n") logger.debug(s"numSamples for $idTag: $numSamplesStats") } } - /** - * Load the GAME model and score the GAME dataset. - * - * @param gameDataset The GAME dataset - * @return The scores - */ - protected def scoreGameDataset(gameDataset: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = { - - val storageLevel = if (getOrDefault(spillScoresToDisk)) { - StorageLevel.MEMORY_AND_DISK - } else { - StorageLevel.MEMORY_ONLY - } - // Need to split these calls to keep correct return type - val scores = getRequiredParam(model).score(gameDataset) - scores.persistRDD(storageLevel).materialize() - - scores - } /** * Evaluate the computed scores with the given evaluator type. * * @param evaluatorType The evaluator type - * @param scores The computed scores - * @param gameDataset The GAME dataset + * @param gameDatasetWithscores The GAME dataset * @return The evaluation metric */ protected def evaluateScores( evaluatorType: EvaluatorType, - gameDataset: RDD[(UniqueSampleId, GameDatum)], - scores: ModelDataScores): Double = { + gameDatasetWithscores: DataFrame): Double = { - val evaluator = EvaluatorFactory.buildEvaluator(evaluatorType, gameDataset) + val evaluator = EvaluatorFactory.buildEvaluator(evaluatorType, gameDatasetWithscores) + val columnsNames = getOrDefault(inputColumnNames) + val offset = columnsNames(InputColumnsNames.OFFSET) + val response = columnsNames(InputColumnsNames.RESPONSE) + val weight = columnsNames(InputColumnsNames.WEIGHT) evaluator match { case se: SingleEvaluator => - val scoresRDD = scores.scoresRdd.map { case (_, sGD) => - (sGD.score + sGD.offset, sGD.response, sGD.weight) - } + val scoresRDD = gameDatasetWithscores + .select(col(DataConst.SCORE) + col(offset), col(response), col(weight)) + .rdd.map (row => (row.getDouble(0), row.getDouble(1), row.getDouble(2))) se.evaluate(scoresRDD) case me: MultiEvaluator => - val scoresRDD = scores.scoresRdd.mapValues(sGD => (sGD.score + sGD.offset, sGD.response, sGD.weight)) + val scoresRDD = gameDatasetWithscores + .select(col(DataConst.ID), col(DataConst.SCORE) + col(offset), col(response), col(weight)) + .rdd.map (row => (row.getAs[UniqueSampleId](0), (row.getDouble(1), row.getDouble(2), row.getDouble(3)))) me.evaluate(scoresRDD) diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala similarity index 56% rename from photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala rename to photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala index 0217b5af..cb14e65f 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala @@ -12,21 +12,13 @@ * License for the specific language governing permissions and limitations * under the License. */ -package com.linkedin.photon.ml.data -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.util.Summarizable +package com.linkedin.photon.ml.util -/** - * Interface for GAME dataset implementations. - */ -protected[ml] trait Dataset[D <: Dataset[D]] extends Summarizable { +import scala.util.Try +import org.apache.spark.sql.DataFrame + +object ApiUtils { - /** - * Add residual scores to the data offsets. - * - * @param keyScore The residual scores - * @return The dataset with updated offsets - */ - def addScoresToOffsets(keyScore: CoordinateDataScores): D + def hasColumn(df: DataFrame, path: String): Boolean = Try(df(path)).isSuccess } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala index 55f0a458..040ac2c8 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala @@ -19,19 +19,20 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfiguration, TaskType} -import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId, REType} import com.linkedin.photon.ml.cli.game.GameDriver +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.avro._ -import com.linkedin.photon.ml.data.scoring.ModelDataScores import com.linkedin.photon.ml.data.{DataValidators, InputColumnsNames} import com.linkedin.photon.ml.index.IndexMapLoader import com.linkedin.photon.ml.io.scopt.game.ScoptGameScoringParametersParser import com.linkedin.photon.ml.model.RandomEffectModel import com.linkedin.photon.ml.transformers.GameTransformer import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfiguration, TaskType} /** * Driver for GAME full model scoring. @@ -182,18 +183,24 @@ object GameScoringDriver extends GameDriver { transformer } - val scores = Timed("Score data") { + val gameDataWithScores = Timed("Score data") { gameTransformer.transform(dataFrame) } - gameModel.toMap.foreach { - case (_, model: RandomEffectModel) => model.unpersistRDD() - case _ => - } +// gameModel.toMap.foreach { +// case (_, model: RandomEffectModel) => model.unpersistRDD() +// case _ => +// } Timed("Save scores") { - saveScoresToHDFS(scores) + val reTypes = gameModel.toMap.values.collect { + case rem: RandomEffectModel => rem.randomEffectType + } + + saveScoresToHDFS(gameDataWithScores, reTypes) } + + gameDataWithScores.unpersist() } /** @@ -224,34 +231,29 @@ object GameScoringDriver extends GameDriver { /** * Save the computed scores to HDFS with auxiliary info. * - * @param scores The computed scores + * @param data The game dataset with computed scores */ - protected def saveScoresToHDFS(scores: ModelDataScores): Unit = { + protected def saveScoresToHDFS(data: DataFrame, reTypes: Iterable[REType]): Unit = { // Take the offset information into account when writing the scores to HDFS - val scoredItems = scores.scoresRdd.map { case (_, scoredGameDatum) => - ScoredItem( - scoredGameDatum.score + scoredGameDatum.offset, - Some(scoredGameDatum.response), - Some(scoredGameDatum.weight), - scoredGameDatum.idTagToValueMap) - } + val columnsNames = getOrDefault(inputColumnNames) + val scoredItems = data.withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(columnsNames(InputColumnsNames.OFFSET))) if (getOrDefault(logDataAndModelStats)) { // Persist scored items here since we introduce multiple passes - scoredItems.setName("Scored items").persist(StorageLevel.MEMORY_AND_DISK) + scoredItems.persist(StorageLevel.MEMORY_AND_DISK) val numScoredItems = scoredItems.count() logger.info(s"Number of scored items to be written to HDFS: $numScoredItems \n") } val scoredItemsToBeSaved = get(outputFilesLimit) match { - case Some(limit) if limit < scoredItems.partitions.length => scoredItems.coalesce(getOrDefault(outputFilesLimit)) + case Some(limit) => scoredItems.limit(getOrDefault(outputFilesLimit)) case _ => scoredItems } val scoresDir = new Path(getRequiredParam(rootOutputDirectory), SCORES_DIR) - ScoreProcessingUtils.saveScoredItemsToHDFS(scoredItemsToBeSaved, scoresDir.toString, get(modelId)) + ScoreProcessingUtils.saveScoredItemsToHDFS(scoredItemsToBeSaved, reTypes, scoresDir.toString, get(modelId)) scoredItems.unpersist() } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala index 94ca91e1..20938a30 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala @@ -20,23 +20,25 @@ import org.apache.spark.SparkContext import org.apache.spark.ml.linalg.{Vector => SparkMLVector} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params} import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml._ import com.linkedin.photon.ml.HyperparameterTunerName.HyperparameterTunerName import com.linkedin.photon.ml.HyperparameterTuningMode.HyperparameterTuningMode import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types._ +import com.linkedin.photon.ml._ import com.linkedin.photon.ml.cli.game.GameDriver -import com.linkedin.photon.ml.data.{DataValidators, FixedEffectDataConfiguration, InputColumnsNames, RandomEffectDataConfiguration} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.avro.{AvroDataReader, ModelProcessingUtils} +import com.linkedin.photon.ml.data.{DataValidators, FixedEffectDataConfiguration, InputColumnsNames, RandomEffectDataConfiguration} import com.linkedin.photon.ml.estimators.GameEstimator.GameOptimizationConfiguration import com.linkedin.photon.ml.estimators.{GameEstimator, GameEstimatorEvaluationFunction} import com.linkedin.photon.ml.hyperparameter.tuner.HyperparameterTunerFactory import com.linkedin.photon.ml.index.{IndexMap, IndexMapLoader} -import com.linkedin.photon.ml.io.{CoordinateConfiguration, ModelOutputMode, RandomEffectCoordinateConfiguration} import com.linkedin.photon.ml.io.ModelOutputMode.ModelOutputMode import com.linkedin.photon.ml.io.scopt.game.ScoptGameTrainingParametersParser +import com.linkedin.photon.ml.io.{CoordinateConfiguration, ModelOutputMode, RandomEffectCoordinateConfiguration} import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel, RandomEffectModel} import com.linkedin.photon.ml.normalization.NormalizationType.NormalizationType import com.linkedin.photon.ml.normalization.{NormalizationContext, NormalizationType} @@ -45,8 +47,7 @@ import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceCompu import com.linkedin.photon.ml.optimization.game.CoordinateOptimizationConfiguration import com.linkedin.photon.ml.stat.FeatureDataStatistics import com.linkedin.photon.ml.util.Implicits._ -import com.linkedin.photon.ml.util.Utils -import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.util.{Utils, _} /** * This object is the entry point and driver for GAME training. There is a separate driver object for scoring. @@ -359,9 +360,16 @@ object GameTrainingDriver extends GameDriver { val (trainingData, featureIndexMapLoaders) = Timed(s"Read training data") { readTrainingData(avroDataReader, featureIndexMapLoadersOpt) } + val gameTrainingData = Timed("Prepare GAME training data") { + trainingData.withColumn(DataConst.ID, monotonically_increasing_id) + } + val validationData = Timed(s"Read validation data") { readValidationData(avroDataReader, featureIndexMapLoaders) } + val gameValidationData = Timed("Prepare GAME validation data") { + validationData.map(_.withColumn(DataConst.ID, monotonically_increasing_id)) + } val interceptIndices = featureIndexMapLoaders.flatMap { case (coordinateId, indexMap) => indexMap.indexMapForDriver().getIndex(Constants.INTERCEPT_KEY) match { @@ -371,8 +379,8 @@ object GameTrainingDriver extends GameDriver { } } - trainingData.persist(StorageLevel.DISK_ONLY) - validationData.map(_.persist(StorageLevel.DISK_ONLY)) + gameTrainingData.persist(StorageLevel.DISK_ONLY) + gameValidationData.map(_.persist(StorageLevel.DISK_ONLY)) val modelOpt = get(modelInputDirectory).map { modelDir => Timed("Load model for warm-start training") { @@ -420,7 +428,7 @@ object GameTrainingDriver extends GameDriver { getOrDefault(inputColumnNames), getRequiredParam(featureShardConfigurations).keySet) - validationData match { + gameValidationData match { case Some(x) => DataValidators.sanityCheckDataFrameForTraining( x, getRequiredParam(trainingTask), @@ -470,17 +478,17 @@ object GameTrainingDriver extends GameDriver { } val explicitModels = Timed("Fit models") { - gameEstimator.fit(trainingData, validationData, gameOptimizationConfigs) + gameEstimator.fit(gameTrainingData, gameValidationData, gameOptimizationConfigs) } val tunedModels = Timed("Tune hyperparameters") { // Disable warm start for autotuning gameEstimator.setUseWarmStart(false) - runHyperparameterTuning(gameEstimator, trainingData, validationData, explicitModels) + runHyperparameterTuning(gameEstimator, gameTrainingData, gameValidationData, explicitModels) } - trainingData.unpersist() - validationData.map(_.unpersist()) + gameTrainingData.unpersist() + gameValidationData.map(_.unpersist()) val (outputModels, bestModel) = selectModels(explicitModels, tunedModels) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala index a900ae23..897da1f4 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala @@ -40,6 +40,7 @@ import com.linkedin.photon.ml.index.{DefaultIndexMap, DefaultIndexMapLoader, Ind import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.TaskType.TaskType /** * Some basic functions to read/write Avro's [[GenericRecord]] from/to HDFS. @@ -325,7 +326,7 @@ object AvroUtils { * @param featureMap The map from feature index of type [[Int]] to feature name of type [[NameAndTerm]] * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero * @return The Avro record that contains the information of the input coefficients - */ + protected[avro] def convertGLMModelToBayesianLinearModelAvro( model: GeneralizedLinearModel, modelId: String, @@ -349,6 +350,40 @@ object AvroUtils { avroFile.setVariances(variancesAvrosOption.get.toList) } + avroFile.build() + }*/ + + /** + * Convert the coefficients of type [[Coefficients]] to Avro record of type [[BayesianLinearModelAvro]]. + * + * @param modelId The model's id + * @param featureMap The map from feature index of type [[Int]] to feature name of type [[NameAndTerm]] + * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero + * @return The Avro record that contains the information of the input coefficients + */ + protected[avro] def convertGLMModelToBayesianLinearModelAvro( + modelClassName: String, + modelCoefficients: Vector[Double], + variancesOption: Option[Vector[Double]], + modelId: String, + featureMap: IndexMap, + sparsityThreshold: Double = VectorUtils.DEFAULT_SPARSITY_THRESHOLD): BayesianLinearModelAvro = { + + val meansAvros = convertVectorAsArrayOfNameTermValueAvros(modelCoefficients, featureMap, sparsityThreshold) + val variancesAvrosOption = variancesOption + .map(convertVectorAsArrayOfNameTermValueAvros(_, featureMap, sparsityThreshold)) + // TODO: Output type of model. + val avroFile = BayesianLinearModelAvro + .newBuilder() + .setModelId(modelId) + .setModelClass(modelClassName) + .setLossFunction("") + .setMeans(meansAvros.toList) + + if (variancesAvrosOption.isDefined) { + avroFile.setVariances(variancesAvrosOption.get.toList) + } + avroFile.build() } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala index 7c03c199..1699a333 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala @@ -23,14 +23,16 @@ import scala.io.Source import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext +import org.apache.spark.ml.linalg.{Vector => SparkMLVector} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.storage.StorageLevel import com.linkedin.photon.avro.generated.{BayesianLinearModelAvro, FeatureSummarizationResultAvro} import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId} import com.linkedin.photon.ml.cli.game.training.GameTrainingDriver +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.estimators.GameEstimator import com.linkedin.photon.ml.index.{IndexMap, IndexMapLoader} import com.linkedin.photon.ml.model._ @@ -227,9 +229,9 @@ object ModelProcessingUtils { s"Missing feature shard definition for '$featureShardId' required by coordinate '$name' in loaded model") } val modelsRDDInputPath = new Path(innerPath, AvroConstants.COEFFICIENTS) - val modelsRDD = loadModelsRDDFromHDFS(modelsRDDInputPath.toString, indexMapLoader, sc) + val models = loadModelsDataFrameFromHDFS(modelsRDDInputPath.toString, indexMapLoader, sc) - (name, new RandomEffectModel(modelsRDD, randomEffectType, featureShardId).persistRDD(storageLevel)) + (name, new RandomEffectModel(models, randomEffectType, featureShardId)/*.persist(storageLevel)*/) } } else { @@ -276,18 +278,18 @@ object ModelProcessingUtils { //Write the coefficientsRDD val coefficientsRDDOutputDir = new Path(randomEffectModelOutputDir, AvroConstants.COEFFICIENTS).toString - val modelsRDD = randomEffectModelFileLimit match { + val models = randomEffectModelFileLimit match { case Some(fileLimit) => require(fileLimit > 0, "Attempt to coalesce random effect model RDD into fewer than 1 partitions") // Control the number of output files by re-partitioning the RDD. - randomEffectModel.modelsRDD.coalesce(fileLimit) + randomEffectModel.models.coalesce(fileLimit) case None => - randomEffectModel.modelsRDD + randomEffectModel.models } - saveModelsRDDToHDFS(modelsRDD, indexMapLoader, coefficientsRDDOutputDir, sparsityThreshold) + saveModelsDataFrameToHDFS(models, indexMapLoader, coefficientsRDDOutputDir, sparsityThreshold) } /** @@ -307,7 +309,9 @@ object ModelProcessingUtils { sparsityThreshold: Double): Unit = { val bayesianLinearModelAvro = AvroUtils.convertGLMModelToBayesianLinearModelAvro( - model, + model.getClass.getName, + model.coefficients.means, + model.coefficients.variancesOption, AvroConstants.FIXED_EFFECT, featureMap, sparsityThreshold) @@ -344,23 +348,39 @@ object ModelProcessingUtils { } /** - * Save an [[RDD]] of GLM to HDFS. + * Save an [[DataFrame]] of GLM to HDFS. * - * @param modelsRDD The models to save + * @param models The models to save * @param featureMapLoader A loader for the feature to index map * @param outputDir The directory to which to save the models * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero */ - private def saveModelsRDDToHDFS( - modelsRDD: RDD[(String, GeneralizedLinearModel)], + private def saveModelsDataFrameToHDFS( + models: DataFrame, /*(RDD[(String, GeneralizedLinearModel)],*/ featureMapLoader: IndexMapLoader, outputDir: String, sparsityThreshold: Double): Unit = { + val modelsRDD = models + .rdd + .map(row => { + val id = row.getAs[String](DataConst.MODEL_ID) + val modelType = row.getAs[String](DataConst.MODEL_TYPE) + val coefficients = VectorUtils.mlToBreeze(row.getAs[SparkMLVector](DataConst.COEFFICIENTS)) + val variances = row.getAs[SparkMLVector](DataConst.VARIANCES) + val variancesOption = if (variances != null) { + Option.apply(VectorUtils.mlToBreeze(variances)) + } else { + None + } + (id, modelType, coefficients, variancesOption) + } + ) + val linearModelAvro = modelsRDD.mapPartitions { iter => val featureMap = featureMapLoader.indexMapForRDD() - iter.map { case (modelId, model) => - AvroUtils.convertGLMModelToBayesianLinearModelAvro(model, modelId, featureMap, sparsityThreshold) + iter.map { case (modelId, modelType, coefficients, variancesOption) => + AvroUtils.convertGLMModelToBayesianLinearModelAvro(modelType, coefficients, variancesOption, modelId, featureMap, sparsityThreshold) } } @@ -368,35 +388,36 @@ object ModelProcessingUtils { } /** - * Load multiple GLM into a [[RDD]]. + * Load multiple GLM into a [[DataFrame]]. * * TODO: Currently only the means of the coefficients are loaded, the variances are discarded * * @param coefficientsRDDInputDir The input directory from which to read models * @param indexMapLoader A loader for the feature to index map * @param sc The Spark context - * @return A [[RDD]] of GLMs loaded from HDFS and a loader for the feature to index map it uses + * @return A [[DataFrame]] of GLMs loaded from HDFS and a loader for the feature to index map it uses */ - private def loadModelsRDDFromHDFS( + private def loadModelsDataFrameFromHDFS( coefficientsRDDInputDir: String, indexMapLoader: IndexMapLoader, - sc: SparkContext): RDD[(String, GeneralizedLinearModel)] = { + sc: SparkContext): DataFrame = { val modelAvros = AvroUtils.readAvroFilesInDir[BayesianLinearModelAvro]( sc, coefficientsRDDInputDir, minNumPartitions = sc.defaultParallelism) - modelAvros.mapPartitions { iter => + val rdd = modelAvros.mapPartitions { iter => val indexMap = indexMapLoader.indexMapForRDD() iter.map { modelAvro => val modelId = modelAvro.getModelId.toString val glm = AvroUtils.convertBayesianLinearModelAvroToGLM(modelAvro, indexMap) - - (modelId, glm) + Row.fromTuple(modelId, glm.modelType, glm.coefficients.means, glm.coefficients.variancesOption.getOrElse(null)) } } + + SparkSession.builder().getOrCreate().createDataFrame(rdd, GeneralizedLinearModel.schema) } /** diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala index d0df6c01..df001cbd 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala @@ -18,8 +18,10 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.avro.generated.ScoringResultAvro +import com.linkedin.photon.ml.Types.REType import com.linkedin.photon.ml.cli.game.scoring.ScoredItem /** @@ -62,27 +64,34 @@ object ScoreProcessingUtils { /** * Save the scored items of type [[ScoredItem]] to the given output directory on HDFS. * - * @param scoredItems An [[RDD]] of scored items of type [[ScoredItem]] + * @param scoredItems An [[DataFrame]] of scored items [score, label, weight] * @param modelId The model's id that used to compute the scores * @param outputDir The given output directory */ protected[ml] def saveScoredItemsToHDFS( - scoredItems: RDD[ScoredItem], - outputDir: String, - modelId: Option[String]): Unit = { + scoredItems: DataFrame, + reTypes: Iterable[REType], + outputDir: String, + modelId: Option[String]): Unit = { - val scoringResultAvros = scoredItems.map { case ScoredItem(predictionScore, labelOpt, weightOpt, ids) => - val metaDataMap = collection.mutable.Map(ids.toMap[CharSequence, CharSequence].toSeq: _*).asJava - val builder = ScoringResultAvro.newBuilder() - builder.setPredictionScore(predictionScore) - builder.setModelId(modelId.getOrElse(DEFAULT_MODEL_ID)) - ids.get(ResponsePredictionFieldNames.UID).foreach(builder.setUid(_)) - labelOpt.foreach(builder.setLabel(_)) - weightOpt.foreach(builder.setWeight(_)) - builder.setMetadataMap(metaDataMap) - builder.build() - } + val scoringResultAvros = scoredItems + .rdd + .map { row => + val predictionScore = row.getDouble(0) + val label = row.getDouble(1) // Nullable + val weight = row.getDouble(2) // Nullable + val ids = reTypes.map(reType => (reType, row.getAs[String](reType))).toMap + val metaDataMap = collection.mutable.Map(ids.toMap[CharSequence, CharSequence].toSeq: _*).asJava + val builder = ScoringResultAvro.newBuilder() + builder.setPredictionScore(predictionScore) + builder.setModelId(modelId.getOrElse(DEFAULT_MODEL_ID)) + ids.get(ResponsePredictionFieldNames.UID).foreach(builder.setUid(_)) + Option.apply(label).foreach(builder.setLabel(_)) + Option.apply(weight).foreach(builder.setWeight(_)) + builder.setMetadataMap(metaDataMap) + builder.build() + } AvroUtils.saveAsAvro(scoringResultAvros, outputDir, ScoringResultAvro.getClassSchema.toString) } } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala index 76f92a23..19b9b9d5 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala @@ -42,6 +42,7 @@ object Utils { * @return The feature name */ def getFeatureKey(record: GenericRecord, nameKey: String, termKey: String, delimiter: String): String = { + val name = getStringAvro(record, nameKey) val term = getStringAvro(record, termKey, isNullOK = true) getFeatureKey(name, term, delimiter) @@ -66,6 +67,7 @@ object Utils { * @return The feature name */ def getFeatureNameFromKey(key: String, delimiter: String = Constants.DELIMITER): String = { + require(delimiter.r.findAllIn(key).length == 1, s"Provided input [$key] is not a valid feature key") key.split(delimiter).headOption.getOrElse("") } @@ -78,6 +80,7 @@ object Utils { * @return The feature term */ def getFeatureTermFromKey(key: String, delimiter: String = Constants.DELIMITER): String = { + require(delimiter.r.findAllIn(key).length == 1, s"Provided input [$key] is not a valid feature key") key.split(delimiter).lift(1).getOrElse("") } @@ -88,10 +91,11 @@ object Utils { * @param record The generic record * @param key The key of the field * @param isNullOK Whether null is accepted. If set to true, then an empty string will be returned if the - * corresponding field of the key is null, otherwise, exception will be thrown. + * corresponding field of the key is null, otherwise, exception will be thrown. * @return The String typed field */ def getStringAvro(record: GenericRecord, key: String, isNullOK: Boolean = false): String = { + record.get(key) match { case id@(_: Utf8 | _: JString) => id.toString case number: JNumber => number.toString @@ -108,6 +112,7 @@ object Utils { * @return The Double typed field */ def getDoubleAvro(record: GenericRecord, key: String): Double = { + record.get(key) match { case number: JNumber => number.doubleValue case id@(_: Utf8 | _: JString) => atod(id.toString) @@ -124,9 +129,9 @@ object Utils { * @return A java map of String -> Object */ def getMapAvro( - record: GenericRecord, - key: String, - isNullOK: Boolean = false): Map[String, JObject] = { + record: GenericRecord, + key: String, + isNullOK: Boolean = false): Map[String, JObject] = { type T = java.util.Map[Any, JObject] // to avoid type erasure warning record.get(key) match { @@ -135,7 +140,7 @@ object Utils { // Need to convert Utf8 values to String here, because otherwise we get schema casting errors and misleading // equivalence failures downstream. case s@(_: Utf8 | _: JString) => s.toString - case x@(_: Number | _: JBoolean) => x + case x@(_: Number | _: JBoolean) => x case _ => null }) }.filter(_._2 != null).toMap @@ -152,6 +157,7 @@ object Utils { * @return The double parsed from the string, or an exception if string is empty or double is NaN or Infinity */ private def atod(string: String): Double = { + if (string.length() < 1) { throw new IllegalArgumentException("Can't convert empty string to double") } @@ -172,6 +178,7 @@ object Utils { * @return The Float typed field */ def getFloatAvro(record: GenericRecord, key: String): Float = { + record.get(key) match { case number: JNumber => number.floatValue case id@(_: Utf8 | _: JString) => atof(id.toString) @@ -187,6 +194,7 @@ object Utils { * @return A float parse from the string, or an exception if the string is empty or the flat is NaN or Infinity */ private def atof(string: String): Float = { + if (string.length() < 1) { throw new IllegalArgumentException("Can't convert empty string to float") } @@ -207,6 +215,7 @@ object Utils { * @return The Int typed field */ def getIntAvro(record: GenericRecord, key: String): Int = { + record.get(key) match { case number: JNumber => number.intValue case id@(_: Utf8 | _: JString) => id.toString.toInt @@ -223,6 +232,7 @@ object Utils { * @return The Long typed field */ def getLongAvro(record: GenericRecord, key: String): Long = { + record.get(key) match { case number: JNumber => number.longValue() case id@(_: Utf8 | _: JString) => id.toString.toLong @@ -239,6 +249,7 @@ object Utils { * @return The Boolean typed field */ def getBooleanAvro(record: GenericRecord, key: String): Boolean = { + record.get(key) match { case booleanValue: JBoolean => booleanValue.booleanValue // NOTE Scala String#toBoolean method is better than JBoolean#parseBoolean in the sense that it only accepts @@ -256,6 +267,7 @@ object Utils { * @param hadoopConf The Hadoop Configuration object */ def deleteHDFSDir(dir: Path, hadoopConf: Configuration): Unit = { + val fs = dir.getFileSystem(hadoopConf) if (fs.exists(dir)) fs.delete(dir, true) } @@ -267,6 +279,7 @@ object Utils { * @param hadoopConf The Hadoop Configuration object */ def createHDFSDir(dir: Path, hadoopConf: Configuration): Unit = { + val fs = dir.getFileSystem(hadoopConf) if (!fs.exists(dir)) fs.mkdirs(dir) } @@ -281,17 +294,18 @@ object Utils { * @param map Input map to look up * @param key The key to be looked up in the provided map * @param elseBranch If one wants to fail on not finding a value of type [[T]] in the map, an - * [[IllegalArgumentException]] will be thrown with the error message provided. If one wants to - * continue without failure, a default value is expected that will be returned + * [[IllegalArgumentException]] will be thrown with the error message provided. If one wants to + * continue without failure, a default value is expected that will be returned * @tparam T Intended return type of the method * @throws java.lang.IllegalArgumentException Exception thrown if a value of type [[T]] isn't found in the map and - * the error message is non-empty + * the error message is non-empty * @return A value of type [[T]] or throw an [[IllegalArgumentException]] */ @throws(classOf[IllegalArgumentException]) def getKeyFromMapOrElse[T](map: Map[String, Any], key: String, elseBranch: Either[String, T]): T = { + map.get(key) match { - case Some(x: T) => x // type erasure warning here + case Some(x: T) => x // type erasure warning here case _ => elseBranch match { case Left(errorMsg) => throw new IllegalArgumentException(errorMsg) @@ -332,4 +346,4 @@ object Utils { * @return Some[T] if p or None */ def filter[T](p: => Boolean)(f: => T): Option[T] = if (p) Some(f) else None -} +} \ No newline at end of file diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala index d20e1af5..37e42de6 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala @@ -14,26 +14,14 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.Dataset -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker /** * The optimization problem coordinate for each effect model. * - * @tparam D The training dataset type - * @param dataset The training dataset */ -protected[ml] abstract class Coordinate[D <: Dataset[D]](protected val dataset: D) { - - /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - protected[algorithm] def updateCoordinateWithDataset(dataset: D): Coordinate[D] +protected[ml] abstract class Coordinate { /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. @@ -42,16 +30,6 @@ protected[ml] abstract class Coordinate[D <: Dataset[D]](protected val dataset: */ protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset with residuals from other - * coordinates. - * - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ - protected[algorithm] def trainModel(score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = - updateCoordinateWithDataset(dataset.addScoresToOffsets(score)).trainModel() - /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as * a starting point. @@ -62,23 +40,11 @@ protected[ml] abstract class Coordinate[D <: Dataset[D]](protected val dataset: protected[algorithm] def trainModel(model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as - * a starting point and with residuals from other coordinates. - * - * @param model The existing model - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ - protected[algorithm] def trainModel( - model: DatumScoringModel, - score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = - updateCoordinateWithDataset(dataset.addScoresToOffsets(score)).trainModel(model) - - /** - * Compute scores for the coordinate data using a given model. + * Generate a new dataset with updated offset. * - * @param model The input model - * @return The dataset scores + * @param model The model of previous coordinate + * @return A new dataset with the updated offsets */ - protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores + protected[algorithm] def updateOffset(model: DatumScoringModel) } + diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala index 109af0c6..42dfb647 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala @@ -16,14 +16,12 @@ package com.linkedin.photon.ml.algorithm import scala.collection.mutable -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.evaluation.{EvaluationResults, EvaluationSuite, EvaluatorType} import com.linkedin.photon.ml.model.{DatumScoringModel, GameModel} import com.linkedin.photon.ml.optimization.OptimizationTracker @@ -35,15 +33,17 @@ import com.linkedin.photon.ml.util.Timed * * @param updateSequence The order in which to update coordinates * @param descentIterations Number of coordinate descent iterations (updates to each coordinate in order) - * @param validationDataAndEvaluationSuiteOpt Optional validation data and [[EvaluationSuite]] of validation metric - * [[com.linkedin.photon.ml.evaluation.Evaluator]] objects + * @param validationOpt Optional validation data + * @param evaluationSuiteOpt Optional [[EvaluationSuite]] of validation metric + * [[com.linkedin.photon.ml.evaluation.Evaluator]] objects * @param lockedCoordinates Set of locked coordinates within the initial model for performing partial retraining * @param logger A logger instance */ class CoordinateDescent( updateSequence: Seq[CoordinateId], descentIterations: Int, - validationDataAndEvaluationSuiteOpt: Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)], + validationOpt: Option[DataFrame], + evaluationSuiteOpt: Option[EvaluationSuite], lockedCoordinates: Set[CoordinateId], implicit private val logger: Logger) { @@ -98,7 +98,7 @@ class CoordinateDescent( * @param initialModelsOpt An optional map of existing models */ private def checkInput( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], initialModelsOpt: Option[Map[CoordinateId, DatumScoringModel]]): Unit = { // All coordinates in the update sequence must be passed as input @@ -130,7 +130,7 @@ class CoordinateDescent( * at the conclusion of coordinate descent). */ def run( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], initialModelsOpt: Option[Map[CoordinateId, DatumScoringModel]]): (GameModel, Option[EvaluationResults]) = { checkInput(coordinates, initialModelsOpt) @@ -145,10 +145,12 @@ class CoordinateDescent( coordinateId, coordinates(coordinateId), initialModels.get(coordinateId), - validationDataAndEvaluationSuiteOpt) + validationOpt, + evaluationSuiteOpt) - } else if (validationDataAndEvaluationSuiteOpt.isDefined) { - val (validationData, evaluationSuite) = validationDataAndEvaluationSuiteOpt.get + } else if (validationOpt.isDefined && evaluationSuiteOpt.isDefined) { + val validationData = validationOpt.get + val evaluationSuite = evaluationSuiteOpt.get val (model, evaluationsResults) = descendWithValidation( coordinates, updateSequence, @@ -176,43 +178,30 @@ object CoordinateDescent { * @param iteration The current iteration of coordinate descent (for logging purposes) * @param initialModelOpt An optional initial model whose coefficients should be used as a starting point for * optimization - * @param residualsOpt Optional residual scores to add to the training data offsets * @param logger An implicit logger * @return The new model trained for the coordinate */ protected[algorithm] def trainCoordinateModel( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, iteration: Int, initialModelOpt: Option[DatumScoringModel], - residualsOpt: Option[CoordinateDataScores])( + prevModelOpt: Option[DatumScoringModel])( implicit logger: Logger): DatumScoringModel = Timed(s"Optimizing coordinate '$coordinateId' for iteration $iteration") { logger.debug(s"Updating coordinate of class ${coordinate.getClass}") - val (model, tracker) = (initialModelOpt, residualsOpt) match { - case (Some(initialModel), Some(residuals)) => - Timed(s"Train new model with residuals using existing model as starting point") { - coordinate.trainModel(initialModel, residuals) - } - - case (Some(initialModel), None) => - Timed(s"Train new model using existing model as starting point") { - coordinate.trainModel(initialModel) - } + prevModelOpt.map(model => coordinate.updateOffset(model)) - case (None, Some(residuals)) => - Timed(s"Train new model with residuals") { - coordinate.trainModel(residuals) - } - - case (None, None) => - Timed(s"Train new model") { - coordinate.trainModel() - } - } + val (model, tracker) = initialModelOpt.map( + initialModel => Timed(s"Train new model using existing model as starting point") { + coordinate.trainModel(initialModel) + }).getOrElse( + Timed(s"Train new model") { + coordinate.trainModel() + }) logOptimizationSummary(logger, coordinateId, model, tracker) @@ -273,23 +262,24 @@ object CoordinateDescent { * @param coordinatesToTrain A list of coordinates for which to train new models * @param initialModelOpt An optional initial model whose coefficients should be used as a starting point for * optimization - * @param residualsOpt Optional residual scores to add to the training data offsets +// * @param residualsOpt Optional residual scores to add to the training data offsets * @param logger An implicit logger * @return The locked model if a new model should not be trained for this coordinate, a newly trained model otherwise. */ protected[algorithm] def trainOrFetchCoordinateModel( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, coordinatesToTrain: Seq[CoordinateId], initialModelOpt: Option[DatumScoringModel], - residualsOpt: Option[CoordinateDataScores])( + prevModelOpt: Option[DatumScoringModel])( implicit logger: Logger): DatumScoringModel = if (coordinatesToTrain.contains(coordinateId)) { - val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, residualsOpt) + prevModelOpt.map(coordinate.updateOffset(_)) + val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, prevModelOpt) - persistModel(newModel, coordinateId, iteration = 1) + //persistModel(newModel, coordinateId, iteration = 1) newModel @@ -310,17 +300,21 @@ object CoordinateDescent { * [[com.linkedin.photon.ml.evaluation.Evaluator]] */ protected[algorithm] def evaluateModel( - modelToEvaluate: DatumScoringModel, - validationData: RDD[(UniqueSampleId, GameDatum)], + modelToEvaluate: GameModel, + validationData: DataFrame, evaluationSuite: EvaluationSuite)( implicit logger: Logger): EvaluationResults = Timed("Validate GAME model") { val validatingScores = Timed(s"Compute validation scores") { - modelToEvaluate.scoreForCoordinateDescent(validationData) + modelToEvaluate.score(validationData) } Timed(s"Compute evaluation metrics") { - val results = evaluationSuite.evaluate(validatingScores.scoresRdd) + val scoresRdd = validatingScores.select(DataConst.ID, DataConst.SCORE) + .rdd + .map(row => (row.getAs[UniqueSampleId](0), row.getDouble(1))) + + val results = evaluationSuite.evaluate(scoresRdd) results .evaluations @@ -332,14 +326,6 @@ object CoordinateDescent { } } - /** - * Cache summed residual scores to memory/disk. - * - * @param coordinateDataScores The residual scores to cache - */ - protected[algorithm] def persistSummedScores(coordinateDataScores: CoordinateDataScores): Unit = - coordinateDataScores.setName(s"Summed scores").persistRDD(StorageLevel.MEMORY_AND_DISK_SER).materialize() - /** * Remove a cached model from cache. * @@ -371,48 +357,22 @@ object CoordinateDescent { * @return A new [[GameModel]] */ private def descend( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], updateSequence: Seq[CoordinateId], coordinatesToTrain: Seq[CoordinateId], iterations: Int, initialModels: Map[CoordinateId, DatumScoringModel])( implicit logger: Logger): GameModel = { - var i: Int = 2 - - // - // First coordinate, first iteration - // - - val firstCoordinateId = updateSequence.head - val firstCoordinate = coordinates(firstCoordinateId) - val firstCoordinateModel = trainOrFetchCoordinateModel( - firstCoordinateId, - firstCoordinate, - coordinatesToTrain, - initialModels.get(firstCoordinateId), - residualsOpt = None) - - var previousScores = firstCoordinate.score(firstCoordinateModel) - var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().sparkContext.emptyRDD) - val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = - mutable.Map(firstCoordinateId -> firstCoordinateModel) - val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = - mutable.Map(firstCoordinateId -> previousScores) - - previousScores.persistRDD(StorageLevel.DISK_ONLY) + var i: Int = 1 + val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map() + // The optional model of previous coordinate + var prevModelOpt: Option[DatumScoringModel] = None // - // Subsequent coordinates, first iteration + // First iteration // - - updateSequence.tail.foreach { coordinateId => - - val newSummedScores = previousScores + summedScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - summedScores = newSummedScores + updateSequence.foreach { coordinateId => val coordinate = coordinates(coordinateId) val newModel = trainOrFetchCoordinateModel( @@ -420,52 +380,36 @@ object CoordinateDescent { coordinate, coordinatesToTrain, initialModels.get(coordinateId), - Some(summedScores)) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) + prevModelOpt) + // persist the new model + persistModel(newModel, coordinateId, 1) currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + prevModelOpt = Option.apply(newModel) } // - // Subsequent coordinates, subsequent iterations + // Subsequent iterations // - - while (i <= iterations) { + while (i < iterations) { coordinatesToTrain.foreach { coordinateId => - val oldScores = currentScores(coordinateId) - val newSummedScores = summedScores - oldScores + previousScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - oldScores.unpersistRDD() - summedScores = newSummedScores - val coordinate = coordinates(coordinateId) val oldModelOpt = currentModels.get(coordinateId) - val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, Some(summedScores)) + val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, prevModelOpt) persistModel(newModel, coordinateId, i) - unpersistModel(oldModelOpt.get) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) - currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + unpersistModel(oldModelOpt.get) + prevModelOpt = Option.apply(newModel) } i += 1 } - summedScores.unpersistRDD() - currentScores.foreach { case (_, scores) => - scores.unpersistRDD() + currentModels.foreach { case (_, model) => + unpersistModel(model) } new GameModel(currentModels.toMap) @@ -491,57 +435,30 @@ object CoordinateDescent { * @return A (new [[GameModel]], model [[EvaluationResults]]) tuple */ private def descendWithValidation( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], updateSequence: Seq[CoordinateId], coordinatesToTrain: Seq[CoordinateId], iterations: Int, initialModels: Map[CoordinateId, DatumScoringModel], - validationData: RDD[(UniqueSampleId, GameDatum)], + validationData: DataFrame, evaluationSuite: EvaluationSuite)( implicit logger: Logger): (GameModel, EvaluationResults) = { - val evaluatorType: EvaluatorType = evaluationSuite.primaryEvaluator.evaluatorType - - var i: Int = 2 - - // - // First coordinate, first iteration - // - - val firstCoordinateId = updateSequence.head - val firstCoordinate = coordinates(firstCoordinateId) - val firstCoordinateModel = trainOrFetchCoordinateModel( - firstCoordinateId, - firstCoordinate, - coordinatesToTrain, - initialModels.get(firstCoordinateId), - residualsOpt = None) - - var previousScores = firstCoordinate.score(firstCoordinateModel) - var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().sparkContext.emptyRDD) - val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = - mutable.Map(firstCoordinateId -> firstCoordinateModel) - val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = - mutable.Map(firstCoordinateId -> previousScores) - var bestModels: Map[CoordinateId, DatumScoringModel] = currentModels.toMap - var bestEvaluationResults: EvaluationResults = evaluateModel( - firstCoordinateModel, - validationData, - evaluationSuite) + var i: Int = 1 + val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map() + // The optional model of previous coordinate + var prevModelOpt: Option[DatumScoringModel] = Option.empty - previousScores.persistRDD(StorageLevel.DISK_ONLY) + val evaluatorType: EvaluatorType = evaluationSuite.primaryEvaluator.evaluatorType + var bestEvaluationResults: EvaluationResults = null // - // Subsequent coordinates, first iteration + // First iteration // - updateSequence.tail.foreach { coordinateId => + updateSequence.foreach { coordinateId => - val newSummedScores = previousScores + summedScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - summedScores = newSummedScores +// summedScores = previousScores + summedScores val coordinate = coordinates(coordinateId) val newModel = trainOrFetchCoordinateModel( @@ -549,20 +466,19 @@ object CoordinateDescent { coordinate, coordinatesToTrain, initialModels.get(coordinateId), - Some(summedScores)) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) + prevModelOpt) + // persist the new model + persistModel(newModel, coordinateId, 1) currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + prevModelOpt = Option.apply(newModel) val evaluationModel = new GameModel(currentModels.toMap) val evaluationResults = evaluateModel(evaluationModel, validationData, evaluationSuite) // Log warning if adding a coordinate reduces the overall model performance - if (evaluatorType.betterThan(bestEvaluationResults.primaryEvaluation, evaluationResults.primaryEvaluation)) { + if (bestEvaluationResults != null + && evaluatorType.betterThan(bestEvaluationResults.primaryEvaluation, evaluationResults.primaryEvaluation)) { logger.info(s"Warning: adding model for coordinate '$coordinateId' reduces overall model performance") } @@ -573,37 +489,28 @@ object CoordinateDescent { // Subsequent coordinates, subsequent iterations // - bestModels = currentModels.toMap + var bestModels: Map[CoordinateId, DatumScoringModel] = currentModels.toMap - while (i <= iterations) { + while (i < iterations) { coordinatesToTrain.foreach { coordinateId => - val oldScores = currentScores(coordinateId) - val newSummedScores = summedScores - oldScores + previousScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - oldScores.unpersistRDD() - summedScores = newSummedScores +// summedScores = summedScores - oldScores + previousScores val coordinate = coordinates(coordinateId) val oldModelOpt = currentModels.get(coordinateId) - val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, Some(summedScores)) + val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, prevModelOpt) persistModel(newModel, coordinateId, i) + currentModels.put(coordinateId, newModel) + prevModelOpt = Option.apply(newModel) + // If the best GAME model doesn't have a model for this coordinate or it does but it's not the old model, // unpersist the old model. if (bestModels.get(coordinateId).forall(!_.eq(oldModelOpt.get))) { unpersistModel(oldModelOpt.get) } - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) - - currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores - val evaluationModel = new GameModel(currentModels.toMap) val evaluationResults = evaluateModel(evaluationModel, validationData, evaluationSuite) if (evaluatorType.betterThan(evaluationResults.primaryEvaluation, bestEvaluationResults.primaryEvaluation)) { @@ -625,10 +532,6 @@ object CoordinateDescent { i += 1 } - summedScores.unpersistRDD() - currentScores.foreach { case (_, scores) => - scores.unpersistRDD() - } currentModels.foreach { case (coordinateId, model) => // If the best GAME model doesn't have a model for this coordinate or it does but they don't match, unpersist it if (bestModels.get(coordinateId).forall(!_.eq(model))) { @@ -645,26 +548,28 @@ object CoordinateDescent { * @param coordinateId The ID of the single coordinate for which to train a new model * @param coordinate The [[Coordinate]] for which to train a new model * @param initialModelOpt An optional existing model to use for warm-start training - * @param validationDataAndEvaluationSuiteOpt An optional (validation data, set of evaluation metrics to compute) - * tuple + * @param validationOpt An optional validation data + * @param evaluationSuiteOpt An optional set of evaluation metrics to compute tuple * @param logger An implicit logger * @return A (new [[GameModel]], optional model [[EvaluationResults]]) tuple */ private def descendSingleCoordinate( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, initialModelOpt: Option[DatumScoringModel], - validationDataAndEvaluationSuiteOpt: Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)])( + validationOpt: Option[DataFrame], + evaluationSuiteOpt: Option[EvaluationSuite])( implicit logger: Logger): (GameModel, Option[EvaluationResults]) = { - val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, residualsOpt = None) + val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, prevModelOpt = None) persistModel(newModel, coordinateId, iteration = 1) - val evaluationResultsOpt = validationDataAndEvaluationSuiteOpt.map { case (validationData, evaluationSuite) => - evaluateModel(newModel, validationData, evaluationSuite) + val gameModel = new GameModel(Map(coordinateId -> newModel)) + val evaluationResultsOpt = validationOpt.map { case validationData => + evaluateModel(gameModel, validationData, evaluationSuiteOpt.get) } - (new GameModel(Map(coordinateId -> newModel)), evaluationResultsOpt) + (gameModel, evaluationResultsOpt) } } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala index bdf9bf2e..d22e750f 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala @@ -14,27 +14,14 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.Dataset -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker /** * The optimization problem coordinate for a pre-trained model. * - * @tparam D The training dataset type - * @param dataset The training dataset */ -abstract class ModelCoordinate[D <: Dataset[D]](dataset: D) extends Coordinate(dataset) { - - /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - override protected[algorithm] def updateCoordinateWithDataset(dataset: D): Coordinate[D] = - throw new UnsupportedOperationException("Attempted to update model coordinate.") +abstract class ModelCoordinate extends Coordinate { /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. @@ -60,5 +47,5 @@ abstract class ModelCoordinate[D <: Dataset[D]](dataset: D) extends Coordinate(d * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores + override protected[algorithm] def updateOffset(model: DatumScoringModel) } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala new file mode 100644 index 00000000..826c7363 --- /dev/null +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala @@ -0,0 +1,25 @@ +/* + * Copyright 2017 LinkedIn Corp. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ +package com.linkedin.photon.ml.constants + +object DataConst { + val ID = "uniqueId" + val SCORE = "score" + + val MODEL_ID = "mId" + val MODEL_TYPE = "modelType" + val COEFFICIENTS = "coefficients" + val VARIANCES = "variances" +} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala deleted file mode 100644 index 311be33a..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.GameDatum - -/** - * The class used to track scored data points throughout training. The score objects are scores only, with no additional - * information. - * - * @param scoresRdd The scores consist of (unique ID, score) pairs as explained above. - */ -protected[ml] class CoordinateDataScores(override val scoresRdd: RDD[(UniqueSampleId, Double)]) - extends DataScores[Double, CoordinateDataScores](scoresRdd) { - - /** - * Generic method to combine two [[CoordinateDataScores]] objects. - * - * @param op The operator to combine two [[CoordinateDataScores]] - * @param that The [[CoordinateDataScores]] instance to merge with this instance - * @return A merged [[CoordinateDataScores]] - */ - private def joinAndApply(op: (Double, Double) => Double, that: CoordinateDataScores): CoordinateDataScores = - // Use fullOuterJoin: it's possible for some data to not be scored by a model - new CoordinateDataScores( - this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapValues { case (thisScoreOpt, thatScoreOpt) => - op(thisScoreOpt.getOrElse(MathConst.DEFAULT_SCORE), thatScoreOpt.getOrElse(MathConst.DEFAULT_SCORE)) - }) - - /** - * The addition operation for [[CoordinateDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[CoordinateDataScores]] instance to add to this instance - * @return A new [[CoordinateDataScores]] instance encapsulating the accumulated values - */ - override def +(that: CoordinateDataScores): CoordinateDataScores = joinAndApply((a, b) => a + b, that) - - /** - * The minus operation for [[CoordinateDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[CoordinateDataScores]] instance to subtract from this instance - * @return A new [[CoordinateDataScores]] instance encapsulating the subtracted values - */ - override def -(that: CoordinateDataScores): CoordinateDataScores = joinAndApply((a, b) => a - b, that) - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - override def canEqual(other: Any): Boolean = other.isInstanceOf[CoordinateDataScores] -} - -object CoordinateDataScores { - - /** - * A factory method to create a [[CoordinateDataScores]] object from an [[RDD]] of scores. - * - * @param scores The scores, consisting of (unique ID, score) pairs. - * @return A new [[CoordinateDataScores]] object - */ - def apply(scores: RDD[(UniqueSampleId, Double)]): CoordinateDataScores = new CoordinateDataScores(scores) - - /** - * Convert a [[GameDatum]] and a raw score into a score object. For [[CoordinateDataScores]] this is the raw score. - * - * @param datum The datum which was scored - * @param score The raw score for the datum - * @return The score object - */ - protected[ml] def toScore(datum: GameDatum, score: Double): Double = score -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala deleted file mode 100644 index eb140632..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import scala.reflect.ClassTag - -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions -import org.apache.spark.storage.StorageLevel - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.spark.RDDLike - -/** - * A base class for tracking scored data points, where the scores are stored in an [[RDD]] which associates the unique - * ID of a data point with a score object. - * - * @param scoresRdd Data point scores, as described above - */ -abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( - val scoresRdd: RDD[(UniqueSampleId, T)]) - extends RDDLike { - - /** - * The addition operation for [[DataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[DataScores]] instance to add to this instance - * @return A new [[DataScores]] instance encapsulating the accumulated values - */ - def +(that: D): D - - /** - * The minus operation for [[DataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[DataScores]] instance to subtract from this instance - * @return A new [[DataScores]] instance encapsulating the subtracted values - */ - def -(that: D): D - - /** - * Get the Spark context for the distributed scores. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = scoresRdd.sparkContext - - /** - * Set the name of [[scoresRdd]]. - * - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[scoresRdd]] assigned - */ - override def setName(name: String): RDDLike = { - - scoresRdd.setName(name) - - this - } - - /** - * Set the storage level of [[scoresRdd]]. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[scoresRdd]] set - */ - override def persistRDD(storageLevel: StorageLevel): RDDLike = { - - if (!scoresRdd.getStorageLevel.isValid) scoresRdd.persist(storageLevel) - - this - } - - /** - * Mark [[scoresRdd]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[scoresRdd]] marked non-persistent - */ - override def unpersistRDD(): RDDLike = { - - if (scoresRdd.getStorageLevel.isValid) scoresRdd.unpersist() - - this - } - - /** - * Materialize [[scoresRdd]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[scoresRdd]] materialized - */ - override def materialize(): RDDLike = { - - scoresRdd.count() - - this - } - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - def canEqual(other: Any): Boolean = other.isInstanceOf[DataScores[T, D]] - - /** - * Compare two [[DataScores]]s objects. - * - * @param other Some other object - * @return True if the both [[DataScores]] objects have identical scores for each unique ID, false otherwise - */ - override def equals(other: Any): Boolean = other match { - - case that: DataScores[T, D] => - - val canEqual = this.canEqual(that) - lazy val areEqual = this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapPartitions { iterator => - - val areScoresEqual = iterator.forall { - case (_, (Some(thisScore), Some(thatScore))) => thisScore.equals(thatScore) - case _ => false - } - - Iterator.single(areScoresEqual) - } - .fold(true)(_ && _) - - canEqual && areEqual - - case _ => - false - } - - /** - * Returns a hash code value for the object. - * - * @return An [[Int]] hash code - */ - override def hashCode: Int = scoresRdd.hashCode() -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala deleted file mode 100644 index 798770e6..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.GameDatum - -/** - * The class used to track scored data points throughout scoring and validation. The score objects are - * [[ScoredGameDatum]], full data points with score information. - * - * @param scoresRdd Data point scores, as described above - */ -class ModelDataScores(override val scoresRdd: RDD[(UniqueSampleId, ScoredGameDatum)]) - extends DataScores[ScoredGameDatum, ModelDataScores](scoresRdd) { - - /** - * Generic method to combine two [[ModelDataScores]] objects. - * - * @param op The operator to combine two [[ModelDataScores]] - * @param that The [[ModelDataScores]] instance to merge with this instance - * @return A merged [[ModelDataScores]] - */ - private def joinAndApply( - op: (ScoredGameDatum, ScoredGameDatum) => ScoredGameDatum, - that: ModelDataScores): ModelDataScores = - // Use fullOuterJoin: it's possible for some data to not be scored by a model - new ModelDataScores( - this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapValues { case (thisScoreOpt, thatScoreOpt) => - // Currently acceptable to drop op if one value is missing, since the currently existing operations are - // commutative and the default value is the 0 value - (thisScoreOpt, thatScoreOpt) match { - case (Some(thisScore), Some(thatScore)) => op(thisScore, thatScore) - case (Some(thisScore), None) => op(thisScore, thisScore.copy(score = MathConst.DEFAULT_SCORE)) - case (None, Some(thatScore)) => op(thatScore.copy(score = MathConst.DEFAULT_SCORE), thatScore) - } - }) - - /** - * The addition operation for [[ModelDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[ModelDataScores]] instance to add to this instance - * @return A new [[ModelDataScores]] instance encapsulating the accumulated values - */ - override def +(that: ModelDataScores): ModelDataScores = - joinAndApply((a, b) => a.copy(score = a.score + b.score), that) - - /** - * The minus operation for [[ModelDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[ModelDataScores]] instance to subtract from this instance - * @return A new [[ModelDataScores]] instance encapsulating the subtracted values - */ - override def -(that: ModelDataScores): ModelDataScores = - joinAndApply((a, b) => a.copy(score = a.score - b.score), that) - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - override def canEqual(other: Any): Boolean = other.isInstanceOf[ModelDataScores] -} - -object ModelDataScores { - - /** - * A factory method to create a [[ModelDataScores]] object from an [[RDD]] of scores. - * - * @param scores The scores, consisting of (unique ID, scored datum) pairs. - * @return A new [[ModelDataScores]] object - */ - def apply(scores: RDD[(Long, ScoredGameDatum)]): ModelDataScores = new ModelDataScores(scores) - - /** - * Convert a [[GameDatum]] and a raw score into a score object. For [[CoordinateDataScores]] this is the raw score. - * - * @param datum The datum which was scored - * @param score The raw score for the datum - * @return The score object - */ - protected[ml] def toScore(datum: GameDatum, score: Double): ScoredGameDatum = datum.toScoredGameDatum(score) -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala index 38dc40c1..07ab0516 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala @@ -14,7 +14,7 @@ */ package com.linkedin.photon.ml.model -import breeze.linalg.{DenseVector, SparseVector, Vector, norm} +import breeze.linalg.{Vector, norm} import breeze.stats.meanAndVariance import com.linkedin.photon.ml.constants.MathConst diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala index 5b6aff04..e30d5029 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala @@ -14,12 +14,9 @@ */ package com.linkedin.photon.ml.model -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} import com.linkedin.photon.ml.util.Summarizable /** @@ -37,16 +34,20 @@ trait DatumScoringModel extends Summarizable { * * @note "score" = features * coefficients (Before link function in the case of logistic regression, for example) * @param dataPoints The dataset to score + * @param scoreField The field name of the score * @return The computed scores */ - def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores + def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame /** - * Compute the scores for the GAME dataset, and store the scores only. + * Accumulatively compute the scores for the GAME dataset. * - * @note "score" = features * coefficients (Before link function in the case of logistic regression, for example) + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore * @return The computed scores */ - protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores + def computeScore(dataPoints: DataFrame, scoreField: String, accumulativeScoreField: String): DataFrame + } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala index 417ba0e2..d30c0692 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala @@ -16,12 +16,11 @@ package com.linkedin.photon.ml.model import scala.collection.SortedMap -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.Types.CoordinateId +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.util.ClassUtils /** @@ -29,12 +28,12 @@ import com.linkedin.photon.ml.util.ClassUtils * * @param gameModels A (modelName -> model) map containing the sub-models that make up the complete GAME model */ -class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) extends DatumScoringModel { +class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) { // The model type should be consistent at construction time. However, copies of this object shouldn't need to call the // check again. Thus the value is lazy, so that anonymous classes can overwrite it without triggering a call to // determineModelType, but it's called immediately so that it's evaluated at construction time. - override lazy val modelType: TaskType = GameModel.determineModelType(gameModels) + lazy val modelType: TaskType = GameModel.determineModelType(gameModels) modelType /** @@ -92,22 +91,26 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * Compute score, PRIOR to going through any link function, i.e. just compute a dot product of feature values * and model coefficients. * - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") + * @param dataPoints The dataset to score * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = - gameModels.values.map(_.score(dataPoints)).reduce(_ + _) + def score(dataPoints: DataFrame): DataFrame = { + + gameModels.foreach { case (coordinateId: CoordinateId, coordinateModel: DatumScoringModel) => + val scoreName = s"${coordinateId}_score" + coordinateModel.computeScore(dataPoints, scoreName, DataConst.SCORE) + } + + dataPoints + } - override protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - gameModels.values.map(_.scoreForCoordinateDescent(dataPoints)).reduce(_ + _) /** * Summarize this GAME model. * * @return A summary of the object in string representation */ - override def toSummaryString: String = { + def toSummaryString: String = { gameModels.map { case (name, model) => s"Model name: $name, summary:\n${model.toSummaryString}\n" }.mkString("\n") } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala index 5266e03a..4ee7b1d9 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala @@ -162,7 +162,7 @@ abstract class Optimizer[-Function <: ObjectiveFunction]( protected[ml] def optimize( objectiveFunction: Function, initialCoefficients: Vector[Double])( - data: objectiveFunction.Data): (Vector[Double], Double) = { + data: objectiveFunction.Data): (Vector[Double], OptimizationStatesTracker) = { val normalizedInitialCoefficients = normalizationContext.value.modelToTransformedSpace(initialCoefficients) @@ -183,7 +183,7 @@ abstract class Optimizer[-Function <: ObjectiveFunction]( statesTracker.convergenceReason = getConvergenceReason val currState = getCurrentState.get - (currState.coefficients, currState.loss) + (currState.coefficients, statesTracker) } /** diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala index 4d5e1221..ace58073 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala @@ -15,10 +15,9 @@ package com.linkedin.photon.ml.sampling import java.util.Random - +import com.linkedin.photon.ml.Types.UniqueSampleId import org.apache.spark.rdd.RDD -import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint