timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateStore.scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework

import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.DateParser
import com.twitter.scalding.RichDate
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.storehaus_internal.manhattan._
import com.twitter.storehaus_internal.util.ApplicationID
import com.twitter.storehaus_internal.util.DatasetName
import com.twitter.storehaus_internal.util.HDFSPath
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird_internal.runner.store_config._
import java.util.TimeZone
import com.twitter.summingbird.batch.MillisecondBatcher

/*
 * Configuration common to all offline aggregate stores
 *
 * @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
 * @param dummyAppId Dummy manhattan app id required by summingbird (unused)
 * @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
 * @param startDate Start date for summingbird job to begin computing aggregates
 */
case class OfflineAggregateStoreCommonConfig(
  outputHdfsPathPrefix: String,
  dummyAppId: String,
  dummyDatasetPrefix: String,
  startDate: String)

/**
 * A trait inherited by any object that defines
 * a HDFS prefix to write output data to. E.g. timelines has its own
 * output prefix to write aggregates_v2 results, your team can create
 * its own.
 */
trait OfflineStoreCommonConfig extends Serializable {
  /*
   * @param startDate Date to create config for
   * @return OfflineAggregateStoreCommonConfig object with all config details for output populated
   */
  def apply(startDate: String): OfflineAggregateStoreCommonConfig
}

/**
 * @param name Uniquely identifiable human-readable name for this output store
 * @param startDate Start date for this output store from which aggregates should be computed
 * @param commonConfig Provider of other common configuration details
 * @param batchesToKeep Retention policy on output (number of batches to keep)
 */
abstract class OfflineAggregateStoreBase
    extends OfflineStoreOnlyConfig[ManhattanROConfig]
    with AggregateStore {

  override def name: String
  def startDate: String
  def commonConfig: OfflineStoreCommonConfig
  def batchesToKeep: Int
  def maxKvSourceFailures: Int

  val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
  val manhattan: ManhattanROConfig = ManhattanROConfig(
    /* This is a sample config, will be replaced with production config later */
    HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
    ApplicationID(datedCommonConfig.dummyAppId),
    DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
    com.twitter.storehaus_internal.manhattan.Adama
  )

  val batcherSize = 24
  val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)

  val startTime: RichDate =
    RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)

  val offline: ManhattanROConfig = manhattan
}

/**
 * Defines an aggregates store which is composed of DataRecords
 * @param name Uniquely identifiable human-readable name for this output store
 * @param startDate Start date for this output store from which aggregates should be computed
 * @param commonConfig Provider of other common configuration details
 * @param batchesToKeep Retention policy on output (number of batches to keep)
 */
case class OfflineAggregateDataRecordStore(
  override val name: String,
  override val startDate: String,
  override val commonConfig: OfflineStoreCommonConfig,
  override val batchesToKeep: Int = 7,
  override val maxKvSourceFailures: Int = 0)
    extends OfflineAggregateStoreBase {

  def toOfflineAggregateDataRecordStoreWithDAL(
    dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
  ): OfflineAggregateDataRecordStoreWithDAL =
    OfflineAggregateDataRecordStoreWithDAL(
      name = name,
      startDate = startDate,
      commonConfig = commonConfig,
      dalDataset = dalDataset,
      maxKvSourceFailures = maxKvSourceFailures
    )
}

trait withDALDataset {
  def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
}

/**
 * Defines an aggregates store which is composed of DataRecords and writes using DAL.
 * @param name Uniquely identifiable human-readable name for this output store
 * @param startDate Start date for this output store from which aggregates should be computed
 * @param commonConfig Provider of other common configuration details
 * @param dalDataset The KeyValDALDataset for this output store
 * @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
 *                      retention policy to maintain the desired number of versions.
 */
case class OfflineAggregateDataRecordStoreWithDAL(
  override val name: String,
  override val startDate: String,
  override val commonConfig: OfflineStoreCommonConfig,
  override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
  override val batchesToKeep: Int = -1,
  override val maxKvSourceFailures: Int = 0)
    extends OfflineAggregateStoreBase
    with withDALDataset