From d32006473eec48c9696553f898a95d3750796a18 Mon Sep 17 00:00:00 2001 From: Manjunath Davanam Date: Sun, 7 Nov 2021 21:01:47 +0530 Subject: [PATCH 01/32] Issue feat SB-27408: Initial commit of Base Archival Job Implementation --- .../analytics/archival/BaseArchivalJob.scala | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala new file mode 100644 index 000000000..090dbf257 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -0,0 +1,59 @@ +package org.sunbird.analytics.archival + +import com.datastax.spark.connector.cql.CassandraConnectorConf +import org.apache.spark.SparkContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.cassandra._ +import org.ekstep.analytics.framework.Level.ERROR +import org.ekstep.analytics.framework.conf.AppConf +import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} +import org.ekstep.analytics.framework.{FrameworkContext, IJob, JobConfig} +import org.sunbird.analytics.exhaust.BaseReportsJob + +trait BaseArchivalJob extends BaseReportsJob with IJob with Serializable { + + def main(config: String)(implicit sc: Option[SparkContext] = None, fc: Option[FrameworkContext] = None): Unit = { + implicit val className: String = getClassName; + JobLogger.init(jobName) + JobLogger.start(s"$jobName started executing - ver3", Option(Map("config" -> config, "model" -> jobName))) + implicit val jobConfig: JobConfig = JSONUtils.deserialize[JobConfig](config) + implicit val spark: SparkSession = openSparkSession(jobConfig) + implicit val frameworkContext: FrameworkContext = getReportingFrameworkContext() + try { + val res = CommonUtil.time(archive()); + JobLogger.end(s"$jobName completed execution", "SUCCESS", None) + } catch { + case ex: Exception => ex.printStackTrace() + JobLogger.log(ex.getMessage, None, ERROR); + JobLogger.end(jobName + " execution failed", "FAILED", Option(Map("model" -> jobName, "statusMsg" -> ex.getMessage))); + } + finally { + frameworkContext.closeContext(); + spark.close() + } + + + } + + def jobId: String + + def jobName: String + + def getReportPath: String + + def getReportKey: String + + def init()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + spark.setCassandraConf("LMSCluster", CassandraConnectorConf.ConnectionHostParam.option(AppConf.getConfig("sunbird.courses.cluster.host"))) + } + + def dataFilter(): Unit = {} + + def dateFormat(): String + + def getClassName: String + + def archive(): Unit = {} + + +} From 9a4cbe29117d0722c32dfcd975fb40a81759c526 Mon Sep 17 00:00:00 2001 From: utk14 Date: Mon, 15 Nov 2021 13:05:50 +0530 Subject: [PATCH 02/32] Issue SB-24793 feat: Assessment archived data:: Base Archival job Implementation --- .../archival/AssessmentArchivalJob.scala | 20 ++++ .../analytics/archival/BaseArchivalJob.scala | 102 +++++++++++++++--- .../util/ArchivalMetaDataStoreJob.scala | 94 ++++++++++++++++ .../src/test/resources/application.conf | 3 + .../archival/TestAsssessmentArchivalJob.scala | 51 +++++++++ .../analytics/util/EmbeddedPostgres.scala | 22 ++++ 6 files changed, 279 insertions(+), 13 deletions(-) create mode 100644 data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala create mode 100644 data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala create mode 100644 data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala new file mode 100644 index 000000000..a71b5a52e --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -0,0 +1,20 @@ +package org.sunbird.analytics.archival + +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} + +object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { + + override def getClassName = "org.sunbird.analytics.archival.AssessmentArchivalJob" + override def jobName() = "AssessmentArchivalJob"; + override def jobId(): String = "assessment-archival"; + override def getReportPath() = "assessment-archival/"; + override def getReportKey() = "assessment"; + + override def processArchival(archivalTableData: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { + println("Process Archival") + + spark.emptyDataFrame + } + +} diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 090dbf257..83ef88583 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -2,15 +2,25 @@ package org.sunbird.analytics.archival import com.datastax.spark.connector.cql.CassandraConnectorConf import org.apache.spark.SparkContext -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.cassandra._ +import org.apache.spark.sql.types.StructType import org.ekstep.analytics.framework.Level.ERROR import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} import org.ekstep.analytics.framework.{FrameworkContext, IJob, JobConfig} import org.sunbird.analytics.exhaust.BaseReportsJob +import org.apache.spark.sql.functions._ +import org.joda.time.DateTime +import org.sunbird.analytics.archival.util.ArchivalMetaDataStoreJob -trait BaseArchivalJob extends BaseReportsJob with IJob with Serializable { +case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) + +case class Period(year: Int, weekOfYear: Int) +trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { + + private val partitionCols = List("batch_id", "year", "week_of_year") + val cassandraUrl = "org.apache.spark.sql.cassandra" def main(config: String)(implicit sc: Option[SparkContext] = None, fc: Option[FrameworkContext] = None): Unit = { implicit val className: String = getClassName; @@ -19,8 +29,9 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with Serializable { implicit val jobConfig: JobConfig = JSONUtils.deserialize[JobConfig](config) implicit val spark: SparkSession = openSparkSession(jobConfig) implicit val frameworkContext: FrameworkContext = getReportingFrameworkContext() + try { - val res = CommonUtil.time(archive()); + val res = CommonUtil.time(execute()); JobLogger.end(s"$jobName completed execution", "SUCCESS", None) } catch { case ex: Exception => ex.printStackTrace() @@ -35,25 +46,90 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with Serializable { } - def jobId: String + def init()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + spark.setCassandraConf("LMSCluster", CassandraConnectorConf.ConnectionHostParam.option(AppConf.getConfig("sunbird.courses.cluster.host"))) + } - def jobName: String +// def dataFilter(): Unit = {} +// def dateFormat(): String; + def getClassName: String; + + def execute()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + val modelParams = config.modelParams.getOrElse(Map[String, Option[AnyRef]]()); + val archivalRequest = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) + val archivalTable = archivalRequest.archivalTable + val archivalKeyspace = archivalRequest.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + + val batchId: String = archivalRequest.batchId.getOrElse("") + val date: String = archivalRequest.date.getOrElse("") + val mode: String = modelParams.getOrElse("mode","archive").asInstanceOf[String] + + println("modelParams: " + modelParams) + println("archival request: " + archivalRequest) + val archivalTableData: DataFrame = getArchivalData(archivalTable, archivalKeyspace,Option(batchId),Option(date)) + println("archivalTableData ") + archivalTableData.show(false) + + mode.toLowerCase() match { + case "archival" => + archiveData(archivalTableData, archivalRequest) + case "delete" => + deleteArchivedData(archivalTableData,archivalRequest) + } + } - def getReportPath: String + def archiveData(data: DataFrame, archivalRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + val requests = getRequests(jobId, archivalRequest.batchId) + println("requestLength: " + requests.length) + try { + if(requests.length == 0) { + val groupedDF = data.withColumn("updated_on", to_timestamp(col("updated_on"))) + .withColumn("year", year(col("updated_on"))) + .withColumn("week_of_year", weekofyear(col("updated_on"))) + .withColumn("question", to_json(col("question"))) + groupedDF.show(false) + val archiveBatchList = groupedDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + println("archiveBatchList: " + archiveBatchList.toString) + +// val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + } + } catch { + case ex: Exception => + ex.printStackTrace() + } + processArchival(data, archivalRequest) + } - def getReportKey: String + def deleteArchivedData(data: DataFrame, archivalRequest: Request): Unit = { - def init()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { - spark.setCassandraConf("LMSCluster", CassandraConnectorConf.ConnectionHostParam.option(AppConf.getConfig("sunbird.courses.cluster.host"))) } - def dataFilter(): Unit = {} + def processArchival(archivalTableData: DataFrame, archivalRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; - def dateFormat(): String + def getArchivalData(table: String, keyspace: String, batchId: Option[String], date: Option[String])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame = { + val archivalTableSettings = Map("table" -> table, "keyspace" -> keyspace, "cluster" -> "LMSCluster") + val archivalDBDF = loadData(archivalTableSettings, cassandraUrl, new StructType()) + val batchIdentifier = batchId.getOrElse(null) - def getClassName: String + if (batchIdentifier.nonEmpty) { + archivalDBDF.filter(col("batch_id") === batchIdentifier).persist() + } else { + archivalDBDF + } + } - def archive(): Unit = {} + def getWeekAndYearVal(date: String): Period = { + if (null != date && date.nonEmpty) { + val dt = new DateTime(date) + Period(year = dt.getYear, weekOfYear = dt.getWeekOfWeekyear) + } else { + Period(0, 0) + } + } + def jobId: String; + def jobName: String; + def getReportPath: String; + def getReportKey: String; } diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala new file mode 100644 index 000000000..fa88cc9b0 --- /dev/null +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -0,0 +1,94 @@ +package org.sunbird.analytics.archival.util + +import java.sql.{Connection, DriverManager, PreparedStatement, Timestamp} +import java.util.Properties + +import org.apache.commons.lang.StringUtils +import org.apache.spark.sql.{Encoders, SparkSession} +import org.apache.spark.sql.functions.{col, lit} +import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} +import org.ekstep.analytics.framework.Level.INFO +import org.ekstep.analytics.framework.conf.AppConf +import org.ekstep.analytics.framework.util.{CommonUtil, JobLogger} +import org.sunbird.analytics.archival.Request + +case class ArchivalRequest(request_id: String, batch_id: String, collection_id: String, resource_type: Option[String], job_id: String, + var archival_date: Option[Long],var completion_date: Option[Long],var archival_status: String,var deletion_status: String, + blob_url: Option[List[String]],var iteration: Option[Int], request_data: Option[String],var err_message: Option[String]) + +trait ArchivalMetaDataStoreJob { + + implicit val className: String = getClassName; + val connProperties: Properties = CommonUtil.getPostgresConnectionProps() + val db: String = AppConf.getConfig("postgres.db") + val url: String = AppConf.getConfig("postgres.url") + s"$db" + val requestsTable: String = AppConf.getConfig("postgres.table.archival_request") + val dbc: Connection = DriverManager.getConnection(url, connProperties.getProperty("user"), connProperties.getProperty("password")); + dbc.setAutoCommit(true); + + def getClassName(): String; + + def cleanUp() { + dbc.close(); + } + + def getRequests(jobId: String, batchId: Option[String])(implicit spark: SparkSession, fc: FrameworkContext): Array[ArchivalRequest] = { + println("jobid: " + jobId + " batchid: " + batchId) + val encoder = Encoders.product[ArchivalRequest] + val archivalConfigsDf = spark.read.jdbc(url, requestsTable, connProperties) + .where(col("job_id") === jobId && col("iteration") < 3) + println("archivalConfigDF:") + archivalConfigsDf.show(false) + + val filteredReportConfigDf = if (batchId.isDefined) { + val filteredArchivalConfig = archivalConfigsDf.filter(col("batch_id").equalTo(batchId.get)) + if (filteredArchivalConfig.count() > 0) filteredArchivalConfig else archivalConfigsDf + } else archivalConfigsDf + println("filteredtReportCOnfig: ") + filteredReportConfigDf.show(false) + JobLogger.log("fetched records count" + filteredReportConfigDf.count(), None, INFO) + val requests = filteredReportConfigDf.as[ArchivalRequest](encoder).collect() + requests + } + + def markArchivalRequestAsFailed(request: ArchivalRequest, failedMsg: String): ArchivalRequest = { + request.archival_status = "FAILED"; + request.archival_date = Option(System.currentTimeMillis()); + request.iteration = Option(request.iteration.getOrElse(0) + 1); + request.err_message = Option(failedMsg); + request + } + + def markDeletionRequestAsFailed(request: ArchivalRequest, failedMsg: String): ArchivalRequest = { + request.deletion_status = "FAILED"; + request.archival_date = Option(System.currentTimeMillis()); + request.iteration = Option(request.iteration.getOrElse(0) + 1); + request.err_message = Option(failedMsg); + request + } + + def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): Boolean = { + val insertQry = s"INSERT INTO $requestsTable (request_id, batch_id, collection_id, resource_type, job_id, archival_date, completion_date, archival_status, " + + s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)" + val updateQry = s"UPDATE $requestsTable SET iteration = ?, archival_status=?, blob_url=?, archival_date=?, completion_date=?, " + + s"err_message=?, request_data=? request_id=?"; + val pstmt: PreparedStatement = dbc.prepareStatement(updateQry); + pstmt.setString(1, request.request_id); + pstmt.setString(2, requestConfig.batchId.getOrElse("")); + pstmt.setString(3, requestConfig.collectionId.getOrElse("")); + pstmt.setString(4, request.resource_type.getOrElse("assessment")); + pstmt.setString(5, request.job_id); + pstmt.setTimestamp(6, if (request.archival_date.isDefined) new Timestamp(request.archival_date.get) else null); + pstmt.setTimestamp(7, if (request.completion_date.isDefined) new Timestamp(request.completion_date.get) else null); + pstmt.setString(8, request.archival_status); + pstmt.setString(9, request.deletion_status); + val blobURLs = request.blob_url.getOrElse(List()).toArray.asInstanceOf[Array[Object]]; + pstmt.setArray(10, dbc.createArrayOf("text", blobURLs)) + pstmt.setInt(11, request.iteration.getOrElse(0)) + pstmt.setString(12, request.request_data.getOrElse("[]")) + pstmt.setString(13, StringUtils.abbreviate(request.err_message.getOrElse(""), 300)); + + pstmt.execute() + } + +} diff --git a/data-products/src/test/resources/application.conf b/data-products/src/test/resources/application.conf index 8f53f38fb..43a6d29a0 100644 --- a/data-products/src/test/resources/application.conf +++ b/data-products/src/test/resources/application.conf @@ -199,3 +199,6 @@ uci.conversation.postgres.pass="postgres" uci.exhaust.store.prefix="src/test/resources/exhaust-reports/" uci.encryption.secret="123443957398423479784298247982789428fldkssd" // END OF UCI Related Job Configs + +//START of Archival Config +postgres.table.archival_request="archival_metadata" \ No newline at end of file diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala new file mode 100644 index 000000000..8c18e5a0a --- /dev/null +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -0,0 +1,51 @@ +package org.sunbird.analytics.archival + +import org.apache.spark.sql.SparkSession +import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} +import org.ekstep.analytics.framework.util.JSONUtils +import org.scalamock.scalatest.MockFactory +import org.sunbird.analytics.exhaust.BaseReportsJob +import org.sunbird.analytics.job.report.BaseReportSpec +import org.sunbird.analytics.util.{EmbeddedCassandra, EmbeddedPostgresql} + +class TestAsssessmentArchivalJob extends BaseReportSpec with MockFactory with BaseReportsJob { + + implicit var spark: SparkSession = _ + + override def beforeAll(): Unit = { + spark = getSparkSession(); + super.beforeAll() + EmbeddedCassandra.loadData("src/test/resources/exhaust/report_data.cql") // Load test data in embedded cassandra server + EmbeddedPostgresql.start() + EmbeddedPostgresql.createArchivalRequestTable() + } + + override def afterAll() : Unit = { + super.afterAll() + EmbeddedCassandra.close() + EmbeddedPostgresql.close() + spark.close() + } + + "AssessmentArchivalJob" should "archive the batch which is not archived in past" in { + implicit val fc = new FrameworkContext() + + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-001","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig= JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute() + } + + it should "archive the batch which is archived in past" in { + implicit val fc = new FrameworkContext() + + EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('do_1130928636168192001667_batch-001', 'batch-001', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2020-10-19 05:58:18.666','2020-10-19 05:58:18.666','SUCCESS', '{\"reports/assessment-archival/batch-001/20-2019.csv.gz\"}', 1,'{\"batchId\": \"batch-001\"}', NULL);") + + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-001","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig= JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute() + } + +} diff --git a/data-products/src/test/scala/org/sunbird/analytics/util/EmbeddedPostgres.scala b/data-products/src/test/scala/org/sunbird/analytics/util/EmbeddedPostgres.scala index 3cb39ca9b..516591e3a 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/util/EmbeddedPostgres.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/util/EmbeddedPostgres.scala @@ -69,6 +69,28 @@ object EmbeddedPostgresql { execute(query) } + def createArchivalRequestTable(): Unit = { + val tableName: String = "archival_metadata" + val query = + s""" + |CREATE TABLE IF NOT EXISTS $tableName ( + | request_id TEXT, + | batch_id TEXT, + | collection_id TEXT, + | resource_type TEXT, + | job_id TEXT, + | archival_date TIMESTAMP, + | completion_date TIMESTAMP, + | archival_status TEXT, + | deletion_status TEXT, + | blob_url TEXT[], + | iteration int, + | request_data json, + | err_message TEXT + |) + """.stripMargin + execute(query) + } def createConversationTable(): Unit = { val tableName: String = "bot" From d9c6e57cf689b4b182ea34943593ed54bb70f1c5 Mon Sep 17 00:00:00 2001 From: utk14 Date: Mon, 15 Nov 2021 13:22:21 +0530 Subject: [PATCH 03/32] Issue SB-24793 feat: Assessment archived data:: Base Archival job Implementation --- .../analytics/archival/BaseArchivalJob.scala | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 83ef88583..f66e11496 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -1,5 +1,7 @@ package org.sunbird.analytics.archival +import java.util.concurrent.atomic.AtomicInteger + import com.datastax.spark.connector.cql.CassandraConnectorConf import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SparkSession} @@ -8,18 +10,22 @@ import org.apache.spark.sql.types.StructType import org.ekstep.analytics.framework.Level.ERROR import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} -import org.ekstep.analytics.framework.{FrameworkContext, IJob, JobConfig} +import org.ekstep.analytics.framework.{FrameworkContext, IJob, JobConfig, Level} import org.sunbird.analytics.exhaust.BaseReportsJob +import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.apache.spark.sql.functions._ import org.joda.time.DateTime import org.sunbird.analytics.archival.util.ArchivalMetaDataStoreJob +case class Period(year: Int, weekOfYear: Int) + +case class BatchPartition(batchId: String, period: Period) case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) -case class Period(year: Int, weekOfYear: Int) trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { private val partitionCols = List("batch_id", "year", "week_of_year") + private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") val cassandraUrl = "org.apache.spark.sql.cassandra" def main(config: String)(implicit sc: Option[SparkContext] = None, fc: Option[FrameworkContext] = None): Unit = { @@ -89,9 +95,19 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor .withColumn("question", to_json(col("question"))) groupedDF.show(false) val archiveBatchList = groupedDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - println("archiveBatchList: " + archiveBatchList.toString) - -// val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + println("archiveBatchList: " + archiveBatchList.head) + + val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + + val archivalStatus = batchesToArchive.flatMap(batches => { + val processingBatch = new AtomicInteger(batches._2.length) + // JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> batches._2.length))) + // Loop through the week_num & year batch partition + val res = for (batch <- batches._2.asInstanceOf[Array[BatchPartition]]) yield { + val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) + upload(filteredDF, batch) // Upload the archived files into blob store + } + }) } } catch { case ex: Exception => @@ -126,6 +142,17 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor Period(0, 0) } } + def upload(archivedData: DataFrame, + batch: BatchPartition)(implicit jobConfig: JobConfig): List[String] = { + val modelParams = jobConfig.modelParams.get + val reportPath: String = modelParams.getOrElse("reportPath", "archived-data/").asInstanceOf[String] + val container = AppConf.getConfig("cloud.container.reports") + val objectKey = AppConf.getConfig("course.metrics.cloud.objectKey") + val fileName = s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" + val storageConfig = getStorageConfig(jobConfig, objectKey) + JobLogger.log(s"Uploading reports to blob storage", None, Level.INFO) + archivedData.saveToBlobStore(storageConfig, "csv", s"$reportPath$fileName-${System.currentTimeMillis()}", Option(Map("header" -> "true", "codec" -> "org.apache.hadoop.io.compress.GzipCodec")), None, Some("csv.gz")) + } def jobId: String; def jobName: String; From a0cdd695e79d418265445770764b18b0be894864 Mon Sep 17 00:00:00 2001 From: utk14 Date: Tue, 7 Dec 2021 12:04:42 +0530 Subject: [PATCH 04/32] Issue SB-24793 feat: Assessment archived data implemetation --- .../archival/AssessmentArchivalJob.scala | 3 +- .../analytics/archival/BaseArchivalJob.scala | 58 +++++++++++++------ .../util/ArchivalMetaDataStoreJob.scala | 4 +- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index a71b5a52e..1c0ffc1cd 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -13,8 +13,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def processArchival(archivalTableData: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { println("Process Archival") - - spark.emptyDataFrame + generatePeriodInData(data = archivalTableData) } } diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index f66e11496..0e29c6fc3 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -15,12 +15,19 @@ import org.sunbird.analytics.exhaust.BaseReportsJob import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.apache.spark.sql.functions._ import org.joda.time.DateTime -import org.sunbird.analytics.archival.util.ArchivalMetaDataStoreJob +import org.sunbird.analytics.archival.util.{ArchivalMetaDataStoreJob, ArchivalRequest} case class Period(year: Int, weekOfYear: Int) case class BatchPartition(batchId: String, period: Period) case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) +case class ArchivalMetrics(batchId: Option[String], + period: Period, + totalArchivedRecords: Option[Long], + pendingWeeksOfYears: Option[Long], + totalDeletedRecords: Option[Long], + totalDistinctBatches: Long + ) trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { @@ -89,31 +96,48 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor println("requestLength: " + requests.length) try { if(requests.length == 0) { - val groupedDF = data.withColumn("updated_on", to_timestamp(col("updated_on"))) - .withColumn("year", year(col("updated_on"))) - .withColumn("week_of_year", weekofyear(col("updated_on"))) - .withColumn("question", to_json(col("question"))) - groupedDF.show(false) - val archiveBatchList = groupedDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + val dataDF = processArchival(data, archivalRequest) + dataDF.show(false) + val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() println("archiveBatchList: " + archiveBatchList.head) val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) - val archivalStatus = batchesToArchive.flatMap(batches => { - val processingBatch = new AtomicInteger(batches._2.length) - // JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> batches._2.length))) - // Loop through the week_num & year batch partition - val res = for (batch <- batches._2.asInstanceOf[Array[BatchPartition]]) yield { - val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) - upload(filteredDF, batch) // Upload the archived files into blob store - } - }) + val archivalStatus = archiveBatches(batchesToArchive, data, requests.head, archivalRequest) + } else { + for (request <- requests) yield { + if (request.archival_date) + } } } catch { case ex: Exception => ex.printStackTrace() } - processArchival(data, archivalRequest) + } + + def generatePeriodInData(data: DataFrame): DataFrame = { + data.withColumn("updated_on", to_timestamp(col("updated_on"))) + .withColumn("year", year(col("updated_on"))) + .withColumn("week_of_year", weekofyear(col("updated_on"))) + .withColumn("question", to_json(col("question"))) + } + + def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, archivalRequest: ArchivalRequest, request: Request)(implicit config: JobConfig): Unit = { + batchesToArchive.flatMap(batches => { + val processingBatch = new AtomicInteger(batches._2.length) + JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) + // Loop through the week_num & year batch partition + val res = for (batch <- batches._2.asInstanceOf[Array[BatchPartition]]) yield { + val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) + val urls = upload(filteredDF, batch) // Upload the archived files into blob store + //TO-DO: archival Request + markRequestAsSuccess(archivalRequest, request) + JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) + + } + JobLogger.log(s"${batches._1} is successfully archived", Some(Map("batch_id" -> batches._1)), Level.INFO) + res + }).toArray } def deleteArchivedData(data: DataFrame, archivalRequest: Request): Unit = { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index fa88cc9b0..b31195285 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -70,9 +70,7 @@ trait ArchivalMetaDataStoreJob { def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): Boolean = { val insertQry = s"INSERT INTO $requestsTable (request_id, batch_id, collection_id, resource_type, job_id, archival_date, completion_date, archival_status, " + s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)" - val updateQry = s"UPDATE $requestsTable SET iteration = ?, archival_status=?, blob_url=?, archival_date=?, completion_date=?, " + - s"err_message=?, request_data=? request_id=?"; - val pstmt: PreparedStatement = dbc.prepareStatement(updateQry); + val pstmt: PreparedStatement = dbc.prepareStatement(insertQry); pstmt.setString(1, request.request_id); pstmt.setString(2, requestConfig.batchId.getOrElse("")); pstmt.setString(3, requestConfig.collectionId.getOrElse("")); From 029125be01a299aaaed5dafe35ebd11a7215d4a1 Mon Sep 17 00:00:00 2001 From: utk14 Date: Tue, 7 Dec 2021 12:15:16 +0530 Subject: [PATCH 05/32] Issue SB-24793 feat: Assessment archived data implemetation --- .../scala/org/sunbird/analytics/archival/BaseArchivalJob.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 0e29c6fc3..a8ed32ff7 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -106,7 +106,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor val archivalStatus = archiveBatches(batchesToArchive, data, requests.head, archivalRequest) } else { for (request <- requests) yield { - if (request.archival_date) + //TO-DO: for each request } } } catch { From 78797d1f71422ca671c1a176a737f5ca99936815 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Wed, 8 Dec 2021 13:59:18 +0530 Subject: [PATCH 06/32] Issue #SB-27408 | Assessment archival to update existing requests --- .../analytics/archival/BaseArchivalJob.scala | 63 +++++++++++-------- .../util/ArchivalMetaDataStoreJob.scala | 39 +++++++++++- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index a8ed32ff7..e707fb661 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -69,46 +69,52 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def execute()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { val modelParams = config.modelParams.getOrElse(Map[String, Option[AnyRef]]()); - val archivalRequest = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) - val archivalTable = archivalRequest.archivalTable - val archivalKeyspace = archivalRequest.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + val requestConfig = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) + val archivalTable = requestConfig.archivalTable + val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val batchId: String = archivalRequest.batchId.getOrElse("") - val date: String = archivalRequest.date.getOrElse("") + val batchId: String = requestConfig.batchId.getOrElse("") + val date: String = requestConfig.date.getOrElse("") val mode: String = modelParams.getOrElse("mode","archive").asInstanceOf[String] println("modelParams: " + modelParams) - println("archival request: " + archivalRequest) + println("archival request: " + requestConfig) val archivalTableData: DataFrame = getArchivalData(archivalTable, archivalKeyspace,Option(batchId),Option(date)) println("archivalTableData ") archivalTableData.show(false) mode.toLowerCase() match { case "archival" => - archiveData(archivalTableData, archivalRequest) + archiveData(archivalTableData, requestConfig) case "delete" => - deleteArchivedData(archivalTableData,archivalRequest) + deleteArchivedData(archivalTableData,requestConfig) } } - def archiveData(data: DataFrame, archivalRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { - val requests = getRequests(jobId, archivalRequest.batchId) + def archiveData(data: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + val requests = getRequests(jobId, requestConfig.batchId) println("requestLength: " + requests.length) try { - if(requests.length == 0) { - val dataDF = processArchival(data, archivalRequest) - dataDF.show(false) - val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - println("archiveBatchList: " + archiveBatchList.head) - - val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) - - val archivalStatus = archiveBatches(batchesToArchive, data, requests.head, archivalRequest) - } else { - for (request <- requests) yield { - //TO-DO: for each request + var dataDF = processArchival(data, requestConfig) + if(requests.length > 0) { + for (request <- requests) { + // TODO: for each request + if (request.archival_status.equals("SUCCESS")) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data.get) + dataDF = dataDF.filter( + col("week_of_year").notEqual(request_data.get("week").get) && + col("year").notEqual(request_data.get("year").get) + ) + } } } + + val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + println("archiveBatchList: " + archiveBatchList.head) + + val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + + val archivalStatus = archiveBatches(batchesToArchive, dataDF, requestConfig) } catch { case ex: Exception => ex.printStackTrace() @@ -122,7 +128,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor .withColumn("question", to_json(col("question"))) } - def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, archivalRequest: ArchivalRequest, request: Request)(implicit config: JobConfig): Unit = { + def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit = { batchesToArchive.flatMap(batches => { val processingBatch = new AtomicInteger(batches._2.length) JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) @@ -130,8 +136,11 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor val res = for (batch <- batches._2.asInstanceOf[Array[BatchPartition]]) yield { val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) val urls = upload(filteredDF, batch) // Upload the archived files into blob store - //TO-DO: archival Request - markRequestAsSuccess(archivalRequest, request) + + val collectionId = filteredDF.first().getAs[String]("course_id") + val archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) + //TO-DO: archival Request + markRequestAsSuccess(archivalRequest, requestConfig) JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) } @@ -144,7 +153,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor } - def processArchival(archivalTableData: DataFrame, archivalRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; + def processArchival(archivalTableData: DataFrame, archiveRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; def getArchivalData(table: String, keyspace: String, batchId: Option[String], date: Option[String])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame = { val archivalTableSettings = Map("table" -> table, "keyspace" -> keyspace, "cluster" -> "LMSCluster") @@ -175,7 +184,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor val fileName = s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" val storageConfig = getStorageConfig(jobConfig, objectKey) JobLogger.log(s"Uploading reports to blob storage", None, Level.INFO) - archivedData.saveToBlobStore(storageConfig, "csv", s"$reportPath$fileName-${System.currentTimeMillis()}", Option(Map("header" -> "true", "codec" -> "org.apache.hadoop.io.compress.GzipCodec")), None, Some("csv.gz")) + archivedData.saveToBlobStore(storageConfig, "csv", s"$reportPath$fileName-${System.currentTimeMillis()}", Option(Map("header" -> "true", "codec" -> "org.apache.hadoop.io.compress.GzipCodec")), None, fileExt=Some("csv.gz")) } def jobId: String; diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index b31195285..97558655b 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -1,15 +1,16 @@ package org.sunbird.analytics.archival.util -import java.sql.{Connection, DriverManager, PreparedStatement, Timestamp} +import java.security.MessageDigest +import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet, Timestamp} import java.util.Properties - import org.apache.commons.lang.StringUtils import org.apache.spark.sql.{Encoders, SparkSession} import org.apache.spark.sql.functions.{col, lit} import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} import org.ekstep.analytics.framework.Level.INFO import org.ekstep.analytics.framework.conf.AppConf -import org.ekstep.analytics.framework.util.{CommonUtil, JobLogger} +import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} +import org.ekstep.analytics.job.batch.VideoStreamingJob.JobRequest import org.sunbird.analytics.archival.Request case class ArchivalRequest(request_id: String, batch_id: String, collection_id: String, resource_type: Option[String], job_id: String, @@ -51,6 +52,38 @@ trait ArchivalMetaDataStoreJob { requests } + def getRequestID(collectionId: String, batchId: String, year: Int, week: Int): String = { + val requestComb = s"$collectionId:$batchId:$year:$week" + MessageDigest.getInstance("MD5").digest(requestComb.getBytes).map("%02X".format(_)).mkString + } + + def getRequest(collectionId: String, batchId: String, year: Int, week: Int): ArchivalRequest = { + val requestId = getRequestID(collectionId, batchId, year, week) + val archivalRequest = s"""select * from $requestsTable where request_id = $requestId""" + val pstmt: PreparedStatement = dbc.prepareStatement(archivalRequest); + val resultSet = pstmt.executeQuery() + + getArchivalRequest(resultSet) + } + + private def getArchivalRequest(resultSet: ResultSet): ArchivalRequest = { + ArchivalRequest( + resultSet.getString("request_id"), + resultSet.getString("batch_id"), + resultSet.getString("collection_id"), + Some(resultSet.getString("resource_type")), + resultSet.getString("job_id"), + Some(resultSet.getLong("archival_date")), + Some(resultSet.getLong("completion_date")), + resultSet.getString("archival_status"), + resultSet.getString("deletion_status"), + Some(resultSet.getArray("blob_url").asInstanceOf[List[String]]), + Some(resultSet.getInt("iteration")), + Some(resultSet.getString("request_data")), + Some(resultSet.getString("err_message")) + ) + } + def markArchivalRequestAsFailed(request: ArchivalRequest, failedMsg: String): ArchivalRequest = { request.archival_status = "FAILED"; request.archival_date = Option(System.currentTimeMillis()); From 862e306f104b0b4ecf7548f331d2e1412200b185 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Wed, 8 Dec 2021 19:20:18 +0530 Subject: [PATCH 07/32] Issue #SB-27408 | Assessment archival to create and update requests --- .../analytics/archival/BaseArchivalJob.scala | 43 ++++++++++----- .../util/ArchivalMetaDataStoreJob.scala | 54 +++++++++++++++---- .../archival/TestAsssessmentArchivalJob.scala | 6 +-- 3 files changed, 75 insertions(+), 28 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index e707fb661..797e160f7 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -100,7 +100,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor for (request <- requests) { // TODO: for each request if (request.archival_status.equals("SUCCESS")) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data.get) + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) dataDF = dataDF.filter( col("week_of_year").notEqual(request_data.get("week").get) && col("year").notEqual(request_data.get("year").get) @@ -110,11 +110,10 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor } val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - println("archiveBatchList: " + archiveBatchList.head) val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) - val archivalStatus = archiveBatches(batchesToArchive, dataDF, requestConfig) + archiveBatches(batchesToArchive, dataDF, requestConfig) } catch { case ex: Exception => ex.printStackTrace() @@ -129,24 +128,40 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor } def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit = { - batchesToArchive.flatMap(batches => { + batchesToArchive.foreach(batches => { val processingBatch = new AtomicInteger(batches._2.length) JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) + // Loop through the week_num & year batch partition - val res = for (batch <- batches._2.asInstanceOf[Array[BatchPartition]]) yield { + batches._2.map((batch: BatchPartition) => { val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) - val urls = upload(filteredDF, batch) // Upload the archived files into blob store - val collectionId = filteredDF.first().getAs[String]("course_id") - val archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) - //TO-DO: archival Request - markRequestAsSuccess(archivalRequest, requestConfig) - JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) + var archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) + + if (archivalRequest != null) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(Request)) ++ Map[String, Int]( + "week" -> batch.period.weekOfYear, + "year"-> batch.period.year + ) + archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Some(getReportKey), jobId, null, null, null, null, null, Some(0), JSONUtils.serialize(request_data), null) + } + + try { + val urls = upload(filteredDF, batch) // Upload the archived files into blob store + archivalRequest.blob_url = Some(urls) + JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) + markRequestAsSuccess(archivalRequest, requestConfig) + } catch { + case ex: Exception => { + markArchivalRequestAsFailed(archivalRequest, ex.getLocalizedMessage) + } + } + }).foreach((archivalRequest: ArchivalRequest) => { + upsertRequest(archivalRequest) + }) - } JobLogger.log(s"${batches._1} is successfully archived", Some(Map("batch_id" -> batches._1)), Level.INFO) - res - }).toArray + }) } def deleteArchivedData(data: DataFrame, archivalRequest: Request): Unit = { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 97558655b..aefe5486b 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -15,7 +15,7 @@ import org.sunbird.analytics.archival.Request case class ArchivalRequest(request_id: String, batch_id: String, collection_id: String, resource_type: Option[String], job_id: String, var archival_date: Option[Long],var completion_date: Option[Long],var archival_status: String,var deletion_status: String, - blob_url: Option[List[String]],var iteration: Option[Int], request_data: Option[String],var err_message: Option[String]) + var blob_url: Option[List[String]],var iteration: Option[Int], request_data: String, var err_message: Option[String]) trait ArchivalMetaDataStoreJob { @@ -59,11 +59,11 @@ trait ArchivalMetaDataStoreJob { def getRequest(collectionId: String, batchId: String, year: Int, week: Int): ArchivalRequest = { val requestId = getRequestID(collectionId, batchId, year, week) - val archivalRequest = s"""select * from $requestsTable where request_id = $requestId""" + val archivalRequest = s"""select * from $requestsTable where request_id = '$requestId' limit 1""" val pstmt: PreparedStatement = dbc.prepareStatement(archivalRequest); val resultSet = pstmt.executeQuery() - getArchivalRequest(resultSet) + if (resultSet.next()) getArchivalRequest(resultSet) else null } private def getArchivalRequest(resultSet: ResultSet): ArchivalRequest = { @@ -73,13 +73,13 @@ trait ArchivalMetaDataStoreJob { resultSet.getString("collection_id"), Some(resultSet.getString("resource_type")), resultSet.getString("job_id"), - Some(resultSet.getLong("archival_date")), - Some(resultSet.getLong("completion_date")), + Some(resultSet.getTimestamp("archival_date").getTime), + if (resultSet.getTimestamp("completion_date") != null) Some(resultSet.getTimestamp("completion_date").getTime) else null, resultSet.getString("archival_status"), resultSet.getString("deletion_status"), Some(resultSet.getArray("blob_url").asInstanceOf[List[String]]), Some(resultSet.getInt("iteration")), - Some(resultSet.getString("request_data")), + resultSet.getString("request_data"), Some(resultSet.getString("err_message")) ) } @@ -100,13 +100,15 @@ trait ArchivalMetaDataStoreJob { request } - def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): Boolean = { + def createRequest(request: ArchivalRequest) = { val insertQry = s"INSERT INTO $requestsTable (request_id, batch_id, collection_id, resource_type, job_id, archival_date, completion_date, archival_status, " + s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)" val pstmt: PreparedStatement = dbc.prepareStatement(insertQry); - pstmt.setString(1, request.request_id); - pstmt.setString(2, requestConfig.batchId.getOrElse("")); - pstmt.setString(3, requestConfig.collectionId.getOrElse("")); + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) + val requestId = getRequestID(request.collection_id, request.batch_id, request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int]) + pstmt.setString(1, requestId); + pstmt.setString(2, request.batch_id); + pstmt.setString(3, request.collection_id); pstmt.setString(4, request.resource_type.getOrElse("assessment")); pstmt.setString(5, request.job_id); pstmt.setTimestamp(6, if (request.archival_date.isDefined) new Timestamp(request.archival_date.get) else null); @@ -116,10 +118,40 @@ trait ArchivalMetaDataStoreJob { val blobURLs = request.blob_url.getOrElse(List()).toArray.asInstanceOf[Array[Object]]; pstmt.setArray(10, dbc.createArrayOf("text", blobURLs)) pstmt.setInt(11, request.iteration.getOrElse(0)) - pstmt.setString(12, request.request_data.getOrElse("[]")) + pstmt.setString(12, request.request_data) pstmt.setString(13, StringUtils.abbreviate(request.err_message.getOrElse(""), 300)); pstmt.execute() } + def upsertRequest(request: ArchivalRequest): Unit = { + if (request.request_id.isEmpty) { + createRequest(request) + } else { + updateRequest(request) + } + } + + def updateRequest(request: ArchivalRequest): Unit = { + val updateQry = s"UPDATE $requestsTable SET blob_url=?, iteration = ?, archival_date=?, completion_date=?, " + + s"archival_status=?, deletion_status=? WHERE request_id=?"; + val pstmt: PreparedStatement = dbc.prepareStatement(updateQry) + + val blobURLs = request.blob_url.getOrElse(List()).toArray.asInstanceOf[Array[Object]]; + pstmt.setArray(1, dbc.createArrayOf("text", blobURLs)) + pstmt.setInt(2, request.iteration.get); + pstmt.setTimestamp(3, if (request.archival_date.isDefined) new Timestamp(request.archival_date.get) else null); + pstmt.setTimestamp(4, if (request.completion_date.isDefined) new Timestamp(request.completion_date.get) else null); + pstmt.setString(5, request.archival_status); + pstmt.setString(6, request.deletion_status); + pstmt.setString(7, request.request_id); + + } + + def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): ArchivalRequest = { + request.archival_status = "SUCCESS"; + request.archival_date = Option(System.currentTimeMillis()) + request + } + } diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 8c18e5a0a..fe743c2f3 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -6,9 +6,9 @@ import org.ekstep.analytics.framework.util.JSONUtils import org.scalamock.scalatest.MockFactory import org.sunbird.analytics.exhaust.BaseReportsJob import org.sunbird.analytics.job.report.BaseReportSpec -import org.sunbird.analytics.util.{EmbeddedCassandra, EmbeddedPostgresql} +import org.sunbird.analytics.util.{BaseSpec, EmbeddedCassandra, EmbeddedPostgresql} -class TestAsssessmentArchivalJob extends BaseReportSpec with MockFactory with BaseReportsJob { +class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseReportsJob { implicit var spark: SparkSession = _ @@ -40,7 +40,7 @@ class TestAsssessmentArchivalJob extends BaseReportSpec with MockFactory with Ba implicit val fc = new FrameworkContext() EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") - EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('do_1130928636168192001667_batch-001', 'batch-001', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2020-10-19 05:58:18.666','2020-10-19 05:58:18.666','SUCCESS', '{\"reports/assessment-archival/batch-001/20-2019.csv.gz\"}', 1,'{\"batchId\": \"batch-001\"}', NULL);") + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('898DF47D8C9B72454C72D2C574DB2A38', 'batch-001', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2020-10-19 05:58:18.666','2020-10-19 05:58:18.666','FAILED', '{\"reports/assessment-archival/batch-001/20-2019.csv.gz\"}', 1,'{\"batchId\": \"batch-001\", \"week\": 43, \"year\": 2021}', NULL);") val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-001","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig= JSONUtils.deserialize[JobConfig](strConfig) From 9f1567e7159bd372c838f06b04088f1044e8b665 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Thu, 9 Dec 2021 20:27:13 +0530 Subject: [PATCH 08/32] Issue #SB-27408 | Assessment archival test case and fixes added --- .../analytics/archival/BaseArchivalJob.scala | 21 ++- .../util/ArchivalMetaDataStoreJob.scala | 16 +-- .../resources/assessment-archival/data.cql | 12 ++ .../archival/TestAsssessmentArchivalJob.scala | 136 ++++++++++++++++-- 4 files changed, 156 insertions(+), 29 deletions(-) create mode 100644 data-products/src/test/resources/assessment-archival/data.cql diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 797e160f7..c4f69811f 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -98,12 +98,11 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor var dataDF = processArchival(data, requestConfig) if(requests.length > 0) { for (request <- requests) { - // TODO: for each request if (request.archival_status.equals("SUCCESS")) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) dataDF = dataDF.filter( - col("week_of_year").notEqual(request_data.get("week").get) && - col("year").notEqual(request_data.get("year").get) + col("batch_id").equalTo(request.batch_id) && + concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) ) } } @@ -138,17 +137,17 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor val collectionId = filteredDF.first().getAs[String]("course_id") var archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) - if (archivalRequest != null) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(Request)) ++ Map[String, Int]( + if (archivalRequest == null) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( "week" -> batch.period.weekOfYear, "year"-> batch.period.year ) - archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Some(getReportKey), jobId, null, null, null, null, null, Some(0), JSONUtils.serialize(request_data), null) + archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Option(getReportKey), jobId, None, None, null, null, None, Option(0), JSONUtils.serialize(request_data), None) } try { val urls = upload(filteredDF, batch) // Upload the archived files into blob store - archivalRequest.blob_url = Some(urls) + archivalRequest.blob_url = Option(urls) JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) markRequestAsSuccess(archivalRequest, requestConfig) } catch { @@ -190,10 +189,10 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor Period(0, 0) } } - def upload(archivedData: DataFrame, - batch: BatchPartition)(implicit jobConfig: JobConfig): List[String] = { - val modelParams = jobConfig.modelParams.get - val reportPath: String = modelParams.getOrElse("reportPath", "archived-data/").asInstanceOf[String] + + def upload(archivedData: DataFrame, batch: BatchPartition)(implicit jobConfig: JobConfig): List[String] = { + val blobConfig = jobConfig.modelParams.get("blobConfig").asInstanceOf[Map[String, AnyRef]] + val reportPath: String = blobConfig.getOrElse("reportPath", "archived-data/").asInstanceOf[String] val container = AppConf.getConfig("cloud.container.reports") val objectKey = AppConf.getConfig("course.metrics.cloud.objectKey") val fileName = s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index aefe5486b..16e59d3a0 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -10,7 +10,6 @@ import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} import org.ekstep.analytics.framework.Level.INFO import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} -import org.ekstep.analytics.job.batch.VideoStreamingJob.JobRequest import org.sunbird.analytics.archival.Request case class ArchivalRequest(request_id: String, batch_id: String, collection_id: String, resource_type: Option[String], job_id: String, @@ -71,16 +70,16 @@ trait ArchivalMetaDataStoreJob { resultSet.getString("request_id"), resultSet.getString("batch_id"), resultSet.getString("collection_id"), - Some(resultSet.getString("resource_type")), + Option(resultSet.getString("resource_type")), resultSet.getString("job_id"), - Some(resultSet.getTimestamp("archival_date").getTime), - if (resultSet.getTimestamp("completion_date") != null) Some(resultSet.getTimestamp("completion_date").getTime) else null, + Option(resultSet.getTimestamp("archival_date").getTime), + if (resultSet.getTimestamp("completion_date") != null) Option(resultSet.getTimestamp("completion_date").getTime) else None, resultSet.getString("archival_status"), resultSet.getString("deletion_status"), - Some(resultSet.getArray("blob_url").asInstanceOf[List[String]]), - Some(resultSet.getInt("iteration")), + if (resultSet.getArray("blob_url") != null) Option(resultSet.getArray("blob_url").getArray().asInstanceOf[Array[String]].toList) else None, + Option(resultSet.getInt("iteration")), resultSet.getString("request_data"), - Some(resultSet.getString("err_message")) + Option(resultSet.getString("err_message")) ) } @@ -102,7 +101,7 @@ trait ArchivalMetaDataStoreJob { def createRequest(request: ArchivalRequest) = { val insertQry = s"INSERT INTO $requestsTable (request_id, batch_id, collection_id, resource_type, job_id, archival_date, completion_date, archival_status, " + - s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)" + s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?::json,?)" val pstmt: PreparedStatement = dbc.prepareStatement(insertQry); val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) val requestId = getRequestID(request.collection_id, request.batch_id, request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int]) @@ -146,6 +145,7 @@ trait ArchivalMetaDataStoreJob { pstmt.setString(6, request.deletion_status); pstmt.setString(7, request.request_id); + pstmt.execute() } def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): ArchivalRequest = { diff --git a/data-products/src/test/resources/assessment-archival/data.cql b/data-products/src/test/resources/assessment-archival/data.cql new file mode 100644 index 000000000..a99dd6e8f --- /dev/null +++ b/data-products/src/test/resources/assessment-archival/data.cql @@ -0,0 +1,12 @@ +-- Week 48 +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-001', 'do_1128870328040161281204', 'attempt-001', '10', 10, 10, 1638357693000, 1638357693000); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-003', 'do_112876961957437440179', 'attempt-001', '10', 10, 10, 1638357693000, 1638357693000); + + +-- Week 49 -- +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, question, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-001', 'do_1128870328040161281204', 'attempt-002', '20', 20, 20, [{id: 'do_213019475454476288155', assess_ts: '2020-06-18T18:15:56.490+0000', max_score: 1, score: 1, type: 'mcq', title: 'testQuestiontextandformula', resvalues: [{'1': '{"text":"A=\\\\pi r^2\n"}'}], params: [{'1': '{"text":"A=\\\\pi r^2\n"}'}, {'2': '{"text":"no\n"}'}, {'answer': '{"correct":["1"]}'}], description: 'testQuestiontextandformula', duration: 1.0}, {id: 'do_213019970118279168165', assess_ts: '2020-06-18T18:15:56.490+0000', max_score: 1, score: 1, type: 'mcq', title: 'test with formula', resvalues: [{'1': '{"text":"1\nA=\\\\pi r^2A=\\\\pi r^2\n"}'}], params: [{'1': '{"text":"1\nA=\\\\pi r^2A=\\\\pi r^2\n"}'}, {'2': '{"text":"2\n"}'}, {'answer': '{"correct":["1"]}'}], description: '', duration: 1.0}, {id: 'do_213019972814823424168', assess_ts: '2020-06-18T18:15:56.490+0000', max_score: 1, score: 0.33, type: 'mtf', title: 'Copy of - Match the following:\n\nx=\\frac{-b\\pm\\sqrt{b^2-4ac}}{2a}\nArrange the following equations in correct order.\n', resvalues: [{'lhs': '[{"1":"{\"text\":\"A=\\\\\\\\pi r^2\\n\"}"},{"2":"{\"text\":\"\\\\\\\\frac{4}{3}\\\\\\\\pi r^3\\n\"}"},{"3":"{\"text\":\"a^n\\\\\\\\times a^m=a^{n+m}\\n\"}"}]'}, {'rhs': '[{"1":"{\"text\":\"Volume of sphere\\n\"}"},{"2":"{\"text\":\"Area of Circle\\n\"}"},{"3":"{\"text\":\"Product Rule\\n\"}"}]'}], params: [{'lhs': '[{"1":"{\"text\":\"A=\\\\\\\\pi r^2\\n\"}"},{"2":"{\"text\":\"\\\\\\\\frac{4}{3}\\\\\\\\pi r^3\\n\"}"},{"3":"{\"text\":\"a^n\\\\\\\\times a^m=a^{n+m}\\n\"}"}]'}, {'rhs': '[{"1":"{\"text\":\"Volume of sphere\\n\"}"},{"2":"{\"text\":\"Product Rule\\n\"}"},{"3":"{\"text\":\"Area of Circle\\n\"}"}]'}, {'answer': '{"lhs":["1","2","3"],"rhs":["3","1","2"]}'}], description: '', duration: 2.0}, {id: 'do_2130256513760624641171', assess_ts: '2020-06-18T18:15:56.490+0000', max_score: 10, score: 10, type: 'mcq', title: '2 +2 is..? mark ia 10\n', resvalues: [{'1': '{"text":"4\n"}'}], params: [{'1': '{"text":"4\n"}'}, {'2': '{"text":"3\n"}'}, {'3': '{"text":"8\n"}'}, {'4': '{"text":"10\n"}'}, {'answer': '{"correct":["1"]}'}], description: '', duration: 12.0}], 1639052254823); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-002', 'do_1128870328040161281204', 'attempt-002', '10', 10, 10, 1639052254823); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-003', 'do_112876961957437440179', 'attempt-002', '10', 10, 10, 1639052254823); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, updated_on) VALUES ('do_11306040245271756813015', 'batch-021', 'user-008', 'do_112876961957437440179', 'attempt-002', '10', 10, 10, 1639052254823); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, updated_on) VALUES ('do_11306040245271756813015', 'batch-021', 'user-010', 'do_11307593493010022418', 'attempt-002', '15', 15, 15, 1639052254823); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, updated_on) VALUES ('do_112835334818643968148', 'batch-031', 'user-014', 'do_11307593493010022418', 'attempt-002', '15', 15, 15, 1639052254823); diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index fe743c2f3..8edd02417 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -1,25 +1,32 @@ package org.sunbird.analytics.archival import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.col import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} -import org.ekstep.analytics.framework.util.JSONUtils +import org.ekstep.analytics.framework.util.{HadoopFileUtil, JSONUtils} import org.scalamock.scalatest.MockFactory import org.sunbird.analytics.exhaust.BaseReportsJob -import org.sunbird.analytics.job.report.BaseReportSpec import org.sunbird.analytics.util.{BaseSpec, EmbeddedCassandra, EmbeddedPostgresql} class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseReportsJob { + val outputLocation = "src/test/resources/reports/assessment-archived-data" implicit var spark: SparkSession = _ override def beforeAll(): Unit = { spark = getSparkSession(); super.beforeAll() - EmbeddedCassandra.loadData("src/test/resources/exhaust/report_data.cql") // Load test data in embedded cassandra server + EmbeddedCassandra.loadData("src/test/resources/assessment-archival/data.cql") // Load test data in embedded cassandra server EmbeddedPostgresql.start() EmbeddedPostgresql.createArchivalRequestTable() } + override def afterEach(): Unit = { + super.afterEach() + EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") + new HadoopFileUtil().delete(spark.sparkContext.hadoopConfiguration, outputLocation) + } + override def afterAll() : Unit = { super.afterAll() EmbeddedCassandra.close() @@ -29,23 +36,132 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo "AssessmentArchivalJob" should "archive the batch which is not archived in past" in { implicit val fc = new FrameworkContext() + val batchId = "batch-011" - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-001","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" - implicit val jobConfig= JSONUtils.deserialize[JobConfig](strConfig) + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() + + val batch011Results = spark.read.format("csv").option("header", "true") + .load(s"$outputLocation/$batchId/2021*.csv.gz") + + batch011Results.count() should be (5) + + val user1 = batch011Results.filter(col("user_id") === "user-001") + user1.count() should be (2) + + val user1attempt1 = user1.filter(col("attempt_id") === "attempt-001").first + user1attempt1.getAs[String]("course_id") should be ("do_1130928636168192001667") + user1attempt1.getAs[String]("content_id") should be ("do_1128870328040161281204") + user1attempt1.getAs[String]("last_attempted_on") should be ("2021-12-01T16:51:33.000+05:30") + user1attempt1.getAs[String]("grand_total") should be ("10") + user1attempt1.getAs[String]("total_max_score") should be ("10.0") + user1attempt1.getAs[String]("total_score") should be ("10.0") + user1attempt1.getAs[String]("question") should be ("[]") + user1attempt1.getAs[String]("updated_on") should be ("2021-12-01T16:51:33.000+05:30") + + val user1attempt2 = user1.filter(col("attempt_id") === "attempt-002").first + + user1attempt2.getAs[String]("course_id") should be ("do_1130928636168192001667") + user1attempt2.getAs[String]("content_id") should be ("do_1128870328040161281204") + user1attempt2.getAs[String]("last_attempted_on") should be (null) + user1attempt2.getAs[String]("grand_total") should be ("20") + user1attempt2.getAs[String]("total_max_score") should be ("20.0") + user1attempt2.getAs[String]("total_score") should be ("20.0") + val questionsList = JSONUtils.deserialize[List[Map[String, AnyRef]]](user1attempt2.getAs[String]("question")) + questionsList.size should be (4) + + user1attempt2.getAs[String]("updated_on") should be ("2021-12-09T17:47:34.823+05:30") + + val user2Result = batch011Results.filter(col("user_id") === "user-002") + user2Result.count() should be (1) + + val user3Result = batch011Results.filter(col("user_id") === "user-003") + user3Result.count() should be (2) + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + archivalRequests.size should be (2) + + archivalRequests.map(ar => ar.request_id).toList should contain allElementsOf List("F08614119F64BC55B14CBE49B10B6730", "949887DE6364A07AE1BB5A04504368F9") + archivalRequests.map(ar => ar.batch_id).toList.distinct should contain allElementsOf List("batch-011") + archivalRequests.map(ar => ar.collection_id).toList.distinct should contain allElementsOf List("do_1130928636168192001667") + archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") + archivalRequests.map(ar => ar.blob_url.get).toList.head.head should include ("src/test/resources/reports/assessment-archived-data/batch-011/2021") + archivalRequests.map(ar => ar.iteration.get).toList.distinct should contain allElementsOf List(0) + archivalRequests.map(ar => ar.err_message.get).toList.distinct should contain allElementsOf List("") } - it should "archive the batch which is archived in past" in { + it should "archive the batch which is failed to archive in past" in { implicit val fc = new FrameworkContext() + val batchId = "batch-011" - EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") - EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('898DF47D8C9B72454C72D2C574DB2A38', 'batch-001', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2020-10-19 05:58:18.666','2020-10-19 05:58:18.666','FAILED', '{\"reports/assessment-archival/batch-001/20-2019.csv.gz\"}', 1,'{\"batchId\": \"batch-001\", \"week\": 43, \"year\": 2021}', NULL);") + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'FAILED', null, 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-001","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" - implicit val jobConfig= JSONUtils.deserialize[JobConfig](strConfig) + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() + + val batch011Results = spark.read.format("csv").option("header", "true") + .load(s"$outputLocation/$batchId/2021*.csv.gz") + + batch011Results.count() should be (5) + + val user1 = batch011Results.filter(col("user_id") === "user-001") + user1.count() should be (2) + + val user2Result = batch011Results.filter(col("user_id") === "user-002") + user2Result.count() should be (1) + + val user3Result = batch011Results.filter(col("user_id") === "user-003") + user3Result.count() should be (2) + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + archivalRequests.size should be (2) + + val failedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, 2021, 48) + + failedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") + failedRequest.archival_status should be ("SUCCESS") + failedRequest.blob_url.get.head should include ("src/test/resources/reports/assessment-archived-data/batch-011/2021") + } + + it should "skip archival for the batch which is archived in past" in { + implicit val fc = new FrameworkContext() + val batchId = "batch-011" + + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") + + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute() + + val batch011Results = spark.read.format("csv").option("header", "true") + .load(s"$outputLocation/$batchId/2021*.csv.gz") + + batch011Results.count() should be (3) + + val user1 = batch011Results.filter(col("user_id") === "user-001") + user1.count() should be (1) + val user1attempt2 = user1.filter(col("attempt_id") === "attempt-002").first + + user1attempt2.getAs[String]("course_id") should be ("do_1130928636168192001667") + user1attempt2.getAs[String]("content_id") should be ("do_1128870328040161281204") + user1attempt2.getAs[String]("last_attempted_on") should be (null) + user1attempt2.getAs[String]("grand_total") should be ("20") + user1attempt2.getAs[String]("total_max_score") should be ("20.0") + user1attempt2.getAs[String]("total_score") should be ("20.0") + + val user2Result = batch011Results.filter(col("user_id") === "user-002") + user2Result.count() should be (1) + + val user3Result = batch011Results.filter(col("user_id") === "user-003") + user3Result.count() should be (1) + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + archivalRequests.size should be (2) } } From 0de04e7231896f0cd865c60697f8b8f7ee19dbc4 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Fri, 10 Dec 2021 13:00:59 +0530 Subject: [PATCH 09/32] Issue #SB-27408 | Assessment archival Base and sub class changes --- .../archival/AssessmentArchivalJob.scala | 111 +++++++++++++++++- .../analytics/archival/BaseArchivalJob.scala | 107 ++--------------- .../archival/TestAsssessmentArchivalJob.scala | 6 +- 3 files changed, 117 insertions(+), 107 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 1c0ffc1cd..4a5edc367 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -1,19 +1,120 @@ package org.sunbird.analytics.archival +import org.apache.spark.sql.functions.{col, concat, lit, to_json, to_timestamp, weekofyear, year} +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} -import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} +import org.ekstep.analytics.framework.conf.AppConf +import org.ekstep.analytics.framework.util.{JSONUtils, JobLogger} +import org.ekstep.analytics.framework.{FrameworkContext, JobConfig, Level} +import org.sunbird.analytics.archival.util.ArchivalRequest + +import java.util.concurrent.atomic.AtomicInteger object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { + private val partitionCols = List("batch_id", "year", "week_of_year") + private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") + override def getClassName = "org.sunbird.analytics.archival.AssessmentArchivalJob" - override def jobName() = "AssessmentArchivalJob"; - override def jobId(): String = "assessment-archival"; - override def getReportPath() = "assessment-archival/"; - override def getReportKey() = "assessment"; + override def jobName = "AssessmentArchivalJob"; + override def jobId: String = "assessment-archival"; + override def getReportPath = "assessment-archival/"; + override def getReportKey = "assessment"; override def processArchival(archivalTableData: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { println("Process Archival") generatePeriodInData(data = archivalTableData) } + override def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + val requests = getRequests(jobId, requestConfig.batchId) + + val archivalTable = requestConfig.archivalTable + val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + val batchId: String = requestConfig.batchId.getOrElse("") + val date: String = requestConfig.date.getOrElse("") + + var data = loadData(Map("table" -> archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) + + data = if (batchId.nonEmpty) { + data.filter(col("batch_id") === batchId).persist() + } else { + data + } + + println("requestLength: " + requests.length) + try { + var dataDF = processArchival(data, requestConfig) + if(requests.length > 0) { + for (request <- requests) { + if (request.archival_status.equals("SUCCESS")) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) + dataDF = dataDF.filter( + col("batch_id").equalTo(request.batch_id) && + concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) + ) + } + } + } + + val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + + val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + + archiveBatches(batchesToArchive, dataDF, requestConfig) + } catch { + case ex: Exception => + ex.printStackTrace() + } + } + + override def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit = { + batchesToArchive.foreach(batches => { + val processingBatch = new AtomicInteger(batches._2.length) + JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) + + // Loop through the week_num & year batch partition + batches._2.map((batch: BatchPartition) => { + val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) + val collectionId = filteredDF.first().getAs[String]("course_id") + var archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) + + if (archivalRequest == null) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( + "week" -> batch.period.weekOfYear, + "year"-> batch.period.year + ) + archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Option(getReportKey), jobId, None, None, null, null, None, Option(0), JSONUtils.serialize(request_data), None) + } + + try { + val urls = upload(filteredDF, batch) // Upload the archived files into blob store + archivalRequest.blob_url = Option(urls) + JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) + markRequestAsSuccess(archivalRequest, requestConfig) + } catch { + case ex: Exception => { + markArchivalRequestAsFailed(archivalRequest, ex.getLocalizedMessage) + } + } + }).foreach((archivalRequest: ArchivalRequest) => { + upsertRequest(archivalRequest) + }) + + JobLogger.log(s"${batches._1} is successfully archived", Some(Map("batch_id" -> batches._1)), Level.INFO) + }) + } + + def deleteArchivedData(archivalRequest: Request): Unit = { + + } + + def generatePeriodInData(data: DataFrame): DataFrame = { + data.withColumn("updated_on", to_timestamp(col("updated_on"))) + .withColumn("year", year(col("updated_on"))) + .withColumn("week_of_year", weekofyear(col("updated_on"))) + .withColumn("question", to_json(col("question"))) + } + + } diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index c4f69811f..1795a141b 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -31,8 +31,6 @@ case class ArchivalMetrics(batchId: Option[String], trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { - private val partitionCols = List("batch_id", "year", "week_of_year") - private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") val cassandraUrl = "org.apache.spark.sql.cassandra" def main(config: String)(implicit sc: Option[SparkContext] = None, fc: Option[FrameworkContext] = None): Unit = { @@ -64,123 +62,34 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor } // def dataFilter(): Unit = {} -// def dateFormat(): String; + def getClassName: String; def execute()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { val modelParams = config.modelParams.getOrElse(Map[String, Option[AnyRef]]()); val requestConfig = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) - val archivalTable = requestConfig.archivalTable - val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - - val batchId: String = requestConfig.batchId.getOrElse("") - val date: String = requestConfig.date.getOrElse("") val mode: String = modelParams.getOrElse("mode","archive").asInstanceOf[String] - println("modelParams: " + modelParams) - println("archival request: " + requestConfig) - val archivalTableData: DataFrame = getArchivalData(archivalTable, archivalKeyspace,Option(batchId),Option(date)) - println("archivalTableData ") - archivalTableData.show(false) - mode.toLowerCase() match { case "archival" => - archiveData(archivalTableData, requestConfig) + archiveData(requestConfig) case "delete" => - deleteArchivedData(archivalTableData,requestConfig) + deleteArchivedData(requestConfig) } } - def archiveData(data: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { - val requests = getRequests(jobId, requestConfig.batchId) - println("requestLength: " + requests.length) - try { - var dataDF = processArchival(data, requestConfig) - if(requests.length > 0) { - for (request <- requests) { - if (request.archival_status.equals("SUCCESS")) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - dataDF = dataDF.filter( - col("batch_id").equalTo(request.batch_id) && - concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) - ) - } - } - } - - val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - - val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) - - archiveBatches(batchesToArchive, dataDF, requestConfig) - } catch { - case ex: Exception => - ex.printStackTrace() - } + def arhivalDateFormat(combinationList: List[String]): String = { + combinationList.mkString("-") } - def generatePeriodInData(data: DataFrame): DataFrame = { - data.withColumn("updated_on", to_timestamp(col("updated_on"))) - .withColumn("year", year(col("updated_on"))) - .withColumn("week_of_year", weekofyear(col("updated_on"))) - .withColumn("question", to_json(col("question"))) - } + def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit - def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit = { - batchesToArchive.foreach(batches => { - val processingBatch = new AtomicInteger(batches._2.length) - JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) - - // Loop through the week_num & year batch partition - batches._2.map((batch: BatchPartition) => { - val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) - val collectionId = filteredDF.first().getAs[String]("course_id") - var archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) - - if (archivalRequest == null) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( - "week" -> batch.period.weekOfYear, - "year"-> batch.period.year - ) - archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Option(getReportKey), jobId, None, None, null, null, None, Option(0), JSONUtils.serialize(request_data), None) - } - - try { - val urls = upload(filteredDF, batch) // Upload the archived files into blob store - archivalRequest.blob_url = Option(urls) - JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) - markRequestAsSuccess(archivalRequest, requestConfig) - } catch { - case ex: Exception => { - markArchivalRequestAsFailed(archivalRequest, ex.getLocalizedMessage) - } - } - }).foreach((archivalRequest: ArchivalRequest) => { - upsertRequest(archivalRequest) - }) - - JobLogger.log(s"${batches._1} is successfully archived", Some(Map("batch_id" -> batches._1)), Level.INFO) - }) - } + def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit - def deleteArchivedData(data: DataFrame, archivalRequest: Request): Unit = { - - } + def deleteArchivedData(archivalRequest: Request): Unit def processArchival(archivalTableData: DataFrame, archiveRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; - def getArchivalData(table: String, keyspace: String, batchId: Option[String], date: Option[String])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame = { - val archivalTableSettings = Map("table" -> table, "keyspace" -> keyspace, "cluster" -> "LMSCluster") - val archivalDBDF = loadData(archivalTableSettings, cassandraUrl, new StructType()) - val batchIdentifier = batchId.getOrElse(null) - - if (batchIdentifier.nonEmpty) { - archivalDBDF.filter(col("batch_id") === batchIdentifier).persist() - } else { - archivalDBDF - } - } - def getWeekAndYearVal(date: String): Period = { if (null != date && date.nonEmpty) { val dt = new DateTime(date) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 8edd02417..5d3d23e68 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -80,7 +80,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val user3Result = batch011Results.filter(col("user_id") === "user-003") user3Result.count() should be (2) - val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) archivalRequests.map(ar => ar.request_id).toList should contain allElementsOf List("F08614119F64BC55B14CBE49B10B6730", "949887DE6364A07AE1BB5A04504368F9") @@ -117,7 +117,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val user3Result = batch011Results.filter(col("user_id") === "user-003") user3Result.count() should be (2) - val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) val failedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, 2021, 48) @@ -160,7 +160,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val user3Result = batch011Results.filter(col("user_id") === "user-003") user3Result.count() should be (1) - val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId(), Option(batchId)) + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) } From 4d5cab220d86d8c8fbe0fa6493182f8eb927dab0 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Fri, 10 Dec 2021 13:20:14 +0530 Subject: [PATCH 10/32] Issue #SB-27408 | Assessment archival changes added --- .../archival/AssessmentArchivalJob.scala | 18 ++++++++---------- .../analytics/archival/BaseArchivalJob.scala | 19 ++++++++++++------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 4a5edc367..0f08ef4ce 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -26,7 +26,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { generatePeriodInData(data = archivalTableData) } - override def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { + override def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { val requests = getRequests(jobId, requestConfig.batchId) val archivalTable = requestConfig.archivalTable @@ -65,11 +65,12 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } catch { case ex: Exception => ex.printStackTrace() + List() } } - override def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit = { - batchesToArchive.foreach(batches => { + override def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] = { + batchesToArchive.flatMap(batches => { val processingBatch = new AtomicInteger(batches._2.length) JobLogger.log(s"Started Processing to archive the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_archive" -> processingBatch))) @@ -97,16 +98,13 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { markArchivalRequestAsFailed(archivalRequest, ex.getLocalizedMessage) } } - }).foreach((archivalRequest: ArchivalRequest) => { - upsertRequest(archivalRequest) }) - - JobLogger.log(s"${batches._1} is successfully archived", Some(Map("batch_id" -> batches._1)), Level.INFO) - }) + }).toList } - def deleteArchivedData(archivalRequest: Request): Unit = { - + def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest] = { + // TODO: Deletion feature + List() } def generatePeriodInData(data: DataFrame): DataFrame = { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 1795a141b..1805418be 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -68,25 +68,30 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def execute()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { val modelParams = config.modelParams.getOrElse(Map[String, Option[AnyRef]]()); val requestConfig = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) + val mode: String = modelParams.getOrElse("mode","archive").asInstanceOf[String] - mode.toLowerCase() match { + val archivalRequests = mode.toLowerCase() match { case "archival" => archiveData(requestConfig) case "delete" => deleteArchivedData(requestConfig) } + + for (archivalRequest <- archivalRequests) { + upsertRequest(archivalRequest) + } } - def arhivalDateFormat(combinationList: List[String]): String = { - combinationList.mkString("-") + def archivalFormat(batch: BatchPartition): String = { + s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" } - def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit + def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] - def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): Unit + def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] - def deleteArchivedData(archivalRequest: Request): Unit + def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest] def processArchival(archivalTableData: DataFrame, archiveRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; @@ -104,7 +109,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor val reportPath: String = blobConfig.getOrElse("reportPath", "archived-data/").asInstanceOf[String] val container = AppConf.getConfig("cloud.container.reports") val objectKey = AppConf.getConfig("course.metrics.cloud.objectKey") - val fileName = s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" + val fileName = archivalFormat(batch) val storageConfig = getStorageConfig(jobConfig, objectKey) JobLogger.log(s"Uploading reports to blob storage", None, Level.INFO) archivedData.saveToBlobStore(storageConfig, "csv", s"$reportPath$fileName-${System.currentTimeMillis()}", Option(Map("header" -> "true", "codec" -> "org.apache.hadoop.io.compress.GzipCodec")), None, fileExt=Some("csv.gz")) From a60875fe0473df02c1478f92282e7c0efe46d44c Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Fri, 10 Dec 2021 15:17:04 +0530 Subject: [PATCH 11/32] Issue #SB-27408 | Assessment archival changes added --- .../archival/AssessmentArchivalJob.scala | 17 +++++++---------- .../analytics/archival/BaseArchivalJob.scala | 1 + 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 0f08ef4ce..2eb3171b5 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -16,14 +16,14 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") override def getClassName = "org.sunbird.analytics.archival.AssessmentArchivalJob" - override def jobName = "AssessmentArchivalJob"; - override def jobId: String = "assessment-archival"; - override def getReportPath = "assessment-archival/"; - override def getReportKey = "assessment"; + override def jobName = "AssessmentArchivalJob" + override def jobId: String = "assessment-archival" + override def getReportPath = "assessment-archival/" + override def getReportKey = "assessment" + override def dateColumn = "updated_on" override def processArchival(archivalTableData: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { - println("Process Archival") - generatePeriodInData(data = archivalTableData) + generatePeriodInData(data = archivalTableData) } override def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { @@ -42,7 +42,6 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { data } - println("requestLength: " + requests.length) try { var dataDF = processArchival(data, requestConfig) if(requests.length > 0) { @@ -108,11 +107,9 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } def generatePeriodInData(data: DataFrame): DataFrame = { - data.withColumn("updated_on", to_timestamp(col("updated_on"))) + data.withColumn("updated_on", to_timestamp(col(dateColumn))) .withColumn("year", year(col("updated_on"))) .withColumn("week_of_year", weekofyear(col("updated_on"))) .withColumn("question", to_json(col("question"))) } - - } diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 1805418be..d7208b2d7 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -32,6 +32,7 @@ case class ArchivalMetrics(batchId: Option[String], trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { val cassandraUrl = "org.apache.spark.sql.cassandra" + def dateColumn: String def main(config: String)(implicit sc: Option[SparkContext] = None, fc: Option[FrameworkContext] = None): Unit = { implicit val className: String = getClassName; From 78ee4333ba284056d411d0abcf0675f85f0154f9 Mon Sep 17 00:00:00 2001 From: utk14 Date: Mon, 13 Dec 2021 12:11:06 +0530 Subject: [PATCH 12/32] Issue SB-24793 feat: Review comments resolved --- .../archival/AssessmentArchivalJob.scala | 42 ++++++------------- .../analytics/archival/BaseArchivalJob.scala | 24 ++++++++--- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 0f08ef4ce..143b507b4 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -21,47 +21,29 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def getReportPath = "assessment-archival/"; override def getReportKey = "assessment"; - override def processArchival(archivalTableData: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { - println("Process Archival") - generatePeriodInData(data = archivalTableData) - } - - override def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { - val requests = getRequests(jobId, requestConfig.batchId) + override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { - val archivalTable = requestConfig.archivalTable val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val batchId: String = requestConfig.batchId.getOrElse("") - val date: String = requestConfig.date.getOrElse("") + val batchId: String = requestConfig.batchId.getOrElse(null) + val collId: String = requestConfig.collectionId.getOrElse(null) + val date: String = requestConfig.date.getOrElse(null) - var data = loadData(Map("table" -> archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) + var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = if (batchId.nonEmpty) { + data = if (batchId.nonEmpty && collId.nonEmpty) { + data.filter(col("batch_id") === batchId && col("course_id") === collId).persist() + } else if (batchId.nonEmpty) { data.filter(col("batch_id") === batchId).persist() } else { data } - - println("requestLength: " + requests.length) try { - var dataDF = processArchival(data, requestConfig) - if(requests.length > 0) { - for (request <- requests) { - if (request.archival_status.equals("SUCCESS")) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - dataDF = dataDF.filter( - col("batch_id").equalTo(request.batch_id) && - concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) - ) - } - } - } - - val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - + val dataDF = generatePeriodInData(data) + val filteredDF = dataFilter(requests, dataDF) + val archiveBatchList = filteredDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) - archiveBatches(batchesToArchive, dataDF, requestConfig) + archiveBatches(batchesToArchive, filteredDF, requestConfig) } catch { case ex: Exception => ex.printStackTrace() diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 1805418be..589799518 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -15,6 +15,7 @@ import org.sunbird.analytics.exhaust.BaseReportsJob import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.apache.spark.sql.functions._ import org.joda.time.DateTime +import org.sunbird.analytics.archival.AssessmentArchivalJob.{getRequests, jobId} import org.sunbird.analytics.archival.util.{ArchivalMetaDataStoreJob, ArchivalRequest} case class Period(year: Int, weekOfYear: Int) @@ -61,19 +62,32 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor spark.setCassandraConf("LMSCluster", CassandraConnectorConf.ConnectionHostParam.option(AppConf.getConfig("sunbird.courses.cluster.host"))) } -// def dataFilter(): Unit = {} + def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { + var filteredDF = dataDF + for (request <- requests) { + if (request.archival_status.equals("SUCCESS")) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) + filteredDF = dataDF.filter( + col("batch_id").equalTo(request.batch_id) && + concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) + ) + } + } + filteredDF + }; def getClassName: String; def execute()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { val modelParams = config.modelParams.getOrElse(Map[String, Option[AnyRef]]()); val requestConfig = JSONUtils.deserialize[Request](JSONUtils.serialize(modelParams.getOrElse("request", Request).asInstanceOf[Map[String,AnyRef]])) - val mode: String = modelParams.getOrElse("mode","archive").asInstanceOf[String] + val requests = getRequests(jobId, requestConfig.batchId) + val archivalRequests = mode.toLowerCase() match { case "archival" => - archiveData(requestConfig) + archiveData(requestConfig, requests) case "delete" => deleteArchivedData(requestConfig) } @@ -87,14 +101,12 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" } - def archiveData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] + def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest] - def processArchival(archivalTableData: DataFrame, archiveRequest: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame; - def getWeekAndYearVal(date: String): Period = { if (null != date && date.nonEmpty) { val dt = new DateTime(date) From 8c39db14fa1d2984db0d70408413cde4f32879fb Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 13 Dec 2021 14:19:25 +0530 Subject: [PATCH 13/32] Issue #SB-27408 | Test case fixes added --- .../sunbird/analytics/archival/AssessmentArchivalJob.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index d04439483..a6bb71567 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -28,8 +28,8 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val batchId: String = requestConfig.batchId.getOrElse(null) - val collId: String = requestConfig.collectionId.getOrElse(null) + val batchId: String = requestConfig.batchId.getOrElse("") + val collId: String = requestConfig.collectionId.getOrElse("") val date: String = requestConfig.date.getOrElse(null) var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) From e9f71dd05d8f889f6721e04bf87bdfd93507826b Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 13 Dec 2021 16:53:11 +0530 Subject: [PATCH 14/32] Issue #SB-27408 | PR Review changes added --- .../analytics/archival/AssessmentArchivalJob.scala | 12 ++++++------ .../analytics/archival/BaseArchivalJob.scala | 4 ---- .../archival/util/ArchivalMetaDataStoreJob.scala | 14 +++++++------- .../archival/TestAsssessmentArchivalJob.scala | 2 +- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index a6bb71567..3bfee11a2 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -1,6 +1,6 @@ package org.sunbird.analytics.archival -import org.apache.spark.sql.functions.{col, concat, lit, to_json, to_timestamp, weekofyear, year} +import org.apache.spark.sql.functions.{col, to_json, to_timestamp, weekofyear, year} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} import org.ekstep.analytics.framework.conf.AppConf @@ -29,13 +29,13 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) val batchId: String = requestConfig.batchId.getOrElse("") - val collId: String = requestConfig.collectionId.getOrElse("") + val collectionId: String = requestConfig.collectionId.getOrElse("") val date: String = requestConfig.date.getOrElse(null) var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = if (batchId.nonEmpty && collId.nonEmpty) { - data.filter(col("batch_id") === batchId && col("course_id") === collId).persist() + data = if (batchId.nonEmpty && collectionId.nonEmpty) { + data.filter(col("batch_id") === batchId && col("course_id") === collectionId).persist() } else if (batchId.nonEmpty) { data.filter(col("batch_id") === batchId).persist() } else { @@ -65,7 +65,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { batches._2.map((batch: BatchPartition) => { val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) val collectionId = filteredDF.first().getAs[String]("course_id") - var archivalRequest = getRequest(collectionId, batch.batchId, batch.period.year, batch.period.weekOfYear) + var archivalRequest:ArchivalRequest = getRequest(collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) if (archivalRequest == null) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( @@ -78,7 +78,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { try { val urls = upload(filteredDF, batch) // Upload the archived files into blob store archivalRequest.blob_url = Option(urls) - JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) + JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(Map("remaining_part_files_to_archive" -> processingBatch.decrementAndGet())), Level.INFO) markRequestAsSuccess(archivalRequest, requestConfig) } catch { case ex: Exception => { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index bd139d6ba..4aba5db42 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -1,12 +1,9 @@ package org.sunbird.analytics.archival -import java.util.concurrent.atomic.AtomicInteger - import com.datastax.spark.connector.cql.CassandraConnectorConf import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.cassandra._ -import org.apache.spark.sql.types.StructType import org.ekstep.analytics.framework.Level.ERROR import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} @@ -15,7 +12,6 @@ import org.sunbird.analytics.exhaust.BaseReportsJob import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.apache.spark.sql.functions._ import org.joda.time.DateTime -import org.sunbird.analytics.archival.AssessmentArchivalJob.{getRequests, jobId} import org.sunbird.analytics.archival.util.{ArchivalMetaDataStoreJob, ArchivalRequest} case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 16e59d3a0..5c4f99668 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -5,8 +5,8 @@ import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet, Timest import java.util.Properties import org.apache.commons.lang.StringUtils import org.apache.spark.sql.{Encoders, SparkSession} -import org.apache.spark.sql.functions.{col, lit} -import org.ekstep.analytics.framework.{FrameworkContext, JobConfig} +import org.apache.spark.sql.functions.col +import org.ekstep.analytics.framework.FrameworkContext import org.ekstep.analytics.framework.Level.INFO import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} @@ -51,13 +51,13 @@ trait ArchivalMetaDataStoreJob { requests } - def getRequestID(collectionId: String, batchId: String, year: Int, week: Int): String = { - val requestComb = s"$collectionId:$batchId:$year:$week" + def getRequestID(collectionId: String, batchId: String, partitionCols: List[Int]): String = { + val requestComb = s"$collectionId:$batchId:" + partitionCols.mkString(":") MessageDigest.getInstance("MD5").digest(requestComb.getBytes).map("%02X".format(_)).mkString } - def getRequest(collectionId: String, batchId: String, year: Int, week: Int): ArchivalRequest = { - val requestId = getRequestID(collectionId, batchId, year, week) + def getRequest(collectionId: String, batchId: String, partitionCols: List[Int]): ArchivalRequest = { + val requestId = getRequestID(collectionId, batchId, partitionCols) val archivalRequest = s"""select * from $requestsTable where request_id = '$requestId' limit 1""" val pstmt: PreparedStatement = dbc.prepareStatement(archivalRequest); val resultSet = pstmt.executeQuery() @@ -104,7 +104,7 @@ trait ArchivalMetaDataStoreJob { s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?::json,?)" val pstmt: PreparedStatement = dbc.prepareStatement(insertQry); val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - val requestId = getRequestID(request.collection_id, request.batch_id, request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int]) + val requestId = getRequestID(request.collection_id, request.batch_id, List(request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int])) pstmt.setString(1, requestId); pstmt.setString(2, request.batch_id); pstmt.setString(3, request.collection_id); diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 5d3d23e68..792ef833a 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -120,7 +120,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) - val failedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, 2021, 48) + val failedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 48)) failedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") failedRequest.archival_status should be ("SUCCESS") From 970b85719616ebab3ea5c35777c1aff6397989fa Mon Sep 17 00:00:00 2001 From: utk14 Date: Mon, 13 Dec 2021 17:01:21 +0530 Subject: [PATCH 15/32] Issue SB-24793 feat: Review comments resolved --- .../archival/AssessmentArchivalJob.scala | 27 ++++++++++++-- .../analytics/archival/BaseArchivalJob.scala | 37 +++---------------- .../util/ArchivalMetaDataStoreJob.scala | 4 -- 3 files changed, 28 insertions(+), 40 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index a6bb71567..20433019e 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -10,11 +10,11 @@ import org.sunbird.analytics.archival.util.ArchivalRequest import java.util.concurrent.atomic.AtomicInteger -case class Period(year: Int, weekOfYear: Int) -case class BatchPartition(batchId: String, period: Period) - object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { + case class Period(year: Int, weekOfYear: Int) + case class BatchPartition(batchId: String, period: Period) + private val partitionCols = List("batch_id", "year", "week_of_year") private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") @@ -25,6 +25,25 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def getReportKey = "assessment" override def dateColumn = "updated_on" + override def archivalFormat(batch: Map[String,AnyRef]): String = { + val formatDetails = JSONUtils.deserialize[BatchPartition](JSONUtils.serialize(batch)) + s"${formatDetails.batchId}/${formatDetails.period.year}-${formatDetails.period.weekOfYear}" + } + + override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { + var filteredDF = dataDF + for (request <- requests) { + if (request.archival_status.equals("SUCCESS")) { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) + filteredDF = dataDF.filter( + col("batch_id").equalTo(request.batch_id) && + concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) + ) + } + } + filteredDF + } + override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) @@ -76,7 +95,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } try { - val urls = upload(filteredDF, batch) // Upload the archived files into blob store + val urls = upload(filteredDF, Map("batchId" -> batch.batchId, "period"-> Map("year" -> batch.period.year, "weekOfYear" -> batch.period.weekOfYear))) // Upload the archived files into blob store archivalRequest.blob_url = Option(urls) JobLogger.log(s"Data is archived and Processing the remaining part files ", None, Level.INFO) markRequestAsSuccess(archivalRequest, requestConfig) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index bd139d6ba..a8bd3a206 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -15,7 +15,7 @@ import org.sunbird.analytics.exhaust.BaseReportsJob import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.apache.spark.sql.functions._ import org.joda.time.DateTime -import org.sunbird.analytics.archival.AssessmentArchivalJob.{getRequests, jobId} +import org.sunbird.analytics.archival.AssessmentArchivalJob.Period import org.sunbird.analytics.archival.util.{ArchivalMetaDataStoreJob, ArchivalRequest} case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) @@ -45,8 +45,6 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor frameworkContext.closeContext(); spark.close() } - - } def init()(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): Unit = { @@ -71,16 +69,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor } } - def getWeekAndYearVal(date: String): Period = { - if (null != date && date.nonEmpty) { - val dt = new DateTime(date) - Period(year = dt.getYear, weekOfYear = dt.getWeekOfWeekyear) - } else { - Period(0, 0) - } - } - - def upload(archivedData: DataFrame, batch: BatchPartition)(implicit jobConfig: JobConfig): List[String] = { + def upload(archivedData: DataFrame, batch: Map[String,AnyRef])(implicit jobConfig: JobConfig): List[String] = { val blobConfig = jobConfig.modelParams.get("blobConfig").asInstanceOf[Map[String, AnyRef]] val reportPath: String = blobConfig.getOrElse("reportPath", "archived-data/").asInstanceOf[String] val container = AppConf.getConfig("cloud.container.reports") @@ -91,20 +80,6 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor archivedData.saveToBlobStore(storageConfig, "csv", s"$reportPath$fileName-${System.currentTimeMillis()}", Option(Map("header" -> "true", "codec" -> "org.apache.hadoop.io.compress.GzipCodec")), None, fileExt=Some("csv.gz")) } - def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { - var filteredDF = dataDF - for (request <- requests) { - if (request.archival_status.equals("SUCCESS")) { - val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - filteredDF = dataDF.filter( - col("batch_id").equalTo(request.batch_id) && - concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) - ) - } - } - filteredDF - }; - // Overriding methods START: def jobId: String; def jobName: String; @@ -114,11 +89,9 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest]; def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest]; + def archivalFormat(batch: Map[String,AnyRef]): String; + def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame; - def archivalFormat(batch: BatchPartition): String = { - s"${batch.batchId}/${batch.period.year}-${batch.period.weekOfYear}" - - //Overriding methods END: - } + //Overriding methods END: } diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 16e59d3a0..b5d0e9ecf 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -33,18 +33,14 @@ trait ArchivalMetaDataStoreJob { } def getRequests(jobId: String, batchId: Option[String])(implicit spark: SparkSession, fc: FrameworkContext): Array[ArchivalRequest] = { - println("jobid: " + jobId + " batchid: " + batchId) val encoder = Encoders.product[ArchivalRequest] val archivalConfigsDf = spark.read.jdbc(url, requestsTable, connProperties) .where(col("job_id") === jobId && col("iteration") < 3) - println("archivalConfigDF:") - archivalConfigsDf.show(false) val filteredReportConfigDf = if (batchId.isDefined) { val filteredArchivalConfig = archivalConfigsDf.filter(col("batch_id").equalTo(batchId.get)) if (filteredArchivalConfig.count() > 0) filteredArchivalConfig else archivalConfigsDf } else archivalConfigsDf - println("filteredtReportCOnfig: ") filteredReportConfigDf.show(false) JobLogger.log("fetched records count" + filteredReportConfigDf.count(), None, INFO) val requests = filteredReportConfigDf.as[ArchivalRequest](encoder).collect() From 0401987037f91891750b454db52573cf2db23206 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 13 Dec 2021 19:44:32 +0530 Subject: [PATCH 16/32] Issue #SB-27408 | Archival Metrics changes added --- .../archival/AssessmentArchivalJob.scala | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 77fa289fc..c1de69db8 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -1,6 +1,6 @@ package org.sunbird.analytics.archival -import org.apache.spark.sql.functions.{col, to_json, to_timestamp, weekofyear, year} +import org.apache.spark.sql.functions.{col, concat, lit, to_json, to_timestamp, weekofyear, year} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} import org.ekstep.analytics.framework.conf.AppConf @@ -13,9 +13,14 @@ import java.util.concurrent.atomic.AtomicInteger object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { case class Period(year: Int, weekOfYear: Int) - case class BatchPartition(batchId: String, period: Period) - - private val partitionCols = List("batch_id", "year", "week_of_year") + case class BatchPartition(collectionId: String, batchId: String, period: Period) + case class ArchivalMetrics(batch: BatchPartition, + totalArchivedRecords: Option[Long], + pendingWeeksOfYears: Option[Long], + totalDeletedRecords: Option[Long] + ) + + private val partitionCols = List("course_id", "batch_id", "year", "week_of_year") private val columnWithOrder = List("course_id", "batch_id", "user_id", "content_id", "attempt_id", "created_on", "grand_total", "last_attempted_on", "total_max_score", "total_score", "updated_on", "question") override def getClassName = "org.sunbird.analytics.archival.AssessmentArchivalJob" @@ -53,19 +58,19 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = if (batchId.nonEmpty && collectionId.nonEmpty) { - data.filter(col("batch_id") === batchId && col("course_id") === collectionId).persist() - } else if (batchId.nonEmpty) { - data.filter(col("batch_id") === batchId).persist() - } else { - data + if(collectionId.nonEmpty) { + data = data.filter(col("course_id") === collectionId) + } + + if (batchId.nonEmpty) { + data = data.filter(col("batch_id") === batchId).persist() } try { val dataDF = generatePeriodInData(data) val filteredDF = dataFilter(requests, dataDF) val archiveBatchList = filteredDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], Period(f.get(1).asInstanceOf[Int], f.get(2).asInstanceOf[Int]))).groupBy(_.batchId) + val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], f.get(1).asInstanceOf[String], Period(f.get(2).asInstanceOf[Int], f.get(3).asInstanceOf[Int]))).groupBy(_.batchId) archiveBatches(batchesToArchive, filteredDF, requestConfig) } catch { @@ -82,7 +87,12 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { // Loop through the week_num & year batch partition batches._2.map((batch: BatchPartition) => { - val filteredDF = data.filter(col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear).select(columnWithOrder.head, columnWithOrder.tail: _*) + val filteredDF = data.filter( + col("course_id") === batch.collectionId && + col("batch_id") === batch.batchId && + col("year") === batch.period.year && + col("week_of_year") === batch.period.weekOfYear + ).select(columnWithOrder.head, columnWithOrder.tail: _*) val collectionId = filteredDF.first().getAs[String]("course_id") var archivalRequest:ArchivalRequest = getRequest(collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) @@ -97,7 +107,8 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { try { val urls = upload(filteredDF, Map("batchId" -> batch.batchId, "period"-> Map("year" -> batch.period.year, "weekOfYear" -> batch.period.weekOfYear))) // Upload the archived files into blob store archivalRequest.blob_url = Option(urls) - JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(Map("remaining_part_files_to_archive" -> processingBatch.decrementAndGet())), Level.INFO) + val metrics = ArchivalMetrics(batch, pendingWeeksOfYears = Some(processingBatch.getAndDecrement()), totalArchivedRecords = Some(filteredDF.count()), totalDeletedRecords = None) + JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) markRequestAsSuccess(archivalRequest, requestConfig) } catch { case ex: Exception => { From 6b3c6273cba2ef3b1017fa1ba2d298cadb849303 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 13 Dec 2021 20:05:49 +0530 Subject: [PATCH 17/32] Issue #SB-27408 | Fixes added --- .../sunbird/analytics/archival/AssessmentArchivalJob.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index c1de69db8..e38b2f437 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -93,15 +93,14 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear ).select(columnWithOrder.head, columnWithOrder.tail: _*) - val collectionId = filteredDF.first().getAs[String]("course_id") - var archivalRequest:ArchivalRequest = getRequest(collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) + var archivalRequest:ArchivalRequest = getRequest(batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) if (archivalRequest == null) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( "week" -> batch.period.weekOfYear, "year"-> batch.period.year ) - archivalRequest = ArchivalRequest("", batch.batchId, collectionId, Option(getReportKey), jobId, None, None, null, null, None, Option(0), JSONUtils.serialize(request_data), None) + archivalRequest = ArchivalRequest("", batch.batchId, batch.collectionId, Option(getReportKey), jobId, None, None, null, null, None, Option(0), JSONUtils.serialize(request_data), None) } try { From 6ae0f3ffc2740a0f2ab21f409c9a4567afecfd33 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 14 Dec 2021 14:46:29 +0530 Subject: [PATCH 18/32] Issue #SB-27408 | Testcase Fixes added --- .../archival/AssessmentArchivalJob.scala | 8 +++++--- .../analytics/archival/BaseArchivalJob.scala | 2 ++ .../resources/assessment-archival/data.cql | 4 ++-- .../archival/TestAsssessmentArchivalJob.scala | 19 +++++++++++-------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index e38b2f437..17bcfbbc9 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -32,7 +32,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def archivalFormat(batch: Map[String,AnyRef]): String = { val formatDetails = JSONUtils.deserialize[BatchPartition](JSONUtils.serialize(batch)) - s"${formatDetails.batchId}/${formatDetails.period.year}-${formatDetails.period.weekOfYear}" + s"${formatDetails.batchId}_${formatDetails.collectionId}/${formatDetails.period.year}-${formatDetails.period.weekOfYear}" } override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { @@ -92,7 +92,9 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { col("batch_id") === batch.batchId && col("year") === batch.period.year && col("week_of_year") === batch.period.weekOfYear - ).select(columnWithOrder.head, columnWithOrder.tail: _*) + ).withColumn("last_attempted_on", tsToLongUdf(col("last_attempted_on"))) + .withColumn("updated_on", tsToLongUdf(col("updated_on"))) + .select(columnWithOrder.head, columnWithOrder.tail: _*) var archivalRequest:ArchivalRequest = getRequest(batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) if (archivalRequest == null) { @@ -104,7 +106,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } try { - val urls = upload(filteredDF, Map("batchId" -> batch.batchId, "period"-> Map("year" -> batch.period.year, "weekOfYear" -> batch.period.weekOfYear))) // Upload the archived files into blob store + val urls = upload(filteredDF, JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(batch))) // Upload the archived files into blob store archivalRequest.blob_url = Option(urls) val metrics = ArchivalMetrics(batch, pendingWeeksOfYears = Some(processingBatch.getAndDecrement()), totalArchivedRecords = Some(filteredDF.count()), totalDeletedRecords = None) JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 9b4b5c8aa..cfa06ff09 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -4,6 +4,7 @@ import com.datastax.spark.connector.cql.CassandraConnectorConf import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.cassandra._ +import org.apache.spark.sql.functions.udf import org.ekstep.analytics.framework.Level.ERROR import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{CommonUtil, JSONUtils, JobLogger} @@ -88,4 +89,5 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor //Overriding methods END: + val tsToLongUdf = udf[java.lang.Long, java.sql.Timestamp]((ts: java.sql.Timestamp) => if (ts != null) ts.getTime else null) } diff --git a/data-products/src/test/resources/assessment-archival/data.cql b/data-products/src/test/resources/assessment-archival/data.cql index a99dd6e8f..93503d2db 100644 --- a/data-products/src/test/resources/assessment-archival/data.cql +++ b/data-products/src/test/resources/assessment-archival/data.cql @@ -1,6 +1,6 @@ -- Week 48 -INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-001', 'do_1128870328040161281204', 'attempt-001', '10', 10, 10, 1638357693000, 1638357693000); -INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-003', 'do_112876961957437440179', 'attempt-001', '10', 10, 10, 1638357693000, 1638357693000); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-001', 'do_1128870328040161281204', 'attempt-001', '10', 10, 10, 1638357693200, 1638357693000); +INSERT INTO sunbird_courses.assessment_aggregator (course_id, batch_id, user_id, content_id, attempt_id, grand_total, total_max_score, total_score, last_attempted_on, updated_on) VALUES ('do_1130928636168192001667', 'batch-011', 'user-003', 'do_112876961957437440179', 'attempt-001', '10', 10, 10, 1638357693200, 1638357693000); -- Week 49 -- diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 792ef833a..0ed7d8e9e 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -37,6 +37,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo "AssessmentArchivalJob" should "archive the batch which is not archived in past" in { implicit val fc = new FrameworkContext() val batchId = "batch-011" + val courseId = "do_1130928636168192001667" val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) @@ -44,7 +45,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo AssessmentArchivalJob.execute() val batch011Results = spark.read.format("csv").option("header", "true") - .load(s"$outputLocation/$batchId/2021*.csv.gz") + .load(s"$outputLocation/${batchId}_${courseId}/2021*.csv.gz") batch011Results.count() should be (5) @@ -54,12 +55,12 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val user1attempt1 = user1.filter(col("attempt_id") === "attempt-001").first user1attempt1.getAs[String]("course_id") should be ("do_1130928636168192001667") user1attempt1.getAs[String]("content_id") should be ("do_1128870328040161281204") - user1attempt1.getAs[String]("last_attempted_on") should be ("2021-12-01T16:51:33.000+05:30") + user1attempt1.getAs[String]("last_attempted_on") should be ("1638357693200") user1attempt1.getAs[String]("grand_total") should be ("10") user1attempt1.getAs[String]("total_max_score") should be ("10.0") user1attempt1.getAs[String]("total_score") should be ("10.0") user1attempt1.getAs[String]("question") should be ("[]") - user1attempt1.getAs[String]("updated_on") should be ("2021-12-01T16:51:33.000+05:30") + user1attempt1.getAs[String]("updated_on") should be ("1638357693000") val user1attempt2 = user1.filter(col("attempt_id") === "attempt-002").first @@ -72,7 +73,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val questionsList = JSONUtils.deserialize[List[Map[String, AnyRef]]](user1attempt2.getAs[String]("question")) questionsList.size should be (4) - user1attempt2.getAs[String]("updated_on") should be ("2021-12-09T17:47:34.823+05:30") + user1attempt2.getAs[String]("updated_on") should be ("1639052254823") val user2Result = batch011Results.filter(col("user_id") === "user-002") user2Result.count() should be (1) @@ -87,7 +88,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo archivalRequests.map(ar => ar.batch_id).toList.distinct should contain allElementsOf List("batch-011") archivalRequests.map(ar => ar.collection_id).toList.distinct should contain allElementsOf List("do_1130928636168192001667") archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") - archivalRequests.map(ar => ar.blob_url.get).toList.head.head should include ("src/test/resources/reports/assessment-archived-data/batch-011/2021") + archivalRequests.map(ar => ar.blob_url.get).toList.head.head should include (s"src/test/resources/reports/assessment-archived-data/${batchId}_${courseId}/2021") archivalRequests.map(ar => ar.iteration.get).toList.distinct should contain allElementsOf List(0) archivalRequests.map(ar => ar.err_message.get).toList.distinct should contain allElementsOf List("") } @@ -95,6 +96,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo it should "archive the batch which is failed to archive in past" in { implicit val fc = new FrameworkContext() val batchId = "batch-011" + val courseId = "do_1130928636168192001667" EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'FAILED', null, 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") @@ -104,7 +106,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo AssessmentArchivalJob.execute() val batch011Results = spark.read.format("csv").option("header", "true") - .load(s"$outputLocation/$batchId/2021*.csv.gz") + .load(s"$outputLocation/${batchId}_${courseId}/2021*.csv.gz") batch011Results.count() should be (5) @@ -124,12 +126,13 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo failedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") failedRequest.archival_status should be ("SUCCESS") - failedRequest.blob_url.get.head should include ("src/test/resources/reports/assessment-archived-data/batch-011/2021") + failedRequest.blob_url.get.head should include (s"src/test/resources/reports/assessment-archived-data/${batchId}_${courseId}/2021") } it should "skip archival for the batch which is archived in past" in { implicit val fc = new FrameworkContext() val batchId = "batch-011" + val courseId = "do_1130928636168192001667" EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") @@ -139,7 +142,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo AssessmentArchivalJob.execute() val batch011Results = spark.read.format("csv").option("header", "true") - .load(s"$outputLocation/$batchId/2021*.csv.gz") + .load(s"$outputLocation/${batchId}_${courseId}/2021*.csv.gz") batch011Results.count() should be (3) From 22c88a5f8af9f07707fcf5c9a7ee7a6c28b3d6b0 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 14 Dec 2021 18:08:41 +0530 Subject: [PATCH 19/32] Issue #SB-27408 | Testcase Fixes added --- .../audit/TestScoreMetricMigrationJob.scala | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/data-products/src/test/scala/org/sunbird/analytics/audit/TestScoreMetricMigrationJob.scala b/data-products/src/test/scala/org/sunbird/analytics/audit/TestScoreMetricMigrationJob.scala index cc92f9b09..9b139a985 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/audit/TestScoreMetricMigrationJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/audit/TestScoreMetricMigrationJob.scala @@ -40,16 +40,7 @@ class TestScoreMetricMigrationJob extends BaseSpec with MockFactory { result.head.get(0).asInstanceOf[Map[String, Int]]("max_score:do_112876961957437440179") should be(10) result.head.get(2).asInstanceOf[Seq[String]].size should be (2) - val aggDetail = JSONUtils.deserialize[Map[String, AnyRef]](result.head.get(2).asInstanceOf[Seq[String]].head) - - aggDetail("max_score") should be(10.0) - aggDetail("score") should be(10.0) - aggDetail("type") should be(jobConfig.modelParams.get.get("metricsType").get.toString) - aggDetail("attempt_id") should be("attempat-001") - aggDetail("content_id") should be("do_112876961957437440110") - aggDetail("attempted_on") should be(1634810023) - - result.head.get(2).asInstanceOf[Seq[String]](1) should be("""{"max_score":10.0,"score":10.0,"type":"attempt_metrics","attempt_id":"attempat-001","content_id":"do_112876961957437440179"}""") + result.head.get(2).asInstanceOf[Seq[String]] should contain allElementsOf List("""{"max_score":10.0,"score":10.0,"type":"attempt_metrics","attempt_id":"attempat-001","content_id":"do_112876961957437440110","attempted_on":1634810023}""", """{"max_score":10.0,"score":10.0,"type":"attempt_metrics","attempt_id":"attempat-001","content_id":"do_112876961957437440179"}""") val result2 = res.filter(col("context_id") === "cb:batch-001") .filter(col("activity_id") === "do_11306040245271756813015") @@ -57,8 +48,8 @@ class TestScoreMetricMigrationJob extends BaseSpec with MockFactory { .select("agg_details").collect() result2.head.get(0).asInstanceOf[Seq[String]].size should be (2) - result2.head.get(0).asInstanceOf[Seq[String]].head should be("""{"max_score":15.0,"score":15.0,"type":"attempt_metrics","attempt_id":"attempat-001","content_id":"do_11307593493010022418"}""") - result2.head.get(0).asInstanceOf[Seq[String]](1) should be("""{"max_score":15.0,"score":10.0,"type":"attempt_metrics","attempt_id":"attempat-002","content_id":"do_11307593493010022418"}""") + + result2.head.get(0).asInstanceOf[Seq[String]] should contain allElementsOf List("""{"max_score":15.0,"score":15.0,"type":"attempt_metrics","attempt_id":"attempat-001","content_id":"do_11307593493010022418"}""", """{"max_score":15.0,"score":10.0,"type":"attempt_metrics","attempt_id":"attempat-002","content_id":"do_11307593493010022418"}""") ScoreMetricMigrationJob.updatedTable(res, ScoreMetricMigrationJob.userActivityAggDBSettings) val result3 = res.filter(col("context_id") === "cb:batch-001") From c69817e447963c8d0d7a1e740ec375c4a9554a28 Mon Sep 17 00:00:00 2001 From: utk14 Date: Wed, 15 Dec 2021 10:10:46 +0530 Subject: [PATCH 20/32] Issue SB-24793 feat: Added batchfilters and search query support --- .../archival/AssessmentArchivalJob.scala | 52 ++++++++++++++----- .../analytics/archival/BaseArchivalJob.scala | 2 +- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 77fa289fc..affbebec9 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -2,14 +2,22 @@ package org.sunbird.analytics.archival import org.apache.spark.sql.functions.{col, to_json, to_timestamp, weekofyear, year} import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} import org.ekstep.analytics.framework.conf.AppConf -import org.ekstep.analytics.framework.util.{JSONUtils, JobLogger} +import org.ekstep.analytics.framework.util.{JSONUtils, JobLogger, RestUtil} import org.ekstep.analytics.framework.{FrameworkContext, JobConfig, Level} import org.sunbird.analytics.archival.util.ArchivalRequest - import java.util.concurrent.atomic.AtomicInteger +import org.apache.spark.sql.functions._ +import org.sunbird.analytics.util.Constants + +import scala.collection.immutable.List + +case class CollectionDetails(result: Result) +case class Result(content: List[CollectionInfo]) +case class CollectionInfo(identifier: String) + object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { case class Period(year: Int, weekOfYear: Int) @@ -47,19 +55,11 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val batchId: String = requestConfig.batchId.getOrElse("") - val collectionId: String = requestConfig.collectionId.getOrElse("") val date: String = requestConfig.date.getOrElse(null) var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = if (batchId.nonEmpty && collectionId.nonEmpty) { - data.filter(col("batch_id") === batchId && col("course_id") === collectionId).persist() - } else if (batchId.nonEmpty) { - data.filter(col("batch_id") === batchId).persist() - } else { - data - } + data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) try { val dataDF = generatePeriodInData(data) @@ -75,6 +75,34 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } } + def validateBatch(data: DataFrame,batchid: Option[String], collectionid: Option[String], batchFilters: Option[List[String]], searchFilter: Option[Map[String, AnyRef]]) (implicit spark: SparkSession, fc: FrameworkContext) ={ + implicit val sqlContext = new SQLContext(spark.sparkContext) + import sqlContext.implicits._ + + val filteredDF = if(batchid.isDefined && collectionid.isDefined) { + data.filter(col("batch_id") === batchid.get && col("course_id") === collectionid.get).persist() + } else if (batchFilters.isDefined) { + val batch = batchFilters.get.toDF() + data.join(batch, Seq("batch_id"), "inner") + } else { + JobLogger.log("Neither batchId nor batchFilters present", None, Level.INFO) + data + } + if (searchFilter.isDefined) { + val res = searchContent(searchFilter.get) + filteredDF.join(res, col("course_id") === col("identifier"), "inner") + } else filteredDF + + } + + def searchContent(searchFilter: Map[String, AnyRef])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame = { + // TODO: Handle limit and do a recursive search call + val apiURL = Constants.COMPOSITE_SEARCH_URL + val request = JSONUtils.serialize(searchFilter) + val response = RestUtil.post[CollectionDetails](apiURL, request).result.content + spark.createDataFrame(response).select("identifier") + } + def archiveBatches(batchesToArchive: Map[String, Array[BatchPartition]], data: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] = { batchesToArchive.flatMap(batches => { val processingBatch = new AtomicInteger(batches._2.length) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 9b4b5c8aa..d92eae115 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -12,7 +12,7 @@ import org.sunbird.analytics.exhaust.BaseReportsJob import org.ekstep.analytics.framework.util.DatasetUtil.extensions import org.sunbird.analytics.archival.util.{ArchivalMetaDataStoreJob, ArchivalRequest} -case class Request(archivalTable: String, keyspace: Option[String], query: Option[String] = Option(""), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), date: Option[String] = Option("")) +case class Request(archivalTable: String, keyspace: Option[String], query: Option[Map[String, AnyRef]] = Option(Map()), batchId: Option[String] = Option(""), collectionId: Option[String]=Option(""), batchFilters: Option[List[String]]=Option(List()), date: Option[String] = Option("")) trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStoreJob with Serializable { From 0af8df33066024972a33b45ac72d7dbe9da11242 Mon Sep 17 00:00:00 2001 From: utk14 Date: Wed, 15 Dec 2021 23:49:00 +0530 Subject: [PATCH 21/32] Issue SB-24793 feat: Added batchfilters and search query support --- .../analytics/archival/TestAsssessmentArchivalJob.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 0ed7d8e9e..57e6bc948 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -39,7 +39,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val batchId = "batch-011" val courseId = "do_1130928636168192001667" - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() @@ -100,7 +100,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'FAILED', null, 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() @@ -136,7 +136,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","query":"{}","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() From 5c2aa2f902a348f7675ed68aeeb3f03ea526162e Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 21 Dec 2021 17:54:15 +0530 Subject: [PATCH 22/32] Issue #SB-27408 | Deletion function added --- .../archival/AssessmentArchivalJob.scala | 100 ++++++++++++++---- .../analytics/archival/BaseArchivalJob.scala | 6 +- .../util/ArchivalMetaDataStoreJob.scala | 8 +- .../archival/TestAsssessmentArchivalJob.scala | 4 + 4 files changed, 91 insertions(+), 27 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index c417656d5..185698c02 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -1,5 +1,6 @@ package org.sunbird.analytics.archival +import com.datastax.spark.connector.{SomeColumns, toRDDFunctions} import org.apache.spark.sql.functions.{col, concat, lit, to_json, to_timestamp, weekofyear, year} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} @@ -7,8 +8,8 @@ import org.ekstep.analytics.framework.conf.AppConf import org.ekstep.analytics.framework.util.{JSONUtils, JobLogger, RestUtil} import org.ekstep.analytics.framework.{FrameworkContext, JobConfig, Level} import org.sunbird.analytics.archival.util.ArchivalRequest -import java.util.concurrent.atomic.AtomicInteger +import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.sql.functions._ import org.sunbird.analytics.util.Constants @@ -43,43 +44,47 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { s"${formatDetails.batchId}_${formatDetails.collectionId}/${formatDetails.period.year}-${formatDetails.period.weekOfYear}" } - override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { + override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame, toDelete: Boolean): DataFrame = { var filteredDF = dataDF for (request <- requests) { if (request.archival_status.equals("SUCCESS")) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - filteredDF = dataDF.filter( - col("batch_id").equalTo(request.batch_id) && - concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data.get("year").get + "-" + request_data.get("week").get) - ) + filteredDF = dataDF.filter(col("batch_id").equalTo(request.batch_id)) + filteredDF = if(toDelete) { + filteredDF.filter(concat(col("year"), lit("-"), col("week_of_year")) === lit(request_data("year") + "-" + request_data("week"))) + } else { + filteredDF.filter(concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data("year") + "-" + request_data("week"))) + } } } filteredDF } override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { - - val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val date: String = requestConfig.date.getOrElse(null) - - var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) - try { - val dataDF = generatePeriodInData(data) - val filteredDF = dataFilter(requests, dataDF) - val archiveBatchList = filteredDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], f.get(1).asInstanceOf[String], Period(f.get(2).asInstanceOf[Int], f.get(3).asInstanceOf[Int]))).groupBy(_.batchId) + val dataDF = prepareData(requestConfig) + val filteredDF = dataFilter(requests, dataDF, false) - archiveBatches(batchesToArchive, filteredDF, requestConfig) + archiveBatches(getBatchPartitions(filteredDF), filteredDF, requestConfig) } catch { case ex: Exception => ex.printStackTrace() + JobLogger.log("archiveData: Exception with error message = " + ex.getMessage, None, Level.ERROR) List() } } - def validateBatch(data: DataFrame,batchid: Option[String], collectionid: Option[String], batchFilters: Option[List[String]], searchFilter: Option[Map[String, AnyRef]]) (implicit spark: SparkSession, fc: FrameworkContext) ={ + def prepareData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { + val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + val date: String = requestConfig.date.getOrElse(null) + + var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) + data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) + + generatePeriodInData(data) + } + + def validateBatch(data: DataFrame,batchid: Option[String], collectionid: Option[String], batchFilters: Option[List[String]], searchFilter: Option[Map[String, AnyRef]])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame ={ implicit val sqlContext = new SQLContext(spark.sparkContext) import sqlContext.implicits._ @@ -137,9 +142,10 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { archivalRequest.blob_url = Option(urls) val metrics = ArchivalMetrics(batch, pendingWeeksOfYears = Some(processingBatch.getAndDecrement()), totalArchivedRecords = Some(filteredDF.count()), totalDeletedRecords = None) JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) - markRequestAsSuccess(archivalRequest, requestConfig) + markArchivalRequestAsSuccess(archivalRequest, requestConfig) } catch { case ex: Exception => { + JobLogger.log("archiveBatch: Exception with error message = " + ex.getLocalizedMessage, Some(batch), Level.ERROR) markArchivalRequestAsFailed(archivalRequest, ex.getLocalizedMessage) } } @@ -147,9 +153,57 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { }).toList } - def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest] = { - // TODO: Deletion feature - List() + def getBatchPartitions(dataDF: DataFrame): Map[String, Array[BatchPartition]] = { + val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], f.get(1).asInstanceOf[String], Period(f.get(2).asInstanceOf[Int], f.get(3).asInstanceOf[Int]))).groupBy(_.batchId) + } + + override def deleteArchivedData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { + try { + val dataDF = prepareData(requestConfig) + val filteredDF = dataFilter(requests, dataDF, true) + + deleteBatches(getBatchPartitions(filteredDF), filteredDF, requestConfig) + } catch { + case ex: Exception => + ex.printStackTrace() + JobLogger.log("deleteArchivedData: Exception with error message = " + ex.getMessage, None, Level.ERROR) + List() + } + } + + def deleteBatches(batchesToDelete: Map[String, Array[BatchPartition]], dataDf: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] = { + batchesToDelete.flatMap(batches => { + val processingBatch = new AtomicInteger(batches._2.length) + JobLogger.log(s"Started Processing to delete the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_delete" -> processingBatch))) + + // Loop through the week_num & year batch partition + batches._2.map((batch: BatchPartition) => { + val archivalRequest:ArchivalRequest = getRequest(batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) + + try { + val archivedData = dataDf.filter( + col("course_id") === batch.collectionId && + col("batch_id") === batch.batchId && + col("year") === batch.period.year && + col("week_of_year") === batch.period.weekOfYear + ).select("course_id", "batch_id", "user_id", "content_id", "attempt_id") + val totalArchivedRecords: Long = archivedData.count + JobLogger.log(s"Deleting $totalArchivedRecords archived records only, for the year ${batch.period.year} and week of year ${batch.period.weekOfYear} from the DB ", None, Level.INFO) + + archivedData.rdd.deleteFromCassandra(AppConf.getConfig("sunbird.courses.keyspace"), AppConf.getConfig("sunbird.courses.assessment.table"), keyColumns = SomeColumns("course_id", "batch_id", "user_id", "content_id", "attempt_id")) + val metrics = ArchivalMetrics(batch, pendingWeeksOfYears = None, totalArchivedRecords = Some(totalArchivedRecords), totalDeletedRecords = Some(totalArchivedRecords)) + + JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) + markDeletionRequestAsSuccess(archivalRequest, requestConfig) + } catch { + case ex: Exception => { + JobLogger.log("deleteBatch: Exception with error message = " + ex.getLocalizedMessage, Some(batch), Level.ERROR) + markDeletionRequestAsFailed(archivalRequest, ex.getLocalizedMessage) + } + } + }) + }).toList } def generatePeriodInData(data: DataFrame): DataFrame = { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 8808ecc12..0ec4d6328 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -57,7 +57,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor case "archival" => archiveData(requestConfig, requests) case "delete" => - deleteArchivedData(requestConfig) + deleteArchivedData(requestConfig, requests) } for (archivalRequest <- archivalRequests) { upsertRequest(archivalRequest) @@ -83,9 +83,9 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def getClassName: String; def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest]; - def deleteArchivedData(archivalRequest: Request): List[ArchivalRequest]; + def deleteArchivedData(archivalRequest: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest]; def archivalFormat(batch: Map[String,AnyRef]): String; - def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame; + def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame, isArchived: Boolean): DataFrame; //Overriding methods END: diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 65dde3887..51f4e620b 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -144,10 +144,16 @@ trait ArchivalMetaDataStoreJob { pstmt.execute() } - def markRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): ArchivalRequest = { + def markArchivalRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): ArchivalRequest = { request.archival_status = "SUCCESS"; request.archival_date = Option(System.currentTimeMillis()) request } + def markDeletionRequestAsSuccess(request: ArchivalRequest, requestConfig: Request): ArchivalRequest = { + request.deletion_status = "SUCCESS"; + request.completion_date = Option(System.currentTimeMillis()) + request + } + } diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 57e6bc948..acce66c4e 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -24,6 +24,9 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo override def afterEach(): Unit = { super.afterEach() EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") + } + + override def beforeEach(): Unit = { new HadoopFileUtil().delete(spark.sparkContext.hadoopConfiguration, outputLocation) } @@ -31,6 +34,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo super.afterAll() EmbeddedCassandra.close() EmbeddedPostgresql.close() + new HadoopFileUtil().delete(spark.sparkContext.hadoopConfiguration, outputLocation) spark.close() } From 8b5a2b5d694f12af22918e53833fdfd17103d7cb Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Wed, 22 Dec 2021 17:53:29 +0530 Subject: [PATCH 23/32] Issue #SB-27408 | Deleting file from blob changes added --- .../archival/AssessmentArchivalJob.scala | 110 ++++++++---------- .../analytics/archival/BaseArchivalJob.scala | 2 +- 2 files changed, 48 insertions(+), 64 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 185698c02..eb43b495c 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -11,6 +11,7 @@ import org.sunbird.analytics.archival.util.ArchivalRequest import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.sql.functions._ +import org.sunbird.analytics.exhaust.util.ExhaustUtil import org.sunbird.analytics.util.Constants import scala.collection.immutable.List @@ -44,17 +45,15 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { s"${formatDetails.batchId}_${formatDetails.collectionId}/${formatDetails.period.year}-${formatDetails.period.weekOfYear}" } - override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame, toDelete: Boolean): DataFrame = { + override def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame = { var filteredDF = dataDF for (request <- requests) { if (request.archival_status.equals("SUCCESS")) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - filteredDF = dataDF.filter(col("batch_id").equalTo(request.batch_id)) - filteredDF = if(toDelete) { - filteredDF.filter(concat(col("year"), lit("-"), col("week_of_year")) === lit(request_data("year") + "-" + request_data("week"))) - } else { - filteredDF.filter(concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data("year") + "-" + request_data("week"))) - } + filteredDF = dataDF.filter( + col("batch_id").equalTo(request.batch_id) && + concat(col("year"), lit("-"), col("week_of_year")) =!= lit(request_data("year") + "-" + request_data("week")) + ) } } filteredDF @@ -62,10 +61,19 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { override def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { try { - val dataDF = prepareData(requestConfig) - val filteredDF = dataFilter(requests, dataDF, false) + val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + val date: String = requestConfig.date.getOrElse(null) + + var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) + data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) + + val dataDF = generatePeriodInData(data) + val filteredDF = dataFilter(requests, dataDF) - archiveBatches(getBatchPartitions(filteredDF), filteredDF, requestConfig) + val archiveBatchList = filteredDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() + val batchesToArchive: Map[String, Array[BatchPartition]] = archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], f.get(1).asInstanceOf[String], Period(f.get(2).asInstanceOf[Int], f.get(3).asInstanceOf[Int]))).groupBy(_.batchId) + + archiveBatches(batchesToArchive, filteredDF, requestConfig) } catch { case ex: Exception => ex.printStackTrace() @@ -74,16 +82,6 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } } - def prepareData(requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): DataFrame = { - val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) - val date: String = requestConfig.date.getOrElse(null) - - var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) - - generatePeriodInData(data) - } - def validateBatch(data: DataFrame,batchid: Option[String], collectionid: Option[String], batchFilters: Option[List[String]], searchFilter: Option[Map[String, AnyRef]])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame ={ implicit val sqlContext = new SQLContext(spark.sparkContext) import sqlContext.implicits._ @@ -153,57 +151,43 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { }).toList } - def getBatchPartitions(dataDF: DataFrame): Map[String, Array[BatchPartition]] = { - val archiveBatchList = dataDF.groupBy(partitionCols.head, partitionCols.tail: _*).count().collect() - archiveBatchList.map(f => BatchPartition(f.get(0).asInstanceOf[String], f.get(1).asInstanceOf[String], Period(f.get(2).asInstanceOf[Int], f.get(3).asInstanceOf[Int]))).groupBy(_.batchId) + def loadArchivedData(batch: BatchPartition)(implicit spark: SparkSession, fc: FrameworkContext, jobConfig: JobConfig): DataFrame = { + val azureFetcherConfig = jobConfig.modelParams.get("assessmentFetcherConfig").asInstanceOf[Map[String, AnyRef]] + + val store = azureFetcherConfig("store").asInstanceOf[String] + val format:String = azureFetcherConfig.getOrElse("format", "csv").asInstanceOf[String] + val filePath = azureFetcherConfig.getOrElse("filePath", "archival-data/").asInstanceOf[String] + val container = azureFetcherConfig.getOrElse("container", "reports").asInstanceOf[String] + + ExhaustUtil.getArchivedData(store, filePath, container, Map("batchId" -> batch.batchId, "collectionId"-> batch.collectionId, "year" -> batch.period.year, "weekNum" -> batch.period.weekOfYear), Option(format)) } override def deleteArchivedData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { - try { - val dataDF = prepareData(requestConfig) - val filteredDF = dataFilter(requests, dataDF, true) - - deleteBatches(getBatchPartitions(filteredDF), filteredDF, requestConfig) - } catch { - case ex: Exception => - ex.printStackTrace() - JobLogger.log("deleteArchivedData: Exception with error message = " + ex.getMessage, None, Level.ERROR) - List() - } + requests.filter(r => r.archival_status.equals("SUCCESS")).map((request: ArchivalRequest) => { + deleteBatch(requestConfig, request) + }).toList } - def deleteBatches(batchesToDelete: Map[String, Array[BatchPartition]], dataDf: DataFrame, requestConfig: Request)(implicit config: JobConfig): List[ArchivalRequest] = { - batchesToDelete.flatMap(batches => { - val processingBatch = new AtomicInteger(batches._2.length) - JobLogger.log(s"Started Processing to delete the data", Some(Map("batch_id" -> batches._1, "total_part_files_to_delete" -> processingBatch))) + def deleteBatch(requestConfig: Request, request: ArchivalRequest)(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): ArchivalRequest = { + try { + val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) + val batchPartition = BatchPartition(request.collection_id, request.batch_id, Period(request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int])) + val archivedData = loadArchivedData(batchPartition).select("course_id", "batch_id", "user_id", "content_id", "attempt_id") - // Loop through the week_num & year batch partition - batches._2.map((batch: BatchPartition) => { - val archivalRequest:ArchivalRequest = getRequest(batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) + val totalArchivedRecords: Long = archivedData.count + JobLogger.log(s"Deleting $totalArchivedRecords archived records only, for the year ${batchPartition.period.year} and week of year ${batchPartition.period.weekOfYear} from the DB ", None, Level.INFO) - try { - val archivedData = dataDf.filter( - col("course_id") === batch.collectionId && - col("batch_id") === batch.batchId && - col("year") === batch.period.year && - col("week_of_year") === batch.period.weekOfYear - ).select("course_id", "batch_id", "user_id", "content_id", "attempt_id") - val totalArchivedRecords: Long = archivedData.count - JobLogger.log(s"Deleting $totalArchivedRecords archived records only, for the year ${batch.period.year} and week of year ${batch.period.weekOfYear} from the DB ", None, Level.INFO) - - archivedData.rdd.deleteFromCassandra(AppConf.getConfig("sunbird.courses.keyspace"), AppConf.getConfig("sunbird.courses.assessment.table"), keyColumns = SomeColumns("course_id", "batch_id", "user_id", "content_id", "attempt_id")) - val metrics = ArchivalMetrics(batch, pendingWeeksOfYears = None, totalArchivedRecords = Some(totalArchivedRecords), totalDeletedRecords = Some(totalArchivedRecords)) + archivedData.rdd.deleteFromCassandra(AppConf.getConfig("sunbird.courses.keyspace"), AppConf.getConfig("sunbird.courses.assessment.table"), keyColumns = SomeColumns("course_id", "batch_id", "user_id", "content_id", "attempt_id")) + val metrics = ArchivalMetrics(batchPartition, pendingWeeksOfYears = None, totalArchivedRecords = Some(totalArchivedRecords), totalDeletedRecords = Some(totalArchivedRecords)) - JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) - markDeletionRequestAsSuccess(archivalRequest, requestConfig) - } catch { - case ex: Exception => { - JobLogger.log("deleteBatch: Exception with error message = " + ex.getLocalizedMessage, Some(batch), Level.ERROR) - markDeletionRequestAsFailed(archivalRequest, ex.getLocalizedMessage) - } - } - }) - }).toList + JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) + markDeletionRequestAsSuccess(request, requestConfig) + } catch { + case ex: Exception => { + JobLogger.log("deleteBatch: Exception with error message = " + ex.getLocalizedMessage, Some(request), Level.ERROR) + markDeletionRequestAsFailed(request, ex.getLocalizedMessage) + } + } } def generatePeriodInData(data: DataFrame): DataFrame = { diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 0ec4d6328..476b4af9f 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -85,7 +85,7 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def archiveData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest]; def deleteArchivedData(archivalRequest: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest]; def archivalFormat(batch: Map[String,AnyRef]): String; - def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame, isArchived: Boolean): DataFrame; + def dataFilter(requests: Array[ArchivalRequest], dataDF: DataFrame): DataFrame; //Overriding methods END: From 97f5cbe58d287888a087a9f8ca331967ce5932a5 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 28 Dec 2021 18:12:35 +0530 Subject: [PATCH 24/32] Issue #SB-27408 | Testcases for Deleting files added --- .../archival/AssessmentArchivalJob.scala | 16 ++-- .../analytics/archival/BaseArchivalJob.scala | 1 - .../util/ArchivalMetaDataStoreJob.scala | 5 +- .../archival/TestAsssessmentArchivalJob.scala | 83 +++++++++++++++++++ 4 files changed, 95 insertions(+), 10 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index eb43b495c..67ea84035 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -152,18 +152,18 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } def loadArchivedData(batch: BatchPartition)(implicit spark: SparkSession, fc: FrameworkContext, jobConfig: JobConfig): DataFrame = { - val azureFetcherConfig = jobConfig.modelParams.get("assessmentFetcherConfig").asInstanceOf[Map[String, AnyRef]] + val blobConfig = jobConfig.modelParams.get("blobConfig").asInstanceOf[Map[String, AnyRef]] - val store = azureFetcherConfig("store").asInstanceOf[String] - val format:String = azureFetcherConfig.getOrElse("format", "csv").asInstanceOf[String] - val filePath = azureFetcherConfig.getOrElse("filePath", "archival-data/").asInstanceOf[String] - val container = azureFetcherConfig.getOrElse("container", "reports").asInstanceOf[String] + val store = blobConfig("store").asInstanceOf[String] + val format:String = blobConfig.getOrElse("blobExt", "csv.gz").asInstanceOf[String] + val filePath = blobConfig.getOrElse("reportPath", "assessment-archived-data/").asInstanceOf[String] + val container = blobConfig.getOrElse("container", "reports").asInstanceOf[String] ExhaustUtil.getArchivedData(store, filePath, container, Map("batchId" -> batch.batchId, "collectionId"-> batch.collectionId, "year" -> batch.period.year, "weekNum" -> batch.period.weekOfYear), Option(format)) } override def deleteArchivedData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { - requests.filter(r => r.archival_status.equals("SUCCESS")).map((request: ArchivalRequest) => { + requests.filter(r => r.archival_status.equals("SUCCESS") && r.deletion_status != "SUCCESS").map((request: ArchivalRequest) => { deleteBatch(requestConfig, request) }).toList } @@ -177,7 +177,9 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val totalArchivedRecords: Long = archivedData.count JobLogger.log(s"Deleting $totalArchivedRecords archived records only, for the year ${batchPartition.period.year} and week of year ${batchPartition.period.weekOfYear} from the DB ", None, Level.INFO) - archivedData.rdd.deleteFromCassandra(AppConf.getConfig("sunbird.courses.keyspace"), AppConf.getConfig("sunbird.courses.assessment.table"), keyColumns = SomeColumns("course_id", "batch_id", "user_id", "content_id", "attempt_id")) + val archivalKeyspace = requestConfig.keyspace.getOrElse(AppConf.getConfig("sunbird.courses.keyspace")) + + archivedData.rdd.deleteFromCassandra(archivalKeyspace, requestConfig.archivalTable, keyColumns = SomeColumns("course_id", "batch_id", "user_id", "content_id", "attempt_id")) val metrics = ArchivalMetrics(batchPartition, pendingWeeksOfYears = None, totalArchivedRecords = Some(totalArchivedRecords), totalDeletedRecords = Some(totalArchivedRecords)) JobLogger.log(s"Data is archived and Processing the remaining part files ", Some(metrics), Level.INFO) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala index 476b4af9f..faf8e05e5 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/BaseArchivalJob.scala @@ -67,7 +67,6 @@ trait BaseArchivalJob extends BaseReportsJob with IJob with ArchivalMetaDataStor def upload(archivedData: DataFrame, batch: Map[String,AnyRef])(implicit jobConfig: JobConfig): List[String] = { val blobConfig = jobConfig.modelParams.get("blobConfig").asInstanceOf[Map[String, AnyRef]] val reportPath: String = blobConfig.getOrElse("reportPath", "archived-data/").asInstanceOf[String] - val container = AppConf.getConfig("cloud.container.reports") val objectKey = AppConf.getConfig("course.metrics.cloud.objectKey") val fileName = archivalFormat(batch) val storageConfig = getStorageConfig(jobConfig, objectKey) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 51f4e620b..f24852819 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -129,7 +129,7 @@ trait ArchivalMetaDataStoreJob { def updateRequest(request: ArchivalRequest): Unit = { val updateQry = s"UPDATE $requestsTable SET blob_url=?, iteration = ?, archival_date=?, completion_date=?, " + - s"archival_status=?, deletion_status=? WHERE request_id=?"; + s"archival_status=?, deletion_status=?, err_message=? WHERE request_id=?"; val pstmt: PreparedStatement = dbc.prepareStatement(updateQry) val blobURLs = request.blob_url.getOrElse(List()).toArray.asInstanceOf[Array[Object]]; @@ -139,7 +139,8 @@ trait ArchivalMetaDataStoreJob { pstmt.setTimestamp(4, if (request.completion_date.isDefined) new Timestamp(request.completion_date.get) else null); pstmt.setString(5, request.archival_status); pstmt.setString(6, request.deletion_status); - pstmt.setString(7, request.request_id); + pstmt.setString(7, request.err_message.getOrElse("")); + pstmt.setString(8, request.request_id); pstmt.execute() } diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index acce66c4e..89c06a90f 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -171,4 +171,87 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo archivalRequests.size should be (2) } + it should "delete the archived records based on blob files" in { + implicit val fc = new FrameworkContext() + val batchId = "batch-011" + val courseId = "do_1130928636168192001667" + + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute()(spark, fc, jobConfig) + + val batch011Results = spark.read.format("csv").option("header", "true") + .load(s"$outputLocation/${batchId}_${courseId}/2021*.csv.gz") + + batch011Results.count() should be (5) + + val cassData = spark.read.format("org.apache.spark.sql.cassandra").options(Map("table" -> "assessment_aggregator", "keyspace" -> "sunbird_courses")).load() + + cassData.filter(col("batch_id") === batchId).count() should be (5) + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) + archivalRequests.size should be (2) + + archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") + archivalRequests.map(ar => ar.deletion_status).toList.distinct should contain allElementsOf List(null) + + val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + + val delJobConfig = JSONUtils.deserialize[JobConfig](delStrConfig) + + AssessmentArchivalJob.execute()(spark, fc, delJobConfig) + + val delCassData = spark.read.format("org.apache.spark.sql.cassandra").options(Map("table" -> "assessment_aggregator", "keyspace" -> "sunbird_courses")).load() + + delCassData.filter(col("batch_id") === batchId).count() should be (0) + + val deletionRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) + deletionRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") + deletionRequests.map(ar => ar.deletion_status).toList.distinct should contain allElementsOf List("SUCCESS") + } + + it should "not delete the records the if the blob file is not available" in { + implicit val fc = new FrameworkContext() + val batchId = "batch-011" + val courseId = "do_1130928636168192001667" + + // Week 48 records are processed will not be processed for archival again + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") + + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute()(spark, fc, jobConfig) + + val batch011Results = spark.read.format("csv").option("header", "true") + .load(s"$outputLocation/${batchId}_${courseId}/2021*.csv.gz") + + batch011Results.count() should be (3) + + val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + + val delJobConfig = JSONUtils.deserialize[JobConfig](delStrConfig) + + AssessmentArchivalJob.execute()(spark, fc, delJobConfig) + + val delCassData = spark.read.format("org.apache.spark.sql.cassandra").options(Map("table" -> "assessment_aggregator", "keyspace" -> "sunbird_courses")).load() + + delCassData.filter(col("batch_id") === batchId).count() should be (2) + + val skippedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 48)) + + skippedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") + skippedRequest.archival_status should be ("SUCCESS") + skippedRequest.deletion_status should be ("FAILED") + skippedRequest.err_message.get should include("Path does not exist") + + val deletionRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 49)) + + deletionRequest.request_id should be ("F08614119F64BC55B14CBE49B10B6730") + deletionRequest.archival_status should be ("SUCCESS") + deletionRequest.deletion_status should be ("SUCCESS") + + } + } From 466584e3bc3bf21dbdcc1f923b0c1cd41f38bcd4 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Wed, 29 Dec 2021 12:37:46 +0530 Subject: [PATCH 25/32] Issue #SB-27408 | Testcases fixes added --- .../analytics/archival/TestAsssessmentArchivalJob.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 89c06a90f..c23f997e1 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -16,23 +16,23 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo override def beforeAll(): Unit = { spark = getSparkSession(); super.beforeAll() - EmbeddedCassandra.loadData("src/test/resources/assessment-archival/data.cql") // Load test data in embedded cassandra server EmbeddedPostgresql.start() EmbeddedPostgresql.createArchivalRequestTable() } override def afterEach(): Unit = { super.afterEach() + EmbeddedCassandra.close() EmbeddedPostgresql.execute(s"TRUNCATE archival_metadata") } override def beforeEach(): Unit = { + EmbeddedCassandra.loadData("src/test/resources/assessment-archival/data.cql") // Load test data in embedded cassandra server new HadoopFileUtil().delete(spark.sparkContext.hadoopConfiguration, outputLocation) } override def afterAll() : Unit = { super.afterAll() - EmbeddedCassandra.close() EmbeddedPostgresql.close() new HadoopFileUtil().delete(spark.sparkContext.hadoopConfiguration, outputLocation) spark.close() From 200269453aa18c614e39e2cdffa84e234f9e3892 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 3 Jan 2022 11:01:32 +0530 Subject: [PATCH 26/32] Issue #SB-27408 | Reqest ID changes added --- .../archival/AssessmentArchivalJob.scala | 2 +- .../util/ArchivalMetaDataStoreJob.scala | 10 +++--- .../archival/TestAsssessmentArchivalJob.scala | 32 +++++++++---------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 67ea84035..babd970b5 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -125,7 +125,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { ).withColumn("last_attempted_on", tsToLongUdf(col("last_attempted_on"))) .withColumn("updated_on", tsToLongUdf(col("updated_on"))) .select(columnWithOrder.head, columnWithOrder.tail: _*) - var archivalRequest:ArchivalRequest = getRequest(batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) + var archivalRequest:ArchivalRequest = getRequest(jobId, batch.collectionId, batch.batchId, List(batch.period.year, batch.period.weekOfYear)) if (archivalRequest == null) { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](JSONUtils.serialize(requestConfig)) ++ Map[String, Int]( diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index f24852819..5aecb42bf 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -47,13 +47,13 @@ trait ArchivalMetaDataStoreJob { requests } - def getRequestID(collectionId: String, batchId: String, partitionCols: List[Int]): String = { - val requestComb = s"$collectionId:$batchId:" + partitionCols.mkString(":") + def getRequestID(jobId: String, collectionId: String, batchId: String, partitionCols: List[Int]): String = { + val requestComb = s"$jobId:$collectionId:$batchId:" + partitionCols.mkString(":") MessageDigest.getInstance("MD5").digest(requestComb.getBytes).map("%02X".format(_)).mkString } - def getRequest(collectionId: String, batchId: String, partitionCols: List[Int]): ArchivalRequest = { - val requestId = getRequestID(collectionId, batchId, partitionCols) + def getRequest(jobId: String, collectionId: String, batchId: String, partitionCols: List[Int]): ArchivalRequest = { + val requestId = getRequestID(jobId, collectionId, batchId, partitionCols) val archivalRequest = s"""select * from $requestsTable where request_id = '$requestId' limit 1""" val pstmt: PreparedStatement = dbc.prepareStatement(archivalRequest); val resultSet = pstmt.executeQuery() @@ -100,7 +100,7 @@ trait ArchivalMetaDataStoreJob { s"deletion_status, blob_url, iteration, request_data, err_message) VALUES (?,?,?,?,?,?,?,?,?,?,?,?::json,?)" val pstmt: PreparedStatement = dbc.prepareStatement(insertQry); val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) - val requestId = getRequestID(request.collection_id, request.batch_id, List(request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int])) + val requestId = getRequestID(request.job_id, request.collection_id, request.batch_id, List(request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int])) pstmt.setString(1, requestId); pstmt.setString(2, request.batch_id); pstmt.setString(3, request.collection_id); diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index c23f997e1..55004dfed 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -43,7 +43,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val batchId = "batch-011" val courseId = "do_1130928636168192001667" - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() @@ -88,7 +88,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) - archivalRequests.map(ar => ar.request_id).toList should contain allElementsOf List("F08614119F64BC55B14CBE49B10B6730", "949887DE6364A07AE1BB5A04504368F9") + archivalRequests.map(ar => ar.request_id).toList should contain allElementsOf List("2A04B5AF40E2E249EBB63530F19656F7", "AC0F439E287263DB49D54004DAA4644B") archivalRequests.map(ar => ar.batch_id).toList.distinct should contain allElementsOf List("batch-011") archivalRequests.map(ar => ar.collection_id).toList.distinct should contain allElementsOf List("do_1130928636168192001667") archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") @@ -102,9 +102,9 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val batchId = "batch-011" val courseId = "do_1130928636168192001667" - EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'FAILED', null, 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('2A04B5AF40E2E249EBB63530F19656F7', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'FAILED', null, 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() @@ -126,9 +126,9 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) archivalRequests.size should be (2) - val failedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 48)) + val failedRequest = AssessmentArchivalJob.getRequest(AssessmentArchivalJob.jobId, "do_1130928636168192001667", batchId, List(2021, 48)) - failedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") + failedRequest.request_id should be ("AC0F439E287263DB49D54004DAA4644B") failedRequest.archival_status should be ("SUCCESS") failedRequest.blob_url.get.head should include (s"src/test/resources/reports/assessment-archived-data/${batchId}_${courseId}/2021") } @@ -140,7 +140,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() @@ -176,7 +176,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val batchId = "batch-011" val courseId = "do_1130928636168192001667" - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute()(spark, fc, jobConfig) @@ -196,7 +196,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") archivalRequests.map(ar => ar.deletion_status).toList.distinct should contain allElementsOf List(null) - val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" val delJobConfig = JSONUtils.deserialize[JobConfig](delStrConfig) @@ -217,9 +217,9 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val courseId = "do_1130928636168192001667" // Week 48 records are processed will not be processed for archival again - EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('949887DE6364A07AE1BB5A04504368F9', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") + EmbeddedPostgresql.execute("INSERT INTO archival_metadata (request_id, batch_id, collection_id , resource_type , job_id , archival_date, completion_date, archival_status, blob_url, iteration,request_data , err_message ) VALUES ('AC0F439E287263DB49D54004DAA4644B', 'batch-011', 'do_1130928636168192001667', 'assessment', 'assessment-archival','2021-12-09 05:58:18.666', null,'SUCCESS', '{\"reports/assessment-archival/batch-011/2021-48.csv.gz\"}', 1,'{\"batchId\": \"batch-011\", \"week\": 48, \"year\": 2021}', NULL);") - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute()(spark, fc, jobConfig) @@ -229,7 +229,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo batch011Results.count() should be (3) - val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val delStrConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"delete","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","collectionId": "do_1130928636168192001667","date":"2021-11-01"},"blobConfig":{"store":"local","blobExt":"csv.gz","reportPath":"src/test/resources/reports/assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" val delJobConfig = JSONUtils.deserialize[JobConfig](delStrConfig) @@ -239,16 +239,16 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo delCassData.filter(col("batch_id") === batchId).count() should be (2) - val skippedRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 48)) + val skippedRequest = AssessmentArchivalJob.getRequest(AssessmentArchivalJob.jobId, "do_1130928636168192001667", batchId, List(2021, 48)) - skippedRequest.request_id should be ("949887DE6364A07AE1BB5A04504368F9") + skippedRequest.request_id should be ("AC0F439E287263DB49D54004DAA4644B") skippedRequest.archival_status should be ("SUCCESS") skippedRequest.deletion_status should be ("FAILED") skippedRequest.err_message.get should include("Path does not exist") - val deletionRequest = AssessmentArchivalJob.getRequest("do_1130928636168192001667", batchId, List(2021, 49)) + val deletionRequest = AssessmentArchivalJob.getRequest(AssessmentArchivalJob.jobId, "do_1130928636168192001667", batchId, List(2021, 49)) - deletionRequest.request_id should be ("F08614119F64BC55B14CBE49B10B6730") + deletionRequest.request_id should be ("2A04B5AF40E2E249EBB63530F19656F7") deletionRequest.archival_status should be ("SUCCESS") deletionRequest.deletion_status should be ("SUCCESS") From c456bf3b7adaae424b9a9084be1b56aee7afd27b Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 3 Jan 2022 11:50:12 +0530 Subject: [PATCH 27/32] Issue #SB-27408 | validation changes added --- .../archival/AssessmentArchivalJob.scala | 3 +-- .../archival/TestAsssessmentArchivalJob.scala | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index babd970b5..97b3c8909 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -92,8 +92,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val batch = batchFilters.get.toDF() data.join(batch, Seq("batch_id"), "inner") } else { - JobLogger.log("Neither batchId nor batchFilters present", None, Level.INFO) - data + throw new Exception("Either batchId or batchFilters should present") } if (searchFilter.isDefined) { val res = searchContent(searchFilter.get) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 55004dfed..1e38edd85 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -254,4 +254,19 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo } + it should "not create archival requests if request is invalid" in { + implicit val fc = new FrameworkContext() + val batchId = "batch-011" + val courseId = "do_1130928636168192001667" + + // Collection ID should be present to be valid request + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute() + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, None) + archivalRequests.size should be (0) + } + } From a0f555c1b32284f3d1468d195ff2c9f009a1ec61 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 3 Jan 2022 12:24:32 +0530 Subject: [PATCH 28/32] Issue #SB-27408 | validation changes added --- .../archival/AssessmentArchivalJob.scala | 29 +++++++------------ .../archival/TestAsssessmentArchivalJob.scala | 2 -- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 97b3c8909..0c76ec68e 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -65,7 +65,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val date: String = requestConfig.date.getOrElse(null) var data = loadData(Map("table" -> requestConfig.archivalTable, "keyspace" -> archivalKeyspace, "cluster" -> "LMSCluster"), cassandraUrl, new StructType()) - data = validateBatch(data, requestConfig.batchId, requestConfig.collectionId, requestConfig.batchFilters, requestConfig.query) + data = validateBatches(data, requestConfig) val dataDF = generatePeriodInData(data) val filteredDF = dataFilter(requests, dataDF) @@ -82,20 +82,20 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { } } - def validateBatch(data: DataFrame,batchid: Option[String], collectionid: Option[String], batchFilters: Option[List[String]], searchFilter: Option[Map[String, AnyRef]])(implicit spark: SparkSession, fc: FrameworkContext): DataFrame ={ + def validateBatches(data: DataFrame, requestConfig: Request)(implicit spark: SparkSession, fc: FrameworkContext): DataFrame ={ implicit val sqlContext = new SQLContext(spark.sparkContext) import sqlContext.implicits._ - val filteredDF = if(batchid.isDefined && collectionid.isDefined) { - data.filter(col("batch_id") === batchid.get && col("course_id") === collectionid.get).persist() - } else if (batchFilters.isDefined) { - val batch = batchFilters.get.toDF() + val filteredDF = if(requestConfig.batchId.isDefined && requestConfig.collectionId.isDefined) { + data.filter(col("batch_id") === requestConfig.batchId.get && col("course_id") === requestConfig.collectionId.get).persist() + } else if (requestConfig.batchFilters.isDefined) { + val batch = requestConfig.batchFilters.get.toDF() data.join(batch, Seq("batch_id"), "inner") } else { throw new Exception("Either batchId or batchFilters should present") } - if (searchFilter.isDefined) { - val res = searchContent(searchFilter.get) + if (requestConfig.query.isDefined) { + val res = searchContent(requestConfig.query.get) filteredDF.join(res, col("course_id") === col("identifier"), "inner") } else filteredDF @@ -150,15 +150,8 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { }).toList } - def loadArchivedData(batch: BatchPartition)(implicit spark: SparkSession, fc: FrameworkContext, jobConfig: JobConfig): DataFrame = { - val blobConfig = jobConfig.modelParams.get("blobConfig").asInstanceOf[Map[String, AnyRef]] - - val store = blobConfig("store").asInstanceOf[String] - val format:String = blobConfig.getOrElse("blobExt", "csv.gz").asInstanceOf[String] - val filePath = blobConfig.getOrElse("reportPath", "assessment-archived-data/").asInstanceOf[String] - val container = blobConfig.getOrElse("container", "reports").asInstanceOf[String] - - ExhaustUtil.getArchivedData(store, filePath, container, Map("batchId" -> batch.batchId, "collectionId"-> batch.collectionId, "year" -> batch.period.year, "weekNum" -> batch.period.weekOfYear), Option(format)) + def loadArchivedData(request: ArchivalRequest)(implicit spark: SparkSession, fc: FrameworkContext, jobConfig: JobConfig): DataFrame = { + ExhaustUtil.fetch(request.blob_url.get.head, "csv") } override def deleteArchivedData(requestConfig: Request, requests: Array[ArchivalRequest])(implicit spark: SparkSession, fc: FrameworkContext, config: JobConfig): List[ArchivalRequest] = { @@ -171,7 +164,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { try { val request_data = JSONUtils.deserialize[Map[String, AnyRef]](request.request_data) val batchPartition = BatchPartition(request.collection_id, request.batch_id, Period(request_data("year").asInstanceOf[Int], request_data("week").asInstanceOf[Int])) - val archivedData = loadArchivedData(batchPartition).select("course_id", "batch_id", "user_id", "content_id", "attempt_id") + val archivedData = loadArchivedData(request).select("course_id", "batch_id", "user_id", "content_id", "attempt_id") val totalArchivedRecords: Long = archivedData.count JobLogger.log(s"Deleting $totalArchivedRecords archived records only, for the year ${batchPartition.period.year} and week of year ${batchPartition.period.weekOfYear} from the DB ", None, Level.INFO) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index 1e38edd85..fcbb4d44b 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -256,8 +256,6 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo it should "not create archival requests if request is invalid" in { implicit val fc = new FrameworkContext() - val batchId = "batch-011" - val courseId = "do_1130928636168192001667" // Collection ID should be present to be valid request val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" From bc75436f85df944c0f83e08960df731c266f3af2 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Mon, 3 Jan 2022 12:41:21 +0530 Subject: [PATCH 29/32] Issue #SB-27408 | validation testcases added --- .../archival/AssessmentArchivalJob.scala | 2 +- .../archival/TestAsssessmentArchivalJob.scala | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index 0c76ec68e..cce737b64 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -89,7 +89,7 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val filteredDF = if(requestConfig.batchId.isDefined && requestConfig.collectionId.isDefined) { data.filter(col("batch_id") === requestConfig.batchId.get && col("course_id") === requestConfig.collectionId.get).persist() } else if (requestConfig.batchFilters.isDefined) { - val batch = requestConfig.batchFilters.get.toDF() + val batch = requestConfig.batchFilters.get.toDF("batch_id") data.join(batch, Seq("batch_id"), "inner") } else { throw new Exception("Either batchId or batchFilters should present") diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index fcbb4d44b..ff9a562ba 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -85,7 +85,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo val user3Result = batch011Results.filter(col("user_id") === "user-003") user3Result.count() should be (2) - val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, Option(batchId)) + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, None) archivalRequests.size should be (2) archivalRequests.map(ar => ar.request_id).toList should contain allElementsOf List("2A04B5AF40E2E249EBB63530F19656F7", "AC0F439E287263DB49D54004DAA4644B") @@ -97,6 +97,22 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo archivalRequests.map(ar => ar.err_message.get).toList.distinct should contain allElementsOf List("") } + it should "archive the multiple batches which is not archived in past" in { + implicit val fc = new FrameworkContext() + val batchId = "batch-011" + val courseId = "do_1130928636168192001667" + + val strConfig = """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchFilters":["batch-011", "batch-021"],"date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) + + AssessmentArchivalJob.execute() + + val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, None) + archivalRequests.size should be (3) + + archivalRequests.map(ar => ar.batch_id).toList.distinct should contain allElementsOf List("batch-011", "batch-021") + } + it should "archive the batch which is failed to archive in past" in { implicit val fc = new FrameworkContext() val batchId = "batch-011" From 8ff5c20aa80a813130d8bce0c6750eda60754964 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 4 Jan 2022 15:10:54 +0530 Subject: [PATCH 30/32] Issue #SB-27408 | All batches archival function changes added --- .../analytics/archival/AssessmentArchivalJob.scala | 3 ++- .../archival/TestAsssessmentArchivalJob.scala | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala index cce737b64..8f2a003d3 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/AssessmentArchivalJob.scala @@ -92,7 +92,8 @@ object AssessmentArchivalJob extends optional.Application with BaseArchivalJob { val batch = requestConfig.batchFilters.get.toDF("batch_id") data.join(batch, Seq("batch_id"), "inner") } else { - throw new Exception("Either batchId or batchFilters should present") + JobLogger.log("Neither batchId nor batchFilters present", None, Level.INFO) + data } if (requestConfig.query.isDefined) { val res = searchContent(requestConfig.query.get) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index ff9a562ba..e5c2cf280 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -270,17 +270,20 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo } - it should "not create archival requests if request is invalid" in { + it should "Archiva all batches if neither batchid nor batchfilters present" in { implicit val fc = new FrameworkContext() - // Collection ID should be present to be valid request - val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","batchId":"batch-011","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" + val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}""" implicit val jobConfig = JSONUtils.deserialize[JobConfig](strConfig) AssessmentArchivalJob.execute() val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, None) - archivalRequests.size should be (0) + archivalRequests.size should be (4) + + archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") + + archivalRequests.map(ar => ar.batch_id).toList.distinct should contain allElementsOf List("batch-011", "batch-021", "batch-031") } } From f41099ada93ca5e9d844baeb85cd6044e4edf2f3 Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 4 Jan 2022 15:52:54 +0530 Subject: [PATCH 31/32] Issue #SB-27408 | Testcases fixes added --- .../sunbird/analytics/archival/TestAsssessmentArchivalJob.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index e5c2cf280..dce8333ab 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -279,7 +279,6 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo AssessmentArchivalJob.execute() val archivalRequests = AssessmentArchivalJob.getRequests(AssessmentArchivalJob.jobId, None) - archivalRequests.size should be (4) archivalRequests.map(ar => ar.archival_status).toList.distinct should contain allElementsOf List("SUCCESS") From b298133d0777b77968904012771f2c3df470afde Mon Sep 17 00:00:00 2001 From: kumarks1122 Date: Tue, 4 Jan 2022 16:44:47 +0530 Subject: [PATCH 32/32] Issue #SB-27408 | PR Review changes added --- .../analytics/archival/util/ArchivalMetaDataStoreJob.scala | 2 +- .../sunbird/analytics/archival/TestAsssessmentArchivalJob.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala index 5aecb42bf..d20b44a38 100644 --- a/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala +++ b/data-products/src/main/scala/org/sunbird/analytics/archival/util/ArchivalMetaDataStoreJob.scala @@ -42,7 +42,7 @@ trait ArchivalMetaDataStoreJob { if (filteredArchivalConfig.count() > 0) filteredArchivalConfig else archivalConfigsDf } else archivalConfigsDf - JobLogger.log("fetched records count" + filteredReportConfigDf.count(), None, INFO) + JobLogger.log("fetched records count: " + filteredReportConfigDf.count(), None, INFO) val requests = filteredReportConfigDf.as[ArchivalRequest](encoder).collect() requests } diff --git a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala index dce8333ab..80269d03e 100644 --- a/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala +++ b/data-products/src/test/scala/org/sunbird/analytics/archival/TestAsssessmentArchivalJob.scala @@ -270,7 +270,7 @@ class TestAsssessmentArchivalJob extends BaseSpec with MockFactory with BaseRepo } - it should "Archiva all batches if neither batchid nor batchfilters present" in { + it should "Archive all batches if neither batchid nor batchfilters present" in { implicit val fc = new FrameworkContext() val strConfig= """{"search":{"type":"none"},"model":"org.sunbird.analytics.job.report.$job_name","modelParams":{"mode":"archival","request":{"archivalTable":"assessment_aggregator","date":"2021-11-01"},"blobConfig":{"store":"azure","blobExt":"csv.gz","reportPath":"assessment-archived-data/","container":"reports"},"sparkCassandraConnectionHost":"{{ core_cassandra_host }}","fromDate":"$(date --date yesterday '+%Y-%m-%d')","toDate":"$(date --date yesterday '+%Y-%m-%d')"},"parallelization":8,"appName":"$job_name"}"""