Skip to content

Commit 081318d

Browse files
authored
[WX-1835] Scheduled logging for list of groups experiencing quota exhaustion (#7539)
1 parent 6f1f9e5 commit 081318d

File tree

5 files changed

+62
-16
lines changed

5 files changed

+62
-16
lines changed

backend/src/main/scala/cromwell/backend/standard/GroupMetricsActor.scala

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,34 +11,63 @@ import cromwell.database.sql.SqlConverters.OffsetDateTimeToSystemTimestamp
1111
import cromwell.database.sql.tables.GroupMetricsEntry
1212

1313
import java.time.OffsetDateTime
14+
import scala.concurrent.Future
15+
import scala.concurrent.duration.FiniteDuration
1416
import scala.util.{Failure, Success}
1517

16-
class GroupMetricsActor(engineDbInterface: EngineSqlDatabase, quotaExhaustionThresholdInMins: Long)
17-
extends Actor
18+
class GroupMetricsActor(engineDbInterface: EngineSqlDatabase,
19+
quotaExhaustionThresholdInMins: Long,
20+
loggingInterval: FiniteDuration
21+
) extends Actor
1822
with ActorLogging {
1923

2024
implicit val ec: MessageDispatcher = context.system.dispatchers.lookup(Dispatcher.EngineDispatcher)
2125

26+
log.info(
27+
s"${this.getClass.getSimpleName} configured to log groups experiencing quota exhaustion at interval of ${loggingInterval.toString()}."
28+
)
29+
// initial schedule for logging exhausted groups
30+
context.system.scheduler.scheduleOnce(loggingInterval)(self ! LogQuotaExhaustedGroups)
31+
2232
override def receive: Receive = {
2333
case RecordGroupQuotaExhaustion(group) =>
2434
val groupMetricsEntry = GroupMetricsEntry(group, OffsetDateTime.now.toSystemTimestamp)
2535
engineDbInterface.recordGroupMetricsEntry(groupMetricsEntry)
2636
()
2737
case GetQuotaExhaustedGroups =>
2838
val respondTo: ActorRef = sender()
29-
30-
// for a group in the GROUP_METRICS_ENTRY table, if the 'quota_exhaustion_detected' timestamp hasn't
31-
// been updated in last X minutes it is no longer experiencing cloud quota exhaustion
32-
val currentTimestampMinusDelay = OffsetDateTime.now().minusMinutes(quotaExhaustionThresholdInMins)
33-
engineDbInterface.getQuotaExhaustedGroups(currentTimestampMinusDelay.toSystemTimestamp) onComplete {
39+
getQuotaExhaustedGroups() onComplete {
3440
case Success(quotaExhaustedGroups) => respondTo ! GetQuotaExhaustedGroupsSuccess(quotaExhaustedGroups.toList)
3541
case Failure(exception) => respondTo ! GetQuotaExhaustedGroupsFailure(exception.getMessage)
3642
}
43+
case LogQuotaExhaustedGroups =>
44+
getQuotaExhaustedGroups() onComplete {
45+
case Success(quotaExhaustedGroups) =>
46+
log.info(
47+
s"Hog groups currently experiencing quota exhaustion: ${quotaExhaustedGroups.length}. Group IDs: [${quotaExhaustedGroups.toList
48+
.mkString(", ")}]."
49+
)
50+
case Failure(exception) =>
51+
log.info(
52+
s"Something went wrong when fetching quota exhausted groups for logging. Will retry in ${loggingInterval
53+
.toString()}. Exception: ${exception.getMessage}"
54+
)
55+
}
56+
// schedule next logging
57+
context.system.scheduler.scheduleOnce(loggingInterval)(self ! LogQuotaExhaustedGroups)
58+
()
3759
case other =>
3860
log.error(
3961
s"Programmer Error: Unexpected message ${other.toPrettyElidedString(1000)} received by ${this.self.path.name}."
4062
)
4163
}
64+
65+
private def getQuotaExhaustedGroups(): Future[Seq[String]] = {
66+
// for a group in the GROUP_METRICS_ENTRY table, if the 'quota_exhaustion_detected' timestamp hasn't
67+
// been updated in last X minutes it is no longer experiencing cloud quota exhaustion
68+
val currentTimestampMinusDelay = OffsetDateTime.now().minusMinutes(quotaExhaustionThresholdInMins)
69+
engineDbInterface.getQuotaExhaustedGroups(currentTimestampMinusDelay.toSystemTimestamp)
70+
}
4271
}
4372

4473
object GroupMetricsActor {
@@ -47,12 +76,17 @@ object GroupMetricsActor {
4776
sealed trait GroupMetricsActorMessage
4877
case class RecordGroupQuotaExhaustion(group: String) extends GroupMetricsActorMessage
4978
case object GetQuotaExhaustedGroups extends GroupMetricsActorMessage
79+
case object LogQuotaExhaustedGroups extends GroupMetricsActorMessage
5080

5181
// Responses
5282
sealed trait GetQuotaExhaustedGroupsResponse
5383
case class GetQuotaExhaustedGroupsSuccess(quotaExhaustedGroups: List[String]) extends GetQuotaExhaustedGroupsResponse
5484
case class GetQuotaExhaustedGroupsFailure(errorMsg: String) extends GetQuotaExhaustedGroupsResponse
5585

56-
def props(engineDbInterface: EngineSqlDatabase, quotaExhaustionThresholdInMins: Long): Props =
57-
Props(new GroupMetricsActor(engineDbInterface, quotaExhaustionThresholdInMins)).withDispatcher(EngineDispatcher)
86+
def props(engineDbInterface: EngineSqlDatabase,
87+
quotaExhaustionThresholdInMins: Long,
88+
loggingInterval: FiniteDuration
89+
): Props =
90+
Props(new GroupMetricsActor(engineDbInterface, quotaExhaustionThresholdInMins, loggingInterval))
91+
.withDispatcher(EngineDispatcher)
5892
}

backend/src/test/scala/cromwell/backend/standard/GroupMetricsActorSpec.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class GroupMetricsActorSpec extends AnyFlatSpec with Matchers {
4747

4848
it should "receive new quota exhaustion message and call database function" in {
4949
val db = databaseInterface()
50-
val mockGroupMetricsActor = TestActorRef(GroupMetricsActor.props(db, 15))
50+
val mockGroupMetricsActor = TestActorRef(GroupMetricsActor.props(db, 15, 5.minutes))
5151

5252
mockGroupMetricsActor.tell(RecordGroupQuotaExhaustion(testHogGroup), TestProbe().ref)
5353

@@ -58,7 +58,7 @@ class GroupMetricsActorSpec extends AnyFlatSpec with Matchers {
5858

5959
it should "respond with groups in quota exhaustion" in {
6060
val db = databaseInterface()
61-
val mockGroupMetricsActor = TestActorRef(GroupMetricsActor.props(db, 15))
61+
val mockGroupMetricsActor = TestActorRef(GroupMetricsActor.props(db, 15, 5.minutes))
6262
val requestActor = TestProbe()
6363

6464
mockGroupMetricsActor.tell(GetQuotaExhaustedGroups, requestActor.ref)

core/src/main/resources/reference.conf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,8 @@ system {
277277
# threshold (in minutes) after which a group in GROUP_METRICS_ENTRY table is no longer considered to be
278278
# actively experiencing quota exhaustion
279279
threshold-minutes = 15
280+
# logging interval for which groups are in active quota exhaustion state
281+
logging-interval = 5 minutes
280282
}
281283

282284
workflow-heartbeats {

engine/src/main/scala/cromwell/server/CromwellRootActor.scala

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,14 @@ abstract class CromwellRootActor(terminator: CromwellTerminator,
210210
systemConfig.as[Option[Boolean]]("quota-exhaustion-job-start-control.enabled").getOrElse(false)
211211
private lazy val quotaExhaustionThresholdInMins: Long =
212212
systemConfig.as[Option[Long]]("quota-exhaustion-job-start-control.threshold-minutes").getOrElse(15)
213+
private lazy val quotaExhaustionLoggingInterval: FiniteDuration =
214+
systemConfig.as[Option[FiniteDuration]]("quota-exhaustion-job-start-control.logging-interval").getOrElse(5.minutes)
213215
private lazy val groupMetricsActor: ActorRef =
214216
context.actorOf(
215-
GroupMetricsActor.props(EngineServicesStore.engineDatabaseInterface, quotaExhaustionThresholdInMins)
217+
GroupMetricsActor.props(EngineServicesStore.engineDatabaseInterface,
218+
quotaExhaustionThresholdInMins,
219+
quotaExhaustionLoggingInterval
220+
)
216221
)
217222
private lazy val groupMetricsActorForJTDA: Option[ActorRef] =
218223
if (quotaExhaustionJobControlEnabled) Option(groupMetricsActor) else None

engine/src/test/scala/cromwell/engine/workflow/tokens/JobTokenDispenserActorSpec.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ package cromwell.engine.workflow.tokens
33
import akka.actor.{ActorRef, PoisonPill, Props}
44
import akka.testkit.{ImplicitSender, TestActorRef, TestProbe}
55
import cromwell.backend.standard.GroupMetricsActor
6-
import cromwell.backend.standard.GroupMetricsActor.{GetQuotaExhaustedGroups, GetQuotaExhaustedGroupsSuccess}
6+
import cromwell.backend.standard.GroupMetricsActor.{
7+
GetQuotaExhaustedGroups,
8+
GetQuotaExhaustedGroupsSuccess,
9+
LogQuotaExhaustedGroups
10+
}
711
import cromwell.core.JobToken.JobTokenType
812
import cromwell.core.{HogGroup, TestKitSuite}
913
import cromwell.engine.workflow.tokens.DynamicRateLimiter.{Rate, TokensAvailable}
@@ -572,8 +576,9 @@ object JobTokenDispenserActorSpec {
572576
val LimitedTo5Tokens: JobTokenType = limitedTokenType(5)
573577
}
574578

575-
class TestGroupMetricsActorForJTDA extends GroupMetricsActor(engineDatabaseInterface, 15) {
576-
override def receive: Receive = { case GetQuotaExhaustedGroups =>
577-
sender() ! GetQuotaExhaustedGroupsSuccess(List(quotaExhaustedHogGroup.value))
579+
class TestGroupMetricsActorForJTDA extends GroupMetricsActor(engineDatabaseInterface, 15, 10.minutes) {
580+
override def receive: Receive = {
581+
case GetQuotaExhaustedGroups => sender() ! GetQuotaExhaustedGroupsSuccess(List(quotaExhaustedHogGroup.value))
582+
case LogQuotaExhaustedGroups => ()
578583
}
579584
}

0 commit comments

Comments
 (0)