diff --git a/README.md b/README.md index b3d2ded..f8f4c5b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ An autoscaler for Jitsi instances (`jibri`, `sip-jibri`, `jigasi`, `JVB`, `nomad * as a Droplet in Digital Ocean * custom deployment model -The autoscaler manages multiple `groups` of instances, each having a `type` (`jibri`, `sip-jibri`, `jigasi`, `JVB`, `nomad`, `whisper`) and being deployed in a specific `cloud` (`oracle`, `digitalocean`, `custom`). +The autoscaler manages multiple `groups` of instances, each having a `type` `stress`or `availability` and being deployed in a specific `cloud` (`oracle`, `digitalocean`, `custom`). The autoscaler knows the Jitsi instances status and communicates with them via the [jitsi-autoscaler-sidecar](https://github.com/jitsi/jitsi-autoscaler-sidecar), which needs to be co-located on each Jitsi instance. The sidecar periodically checks in with the autoscaler via a REST call and sends its status. diff --git a/src/app.ts b/src/app.ts index c7c23c5..8e07410 100644 --- a/src/app.ts +++ b/src/app.ts @@ -630,7 +630,9 @@ app.put( body('options.scaleDownPeriodsCount').optional().isInt({ min: 0 }).withMessage('Value must be positive'), body('instanceType').custom(async (value) => { if (!(await validator.supportedInstanceType(value))) { - throw new Error('Instance type not supported. Use jvb, jigasi, nomad, jibri, whisper or sip-jibri instead'); + throw new Error( + 'Instance type not supported. Use stress, availabity, jvb, jigasi, nomad, jibri, whisper or sip-jibri instead', + ); } return true; }), diff --git a/src/autoscaler.ts b/src/autoscaler.ts index 6cf4586..e4c32da 100644 --- a/src/autoscaler.ts +++ b/src/autoscaler.ts @@ -192,7 +192,8 @@ export default class AutoscaleProcessor { switch (group.type) { case 'jibri': case 'sip-jibri': - // in the jibri case only scale up if value (available count) is below threshold + case 'availability': + // in the availability case only scale up if value (available count) is below threshold return ( (count < group.scalingOptions.maxDesired && value < group.scalingOptions.scaleUpThreshold) || count < group.scalingOptions.minDesired @@ -200,12 +201,9 @@ export default class AutoscaleProcessor { case 'jigasi': case 'nomad': case 'JVB': - // in the case of JVB scale up only if value (average stress level) is above or equal to threshhold - return ( - (count < group.scalingOptions.maxDesired && value >= group.scalingOptions.scaleUpThreshold) || - count < group.scalingOptions.minDesired - ); case 'whisper': + case 'stress': + // in the case of stress scale up only if value (average stress level) is above or equal to threshhold return ( (count < group.scalingOptions.maxDesired && value >= group.scalingOptions.scaleUpThreshold) || count < group.scalingOptions.minDesired @@ -218,14 +216,15 @@ export default class AutoscaleProcessor { switch (group.type) { case 'jibri': case 'sip-jibri': - // in the jibri case only scale up if value (available count) is above threshold + case 'availability': + // in the availability case only scale up if value (available count) is above threshold return count > group.scalingOptions.minDesired && value > group.scalingOptions.scaleDownThreshold; case 'jigasi': case 'nomad': - case 'JVB': - // in the case of JVB scale down only if value (average stress level) is below threshhold - return count > group.scalingOptions.minDesired && value < group.scalingOptions.scaleDownThreshold; case 'whisper': + case 'JVB': + case 'stress': + // in the case of stress scale down only if value (average stress level) is below threshhold return count > group.scalingOptions.minDesired && value < group.scalingOptions.scaleDownThreshold; } diff --git a/src/group_report.ts b/src/group_report.ts index 8a9b1cd..339e941 100644 --- a/src/group_report.ts +++ b/src/group_report.ts @@ -4,7 +4,7 @@ import { CloudInstance } from './cloud_manager'; import ShutdownManager from './shutdown_manager'; import MetricsLoop from './metrics_loop'; import ReconfigureManager from './reconfigure_manager'; -import { InstanceGroup, InstanceState, JibriStatusState } from './instance_store'; +import { InstanceGroup, InstanceState, JibriStatusState, StressStatus } from './instance_store'; export interface InstanceReport { instanceId: string; @@ -148,6 +148,7 @@ export default class GroupReportGenerator { switch (group.type) { case 'jibri': case 'sip-jibri': + case 'availability': if (instanceReport.scaleStatus == JibriStatusState.Idle) { groupReport.availableCount++; } @@ -162,6 +163,7 @@ export default class GroupReportGenerator { case 'nomad': case 'whisper': case 'JVB': + case 'stress': // @TODO: implement JVB instance counting break; } @@ -209,57 +211,50 @@ export default class GroupReportGenerator { } else if (instanceState.status.provisioning) { instanceReport.scaleStatus = 'PROVISIONING'; } else { + let stats: StressStatus; switch (group.type) { case 'jibri': case 'sip-jibri': + case 'availability': instanceReport.scaleStatus = 'SIDECAR_RUNNING'; if (instanceState.status.jibriStatus && instanceState.status.jibriStatus.busyStatus) { instanceReport.scaleStatus = instanceState.status.jibriStatus.busyStatus.toString(); } break; case 'nomad': - // @TODO: convert nomad stats into more explict statuses - instanceReport.scaleStatus = 'SIDECAR_RUNNING'; - if (instanceState.status.nomadStatus && instanceState.status.nomadStatus.allocatedCPU > 1000) { - instanceReport.scaleStatus = 'IN USE'; - } - if ( - instanceState.status.jigasiStatus && - !instanceState.status.nomadStatus.eligibleForScheduling - ) { - instanceReport.scaleStatus = 'GRACEFUL SHUTDOWN'; - } - break; case 'jigasi': - // @TODO: convert Jigasi stats into more explict statuses - instanceReport.scaleStatus = 'ONLINE'; - if (instanceState.status.jigasiStatus && instanceState.status.jigasiStatus.participants) { - instanceReport.scaleStatus = 'IN USE'; - } - if (instanceState.status.jigasiStatus && instanceState.status.jigasiStatus.graceful_shutdown) { - instanceReport.scaleStatus = 'GRACEFUL SHUTDOWN'; - } - break; case 'JVB': - // @TODO: convert JVB stats into more explict statuses - instanceReport.scaleStatus = 'ONLINE'; - if (instanceState.status.jvbStatus && instanceState.status.jvbStatus.participants) { - instanceReport.scaleStatus = 'IN USE'; - } - if (instanceState.status.jvbStatus && instanceState.status.jvbStatus.graceful_shutdown) { - instanceReport.scaleStatus = 'GRACEFUL SHUTDOWN'; - } - break; case 'whisper': + case 'stress': + stats = ( + (instanceState.status.stats + ? instanceState.status.stats + : instanceState.status.jvbStatus + ? instanceState.status.jvbStatus + : instanceState.status.jigasiStatus + ? instanceState.status.jigasiStatus + : instanceState.status.whisperStatus + ? instanceState.status.whisperStatus + : instanceState.status.nomadStatus + ? instanceState.status.nomadStatus + : null) + ); + instanceReport.scaleStatus = 'ONLINE'; - if (instanceState.status.whisperStatus && instanceState.status.whisperStatus.connections) { - instanceReport.scaleStatus = 'IN USE'; - } - if ( - instanceState.status.whisperStatus && - instanceState.status.whisperStatus.graceful_shutdown - ) { - instanceReport.scaleStatus = 'GRACEFUL SHUTDOWN'; + if (stats) { + instanceReport.scaleStatus = 'SIDECAR_RUNNING'; + if (stats.participants) { + instanceReport.scaleStatus = 'IN USE'; + } + if (stats.connections) { + instanceReport.scaleStatus = 'IN USE'; + } + if (stats.allocatedCPU > 1000) { + instanceReport.scaleStatus = 'IN USE'; + } + if (stats.graceful_shutdown) { + instanceReport.scaleStatus = 'GRACEFUL SHUTDOWN'; + } } break; } diff --git a/src/instance_launcher.ts b/src/instance_launcher.ts index 92f40c5..a124a5c 100644 --- a/src/instance_launcher.ts +++ b/src/instance_launcher.ts @@ -6,7 +6,7 @@ import * as promClient from 'prom-client'; import ShutdownManager from './shutdown_manager'; import Audit from './audit'; import MetricsLoop from './metrics_loop'; -import { InstanceDetails, InstanceGroup, InstanceState, JibriStatusState } from './instance_store'; +import { InstanceDetails, InstanceGroup, InstanceState, JibriStatusState, StressStatus } from './instance_store'; const instancesLaunchedCounter = new promClient.Counter({ name: 'autoscaling_instance_launched_total', @@ -182,102 +182,37 @@ export default class InstanceLauncher { return true; } - getJigasisForScaleDown( - ctx: Context, - group: InstanceGroup, - unprotectedInstances: InstanceState[], - desiredScaleDownQuantity: number, - ): InstanceDetails[] { - // first sort by participant count - unprotectedInstances.sort((a, b) => { - const aParticipants = a.status.jigasiStatus ? a.status.jigasiStatus.participants : 0; - const bParticipants = b.status.jigasiStatus ? b.status.jigasiStatus.participants : 0; - return aParticipants - bParticipants; - }); - const actualScaleDownQuantity = Math.min(desiredScaleDownQuantity, unprotectedInstances.length); - if (actualScaleDownQuantity < desiredScaleDownQuantity) { - ctx.logger.error( - '[Launcher] Nr of Jigasi instances in group for scale down is less than desired scale down quantity', - { groupName: group.name, actualScaleDownQuantity, desiredScaleDownQuantity }, - ); - } - // Try to not scale down the running instances unless needed - // This is needed in case of scale up problems, when we should terminate the provisioning instances first - let listOfInstancesForScaleDown = this.getProvisioningOrWithoutStatusInstances(unprotectedInstances); - if (listOfInstancesForScaleDown.length < actualScaleDownQuantity) { - listOfInstancesForScaleDown = listOfInstancesForScaleDown.concat( - this.getRunningInstances(unprotectedInstances), - ); - } - - // now return first N instances, least loaded first - return listOfInstancesForScaleDown.slice(0, actualScaleDownQuantity); - } - - getNomadsForScaleDown( - ctx: Context, - group: InstanceGroup, - unprotectedInstances: InstanceState[], - desiredScaleDownQuantity: number, - ): InstanceDetails[] { - // first sort by participant count - unprotectedInstances.sort((a, b) => { - const aAllocatedCPU = a.status.nomadStatus ? a.status.nomadStatus.allocatedCPU : 0; - const bAllocatedCPU = b.status.nomadStatus ? b.status.nomadStatus.allocatedCPU : 0; - return aAllocatedCPU - bAllocatedCPU; - }); - const actualScaleDownQuantity = Math.min(desiredScaleDownQuantity, unprotectedInstances.length); - if (actualScaleDownQuantity < desiredScaleDownQuantity) { - ctx.logger.error( - '[Launcher] Nr of Nomad instances in group for scale down is less than desired scale down quantity', - { groupName: group.name, actualScaleDownQuantity, desiredScaleDownQuantity }, - ); - } - // Try to not scale down the running instances unless needed - // This is needed in case of scale up problems, when we should terminate the provisioning instances first - let listOfInstancesForScaleDown = this.getProvisioningOrWithoutStatusInstances(unprotectedInstances); - if (listOfInstancesForScaleDown.length < actualScaleDownQuantity) { - listOfInstancesForScaleDown = listOfInstancesForScaleDown.concat( - this.getRunningInstances(unprotectedInstances), - ); - } - - // now return first N instances, least loaded first - return listOfInstancesForScaleDown.slice(0, actualScaleDownQuantity); - } - getJVBsForScaleDown( - ctx: Context, - group: InstanceGroup, - unprotectedInstances: InstanceState[], - desiredScaleDownQuantity: number, - ): InstanceDetails[] { - // first sort by participant count - unprotectedInstances.sort((a, b) => { - const aParticipants = a.status.jvbStatus ? a.status.jvbStatus.participants : 0; - const bParticipants = b.status.jvbStatus ? b.status.jvbStatus.participants : 0; - return aParticipants - bParticipants; - }); - const actualScaleDownQuantity = Math.min(desiredScaleDownQuantity, unprotectedInstances.length); - if (actualScaleDownQuantity < desiredScaleDownQuantity) { - ctx.logger.error( - '[Launcher] Nr of JVB instances in group for scale down is less than desired scale down quantity', - { groupName: group.name, actualScaleDownQuantity, desiredScaleDownQuantity }, - ); - } - // Try to not scale down the running instances unless needed - // This is needed in case of scale up problems, when we should terminate the provisioning instances first - let listOfInstancesForScaleDown = this.getProvisioningOrWithoutStatusInstances(unprotectedInstances); - if (listOfInstancesForScaleDown.length < actualScaleDownQuantity) { - listOfInstancesForScaleDown = listOfInstancesForScaleDown.concat( - this.getRunningInstances(unprotectedInstances), - ); - } + getStatusMetricForScaleDown(state: InstanceState): number { + // pull stats from the first available status + const stats: StressStatus = ( + (state.status.stats + ? state.status.stats + : state.status.jvbStatus + ? state.status.jvbStatus + : state.status.jigasiStatus + ? state.status.jigasiStatus + : state.status.nomadStatus + ? state.status.nomadStatus + : state.status.whisperStatus + ? state.status.whisperStatus + : null) + ); - // now return first N instances, least loaded first - return listOfInstancesForScaleDown.slice(0, actualScaleDownQuantity); + // use specific provided values or fall back on stress level + return stats + ? stats.participants !== undefined + ? stats.participants + : stats.allocatedCPU !== undefined + ? stats.allocatedCPU + : stats.connections !== undefined + ? stats.connections + : stats.stress_level !== undefined + ? stats.stress_level + : 0 + : 0; } - getWhisperForScaleDown( + getInstancesForScaleDownByStress( ctx: Context, group: InstanceGroup, unprotectedInstances: InstanceState[], @@ -285,14 +220,12 @@ export default class InstanceLauncher { ): InstanceDetails[] { // first sort by participant count unprotectedInstances.sort((a, b) => { - const aConnections = a.status.whisperStatus ? a.status.whisperStatus.connections : 0; - const bConnections = b.status.whisperStatus ? b.status.whisperStatus.connections : 0; - return aConnections - bConnections; + return this.getStatusMetricForScaleDown(a) - this.getStatusMetricForScaleDown(b); }); const actualScaleDownQuantity = Math.min(desiredScaleDownQuantity, unprotectedInstances.length); if (actualScaleDownQuantity < desiredScaleDownQuantity) { ctx.logger.error( - '[Launcher] Nr of whisper instances in group for scale down is less than desired scale down quantity', + '[Launcher] Nr of instances in group for scale down is less than desired scale down quantity', { groupName: group.name, actualScaleDownQuantity, desiredScaleDownQuantity }, ); } @@ -309,7 +242,7 @@ export default class InstanceLauncher { return listOfInstancesForScaleDown.slice(0, actualScaleDownQuantity); } - getJibrisForScaleDown( + getInstancesForScaleDownByAvailability( ctx: Context, group: InstanceGroup, unprotectedInstances: InstanceState[], @@ -366,15 +299,8 @@ export default class InstanceLauncher { switch (group.type) { case 'jibri': case 'sip-jibri': - listOfInstancesForScaleDown = this.getJibrisForScaleDown( - ctx, - group, - unprotectedInstances, - desiredScaleDownQuantity, - ); - break; - case 'jigasi': - listOfInstancesForScaleDown = this.getJigasisForScaleDown( + case 'availability': + listOfInstancesForScaleDown = this.getInstancesForScaleDownByAvailability( ctx, group, unprotectedInstances, @@ -382,23 +308,11 @@ export default class InstanceLauncher { ); break; case 'nomad': - listOfInstancesForScaleDown = this.getNomadsForScaleDown( - ctx, - group, - unprotectedInstances, - desiredScaleDownQuantity, - ); - break; + case 'jigasi': case 'JVB': - listOfInstancesForScaleDown = this.getJVBsForScaleDown( - ctx, - group, - unprotectedInstances, - desiredScaleDownQuantity, - ); - break; case 'whisper': - listOfInstancesForScaleDown = this.getWhisperForScaleDown( + case 'stress': + listOfInstancesForScaleDown = this.getInstancesForScaleDownByStress( ctx, group, unprotectedInstances, @@ -428,7 +342,8 @@ export default class InstanceLauncher { private getProvisioningOrWithoutStatusInstances(instanceStates: InstanceState[]): InstanceDetails[] { const states = instanceStates.filter((instanceState) => { return ( - (!instanceState.status.jibriStatus && + (!instanceState.status.stats && + !instanceState.status.jibriStatus && !instanceState.status.jvbStatus && !instanceState.status.jigasiStatus && !instanceState.status.nomadStatus) || @@ -441,7 +356,8 @@ export default class InstanceLauncher { private getRunningInstances(instanceStates: InstanceState[]): InstanceDetails[] { const states = instanceStates.filter((instanceState) => { return ( - (instanceState.status.jibriStatus || + (instanceState.status.stats || + instanceState.status.jibriStatus || instanceState.status.jvbStatus || instanceState.status.jigasiStatus || instanceState.status.nomadStatus) && diff --git a/src/instance_store.ts b/src/instance_store.ts index 8ef956e..ac344ed 100644 --- a/src/instance_store.ts +++ b/src/instance_store.ts @@ -40,6 +40,7 @@ export interface InstanceGroupTags { export interface NomadStatus { stress_level: number; + graceful_shutdown?: boolean; totalCPU: number; eligibleForScheduling: boolean; allocatedCPU: number; @@ -100,6 +101,14 @@ export interface JibriHealth { healthStatus: JibriHealthState; } +export interface StressStatus { + stress_level: number; + graceful_shutdown: boolean; + participants?: number; + connections?: number; + allocatedCPU?: number; +} + export interface JVBStatus { stress_level: number; muc_clients_configured: number; @@ -119,6 +128,7 @@ export interface WhisperStatus { export interface InstanceStatus { provisioning: boolean; jibriStatus?: JibriStatus; + stats?: StressStatus; jvbStatus?: JVBStatus; jigasiStatus?: JigasiStatus; nomadStatus?: NomadStatus; diff --git a/src/instance_tracker.ts b/src/instance_tracker.ts index 4b24cf4..795d5a1 100644 --- a/src/instance_tracker.ts +++ b/src/instance_tracker.ts @@ -9,10 +9,8 @@ import InstanceStore, { InstanceState, JibriStatus, JibriStatusState, - JigasiStatus, - JVBStatus, NomadStatus, - WhisperStatus, + StressStatus, } from './instance_store'; /* eslint-disable */ @@ -105,20 +103,20 @@ export class InstanceTracker { switch (report.instance.instanceType) { case 'jibri': case 'sip-jibri': + case 'availability': jibriStatusReport = report.stats; instanceState.status.jibriStatus = jibriStatusReport.status; break; case 'jigasi': - instanceState.status.jigasiStatus = report.stats; - break; - case 'nomad': - instanceState.status.nomadStatus = this.nomadStatusFromStats(report.stats); - break; case 'JVB': - instanceState.status.jvbStatus = report.stats; - break; case 'whisper': - instanceState.status.whisperStatus = report.stats; + case 'stress': + instanceState.status.stats = report.stats; + break; + case 'nomad': + instanceState.status.stats = ( + this.nomadStatusFromStats(report.stats) + ); break; } } @@ -148,6 +146,7 @@ export class InstanceTracker { return { totalCPU, stress_level: nomadStats['nomad.client.allocated.cpu'] / totalCPU, + graceful_shutdown: nomadLabels['node_scheduling_eligibility'] != 'eligible', eligibleForScheduling: nomadLabels['node_scheduling_eligibility'] == 'eligible', allocatedCPU: nomadStats['nomad.client.allocated.cpu'], allocatedMemory: nomadStats['nomad.client.allocated.memory'], @@ -186,35 +185,15 @@ export class InstanceTracker { // If Jibri is not up, the available metric is tracked with value 0 break; case 'jigasi': - if (!state.status.jigasiStatus) { - // If Jigasi is not up or is in graceful shutdown, we should not use it to compute average stress level across the group - trackMetric = false; - } else if (state.status.jigasiStatus.stress_level) { - metricValue = state.status.jigasiStatus.stress_level; - } - break; case 'nomad': - if (!state.status.nomadStatus) { - // If nomad node is not up or is in graceful shutdown, we should not use it to compute average stress level across the group - trackMetric = false; - } else if (state.status.nomadStatus.stress_level) { - metricValue = state.status.nomadStatus.stress_level; - } - break; case 'JVB': - if (!state.status.jvbStatus) { - // If JVB is not up or is in graceful shutdown, we should not use it to compute average stress level across the group - trackMetric = false; - } else if (state.status.jvbStatus.stress_level) { - metricValue = state.status.jvbStatus.stress_level; - } - break; case 'whisper': - if (!state.status.whisperStatus) { - // If whisper is not up or is in graceful shutdown, we should not use it to compute average stress level across the group + case 'stress': + // If node is not up or is in graceful shutdown, we should not use it to compute average stress level across the group + if (!state.status.stats || state.status.stats.stress_level == undefined) { trackMetric = false; - } else if (state.status.whisperStatus.stress_level) { - metricValue = state.status.whisperStatus.stress_level; + } else { + metricValue = state.status.stats.stress_level; } break; } @@ -243,12 +222,13 @@ export class InstanceTracker { switch (group.type) { case 'jibri': case 'sip-jibri': + case 'availability': return this.getAvailableMetricPerPeriod(ctx, metricInventoryPerPeriod, periodCount); case 'nomad': case 'jigasi': case 'JVB': - return this.getAverageMetricPerPeriod(ctx, metricInventoryPerPeriod, periodCount); case 'whisper': + case 'stress': return this.getAverageMetricPerPeriod(ctx, metricInventoryPerPeriod, periodCount); } return; @@ -452,7 +432,8 @@ export class InstanceTracker { // check whether jigasi, JVB or whisper reports graceful shutdown, treat as if sidecar has acknowledge shutdown command if ( state.status && - ((state.status.jvbStatus && state.status.jvbStatus.graceful_shutdown) || + ((state.status.stats && state.status.stats.graceful_shutdown) || + (state.status.jvbStatus && state.status.jvbStatus.graceful_shutdown) || (state.status.jigasiStatus && state.status.jigasiStatus.graceful_shutdown) || (state.status.whisperStatus && state.status.whisperStatus.graceful_shutdown) || (state.status.nomadStatus && !state.status.nomadStatus.eligibleForScheduling))