Skip to content

Commit

Permalink
fix: Missing duration metrics if controller restart (#6815)
Browse files Browse the repository at this point in the history
Signed-off-by: Saravanan Balasubramanian <[email protected]>
  • Loading branch information
sarabala1979 committed Sep 28, 2021
1 parent 1fe754e commit 78cd691
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 6 deletions.
13 changes: 7 additions & 6 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,13 @@ func (woc *wfOperationCtx) operate(ctx context.Context) {
woc.preExecutionNodePhases[node.ID] = node.Phase
}

if woc.execWf.Spec.Metrics != nil {
realTimeScope := map[string]func() float64{common.GlobalVarWorkflowDuration: func() float64 {
return time.Since(woc.wf.Status.StartedAt.Time).Seconds()
}}
woc.computeMetrics(woc.execWf.Spec.Metrics.Prometheus, woc.globalParams, realTimeScope, true)
}

if woc.wf.Status.Phase == wfv1.WorkflowUnknown {
woc.markWorkflowRunning(ctx)
err := woc.createPDBResource(ctx)
Expand All @@ -266,12 +273,6 @@ func (woc *wfOperationCtx) operate(ctx context.Context) {
woc.requeueAfter(time.Until(*woc.workflowDeadline))
}

if woc.execWf.Spec.Metrics != nil {
realTimeScope := map[string]func() float64{common.GlobalVarWorkflowDuration: func() float64 {
return time.Since(woc.wf.Status.StartedAt.Time).Seconds()
}}
woc.computeMetrics(woc.execWf.Spec.Metrics.Prometheus, woc.globalParams, realTimeScope, true)
}
woc.wf.Status.EstimatedDuration = woc.estimateWorkflowDuration()
} else {
woc.workflowDeadline = woc.getWorkflowDeadline()
Expand Down
148 changes: 148 additions & 0 deletions workflow/controller/operator_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,3 +586,151 @@ func TestProcessedRetryNode(t *testing.T) {
assert.NoError(t, err)
assert.Contains(t, metricErrorCounterString, `value:1`)
}

var suspendWfWithMetrics = `apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
name: suspend-template-qndm5
spec:
entrypoint: suspend
metrics:
prometheus:
- gauge:
realtime: true
value: '{{workflow.duration}}'
help: Duration gauge by name
labels:
- key: name
value: model_a
name: exec_duration_gauge
templates:
- name: suspend
steps:
- - name: build
template: whalesay
- - name: approve
template: approve
- - name: delay
template: delay
- - name: release
template: whalesay
- name: approve
suspend: {}
- name: delay
suspend:
duration: "20"
- container:
args:
- hello world
command:
- cowsay
image: docker/whalesay
name: ""
name: whalesay
ttlStrategy:
secondsAfterCompletion: 600
status:
conditions:
- status: "False"
type: PodRunning
finishedAt: null
nodes:
suspend-template-qndm5:
children:
- suspend-template-qndm5-343839516
displayName: suspend-template-qndm5
finishedAt: null
id: suspend-template-qndm5
name: suspend-template-qndm5
phase: Running
progress: 1/1
startedAt: "2021-09-28T12:23:10Z"
templateName: suspend
templateScope: local/suspend-template-qndm5
type: Steps
suspend-template-qndm5-343839516:
boundaryID: suspend-template-qndm5
children:
- suspend-template-qndm5-2823755246
displayName: '[0]'
finishedAt: "2021-09-28T12:23:20Z"
id: suspend-template-qndm5-343839516
name: suspend-template-qndm5[0]
phase: Succeeded
progress: 1/1
resourcesDuration:
cpu: 6
memory: 3
startedAt: "2021-09-28T12:23:10Z"
templateScope: local/suspend-template-qndm5
type: StepGroup
suspend-template-qndm5-2823755246:
boundaryID: suspend-template-qndm5
children:
- suspend-template-qndm5-3632002577
displayName: build
finishedAt: "2021-09-28T12:23:16Z"
hostNodeName: kind-control-plane
id: suspend-template-qndm5-2823755246
name: suspend-template-qndm5[0].build
outputs:
exitCode: "0"
phase: Succeeded
progress: 1/1
resourcesDuration:
cpu: 6
memory: 3
startedAt: "2021-09-28T12:23:10Z"
templateName: whalesay
templateScope: local/suspend-template-qndm5
type: Pod
suspend-template-qndm5-3456849218:
boundaryID: suspend-template-qndm5
displayName: approve
finishedAt: null
id: suspend-template-qndm5-3456849218
name: suspend-template-qndm5[1].approve
phase: Running
startedAt: "2021-09-28T12:23:20Z"
templateName: approve
templateScope: local/suspend-template-qndm5
type: Suspend
suspend-template-qndm5-3632002577:
boundaryID: suspend-template-qndm5
children:
- suspend-template-qndm5-3456849218
displayName: '[1]'
finishedAt: null
id: suspend-template-qndm5-3632002577
name: suspend-template-qndm5[1]
phase: Running
startedAt: "2021-09-28T12:23:20Z"
templateScope: local/suspend-template-qndm5
type: StepGroup
phase: Running
progress: 1/1
resourcesDuration:
cpu: 6
memory: 3
startedAt: "2021-09-28T12:23:10Z"
`

func TestControllerRestartWithRunningWorkflow(t *testing.T) {
cancel, controller := newController()
defer cancel()
ctx := context.Background()
wfcset := controller.wfclientset.ArgoprojV1alpha1().Workflows("")
wf := v1alpha1.MustUnmarshalWorkflow(suspendWfWithMetrics)
_, err := wfcset.Create(ctx, wf, metav1.CreateOptions{})
assert.NoError(t, err)
woc := newWorkflowOperationCtx(wf, controller)

woc.operate(ctx)
metricDesc := wf.Spec.Metrics.Prometheus[0].GetDesc()
metric := controller.metrics.GetCustomMetric(metricDesc)
assert.NotNil(t, metric)
metricString, err := getMetricStringValue(metric)
fmt.Println(metricString)
assert.NoError(t, err)
assert.Contains(t, metricString, `model_a`)
}

0 comments on commit 78cd691

Please sign in to comment.