cvat-ai · azhavoro · Sep 25, 2024 · Sep 30, 2024 · Oct 8, 2024 · Oct 8, 2024
@@ -0,0 +1,29 @@
+import os
+import platform
+from datetime import datetime
+from django.core.management.base import BaseCommand, CommandError
+from django.conf import settings
+from rq.worker import Worker
+import django_rq
+
+
+class Command(BaseCommand):
+    help = "Check worker liveness in specified queues"
+
+    def add_arguments(self, parser):
+        parser.add_argument("queue_names", nargs="+", type=str)
+
+    def handle(self, *args, **options):
+        allowed_queue_names = list(q.value for q in settings.CVAT_QUEUES)
+        hostname = platform.node()
+        for queue_name in options["queue_names"]:
+            if queue_name not in allowed_queue_names:
+                raise CommandError(f"Queue {queue_name} is not defined")
+
+            queue = django_rq.get_queue(queue_name)
+
+            workers = [w for w in Worker.all(queue.connection) if queue.name in w.queue_names() and w.hostname == hostname]
+
+            if len(workers) != int(os.getenv("NUMPROCS", 1)) or \
+                not all((datetime.now() - w.last_heartbeat).seconds < w.worker_ttl for w in workers):
-                not all((datetime.now() - w.last_heartbeat).seconds < w.worker_ttl for w in workers):
+                not all((datetime.now() - w.last_heartbeat).total_seconds() < w.worker_ttl for w in workers):
-                not all((datetime.now() - w.last_heartbeat).seconds < w.worker_ttl for w in workers):
+                not all((datetime.now() - w.last_heartbeat).total_seconds() < w.worker_ttl for w in workers):
+                raise CommandError(f"Unhealthy workers in the {queue_name} queue")
@@ -16,7 +16,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.13.2
+version: 0.14.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

@@ -65,6 +65,20 @@ spec:
           {{- end }}
           ports:
           - containerPort: 8080
+          {{- if $localValues.readinessProbe.enabled }}
+          readinessProbe:
+            httpGet:
+              path: /api/server/about
+              port: 8080
+            {{- toYaml (omit $localValues.readinessProbe "enabled") | nindent 12 }}
+          {{- end }}
+          {{- if $localValues.livenessProbe.enabled }}
+          livenessProbe:
+            httpGet:
+              path: /api/server/health/
+              port: 8080
+            {{- toYaml (omit $localValues.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- if not .Values.cvat.backend.disableDistinctCachePerService }}
           - mountPath: /home/django/data/cache

@@ -60,8 +60,17 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
-          ports:
-          - containerPort: 8080
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - notifications
+              - cleaning
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- if not .Values.cvat.backend.disableDistinctCachePerService }}
           - mountPath: /home/django/data/cache

@@ -61,6 +61,16 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - analytics_reports
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
           volumeMounts:
           {{- toYaml . | nindent 10 }}

@@ -60,6 +60,16 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - annotation
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- if not .Values.cvat.backend.disableDistinctCachePerService }}
           - mountPath: /home/django/data/cache

@@ -61,6 +61,16 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - export
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- if not .Values.cvat.backend.disableDistinctCachePerService }}
           - mountPath: /home/django/data/cache

@@ -60,6 +60,16 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - import
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- if not .Values.cvat.backend.disableDistinctCachePerService }}
           - mountPath: /home/django/data/cache

@@ -60,6 +60,16 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - quality_reports
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
           volumeMounts:
           {{- toYaml . | nindent 10 }}

@@ -60,6 +60,17 @@ spec:
           {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+
+          {{- if .Values.cvat.backend.worker.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - python
+              - manage.py
+              - workerprobe
+              - webhooks
+            {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
           volumeMounts:
           {{- toYaml . | nindent 10 }}

@@ -45,6 +45,18 @@ spec:
           {{- toYaml . | nindent 10 }}
           {{- end }}
           {{- with .Values.cvat.frontend.additionalVolumeMounts }}
+          {{- if .Values.cvat.frontend.readinessProbe.enabled }}
+          readinessProbe:
+            tcpSocket:
+              port: 80
+            {{- toYaml (omit .Values.cvat.frontend.readinessProbe "enabled") | nindent 12 }}
+          {{- end }}
+          {{- if .Values.cvat.frontend.livenessProbe.enabled }}
+          livenessProbe:
+            tcpSocket:
+              port: 80
+            {{- toYaml (omit .Values.cvat.frontend.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           {{- toYaml . | nindent 10 }}
           {{- end }}

@@ -59,6 +59,25 @@ spec:
           {{- with .Values.cvat.kvrocks.additionalEnv }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          #https://github.com/apache/kvrocks/blob/unstable/Dockerfile
+          {{- if .Values.cvat.kvrocks.readinessProbe.enabled }}
+          readinessProbe:
+            exec:
+              command:
+              - /bin/sh
+              - -c
+              - ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)' || exit 1
+            {{- toYaml (omit .Values.cvat.kvrocks.readinessProbe "enabled") | nindent 12 }}
+          {{- end }}
+          {{- if .Values.cvat.kvrocks.livenessProbe.enabled }}
+          livenessProbe:
+            exec:
+              command:
+              - /bin/sh
+              - -c
+              - ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)' || exit 1
+            {{- toYaml (omit .Values.cvat.kvrocks.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           volumeMounts:
           - name: {{ .Release.Name }}-kvrocks-data
             mountPath: /var/lib/kvrocks/data

@@ -53,6 +53,18 @@ spec:
           env:
           {{- toYaml . | nindent 10 }}
           {{- end }}
+          {{- if .Values.cvat.opa.readinessProbe.enabled }}
+          readinessProbe:
+            tcpSocket:
+              port: 8181
+            {{- toYaml (omit .Values.cvat.opa.readinessProbe "enabled") | nindent 12 }}
+          {{- end }}
+          {{- if .Values.cvat.opa.livenessProbe.enabled }}
+          livenessProbe:
+            tcpSocket:
+              port: 8181
+            {{- toYaml (omit .Values.cvat.opa.livenessProbe "enabled") | nindent 12 }}
+          {{- end }}
           {{- with .Values.cvat.opa.additionalVolumeMounts }}
           volumeMounts:
           {{- toYaml . | nindent 10 }}

@@ -42,7 +42,21 @@ cvat:
       additionalEnv: []
       additionalVolumes: []
       additionalVolumeMounts: []
+      readinessProbe:
+        enabled: true
+        periodSeconds: 15
+        initialDelaySeconds: 15
+      livenessProbe:
+        enabled: true
+        periodSeconds: 15
+        failureThreshold: 10
+        initialDelaySeconds: 60
     worker:
+      livenessProbe:
+        enabled: true
+        periodSeconds: 120
+        initialDelaySeconds: 30
+        timeoutSeconds: 10
       export:
         replicas: 2
         labels: {}
@@ -172,6 +186,10 @@ cvat:
     # -   mountPath: /tmp
     #     name: tmp
     #     subPath: test
+    readinessProbe:
+      enabled: true
+    livenessProbe:
+      enabled: true
     service:
       type: ClusterIP
       ports:
@@ -216,6 +234,14 @@ cvat:
     #     name: tmp
     #     subPath: test
     composeCompatibleServiceName: true # Sets service name to opa in order to be compatible with Docker Compose. Necessary because changing IAM_OPA_DATA_URL via environment variables in current images. Hinders multiple deployment due to duplicate name
+    readinessProbe:
+      enabled: true
+      periodSeconds: 15
+      initialDelaySeconds: 15
+    livenessProbe:
+      enabled: true
+      periodSeconds: 15
+      initialDelaySeconds: 15
     service:
       type: ClusterIP
       ports:
@@ -266,6 +292,14 @@ cvat:
     # -   mountPath: /tmp
     #     name: tmp
     #     subPath: test
+    readinessProbe:
+      enabled: true
+      periodSeconds: 10
+      initialDelaySeconds: 30
+    livenessProbe:
+      enabled: true
+      periodSeconds: 10
+      initialDelaySeconds: 30
     defaultStorage:
       enabled: true
 #     storageClassName: default

@@ -24,13 +24,14 @@ command=%(ENV_HOME)s/wait_for_deps.sh
         -i 30 --path %(ENV_HOME)s
 environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler"
 numprocs=1
+autorestart=true
 
 [program:rqworker-notifications]
 command=%(ENV_HOME)s/wait_for_deps.sh
     python3 %(ENV_HOME)s/manage.py rqworker -v 3 notifications
         --worker-class cvat.rqworker.DefaultWorker
 environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler",CVAT_POSTGRES_APPLICATION_NAME="cvat:worker:notifications"
-numprocs=1
+numprocs=%(ENV_NUMPROCS)s
 autorestart=true
 
 [program:rqworker-cleaning]

@@ -240,11 +240,26 @@ def kube_restore_clickhouse_db():
 
 
 def docker_restore_redis_inmem():
-    docker_exec_redis_inmem(["redis-cli", "-e", "flushall"])
+    docker_exec_redis_inmem(
+        [
+            "sh",
+            "-c",
+            "for p in rq:finished:* rq:job:* rq:wip:* rq:finished:* rq:failed:*; "
+            'do redis-cli -e --scan --pattern "$p" | xargs -r redis-cli -e del ; done',
+        ]
+    )
 
 
 def kube_restore_redis_inmem():
-    kube_exec_redis_inmem(["sh", "-c", 'redis-cli -e -a "${REDIS_PASSWORD}" flushall'])
+    kube_exec_redis_inmem(
+        [
+            "sh",
+            "-c",
+            "for p in rq:finished:* rq:job:* rq:wip:* rq:finished:* rq:failed:*; "
+            'do redis-cli -e -a "${REDIS_PASSWORD}" --scan --pattern "$p" | '
+            'xargs -r redis-cli -e -a "${REDIS_PASSWORD}" del ; done',
+        ]
+    )
 
 
 def docker_restore_redis_ondisk():