diff --git a/cvat/apps/health/management/__init__.py b/cvat/apps/health/management/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cvat/apps/health/management/commands/workerprobe.py b/cvat/apps/health/management/commands/workerprobe.py new file mode 100644 index 00000000000..07225e4612b --- /dev/null +++ b/cvat/apps/health/management/commands/workerprobe.py @@ -0,0 +1,29 @@ +import os +import platform +from datetime import datetime +from django.core.management.base import BaseCommand, CommandError +from django.conf import settings +from rq.worker import Worker +import django_rq + + +class Command(BaseCommand): + help = "Check worker liveness in specified queues" + + def add_arguments(self, parser): + parser.add_argument("queue_names", nargs="+", type=str) + + def handle(self, *args, **options): + allowed_queue_names = list(q.value for q in settings.CVAT_QUEUES) + hostname = platform.node() + for queue_name in options["queue_names"]: + if queue_name not in allowed_queue_names: + raise CommandError(f"Queue {queue_name} is not defined") + + queue = django_rq.get_queue(queue_name) + + workers = [w for w in Worker.all(queue.connection) if queue.name in w.queue_names() and w.hostname == hostname] + + if len(workers) != int(os.getenv("NUMPROCS", 1)) or \ + not all((datetime.now() - w.last_heartbeat).seconds < w.worker_ttl for w in workers): + raise CommandError(f"Unhealthy workers in the {queue_name} queue") diff --git a/helm-chart/Chart.yaml b/helm-chart/Chart.yaml index 3d556c034b0..6ec9c7e7a33 100644 --- a/helm-chart/Chart.yaml +++ b/helm-chart/Chart.yaml @@ -16,7 +16,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.13.2 +version: 0.14.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/helm-chart/templates/cvat_backend/server/deployment.yml b/helm-chart/templates/cvat_backend/server/deployment.yml index a90358cd954..0759ef31711 100644 --- a/helm-chart/templates/cvat_backend/server/deployment.yml +++ b/helm-chart/templates/cvat_backend/server/deployment.yml @@ -65,6 +65,20 @@ spec: {{- end }} ports: - containerPort: 8080 + {{- if $localValues.readinessProbe.enabled }} + readinessProbe: + httpGet: + path: /api/server/about + port: 8080 + {{- toYaml (omit $localValues.readinessProbe "enabled") | nindent 12 }} + {{- end }} + {{- if $localValues.livenessProbe.enabled }} + livenessProbe: + httpGet: + path: /api/server/health/ + port: 8080 + {{- toYaml (omit $localValues.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- if not .Values.cvat.backend.disableDistinctCachePerService }} - mountPath: /home/django/data/cache diff --git a/helm-chart/templates/cvat_backend/utils/deployment.yml b/helm-chart/templates/cvat_backend/utils/deployment.yml index 15229fafbd2..3527c7b2006 100644 --- a/helm-chart/templates/cvat_backend/utils/deployment.yml +++ b/helm-chart/templates/cvat_backend/utils/deployment.yml @@ -60,8 +60,17 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} - ports: - - containerPort: 8080 + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - notifications + - cleaning + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- if not .Values.cvat.backend.disableDistinctCachePerService }} - mountPath: /home/django/data/cache diff --git a/helm-chart/templates/cvat_backend/worker_analyticsreports/deployment.yml b/helm-chart/templates/cvat_backend/worker_analyticsreports/deployment.yml index ffc4997081e..888396342fe 100644 --- a/helm-chart/templates/cvat_backend/worker_analyticsreports/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_analyticsreports/deployment.yml @@ -61,6 +61,16 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - analytics_reports + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }} volumeMounts: {{- toYaml . | nindent 10 }} diff --git a/helm-chart/templates/cvat_backend/worker_annotation/deployment.yml b/helm-chart/templates/cvat_backend/worker_annotation/deployment.yml index dedb86976e9..6ec1caabc6f 100644 --- a/helm-chart/templates/cvat_backend/worker_annotation/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_annotation/deployment.yml @@ -60,6 +60,16 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - annotation + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- if not .Values.cvat.backend.disableDistinctCachePerService }} - mountPath: /home/django/data/cache diff --git a/helm-chart/templates/cvat_backend/worker_export/deployment.yml b/helm-chart/templates/cvat_backend/worker_export/deployment.yml index e96723c394b..be3db6f22c3 100644 --- a/helm-chart/templates/cvat_backend/worker_export/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_export/deployment.yml @@ -61,6 +61,16 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - export + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- if not .Values.cvat.backend.disableDistinctCachePerService }} - mountPath: /home/django/data/cache diff --git a/helm-chart/templates/cvat_backend/worker_import/deployment.yml b/helm-chart/templates/cvat_backend/worker_import/deployment.yml index b96b6b9b64a..4c391963e02 100644 --- a/helm-chart/templates/cvat_backend/worker_import/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_import/deployment.yml @@ -60,6 +60,16 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - import + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- if not .Values.cvat.backend.disableDistinctCachePerService }} - mountPath: /home/django/data/cache diff --git a/helm-chart/templates/cvat_backend/worker_qualityreports/deployment.yml b/helm-chart/templates/cvat_backend/worker_qualityreports/deployment.yml index 663dc7bc097..5d12d9647fd 100644 --- a/helm-chart/templates/cvat_backend/worker_qualityreports/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_qualityreports/deployment.yml @@ -60,6 +60,16 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - quality_reports + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }} volumeMounts: {{- toYaml . | nindent 10 }} diff --git a/helm-chart/templates/cvat_backend/worker_webhooks/deployment.yml b/helm-chart/templates/cvat_backend/worker_webhooks/deployment.yml index 9a6cc4a9775..4e0e4f24cb6 100644 --- a/helm-chart/templates/cvat_backend/worker_webhooks/deployment.yml +++ b/helm-chart/templates/cvat_backend/worker_webhooks/deployment.yml @@ -60,6 +60,17 @@ spec: {{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + + {{- if .Values.cvat.backend.worker.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - python + - manage.py + - workerprobe + - webhooks + {{- toYaml (omit .Values.cvat.backend.worker.livenessProbe "enabled") | nindent 12 }} + {{- end }} {{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }} volumeMounts: {{- toYaml . | nindent 10 }} diff --git a/helm-chart/templates/cvat_frontend/deployment.yml b/helm-chart/templates/cvat_frontend/deployment.yml index e5e2105f845..f0fb0dfe2f8 100644 --- a/helm-chart/templates/cvat_frontend/deployment.yml +++ b/helm-chart/templates/cvat_frontend/deployment.yml @@ -45,6 +45,18 @@ spec: {{- toYaml . | nindent 10 }} {{- end }} {{- with .Values.cvat.frontend.additionalVolumeMounts }} + {{- if .Values.cvat.frontend.readinessProbe.enabled }} + readinessProbe: + tcpSocket: + port: 80 + {{- toYaml (omit .Values.cvat.frontend.readinessProbe "enabled") | nindent 12 }} + {{- end }} + {{- if .Values.cvat.frontend.livenessProbe.enabled }} + livenessProbe: + tcpSocket: + port: 80 + {{- toYaml (omit .Values.cvat.frontend.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: {{- toYaml . | nindent 10 }} {{- end }} diff --git a/helm-chart/templates/cvat_kvrocks/statefulset.yml b/helm-chart/templates/cvat_kvrocks/statefulset.yml index 1d18a237e16..d5d8ccad9cd 100644 --- a/helm-chart/templates/cvat_kvrocks/statefulset.yml +++ b/helm-chart/templates/cvat_kvrocks/statefulset.yml @@ -59,6 +59,25 @@ spec: {{- with .Values.cvat.kvrocks.additionalEnv }} {{- toYaml . | nindent 10 }} {{- end }} + #https://github.com/apache/kvrocks/blob/unstable/Dockerfile + {{- if .Values.cvat.kvrocks.readinessProbe.enabled }} + readinessProbe: + exec: + command: + - /bin/sh + - -c + - ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)' || exit 1 + {{- toYaml (omit .Values.cvat.kvrocks.readinessProbe "enabled") | nindent 12 }} + {{- end }} + {{- if .Values.cvat.kvrocks.livenessProbe.enabled }} + livenessProbe: + exec: + command: + - /bin/sh + - -c + - ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)' || exit 1 + {{- toYaml (omit .Values.cvat.kvrocks.livenessProbe "enabled") | nindent 12 }} + {{- end }} volumeMounts: - name: {{ .Release.Name }}-kvrocks-data mountPath: /var/lib/kvrocks/data diff --git a/helm-chart/templates/cvat_opa/deployment.yml b/helm-chart/templates/cvat_opa/deployment.yml index 3827484b92f..1b0ae199995 100644 --- a/helm-chart/templates/cvat_opa/deployment.yml +++ b/helm-chart/templates/cvat_opa/deployment.yml @@ -53,6 +53,18 @@ spec: env: {{- toYaml . | nindent 10 }} {{- end }} + {{- if .Values.cvat.opa.readinessProbe.enabled }} + readinessProbe: + tcpSocket: + port: 8181 + {{- toYaml (omit .Values.cvat.opa.readinessProbe "enabled") | nindent 12 }} + {{- end }} + {{- if .Values.cvat.opa.livenessProbe.enabled }} + livenessProbe: + tcpSocket: + port: 8181 + {{- toYaml (omit .Values.cvat.opa.livenessProbe "enabled") | nindent 12 }} + {{- end }} {{- with .Values.cvat.opa.additionalVolumeMounts }} volumeMounts: {{- toYaml . | nindent 10 }} diff --git a/helm-chart/values.yaml b/helm-chart/values.yaml index 91e4493258f..963c328b615 100644 --- a/helm-chart/values.yaml +++ b/helm-chart/values.yaml @@ -42,7 +42,21 @@ cvat: additionalEnv: [] additionalVolumes: [] additionalVolumeMounts: [] + readinessProbe: + enabled: true + periodSeconds: 15 + initialDelaySeconds: 15 + livenessProbe: + enabled: true + periodSeconds: 15 + failureThreshold: 10 + initialDelaySeconds: 60 worker: + livenessProbe: + enabled: true + periodSeconds: 120 + initialDelaySeconds: 30 + timeoutSeconds: 10 export: replicas: 2 labels: {} @@ -172,6 +186,10 @@ cvat: # - mountPath: /tmp # name: tmp # subPath: test + readinessProbe: + enabled: true + livenessProbe: + enabled: true service: type: ClusterIP ports: @@ -216,6 +234,14 @@ cvat: # name: tmp # subPath: test composeCompatibleServiceName: true # Sets service name to opa in order to be compatible with Docker Compose. Necessary because changing IAM_OPA_DATA_URL via environment variables in current images. Hinders multiple deployment due to duplicate name + readinessProbe: + enabled: true + periodSeconds: 15 + initialDelaySeconds: 15 + livenessProbe: + enabled: true + periodSeconds: 15 + initialDelaySeconds: 15 service: type: ClusterIP ports: @@ -266,6 +292,14 @@ cvat: # - mountPath: /tmp # name: tmp # subPath: test + readinessProbe: + enabled: true + periodSeconds: 10 + initialDelaySeconds: 30 + livenessProbe: + enabled: true + periodSeconds: 10 + initialDelaySeconds: 30 defaultStorage: enabled: true # storageClassName: default diff --git a/supervisord/utils.conf b/supervisord/utils.conf index 92ce24991e4..1271e6eef53 100644 --- a/supervisord/utils.conf +++ b/supervisord/utils.conf @@ -24,13 +24,14 @@ command=%(ENV_HOME)s/wait_for_deps.sh -i 30 --path %(ENV_HOME)s environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler" numprocs=1 +autorestart=true [program:rqworker-notifications] command=%(ENV_HOME)s/wait_for_deps.sh python3 %(ENV_HOME)s/manage.py rqworker -v 3 notifications --worker-class cvat.rqworker.DefaultWorker environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler",CVAT_POSTGRES_APPLICATION_NAME="cvat:worker:notifications" -numprocs=1 +numprocs=%(ENV_NUMPROCS)s autorestart=true [program:rqworker-cleaning] diff --git a/tests/python/shared/fixtures/init.py b/tests/python/shared/fixtures/init.py index 84e18110b0d..c9d755442a8 100644 --- a/tests/python/shared/fixtures/init.py +++ b/tests/python/shared/fixtures/init.py @@ -240,11 +240,26 @@ def kube_restore_clickhouse_db(): def docker_restore_redis_inmem(): - docker_exec_redis_inmem(["redis-cli", "-e", "flushall"]) + docker_exec_redis_inmem( + [ + "sh", + "-c", + "for p in rq:finished:* rq:job:* rq:wip:* rq:finished:* rq:failed:*; " + 'do redis-cli -e --scan --pattern "$p" | xargs -r redis-cli -e del ; done', + ] + ) def kube_restore_redis_inmem(): - kube_exec_redis_inmem(["sh", "-c", 'redis-cli -e -a "${REDIS_PASSWORD}" flushall']) + kube_exec_redis_inmem( + [ + "sh", + "-c", + "for p in rq:finished:* rq:job:* rq:wip:* rq:finished:* rq:failed:*; " + 'do redis-cli -e -a "${REDIS_PASSWORD}" --scan --pattern "$p" | ' + 'xargs -r redis-cli -e -a "${REDIS_PASSWORD}" del ; done', + ] + ) def docker_restore_redis_ondisk():