Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gateway / scheduler refactor protptype #1082

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ build-notebook:

build-ray-node:
docker build -t $(rayNodeImageName):$(version) --build-arg TARGETARCH=$(arch) -f Dockerfile-ray-node .
docker build -t $(rayNodeImageName):$(version)-py310 --build-arg TARGETARCH=$(arch) --build-arg IMAGE_PY_VERSION=py310 -f Dockerfile-ray-node .
docker build -t $(rayNodeImageName):$(version)-py39 --build-arg TARGETARCH=$(arch) --build-arg IMAGE_PY_VERSION=py39 -f Dockerfile-ray-node .
docker build -t $(rayNodeImageName):$(version)-py38 --build-arg TARGETARCH=$(arch) --build-arg IMAGE_PY_VERSION=py38 -f Dockerfile-ray-node .

build-gateway:
docker build -t $(gatewayImageName):$(version) -f ./gateway/Dockerfile .
Expand Down
144 changes: 17 additions & 127 deletions charts/quantum-serverless/charts/gateway/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ spec:
- name: gateway-pv-storage
persistentVolumeClaim:
claimName: {{ .Values.cos.claimName }}
- name: ray-cluster-template
configMap:
name: rayclustertemplate
containers:
- name: {{ .Chart.Name }}
securityContext:
Expand All @@ -81,6 +84,8 @@ spec:
volumeMounts:
- mountPath: "/usr/src/app/media/"
name: gateway-pv-storage
- mountPath: "/tmp/templates/"
name: ray-cluster-template
resources:
{{- toYaml .Values.resources | nindent 12 }}
env:
Expand All @@ -93,8 +98,6 @@ spec:
key: {{ .Values.secrets.secretKey.key }}
- name: SITE_HOST
value: {{ .Values.application.siteHost | quote }}
- name: RAY_HOST
value: {{ .Values.application.rayHost | quote }}
- name: SETTINGS_AUTH_MECHANISM
value: {{ .Values.application.auth.mechanism | quote }}
- name: SETTINGS_TOKEN_AUTH_URL
Expand All @@ -111,6 +114,12 @@ spec:
value: {{ .Values.application.auth.keycloak.realm | quote }}
- name: SETTINGS_KEYCLOAK_CLIENT_SECRET
value: {{ .Values.application.auth.keycloak.clientSecret | quote }}
- name: RAY_CLUSTER_WORKER_REPLICAS
value: {{ .Values.application.ray.replicas | quote }}
- name: RAY_CLUSTER_WORKER_MIN_REPLICAS
value: {{ .Values.application.ray.minReplicas | quote }}
- name: RAY_CLUSTER_WORKER_MAX_REPLICAS
value: {{ .Values.application.ray.maxReplicas | quote }}
{{- if .Values.application.superuser.enable }}
- name: DJANGO_SUPERUSER_USERNAME
valueFrom:
Expand Down Expand Up @@ -169,123 +178,16 @@ spec:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.databasePassword }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: scheduler
labels:
{{- include "scheduler.labels" . | nindent 4 }}
spec:
{{- if not .Values.autoscaling.enabled }}
replicas: 1
{{- end }}
selector:
matchLabels:
{{- include "scheduler.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "scheduler.selectorLabels" . | nindent 8 }}
spec:
volumes:
- name: gateway-pv-storage
persistentVolumeClaim:
claimName: {{ .Values.cos.claimName }}
- name: ray-cluster-template
configMap:
name: rayclustertemplate
serviceAccountName: {{ include "gateway.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: waitpostresql
image: actions/pg_isready
command: ['sh', '-c', 'until pg_isready -U ${DATABASE_USER} -d "dbname=${DATABASE_NAME}" -h ${DATABASE_HOST} -p ${DATABASE_PORT}; do echo waiting for myservice; sleep 2; done']
env:
- name: DATABASE_HOST
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.host }}
- name: DATABASE_PORT
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.port }}
- name: DATABASE_NAME
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.databaseName }}
- name: DATABASE_USER
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.userName }}
containers:
- name: gateway-scheduler
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ["./scripts/scheduler.sh"]
volumeMounts:
- mountPath: "/usr/src/app/media/"
name: gateway-pv-storage
- mountPath: "/tmp/templates/"
name: ray-cluster-template
env:
- name: DEBUG
value: {{ .Values.application.debug | quote }}
- name: DJANGO_SECRET_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.secretKey.name }}
key: {{ .Values.secrets.secretKey.key }}
- name: DATABASE_HOST
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.host }}
- name: DATABASE_PORT
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.port }}
- name: DATABASE_NAME
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.databaseName }}
- name: DATABASE_USER
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.userName }}
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.secrets.servicePsql.name }}
key: {{ .Values.secrets.servicePsql.key.databasePassword }}
- name: RAY_KUBERAY_NAMESPACE
value: {{ .Release.Namespace }}
- name: RAY_NODE_IMAGE
value: {{ .Values.application.ray.nodeImage | quote }}
- name: RAY_NODE_IMAGE_PY38
value: {{ .Values.application.ray.nodeImage_py38 | quote }}
- name: RAY_NODE_IMAGE_PY39
value: {{ .Values.application.ray.nodeImage_py39 | quote }}
- name: RAY_NODE_IMAGE_PY310
value: {{ .Values.application.ray.nodeImage_py310 | quote }}
- name: LIMITS_JOBS_PER_USER
value: {{ .Values.application.limits.maxJobsPerUser | quote }}
- name: LIMITS_MAX_CLUSTERS
Expand All @@ -294,18 +196,6 @@ spec:
- name: RAY_CLUSTER_NO_DELETE_ON_COMPLETE
value: "True"
{{- end }}
- name: OTEL_ENABLED
value: {{ .Values.application.ray.openTelemetryCollector.enabled | quote }}
- name: OTEL_SERVICE_NAME
value: "Gateway"
- name: OTEL_TRACES_EXPORTER
value: console,otlp
- name: OTEL_METRICS_EXPORTER
value: console
- name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
value: {{ .Values.application.ray.openTelemetryCollector.host }}:{{ .Values.application.ray.openTelemetryCollector.port }}
- name: OTEL_EXPORTER_OTLP_TRACES_INSECURE
value: {{ .Values.application.ray.openTelemetryCollector.insecure | quote }}
- name: PROGRAM_TIMEOUT
value: {{ .Values.application.limits.programTimeoutDays | quote }}
{{- with .Values.nodeSelector }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ data:
{{- if .Values.application.ray.scrapeWithPrometheus }}
headServiceAnnotations:
prometheus.io/scrape: "true"
{{- end }}
{{- end }}
enableInTreeAutoscaling: {{`{{ auto_scaling }}`}}
headGroupSpec:
rayStartParams:
dashboard-host: 0.0.0.0
Expand Down Expand Up @@ -68,7 +69,7 @@ data:
{{- end }}
affinity:
containers:
- image: {{ .Values.application.ray.nodeImage | quote }}
- image: {{`{{ node_image }}`| quote }}
imagePullPolicy: IfNotPresent
name: ray-head
ports:
Expand Down Expand Up @@ -193,11 +194,11 @@ data:
claimName: {{ .Values.cos.claimName }}
workerGroupSpecs:
- groupName: g
maxReplicas: {{ .Values.application.ray.maxReplicas }}
minReplicas: {{ .Values.application.ray.minReplicas }}
maxReplicas: {{`{{ max_workers }}`}}
minReplicas: {{`{{ min_workers }}`}}
rayStartParams:
block: 'true'
replicas: {{ .Values.application.ray.replicas }}
replicas: {{`{{ workers }}`}}
template:
{{- if .Values.application.ray.scrapeWithPrometheus }}
metadata:
Expand Down Expand Up @@ -299,7 +300,7 @@ data:
- name: OT_ENABLED
value: {{ .Values.application.ray.openTelemetryCollector.enabled | quote }}
{{- end }}
image: {{ .Values.application.ray.nodeImage | quote}}
image: {{`{{ node_image }}`| quote}}
imagePullPolicy: IfNotPresent
name: ray-worker
resources:
Expand Down
3 changes: 3 additions & 0 deletions charts/quantum-serverless/charts/gateway/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ application:
enable: true
ray:
nodeImage: "icr.io/quantum-public/quantum-serverless-ray-node:0.7.1-py39"
nodeImage_py38: "icr.io/quantum-public/quantum-serverless-ray-node:0.7.1-py38"
nodeImage_py39: "icr.io/quantum-public/quantum-serverless-ray-node:0.7.1-py39"
nodeImage_py310: "icr.io/quantum-public/quantum-serverless-ray-node:0.7.1-py310"
cpu: 2
memory: 2
replicas: 1
Expand Down
4 changes: 3 additions & 1 deletion client/quantum_serverless/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
QuantumServerlessException
get_auto_discovered_provider
"""

# pylint: disable=W0404
from importlib_metadata import version as metadata_version, PackageNotFoundError

from .core import (
Expand All @@ -36,6 +36,8 @@
RayProvider,
LocalProvider,
save_result,
set_status,
Configuration,
)
from .quantum_serverless import (
QuantumServerless,
Expand Down
3 changes: 2 additions & 1 deletion client/quantum_serverless/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@
LocalJobClient,
Job,
save_result,
set_status,
Configuration,
)
from .pattern import (
QiskitPattern,
Program,
ProgramStorage,
ProgramRepository,
download_and_unpack_artifact,
Expand Down
Loading