Add ray experiments

redhat-et · May 25, 2023 · 268d5a2 · 268d5a2
1 parent 49662f3
commit 268d5a2
Show file tree

Hide file tree

Showing 2 changed files with 952 additions and 0 deletions.
diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml
@@ -0,0 +1,155 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+  name: finetuneflan
+  namespace: default
+spec:
+  priority: 9
+  resources:
+    GenericItems:
+    - custompodresources:
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+        replicas: 1
+        requests:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 1
+        replicas: 2
+        requests:
+          cpu: 1
+          memory: 2G
+          nvidia.com/gpu: 1
+      generictemplate:
+        apiVersion: ray.io/v1alpha1
+        kind: RayCluster
+        metadata:
+          labels:
+            appwrapper.mcad.ibm.com: finetuneflan
+            controller-tools.k8s.io: '1.0'
+          name: finetuneflan
+          namespace: default
+        spec:
+          autoscalerOptions:
+            idleTimeoutSeconds: 60
+            imagePullPolicy: Always
+            resources:
+              limits:
+                cpu: 500m
+                memory: 512Mi
+              requests:
+                cpu: 500m
+                memory: 512Mi
+            upscalingMode: Default
+          enableInTreeAutoscaling: false
+          headGroupSpec:
+            rayStartParams:
+              block: 'true'
+              dashboard-host: 0.0.0.0
+              num-gpus: '0'
+            serviceType: ClusterIP
+            template:
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  imagePullPolicy: Always
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: ray-head
+                  ports:
+                  - containerPort: 6379
+                    name: gcs
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+                    requests:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+          rayVersion: 1.12.0
+          workerGroupSpecs:
+          - groupName: small-group-finetuneflan
+            maxReplicas: 2
+            minReplicas: 2
+            rayStartParams:
+              block: 'true'
+              num-gpus: '1'
+            replicas: 2
+            template:
+              metadata:
+                annotations:
+                  key: value
+                labels:
+                  key: value
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: machine-learning
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: 1
+                      memory: 2G
+                      nvidia.com/gpu: 1
+                initContainers:
+                - command:
+                  - sh
+                  - -c
+                  - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
+                    do echo waiting for myservice; sleep 2; done
+                  image: busybox:1.28
+                  name: init-myservice
+      replicas: 1
+    - generictemplate:
+        apiVersion: route.openshift.io/v1
+        kind: Route
+        metadata:
+          labels:
+            odh-ray-cluster-service: finetuneflan-head-svc
+          name: ray-dashboard-finetuneflan
+          namespace: default
+        spec:
+          port:
+            targetPort: dashboard
+          to:
+            kind: Service
+            name: finetuneflan-head-svc
+      replica: 1
+    Items: []