Skip to content

Commit

Permalink
Add ray experiments
Browse files Browse the repository at this point in the history
remove creds
  • Loading branch information
Shreyanand committed May 25, 2023
1 parent 49662f3 commit 2ce1e71
Show file tree
Hide file tree
Showing 2 changed files with 952 additions and 0 deletions.
155 changes: 155 additions & 0 deletions notebooks/ray-experiments/finetuneflan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
apiVersion: mcad.ibm.com/v1beta1
kind: AppWrapper
metadata:
name: finetuneflan
namespace: default
spec:
priority: 9
resources:
GenericItems:
- custompodresources:
- limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
replicas: 1
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 1
replicas: 2
requests:
cpu: 1
memory: 2G
nvidia.com/gpu: 1
generictemplate:
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: finetuneflan
controller-tools.k8s.io: '1.0'
name: finetuneflan
namespace: default
spec:
autoscalerOptions:
idleTimeoutSeconds: 60
imagePullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
upscalingMode: Default
enableInTreeAutoscaling: false
headGroupSpec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
num-gpus: '0'
serviceType: ClusterIP
template:
spec:
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: ray-head
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
rayVersion: 1.12.0
workerGroupSpecs:
- groupName: small-group-finetuneflan
maxReplicas: 2
minReplicas: 2
rayStartParams:
block: 'true'
num-gpus: '1'
replicas: 2
template:
metadata:
annotations:
key: value
labels:
key: value
spec:
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: machine-learning
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 1
requests:
cpu: 1
memory: 2G
nvidia.com/gpu: 1
initContainers:
- command:
- sh
- -c
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
do echo waiting for myservice; sleep 2; done
image: busybox:1.28
name: init-myservice
replicas: 1
- generictemplate:
apiVersion: route.openshift.io/v1
kind: Route
metadata:
labels:
odh-ray-cluster-service: finetuneflan-head-svc
name: ray-dashboard-finetuneflan
namespace: default
spec:
port:
targetPort: dashboard
to:
kind: Service
name: finetuneflan-head-svc
replica: 1
Items: []
Loading

0 comments on commit 2ce1e71

Please sign in to comment.