-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathraycluster.yaml
245 lines (229 loc) · 8.69 KB
/
raycluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# A unique identifier for the head node and workers of this cluster.
cluster_name: spacebeaver
# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 10
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: spacebeaver
# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
# contain a user-created service account with the proper permissions.
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: autoscaler
# Role created by the autoscaler for the head node pod that it runs in.
# If this field isn't provided, the role referenced in
# autoscaler_role_binding must exist and have at least these permissions.
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec", "services", "certificates"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
# RoleBinding created by the autoscaler for the head node pod that it runs
# in. If this field isn't provided, the head pod config below must contain
# a user-created service account with the proper permissions.
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: autoscaler
subjects:
- kind: ServiceAccount
name: autoscaler
roleRef:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: ray-ray-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
- name: ray-serve
protocol: TCP
port: 8000
targetPort: 8000
# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
available_node_types:
worker_node:
min_workers: 0
max_workers: 10
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-ray-worker-
# See https://github.com/ray-project/ray/issues/21706
resources: {"CPU": 2, "GPU": 0, "memory": "8g"}
# the auto-scaler doesn't pick CPU from above so add it here.
rayStartParams:
num-cpus: 2
spec:
serviceAccount: podsa
serviceAccountName: podsa
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: holdenk/messaging-backend:0.1a
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
envFrom:
- configMapRef:
name: spacebeaver-web-config
- secretRef:
name: spacebeaver-secret
- secretRef:
name: spacebeaver-ray-secret
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 2
memory: 8G
limits:
cpu: 2
memory: 9G
head_node:
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
restartPolicy: OnFailure
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: holdenk/messaging-backend:0.1a
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
envFrom:
- configMapRef:
name: spacebeaver-web-config
- secretRef:
name: spacebeaver-secret
- secretRef:
name: spacebeaver-ray-secret
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 2
memory: 8G
limits:
cpu: 2
memory: 8G
# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
# We do change this because of some limitations with the Ray CPU disco with modern K8s.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
# If we didn't have the packages pre-installed in the container we could bootstrap here.
#head_setup_commands: ["pip install aiosmtpd 'protobuf<4.0.0,>=3.15.3' 'Django>=3.2.14' psycopg2-binary"]
#worker_setup_commands: ["pip install aiosmtpd 'protobuf<4.0.0,>=3.15.3' 'Django>=3.2.14' psycopg2-binary"]