raycluster.yaml

# A unique identifier for the head node and workers of this cluster.
cluster_name: spacebeaver

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 10

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
  type: kubernetes

  # Exposing external IP addresses for ray pods isn't currently supported.
  use_internal_ips: true

  # Namespace to use for all resources created.
  namespace: spacebeaver

  # ServiceAccount created by the autoscaler for the head node pod that it
  # runs in. If this field isn't provided, the head pod config below must
  # contain a user-created service account with the proper permissions.
  autoscaler_service_account:
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: autoscaler

  # Role created by the autoscaler for the head node pod that it runs in.
  # If this field isn't provided, the role referenced in
  # autoscaler_role_binding must exist and have at least these permissions.
  autoscaler_role:
    kind: Role
    apiVersion: rbac.authorization.k8s.io/v1
    metadata:
      name: autoscaler
    rules:
      - apiGroups: [""]
        resources: ["pods", "pods/status", "pods/exec", "services", "certificates"]
        verbs: ["get", "watch", "list", "create", "delete", "patch"]

  # RoleBinding created by the autoscaler for the head node pod that it runs
  # in. If this field isn't provided, the head pod config below must contain
  # a user-created service account with the proper permissions.
  autoscaler_role_binding:
    apiVersion: rbac.authorization.k8s.io/v1
    kind: RoleBinding
    metadata:
      name: autoscaler
    subjects:
      - kind: ServiceAccount
        name: autoscaler
    roleRef:
      kind: Role
      name: autoscaler
      apiGroup: rbac.authorization.k8s.io

  services:
    # Service that maps to the head node of the Ray cluster.
    - apiVersion: v1
      kind: Service
      metadata:
        # NOTE: If you're running multiple Ray clusters with services
        # on one Kubernetes cluster, they must have unique service
        # names.
        name: ray-ray-head
      spec:
        # This selector must match the head node pod's selector below.
        selector:
          component: ray-ray-head
        ports:
          - name: client
            protocol: TCP
            port: 10001
            targetPort: 10001
          - name: dashboard
            protocol: TCP
            port: 8265
            targetPort: 8265
          - name: ray-serve
            protocol: TCP
            port: 8000
            targetPort: 8000

# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
available_node_types:
  worker_node:
    min_workers: 0
    max_workers: 10
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: ray-ray-worker-
      # See https://github.com/ray-project/ray/issues/21706
      resources: {"CPU": 2, "GPU": 0, "memory": "8g"}
      # the auto-scaler doesn't pick CPU from above so add it here.
      rayStartParams:
        num-cpus: 2
      spec:
        serviceAccount: podsa
        serviceAccountName: podsa
        restartPolicy: Never
        volumes:
          - name: dshm
            emptyDir:
              medium: Memory
        containers:
          - name: ray-node
            imagePullPolicy: Always
            image: holdenk/messaging-backend:0.1a
            command: ["/bin/bash", "-c", "--"]
            args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
            envFrom:
              - configMapRef:
                  name: spacebeaver-web-config
              - secretRef:
                  name: spacebeaver-secret
              - secretRef:
                  name: spacebeaver-ray-secret
            env:
              - name: POD_NAME
                valueFrom:
                  fieldRef:
                    fieldPath: metadata.name
              - name: POD_NAMESPACE
                valueFrom:
                  fieldRef:
                    fieldPath: metadata.namespace
              - name: POD_IP
                valueFrom:
                  fieldRef:
                    fieldPath: status.podIP
            # This volume allocates shared memory for Ray to use for its plasma
            # object store. If you do not provide this, Ray will fall back to
            # /tmp which cause slowdowns if is not a shared memory volume.
            volumeMounts:
              - mountPath: /dev/shm
                name: dshm
            resources:
              requests:
                cpu: 2
                memory: 8G
              limits:
                cpu: 2
                memory: 9G
  head_node:
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: ray-ray-head-
        # Must match the head node service selector above if a head node
        # service is required.
        labels:
          component: ray-ray-head
      spec:
        # Change this if you altered the autoscaler_service_account above
        # or want to provide your own.
        serviceAccountName: autoscaler

        restartPolicy: OnFailure

        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumes:
          - name: dshm
            emptyDir:
              medium: Memory
        containers:
          - name: ray-node
            imagePullPolicy: Always
            image: holdenk/messaging-backend:0.1a
            # Do not change this command - it keeps the pod alive until it is
            # explicitly killed.
            command: ["/bin/bash", "-c", "--"]
            args: ['trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;']
            ports:
              - containerPort: 6379  # Redis port
              - containerPort: 10001  # Used by Ray Client
              - containerPort: 8265  # Used by Ray Dashboard
            envFrom:
              - configMapRef:
                  name: spacebeaver-web-config
              - secretRef:
                  name: spacebeaver-secret
              - secretRef:
                  name: spacebeaver-ray-secret
            env:
              - name: POD_NAME
                valueFrom:
                  fieldRef:
                    fieldPath: metadata.name
              - name: POD_NAMESPACE
                valueFrom:
                  fieldRef:
                    fieldPath: metadata.namespace
              - name: POD_IP
                valueFrom:
                  fieldRef:
                    fieldPath: status.podIP
            # This volume allocates shared memory for Ray to use for its plasma
            # object store. If you do not provide this, Ray will fall back to
            # /tmp which cause slowdowns if is not a shared memory volume.
            volumeMounts:
              - mountPath: /dev/shm
                name: dshm
            resources:
              requests:
                cpu: 2
                memory: 8G
              limits:
                cpu: 2
                memory: 8G

# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
  - ray stop
  - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
# We do change this because of some limitations with the Ray CPU disco with modern K8s.
worker_start_ray_commands:
  - ray stop
  - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379

# If we didn't have the packages pre-installed in the container we could bootstrap here.
#head_setup_commands: ["pip install aiosmtpd 'protobuf<4.0.0,>=3.15.3' 'Django>=3.2.14' psycopg2-binary"]
#worker_setup_commands: ["pip install aiosmtpd 'protobuf<4.0.0,>=3.15.3' 'Django>=3.2.14' psycopg2-binary"]