add public Docker setup for ray cluster

luarss · luarss · commit 35bff37aa97a · 2024-12-25T11:12:23.000Z
Signed-off-by: Jack Luar &lt;jluar@precisioninno.com&gt;
diff --git a/tools/AutoTuner/.gitignore b/tools/AutoTuner/.gitignore
@@ -10,3 +10,7 @@ __pycache__/
 # Autotuner env
 autotuner_env
 .env
+
+# Ray distributed
+public.yaml
+private.yaml
diff --git a/tools/AutoTuner/distributed/.env.sample b/tools/AutoTuner/distributed/.env.sample
@@ -0,0 +1,2 @@
+DOCKERHUB_USERNAME={{DOCKERHUB_USERNAME}}
+DOCKERHUB_PASSWORD={{DOCKERHUB_PASSWORD}}
diff --git a/tools/AutoTuner/distributed/Dockerfile b/tools/AutoTuner/distributed/Dockerfile
@@ -0,0 +1,10 @@
+ARG BASE_TAG
+FROM openroad/flow-ubuntu22.04-builder:${BASE_TAG:-latest}
+
+# Install AT required packages
+RUN rm -rf ~/.cache/pip
+RUN pip3 cache purge
+RUN pip3 install --no-cache-dir -r /OpenROAD-flow-scripts/tools/AutoTuner/requirements.txt
+
+# ORFS installation dir
+WORKDIR /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner
diff --git a/tools/AutoTuner/distributed/Makefile b/tools/AutoTuner/distributed/Makefile
@@ -0,0 +1,39 @@
+.PHONY: clean
+include .env
+export
+
+init:
+	@echo "Setting up environment..."
+	@../installer.sh
+
+clean:
+	@echo "Cleaning up old images"
+	@docker rmi orfs-autotuner:latest
+
+base:
+	@echo "Building base image..."
+	@cd ../../../ && ./build_openroad.sh
+
+docker:
+	@echo "Building docker image..."
+	@export BASE_TAG=$(shell cd ../../../ && ./etc/DockerTag.sh -dev) && \
+		echo "Base image tag: $$BASE_TAG" && \
+		docker build -t orfs-autotuner:latest -f Dockerfile . --build-arg BASE_TAG=$$BASE_TAG && \
+		docker tag orfs-autotuner:latest orfs-autotuner:$$BASE_TAG
+
+upload:
+	@echo "Uploading docker image..."
+	@docker login -u $(DOCKERHUB_USERNAME) -p $(DOCKERHUB_PASSWORD)
+	@export BASE_TAG=$(shell cd ../../../ && ./etc/DockerTag.sh -dev) && \
+		echo "Base image: $$BASE_TAG" && \
+		docker tag orfs-autotuner:latest ${DOCKERHUB_USERNAME}/orfs-autotuner:$$BASE_TAG && \
+		docker push ${DOCKERHUB_USERNAME}/orfs-autotuner:$$BASE_TAG
+	@docker logout
+
+up:
+	@echo "Starting Ray cluster..."
+	@. .venv/bin/activate && ray up -y public.yaml
+
+down:
+	@echo "Stopping Ray cluster..."
+	@. .venv/bin/activate && ray down -y public.yaml
diff --git a/tools/AutoTuner/distributed/NOTES.md b/tools/AutoTuner/distributed/NOTES.md
@@ -0,0 +1,26 @@
+1) Setup two AT instances on same internal network
+2) Setup the requirements
+
+```
+sudo apt-get install -y python3-pip python3-venv
+python3 -m venv .venv
+.venv/bin/activate && pip install ray[tune]
+
+```
+
+3) Common setup script
+- `at_distributed.sh`
+
+4) Worker script
+- `at_worker.py`
+- `mkdir -p /tmp/owo && touch /tmp/owo/abc`
+
+
+5) Benchmark file transfers (do on worker)
+- Observation: sync_dir just makes sure the files are sync-ed. So neat feature is that only file diffs are transffered.
+- You do not have to create the dest_dir, sync_dir does that for you.
+- `max_size_bytes` is limited to 1GiB. So we have to lift up the restriction manually if needed.
+- Bottleneck seems to start at 1GiB transfers and above
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=100` - creates 100MB file. (Time taken: 2.2103039264678954 ± 0.556972017400803)
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=1000` - creates 1Gb file. (Time taken: 8.897777223587036 ± 0.6503669298689543)
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=5000` - creates 5Gb file. (Time taken: 54.920665216445926 ± 1.0533714623736783)
diff --git a/tools/AutoTuner/distributed/README.md b/tools/AutoTuner/distributed/README.md
@@ -1,26 +1,49 @@
-1) Setup two AT instances on same internal network
-2) Setup the requirements
+# Ray Cluster Setup on Google Cloud Platform (GCP)
 
+This tutorial covers the setup of Ray Clusters on GCP. Ray Clusters are a way to
+start compute intensive jobs (e.g. Autotuner) on a distributed set of nodes spawned 
+automatically. For more information on Ray Cluster, refer to [here](https://docs.ray.io/en/latest/cluster/getting-started.html).
+
+To run Autotuner jobs on Ray Cluster, we have to first install ORFS onto the
+GCP nodes.
+
+There are two different ways for ORFS setup on Ray Cluster, namely:
+- [Public](#public-cluster-setup): Upload Docker image to Dockerhub (or any public Docker registry).
+- [Private](#private-cluster-setup): Upload local code to Dockerhub, and re-compile on 
+
+## Prerequisites
+
+Make sure Autotuner prerequisites are installed. To do so, refer to the installation script.
+
+```bash
+make init
 ```
-sudo apt-get install -y python3-pip python3-venv
-python3 -m venv .venv
-.venv/bin/activate && pip install ray[tune]
 
+## Public cluster setup
+
+1. Set up `.env` with Docker registry username/password. Also, set up the `public.yaml`
+file accordingly to your desired specifications.
+
+```bash
+cp .env.sample .env
+cp public.yaml.template public.yaml
 ```
 
-3) Common setup script
-- `at_distributed.sh`
+2. Run the following commands to build, tag and upload the public image:
+
+```bash
+make clean
+make base
+make docker
+make upload
+```
 
-4) Worker script
-- `at_worker.py`
-- `mkdir -p /tmp/owo && touch /tmp/owo/abc`
+3. Launch your cluster as follows:
+
+```bash
+make up
+```
 
+## Private cluster setup
 
-5) Benchmark file transfers (do on worker)
-- Observation: sync_dir just makes sure the files are sync-ed. So neat feature is that only file diffs are transffered.
-- You do not have to create the dest_dir, sync_dir does that for you.
-- `max_size_bytes` is limited to 1GiB. So we have to lift up the restriction manually if needed.
-- Bottleneck seems to start at 1GiB transfers and above
-- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=100` - creates 100MB file. (Time taken: 2.2103039264678954 ± 0.556972017400803)
-- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=1000` - creates 1Gb file. (Time taken: 8.897777223587036 ± 0.6503669298689543)
-- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=5000` - creates 5Gb file. (Time taken: 54.920665216445926 ± 1.0533714623736783)
+Coming soon.
diff --git a/tools/AutoTuner/distributed/public.yaml.template b/tools/AutoTuner/distributed/public.yaml.template
@@ -0,0 +1,146 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The maximum number of workers nodes to launch in addition to the head
+# node.
+max_workers: 2
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+  image: "orfs-autotuner:latest"
+  container_name: "ray_container"
+  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+  # if no cached version is present.
+  pull_before_run: false
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: gcp
+    region: us-west1
+    availability_zone: us-west1-a
+    project_id: foss-fpga-tools-ext-openroad
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+
+available_node_types:
+    ray_head_default:
+        resources: {"CPU": 2}
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
+    ray_worker_small:
+        # The minimum number of worker nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 1
+        # The maximum number of worker nodes of this type to launch.
+        # This takes precedence over min_workers.
+        max_workers: 2
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        node_config:
+            machineType: n1-standard-2
+            disks:
+              - boot: true
+                autoDelete: true
+                type: PERSISTENT
+                initializeParams:
+                  diskSizeGb: 50
+                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
+            # scheduling:
+            #   - preemptible: true
+            # Un-Comment this to launch workers with the Service Account of the Head Node
+            # serviceAccounts:
+            # - email: ray-autoscaler-sa-v1@<project_id>.iam.gserviceaccount.com
+            #   scopes:
+            #   - https://www.googleapis.com/auth/cloud-platform
+
+# Specify the node type of the head node (as configured above).
+head_node_type: ray_head_default
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude:
+    - "**/.git"
+    - "**/.git/**"
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter:
+    - ".gitignore"
+
+initialization_commands:
+    - curl -fsSL https://get.docker.com -o get-docker.sh
+    - sudo sh get-docker.sh
+    - sudo usermod -aG docker $USER
+    - sudo systemctl restart docker -f
+
+# List of shell commands to run to set up nodes.
+setup_commands: []
+    # Note: if you're developing Ray, you probably want to create a Docker image that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
+    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
+    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
+
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+  - pip install google-api-python-client==1.7.8
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - >-
+      ray start
+      --head
+      --port=6379
+      --object-manager-port=8076
+      --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - >-
+      ray start
+      --address=$RAY_HEAD_IP:6379
+      --object-manager-port=8076

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+DOCKERHUB_USERNAME={{DOCKERHUB_USERNAME}}`
	`2`	`+DOCKERHUB_PASSWORD={{DOCKERHUB_PASSWORD}}`