From 2ce1e718d6c11462d0dd87ec93ff5ceaa2e7d277 Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 25 May 2023 14:56:49 +0000 Subject: [PATCH 1/5] Add ray experiments remove creds --- notebooks/ray-experiments/finetuneflan.yaml | 155 ++++ notebooks/ray-experiments/ray-flantune.ipynb | 797 +++++++++++++++++++ 2 files changed, 952 insertions(+) create mode 100644 notebooks/ray-experiments/finetuneflan.yaml create mode 100644 notebooks/ray-experiments/ray-flantune.ipynb diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml new file mode 100644 index 0000000..2cee801 --- /dev/null +++ b/notebooks/ray-experiments/finetuneflan.yaml @@ -0,0 +1,155 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: finetuneflan + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 1 + replicas: 2 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 1 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: finetuneflan + controller-tools.k8s.io: '1.0' + name: finetuneflan + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + rayVersion: 1.12.0 + workerGroupSpecs: + - groupName: small-group-finetuneflan + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '1' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 1 + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: finetuneflan-head-svc + name: ray-dashboard-finetuneflan + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: finetuneflan-head-svc + replica: 1 + Items: [] diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb new file mode 100644 index 0000000..4275c59 --- /dev/null +++ b/notebooks/ray-experiments/ray-flantune.ipynb @@ -0,0 +1,797 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bbc21043", + "metadata": {}, + "source": [ + "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." + ] + }, + { + "cell_type": "markdown", + "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa", + "metadata": {}, + "source": [ + "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner. The bitsandbytes package doesn't work because of CUDA and Pytorch version." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "614daa0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"XX\",\n", + " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n", + " skip_tls=False\n", + ")\n", + "auth.login()" + ] + }, + { + "cell_type": "markdown", + "id": "bc27f84c", + "metadata": {}, + "source": [ + "Once again, let's start by running through the same cluster setup as before:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0f4bc870-091f-4e11-9642-cba145710159", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: finetuneflan.yaml\n" + ] + } + ], + "source": [ + "# Create and configure our cluster object (and appwrapper)\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='finetuneflan',\n", + " namespace='default',\n", + " min_worker=2,\n", + " max_worker=2,\n", + " min_cpus=1,\n", + " max_cpus=2,\n", + " min_memory=2,\n", + " max_memory=8,\n", + " gpu=1,\n", + " instascale=False,\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for requested resources to be set up...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "df71c1ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                     ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                     \n",
+       "                                                                         \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                              โ”‚ \n",
+       " โ”‚   finetuneflan                                        Inactive โŒ   โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   URI: ray://finetuneflan-head-svc.default.svc:10001                โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                       โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚                      Cluster Resources                              โ”‚ \n",
+       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ          โ”‚ \n",
+       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  2~8         1           1           โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ          โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mfinetuneflan\u001b[0m Inactive โŒ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "id": "33663f47", + "metadata": {}, + "source": [ + "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n", + "\n", + "Using the SDK, we can get both the Ray cluster URI and dashboard URI:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c1719bca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n", + "ray://finetuneflan-head-svc.default.svc:10001\n" + ] + } + ], + "source": [ + "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n", + "ray_cluster_uri = cluster.cluster_uri()\n", + "print(ray_dashboard_uri)\n", + "print(ray_cluster_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "2a2aca6a", + "metadata": {}, + "source": [ + "Now we can connect directly to our Ray cluster via the Ray python client:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "300146dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray cluster is up and running: True\n" + ] + } + ], + "source": [ + "#before proceeding make sure the cluster exists and the uri is not empty\n", + "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", + "\n", + "import ray\n", + "from ray.air.config import ScalingConfig\n", + "\n", + "# reset the ray context in case there's already one. \n", + "ray.shutdown()\n", + "# establish connection to ray cluster\n", + "\n", + "#install additionall libraries that will be required for model training\n", + "runtime_env = {\"pip\": [\"transformers\",\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"pyarrow<7.0.0\",\n", + " \"accelerate\",\n", + " \"bitsandbytes\",\n", + " \"loralib\",\n", + " \"py7zr\",\n", + " \"tensorboard\",\n", + " \"peft\"], \n", + " \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n", + "\n", + "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n", + "\n", + "print(\"Ray cluster is up and running: \", ray.is_initialized())" + ] + }, + { + "cell_type": "markdown", + "id": "9711030b", + "metadata": {}, + "source": [ + "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1b36e0d9", + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def train_fn():\n", + " from datasets import load_dataset\n", + " import transformers\n", + " from transformers import AutoTokenizer, TrainingArguments\n", + " from transformers import AutoModelForSequenceClassification\n", + " import numpy as np\n", + " from datasets import load_metric\n", + " import ray\n", + " from ray import tune\n", + " from ray.train.huggingface import HuggingFaceTrainer\n", + " \n", + " from datasets import load_dataset, concatenate_datasets\n", + " from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", + " from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n", + "\n", + " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", + "\n", + " #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " \n", + " dataset = load_dataset(\"samsum\")\n", + "\n", + " print(f\"Train dataset size: {len(dataset['train'])}\")\n", + " print(f\"Test dataset size: {len(dataset['test'])}\")\n", + " \n", + " #### COMPUTE MAX SEQ LEN ##########\n", + " # The maximum total input sequence length after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\n", + " conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n", + "\n", + " \n", + " tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"])\n", + " \n", + " input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n", + " # take 85 percentile of max length for better utilization\n", + " max_source_length = int(np.percentile(input_lengths, 85))\n", + " print(f\"Max source length: {max_source_length}\")\n", + "\n", + " # The maximum total sequence length for target text after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n", + " tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"]) \n", + " target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n", + " # take 90 percentile of max length for better utilization\n", + " max_target_length = int(np.percentile(target_lengths, 90))\n", + " print(f\"Max target length: {max_target_length}\")\n", + " \n", + " #### PREPROCESS DATA ##########\n", + " \n", + " def preprocess_function(sample,padding=\"max_length\"):\n", + " # add prefix to the input for t5\n", + " inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n", + "\n", + " # tokenize inputs\n", + " model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n", + "\n", + " # Tokenize targets with the `text_target` keyword argument\n", + " labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n", + "\n", + " # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n", + " # padding in the loss.\n", + " if padding == \"max_length\":\n", + " labels[\"input_ids\"] = [\n", + " [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n", + " ]\n", + "\n", + " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", + " return model_inputs\n", + "\n", + " tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n", + " print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n", + "\n", + " ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n", + " ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n", + "\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", + " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", + " model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", + " lora_config = LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " target_modules=[\"q\", \"v\"],\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=TaskType.SEQ_2_SEQ_LM\n", + " )\n", + " # prepare int-8 model for training\n", + " model = prepare_model_for_int8_training(model)\n", + "\n", + " # add LoRA adaptor\n", + " model = get_peft_model(model, lora_config)\n", + " model.print_trainable_parameters()\n", + " \n", + " from transformers import DataCollatorForSeq2Seq\n", + "\n", + " # we want to ignore tokenizer pad token in the loss\n", + " label_pad_token_id = -100\n", + " # Data collator\n", + " data_collator = DataCollatorForSeq2Seq(\n", + " tokenizer,\n", + " model=model,\n", + " label_pad_token_id=label_pad_token_id,\n", + " pad_to_multiple_of=8\n", + " )\n", + " \n", + " output_dir=\"/tmp/flan/test\"\n", + "\n", + " # Define training args\n", + " training_args = Seq2SeqTrainingArguments(\n", + " output_dir=output_dir,\n", + " auto_find_batch_size=True,\n", + " learning_rate=1e-3, # higher learning rate\n", + " num_train_epochs=5,\n", + " logging_dir=f\"{output_dir}/logs\",\n", + " logging_strategy=\"steps\",\n", + " logging_steps=500,\n", + " save_strategy=\"no\",\n", + " report_to=\"tensorboard\",\n", + " )\n", + "\n", + " trainer = Seq2SeqTrainer(model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=tokenized_dataset[\"train\"])\n", + " \n", + " return trainer\n", + "\n", + " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", + "\n", + " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", + " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", + " trainer = HuggingFaceTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " scaling_config=scaling_config,\n", + " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " )\n", + " result = trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "id": "d4d8fd65", + "metadata": {}, + "source": [ + "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5901d958", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n", + " 0%| | 0/3 [00:00 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m 101\u001b[0m \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 432\u001b[0m op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 460\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n\u001b[1;32m 466\u001b[0m size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n", + "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n self.setup(copy.deepcopy(self.config))\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n setup_kwargs[k] = parameter_registry.get(prefix + k)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n table = _memory_mapped_arrow_table_from_file(path)\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n memory_mapped_stream = pa.memory_map(filename)\n File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "#call the above cell as a remote ray function\n", + "ray.get(train_fn.remote())" + ] + }, + { + "cell_type": "markdown", + "id": "5af8cd32", + "metadata": {}, + "source": [ + "Once complete, we can bring our Ray cluster down and clean up:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'cluster' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n", + "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined" + ] + } + ], + "source": [ + "cluster.down()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d41b90e", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.14", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.14" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f2b4f6759870080cc9b532b07a44f90ce251919a Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 13 Jul 2023 17:07:12 +0000 Subject: [PATCH 2/5] Add recent changes --- notebooks/ray-experiments/finetuneflan.yaml | 40 +++++++++++++++++++- notebooks/ray-experiments/ray-flantune.ipynb | 39 +++++++++---------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml index 2cee801..dafc03c 100644 --- a/notebooks/ray-experiments/finetuneflan.yaml +++ b/notebooks/ray-experiments/finetuneflan.yaml @@ -1,6 +1,8 @@ apiVersion: mcad.ibm.com/v1beta1 kind: AppWrapper metadata: + labels: + orderedinstance: m5.xlarge_g4dn.xlarge name: finetuneflan namespace: default spec: @@ -56,12 +58,29 @@ spec: serviceType: ClusterIP template: spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: finetuneflan + operator: In + values: + - finetuneflan containers: - env: - name: MY_POD_IP valueFrom: fieldRef: fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 imagePullPolicy: Always lifecycle: @@ -88,7 +107,8 @@ spec: cpu: 2 memory: 8G nvidia.com/gpu: 0 - rayVersion: 1.12.0 + imagePullSecrets: [] + rayVersion: 2.1.0 workerGroupSpecs: - groupName: small-group-finetuneflan maxReplicas: 2 @@ -104,12 +124,29 @@ spec: labels: key: value spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: finetuneflan + operator: In + values: + - finetuneflan containers: - env: - name: MY_POD_IP valueFrom: fieldRef: fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 lifecycle: preStop: @@ -128,6 +165,7 @@ spec: cpu: 1 memory: 2G nvidia.com/gpu: 1 + imagePullSecrets: [] initContainers: - command: - sh diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb index 4275c59..bac7a90 100644 --- a/notebooks/ray-experiments/ray-flantune.ipynb +++ b/notebooks/ray-experiments/ray-flantune.ipynb @@ -48,7 +48,7 @@ "source": [ "# Create authentication object for oc user permissions\n", "auth = TokenAuthentication(\n", - " token = \"XX\",\n", + " token = \"sha256~Z29WoRM5bMsxVgZpJ5uX9XtB-qPZzdOuGo9upSvpc98\",\n", " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n", " skip_tls=False\n", ")\n", @@ -89,7 +89,8 @@ " min_memory=2,\n", " max_memory=8,\n", " gpu=1,\n", - " instascale=False,\n", + " instascale=True,\n", + " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", "))" ] }, @@ -120,7 +121,13 @@ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:221\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 219\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ready:\n\u001b[0;32m--> 221\u001b[0m status, ready \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprint_to_console\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m==\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN:\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWARNING: Current cluster status is unknown, have you run cluster.up yet?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 225\u001b[0m )\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:160\u001b[0m, in \u001b[0;36mCluster.status\u001b[0;34m(self, print_to_console)\u001b[0m\n\u001b[1;32m 158\u001b[0m status \u001b[38;5;241m=\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# check the app wrapper status\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m appwrapper \u001b[38;5;241m=\u001b[39m \u001b[43m_app_wrapper_status\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnamespace\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper:\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 163\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING,\n\u001b[1;32m 164\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mCOMPLETED,\n\u001b[1;32m 165\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING_HOLD_COMPLETION,\n\u001b[1;32m 166\u001b[0m ]:\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:330\u001b[0m, in \u001b[0;36m_app_wrapper_status\u001b[0;34m(name, namespace)\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mproject(namespace), oc\u001b[38;5;241m.\u001b[39mtimeout(\u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m):\n\u001b[0;32m--> 330\u001b[0m cluster \u001b[38;5;241m=\u001b[39m \u001b[43moc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mappwrapper/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mname\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 331\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mOpenShiftPythonException \u001b[38;5;28;01mas\u001b[39;00m osp: \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m 332\u001b[0m msg \u001b[38;5;241m=\u001b[39m osp\u001b[38;5;241m.\u001b[39mmsg\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:403\u001b[0m, in \u001b[0;36mSelector.object\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mobject\u001b[39m(\u001b[38;5;28mself\u001b[39m, ignore_not_found\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 395\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 396\u001b[0m \u001b[38;5;124;03m Returns a single APIObject that represents the selected resource. If multiple\u001b[39;00m\n\u001b[1;32m 397\u001b[0m \u001b[38;5;124;03m resources are being selected an exception will be thrown (use objects() when\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[38;5;124;03m :return: A Model of the selected resource.\u001b[39;00m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 403\u001b[0m objs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobjects\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_not_found:\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:423\u001b[0m, in \u001b[0;36mSelector.objects\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;124;03mReturns a python list of APIObject objects that represent the selected resources. An\u001b[39;00m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;124;03mempty is returned if nothing is selected.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[38;5;124;03m:return: A list of Model objects representing the receiver's selected resources.\u001b[39;00m\n\u001b[1;32m 420\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapiobject\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m APIObject\n\u001b[0;32m--> 423\u001b[0m obj \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43mignore_not_found\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_not_found\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 426\u001b[0m api_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(obj)\u001b[38;5;241m.\u001b[39melements(\u001b[38;5;28mcls\u001b[39m)\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:380\u001b[0m, in \u001b[0;36mSelector.object_json\u001b[0;34m(self, ignore_not_found)\u001b[0m\n\u001b[1;32m 377\u001b[0m cmd_args\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--ignore-not-found\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 379\u001b[0m r \u001b[38;5;241m=\u001b[39m Result(verb)\n\u001b[0;32m--> 380\u001b[0m r\u001b[38;5;241m.\u001b[39madd_action(\u001b[43moc_action\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mall_namespaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_namespaces\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcmd_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcmd_args\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 381\u001b[0m r\u001b[38;5;241m.\u001b[39mfail_if(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to read object\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 383\u001b[0m \u001b[38;5;66;03m# --ignore-not-found returns an empty string instead of an error if nothing is found\u001b[39;00m\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/action.py:363\u001b[0m, in \u001b[0;36moc_action\u001b[0;34m(context, verb, cmd_args, all_namespaces, no_namespace, namespace, references, stdin_obj, stdin_str, last_attempt, **kwargs)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m \u001b[38;5;66;03m# ignore\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m period \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(\u001b[38;5;241m1\u001b[39m, period \u001b[38;5;241m+\u001b[39m period) \u001b[38;5;66;03m# Poll fast at first, but slow down to 1/sec over time\u001b[39;00m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;66;03m# See note in paramiko flow on decoding\u001b[39;00m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } @@ -133,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "df71c1ed", "metadata": {}, "outputs": [ @@ -169,7 +176,7 @@ " โ”‚ โ”‚ \n", " โ”‚ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ”‚ \n", " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ \u001b]8;id=991912;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", " โ”‚ โ”‚ \n", " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", @@ -190,7 +197,7 @@ "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -737,22 +744,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'cluster' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n", - "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined" - ] - } - ], + "outputs": [], "source": [ "cluster.down()" ] @@ -770,7 +765,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.14", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -784,7 +779,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.14" + "version": "3.8.13" }, "vscode": { "interpreter": { From 1bcaf5b84dcbb774c9a62fb28b762135ec9bf5ee Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 10 Aug 2023 20:29:08 +0000 Subject: [PATCH 3/5] Add initial ray experiments --- .../ray-flan-interactive.ipynb | 3499 +++++++++++++++++ 1 file changed, 3499 insertions(+) create mode 100644 notebooks/ray-experiments/ray-flan-interactive.ipynb diff --git a/notebooks/ray-experiments/ray-flan-interactive.ipynb b/notebooks/ray-experiments/ray-flan-interactive.ipynb new file mode 100644 index 0000000..858894c --- /dev/null +++ b/notebooks/ray-experiments/ray-flan-interactive.ipynb @@ -0,0 +1,3499 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bbc21043", + "metadata": {}, + "source": [ + "# Fine tune Flan T5 model using the Codeflare stack and Ray distribution" + ] + }, + { + "cell_type": "markdown", + "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa", + "metadata": {}, + "source": [ + "This notebook fine tunes the flan T5 model with a summarization dataset. It first uses Instascale to add required machines to the Openshift cluster and then uses Codeflare stack to spawn up a ray cluster. Then it uses Ray train api to distribute the training job over multiple nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a066b71b-4967-4d03-8601-c2afb2d0b507", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.1.0'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check ray version: it should match the worker's ray version\n", + "import ray\n", + "ray.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "614daa0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"xxx\",\n", + " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n", + " skip_tls=False\n", + ")\n", + "auth.login()" + ] + }, + { + "cell_type": "markdown", + "id": "bc27f84c", + "metadata": {}, + "source": [ + "Once again, let's start by running through the same cluster setup as before:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0f4bc870-091f-4e11-9642-cba145710159", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: finetuneflan.yaml\n" + ] + } + ], + "source": [ + "# Create and configure our cluster object (and appwrapper)\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='finetuneflan',\n", + " namespace='default',\n", + " min_worker=2,\n", + " max_worker=2,\n", + " min_cpus=1,\n", + " max_cpus=2,\n", + " min_memory=8,\n", + " max_memory=24,\n", + " gpu=1,\n", + " instascale=True,\n", + " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for requested resources to be set up...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [14], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:229\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 229\u001b[0m \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 230\u001b[0m time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 231\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "df71c1ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                     ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                     \n",
+       "                                                                         \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                              โ”‚ \n",
+       " โ”‚   finetuneflan                                        Inactive โŒ   โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   URI: ray://finetuneflan-head-svc.default.svc:10001                โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                       โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚                      Cluster Resources                              โ”‚ \n",
+       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ          โ”‚ \n",
+       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  8~24        1           1           โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ          โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mfinetuneflan\u001b[0m Inactive โŒ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=384441;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m8~24 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=24, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "id": "33663f47", + "metadata": {}, + "source": [ + "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n", + "\n", + "Using the SDK, we can get both the Ray cluster URI and dashboard URI:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c1719bca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n", + "ray://finetuneflan-head-svc.default.svc:10001\n" + ] + } + ], + "source": [ + "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n", + "ray_cluster_uri = cluster.cluster_uri()\n", + "print(ray_dashboard_uri)\n", + "print(ray_cluster_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "2a2aca6a", + "metadata": {}, + "source": [ + "Now we can connect directly to our Ray cluster via the Ray python client:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "300146dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray cluster is up and running: True\n" + ] + } + ], + "source": [ + "#before proceeding make sure the cluster exists and the uri is not empty\n", + "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", + "\n", + "import ray\n", + "from ray.air.config import ScalingConfig\n", + "\n", + "# reset the ray context in case there's already one. \n", + "ray.shutdown()\n", + "# establish connection to ray cluster\n", + "\n", + "#install additionall libraries that will be required for model training\n", + "runtime_env = {\"pip\": [\"transformers\",\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"pyarrow<7.0.0\",\n", + " \"accelerate\",\n", + " \"loralib\",\n", + " \"py7zr\",\n", + " \"tensorboard\",\n", + " \"peft\"], \n", + " \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n", + "\n", + "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n", + "\n", + "print(\"Ray cluster is up and running: \", ray.is_initialized())" + ] + }, + { + "cell_type": "markdown", + "id": "9711030b", + "metadata": {}, + "source": [ + "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1b36e0d9", + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def train_fn():\n", + " from datasets import load_dataset\n", + " import transformers\n", + " from transformers import AutoTokenizer, TrainingArguments\n", + " from transformers import AutoModelForSequenceClassification\n", + " import numpy as np\n", + " from datasets import load_metric\n", + " import ray\n", + " from ray import tune\n", + " from ray.train.huggingface import HuggingFaceTrainer\n", + " from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n", + " from datasets import load_dataset, concatenate_datasets\n", + " from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", + " from peft import LoraConfig, get_peft_model, TaskType #, prepare_model_for_int8_training\n", + "\n", + " model_name = \"google/flan-t5-xl\"\n", + "\n", + " #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " \n", + " dataset = load_dataset(\"samsum\")\n", + "\n", + " print(f\"Train dataset size: {len(dataset['train'])}\")\n", + " print(f\"Test dataset size: {len(dataset['test'])}\")\n", + " \n", + " #### COMPUTE MAX SEQ LEN ##########\n", + " # The maximum total input sequence length after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\n", + " conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n", + "\n", + " \n", + " tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"])\n", + " \n", + " input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n", + " # take 85 percentile of max length for better utilization\n", + " max_source_length = int(np.percentile(input_lengths, 85))\n", + " print(f\"Max source length: {max_source_length}\")\n", + "\n", + " # The maximum total sequence length for target text after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n", + " tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"]) \n", + " target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n", + " # take 90 percentile of max length for better utilization\n", + " max_target_length = int(np.percentile(target_lengths, 90))\n", + " print(f\"Max target length: {max_target_length}\")\n", + " \n", + " #### PREPROCESS DATA ##########\n", + " \n", + " def preprocess_function(sample,padding=\"max_length\"):\n", + " # add prefix to the input for t5\n", + " inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n", + "\n", + " # tokenize inputs\n", + " model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n", + "\n", + " # Tokenize targets with the `text_target` keyword argument\n", + " labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n", + "\n", + " # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n", + " # padding in the loss.\n", + " if padding == \"max_length\":\n", + " labels[\"input_ids\"] = [\n", + " [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n", + " ]\n", + "\n", + " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", + " return model_inputs\n", + "\n", + " tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n", + " print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n", + "\n", + " ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n", + " ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n", + "\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + " \n", + " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", + " model_name = \"google/flan-t5-xl\"\n", + " model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map=\"auto\")\n", + " lora_config = LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " target_modules=[\"q\", \"v\"],\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=TaskType.SEQ_2_SEQ_LM\n", + " )\n", + " # prepare int-8 model for training\n", + " #model = prepare_model_for_int8_training(model)\n", + "\n", + " # add LoRA adaptor\n", + " model = get_peft_model(model, lora_config)\n", + " model.print_trainable_parameters()\n", + " \n", + " from transformers import DataCollatorForSeq2Seq\n", + "\n", + " # we want to ignore tokenizer pad token in the loss\n", + " label_pad_token_id = -100\n", + " # Data collator\n", + " data_collator = DataCollatorForSeq2Seq(\n", + " tokenizer,\n", + " model=model,\n", + " label_pad_token_id=label_pad_token_id,\n", + " pad_to_multiple_of=8\n", + " )\n", + " \n", + " output_dir=\"/tmp/flan/test\"\n", + "\n", + " # Define training args\n", + " training_args = Seq2SeqTrainingArguments(\n", + " output_dir=output_dir,\n", + " auto_find_batch_size=True,\n", + " learning_rate=1e-3, # higher learning rate\n", + " num_train_epochs=5,\n", + " logging_dir=f\"{output_dir}/logs\",\n", + " logging_strategy=\"steps\",\n", + " logging_steps=500,\n", + " save_strategy=\"no\",\n", + " report_to=\"tensorboard\",\n", + " )\n", + "\n", + " trainer = Seq2SeqTrainer(model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset)\n", + " \n", + " return trainer\n", + "\n", + " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", + "\n", + " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", + " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", + " trainer = HuggingFaceTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " scaling_config=scaling_config,\n", + " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " )\n", + " result = trainer.fit()\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "id": "d4d8fd65", + "metadata": {}, + "source": [ + "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5901d958", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading (โ€ฆ)okenizer_config.json: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 2.54k/2.54k [00:00<00:00, 767kB/s]\n", + "Downloading spiece.model: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 792k/792k [00:00<00:00, 99.4MB/s]\n", + "Downloading (โ€ฆ)/main/tokenizer.json: 0%| | 0.00/2.42M [00:00 2\u001b[0m r \u001b[38;5;241m=\u001b[39m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m 101\u001b[0m \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 432\u001b[0m op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 460\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n\u001b[1;32m 466\u001b[0m size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n", + "\u001b[0;31mRayTaskError(RuntimeError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=4614, ip=10.128.28.7)\n File \"/tmp/ipykernel_14249/2624701892.py\", line 150, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 360, in fit\n raise result.error\nray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\nray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.\n\tclass_name: RayTrainWorker\n\tactor_id: 537034a8f0299b4acc1e1f4e05000000\n\tpid: 378\n\tnamespace: 79e19797-9a9d-4359-9e7e-135e143c02c0\n\tip: 10.128.30.22\nThe actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 355, in train\n raise skipped from exception_cause(skipped)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 325, in entrypoint\n return self._trainable_func(\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 475, in _trainable_func\n super()._trainable_func(self._merged_config, reporter, checkpoint_dir)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 651, in _trainable_func\n output = fn()\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 390, in train_func\n trainer.training_loop()\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 371, in training_loop\n self._report(training_iterator)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 320, in _report\n for results in training_iterator:\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 225, in __next__\n next_results = self._run_with_error_handling(self._fetch_next_result)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 188, in _run_with_error_handling\n return func()\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 257, in _fetch_next_result\n results = self._backend_executor.get_next_results()\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 390, in get_next_results\n results = self.get_with_failure_handling(futures)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 483, in get_with_failure_handling\n self._increment_failures()\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 533, in _increment_failures\n raise exc.with_traceback(None) from self._last_failure\nRuntimeError: Training has failed after 1 attempts. You can change the number of max failure attempts by setting the `max_retries` arg in your `Trainer`." + ] + } + ], + "source": [ + "#call the above cell as a remote ray function\n", + "r = ray.get(train_fn.remote())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c69fb6a5-173b-4564-bd20-49fcd6aebd64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install --upgrade ray peft accelerate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25819219-0317-43e5-bc31-d1fddd1fe897", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from ray.train.huggingface.transformers.transformers_checkpoint import TransformersCheckpoint\n", + "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", + "from peft import PeftModel, PeftConfig\n", + "\n", + "model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-large')\n", + "tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')\n", + "\n", + "checkpoint = TransformersCheckpoint.from_checkpoint(r.checkpoint)\n", + "\n", + "model_output_dir = '../../models/raytune'\n", + "checkpoint.to_directory(model_output_dir)\n", + "\n", + "# Load the Lora model\n", + "model = PeftModel.from_pretrained(model, model_output_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "5af8cd32", + "metadata": {}, + "source": [ + "Once complete, we can bring our Ray cluster down and clean up:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f995319e-17a1-4e1c-80bb-5cd1014e719a", + "metadata": {}, + "outputs": [], + "source": [ + "# To do next:\n", + "# - train on ROSA data and add inference code\n", + "# - train a higher param model\n", + "# - Add bitsandbytes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.down()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d41b90e", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From af7e2179a4a4a1e8ecfae44d660567f73c00957a Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 10 Aug 2023 20:30:32 +0000 Subject: [PATCH 4/5] Add initial ray experiments --- .../ray-flan-interactive.ipynb | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/notebooks/ray-experiments/ray-flan-interactive.ipynb b/notebooks/ray-experiments/ray-flan-interactive.ipynb index 858894c..a63f5ec 100644 --- a/notebooks/ray-experiments/ray-flan-interactive.ipynb +++ b/notebooks/ray-experiments/ray-flan-interactive.ipynb @@ -51,6 +51,17 @@ "ray.__version__" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2fae774b-1cbb-4548-88bd-841ca0d3b0c7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get packages for loading the model in this environment\n", + "#!pip install --upgrade ray peft accelerate" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -3391,18 +3402,6 @@ "r = ray.get(train_fn.remote())" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "c69fb6a5-173b-4564-bd20-49fcd6aebd64", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install --upgrade ray peft accelerate" - ] - }, { "cell_type": "code", "execution_count": null, @@ -3421,6 +3420,7 @@ "\n", "checkpoint = TransformersCheckpoint.from_checkpoint(r.checkpoint)\n", "\n", + "# Save model in a directory\n", "model_output_dir = '../../models/raytune'\n", "checkpoint.to_directory(model_output_dir)\n", "\n", From fdbfae4845e472fa39012401d85678b3f289379d Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 10 Aug 2023 20:33:10 +0000 Subject: [PATCH 5/5] Add initial ray experiments --- notebooks/ray-experiments/ray-flantune.ipynb | 792 ------------------- 1 file changed, 792 deletions(-) delete mode 100644 notebooks/ray-experiments/ray-flantune.ipynb diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb deleted file mode 100644 index bac7a90..0000000 --- a/notebooks/ray-experiments/ray-flantune.ipynb +++ /dev/null @@ -1,792 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "bbc21043", - "metadata": {}, - "source": [ - "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." - ] - }, - { - "cell_type": "markdown", - "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa", - "metadata": {}, - "source": [ - "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner. The bitsandbytes package doesn't work because of CUDA and Pytorch version." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", - "from codeflare_sdk.cluster.auth import TokenAuthentication" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "614daa0c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create authentication object for oc user permissions\n", - "auth = TokenAuthentication(\n", - " token = \"sha256~Z29WoRM5bMsxVgZpJ5uX9XtB-qPZzdOuGo9upSvpc98\",\n", - " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n", - " skip_tls=False\n", - ")\n", - "auth.login()" - ] - }, - { - "cell_type": "markdown", - "id": "bc27f84c", - "metadata": {}, - "source": [ - "Once again, let's start by running through the same cluster setup as before:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: finetuneflan.yaml\n" - ] - } - ], - "source": [ - "# Create and configure our cluster object (and appwrapper)\n", - "cluster = Cluster(ClusterConfiguration(\n", - " name='finetuneflan',\n", - " namespace='default',\n", - " min_worker=2,\n", - " max_worker=2,\n", - " min_cpus=1,\n", - " max_cpus=2,\n", - " min_memory=2,\n", - " max_memory=8,\n", - " gpu=1,\n", - " instascale=True,\n", - " machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for requested resources to be set up...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:221\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 219\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ready:\n\u001b[0;32m--> 221\u001b[0m status, ready \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprint_to_console\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m==\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN:\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWARNING: Current cluster status is unknown, have you run cluster.up yet?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 225\u001b[0m )\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:160\u001b[0m, in \u001b[0;36mCluster.status\u001b[0;34m(self, print_to_console)\u001b[0m\n\u001b[1;32m 158\u001b[0m status \u001b[38;5;241m=\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# check the app wrapper status\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m appwrapper \u001b[38;5;241m=\u001b[39m \u001b[43m_app_wrapper_status\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnamespace\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper:\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 163\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING,\n\u001b[1;32m 164\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mCOMPLETED,\n\u001b[1;32m 165\u001b[0m AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING_HOLD_COMPLETION,\n\u001b[1;32m 166\u001b[0m ]:\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:330\u001b[0m, in \u001b[0;36m_app_wrapper_status\u001b[0;34m(name, namespace)\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mproject(namespace), oc\u001b[38;5;241m.\u001b[39mtimeout(\u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m):\n\u001b[0;32m--> 330\u001b[0m cluster \u001b[38;5;241m=\u001b[39m \u001b[43moc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mappwrapper/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mname\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 331\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mOpenShiftPythonException \u001b[38;5;28;01mas\u001b[39;00m osp: \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m 332\u001b[0m msg \u001b[38;5;241m=\u001b[39m osp\u001b[38;5;241m.\u001b[39mmsg\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:403\u001b[0m, in \u001b[0;36mSelector.object\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mobject\u001b[39m(\u001b[38;5;28mself\u001b[39m, ignore_not_found\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 395\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 396\u001b[0m \u001b[38;5;124;03m Returns a single APIObject that represents the selected resource. If multiple\u001b[39;00m\n\u001b[1;32m 397\u001b[0m \u001b[38;5;124;03m resources are being selected an exception will be thrown (use objects() when\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[38;5;124;03m :return: A Model of the selected resource.\u001b[39;00m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 403\u001b[0m objs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobjects\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_not_found:\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:423\u001b[0m, in \u001b[0;36mSelector.objects\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;124;03mReturns a python list of APIObject objects that represent the selected resources. An\u001b[39;00m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;124;03mempty is returned if nothing is selected.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[38;5;124;03m:return: A list of Model objects representing the receiver's selected resources.\u001b[39;00m\n\u001b[1;32m 420\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapiobject\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m APIObject\n\u001b[0;32m--> 423\u001b[0m obj \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43mignore_not_found\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_not_found\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 426\u001b[0m api_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(obj)\u001b[38;5;241m.\u001b[39melements(\u001b[38;5;28mcls\u001b[39m)\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:380\u001b[0m, in \u001b[0;36mSelector.object_json\u001b[0;34m(self, ignore_not_found)\u001b[0m\n\u001b[1;32m 377\u001b[0m cmd_args\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--ignore-not-found\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 379\u001b[0m r \u001b[38;5;241m=\u001b[39m Result(verb)\n\u001b[0;32m--> 380\u001b[0m r\u001b[38;5;241m.\u001b[39madd_action(\u001b[43moc_action\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mall_namespaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_namespaces\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcmd_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcmd_args\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 381\u001b[0m r\u001b[38;5;241m.\u001b[39mfail_if(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to read object\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 383\u001b[0m \u001b[38;5;66;03m# --ignore-not-found returns an empty string instead of an error if nothing is found\u001b[39;00m\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/action.py:363\u001b[0m, in \u001b[0;36moc_action\u001b[0;34m(context, verb, cmd_args, all_namespaces, no_namespace, namespace, references, stdin_obj, stdin_str, last_attempt, **kwargs)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m \u001b[38;5;66;03m# ignore\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m period \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(\u001b[38;5;241m1\u001b[39m, period \u001b[38;5;241m+\u001b[39m period) \u001b[38;5;66;03m# Poll fast at first, but slow down to 1/sec over time\u001b[39;00m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;66;03m# See note in paramiko flow on decoding\u001b[39;00m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "# Bring up the cluster\n", - "cluster.up()\n", - "cluster.wait_ready()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "df71c1ed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
                     ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                     \n",
-       "                                                                         \n",
-       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
-       " โ”‚   Name                                                              โ”‚ \n",
-       " โ”‚   finetuneflan                                        Inactive โŒ   โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚   URI: ray://finetuneflan-head-svc.default.svc:10001                โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚   Dashboard๐Ÿ”—                                                       โ”‚ \n",
-       " โ”‚                                                                     โ”‚ \n",
-       " โ”‚                      Cluster Resources                              โ”‚ \n",
-       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ          โ”‚ \n",
-       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚          โ”‚ \n",
-       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
-       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  2~8         1           1           โ”‚          โ”‚ \n",
-       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
-       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ          โ”‚ \n",
-       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", - " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", - " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", - " โ”‚ \u001b[1;4mfinetuneflan\u001b[0m Inactive โŒ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=991912;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", - " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", - " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", - " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster.details()" - ] - }, - { - "cell_type": "markdown", - "id": "33663f47", - "metadata": {}, - "source": [ - "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n", - "\n", - "Using the SDK, we can get both the Ray cluster URI and dashboard URI:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c1719bca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n", - "ray://finetuneflan-head-svc.default.svc:10001\n" - ] - } - ], - "source": [ - "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n", - "ray_cluster_uri = cluster.cluster_uri()\n", - "print(ray_dashboard_uri)\n", - "print(ray_cluster_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "2a2aca6a", - "metadata": {}, - "source": [ - "Now we can connect directly to our Ray cluster via the Ray python client:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "300146dc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ray cluster is up and running: True\n" - ] - } - ], - "source": [ - "#before proceeding make sure the cluster exists and the uri is not empty\n", - "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", - "\n", - "import ray\n", - "from ray.air.config import ScalingConfig\n", - "\n", - "# reset the ray context in case there's already one. \n", - "ray.shutdown()\n", - "# establish connection to ray cluster\n", - "\n", - "#install additionall libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\",\n", - " \"datasets\",\n", - " \"evaluate\",\n", - " \"pyarrow<7.0.0\",\n", - " \"accelerate\",\n", - " \"bitsandbytes\",\n", - " \"loralib\",\n", - " \"py7zr\",\n", - " \"tensorboard\",\n", - " \"peft\"], \n", - " \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n", - "\n", - "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n", - "\n", - "print(\"Ray cluster is up and running: \", ray.is_initialized())" - ] - }, - { - "cell_type": "markdown", - "id": "9711030b", - "metadata": {}, - "source": [ - "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "1b36e0d9", - "metadata": {}, - "outputs": [], - "source": [ - "@ray.remote\n", - "def train_fn():\n", - " from datasets import load_dataset\n", - " import transformers\n", - " from transformers import AutoTokenizer, TrainingArguments\n", - " from transformers import AutoModelForSequenceClassification\n", - " import numpy as np\n", - " from datasets import load_metric\n", - " import ray\n", - " from ray import tune\n", - " from ray.train.huggingface import HuggingFaceTrainer\n", - " \n", - " from datasets import load_dataset, concatenate_datasets\n", - " from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", - " from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n", - "\n", - " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", - "\n", - " #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", - " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - " \n", - " dataset = load_dataset(\"samsum\")\n", - "\n", - " print(f\"Train dataset size: {len(dataset['train'])}\")\n", - " print(f\"Test dataset size: {len(dataset['test'])}\")\n", - " \n", - " #### COMPUTE MAX SEQ LEN ##########\n", - " # The maximum total input sequence length after tokenization.\n", - " # Sequences longer than this will be truncated, sequences shorter will be padded.\n", - " conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n", - "\n", - " \n", - " tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", - " truncation=True),\n", - " batched=True,\n", - " remove_columns=[\"dialogue\", \"summary\"])\n", - " \n", - " input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n", - " # take 85 percentile of max length for better utilization\n", - " max_source_length = int(np.percentile(input_lengths, 85))\n", - " print(f\"Max source length: {max_source_length}\")\n", - "\n", - " # The maximum total sequence length for target text after tokenization.\n", - " # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n", - " tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", - " truncation=True),\n", - " batched=True,\n", - " remove_columns=[\"dialogue\", \"summary\"]) \n", - " target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n", - " # take 90 percentile of max length for better utilization\n", - " max_target_length = int(np.percentile(target_lengths, 90))\n", - " print(f\"Max target length: {max_target_length}\")\n", - " \n", - " #### PREPROCESS DATA ##########\n", - " \n", - " def preprocess_function(sample,padding=\"max_length\"):\n", - " # add prefix to the input for t5\n", - " inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n", - "\n", - " # tokenize inputs\n", - " model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n", - "\n", - " # Tokenize targets with the `text_target` keyword argument\n", - " labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n", - "\n", - " # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n", - " # padding in the loss.\n", - " if padding == \"max_length\":\n", - " labels[\"input_ids\"] = [\n", - " [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n", - " ]\n", - "\n", - " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", - " return model_inputs\n", - "\n", - " tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n", - " print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n", - "\n", - " ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n", - " ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n", - "\n", - " def compute_metrics(eval_pred):\n", - " metric = load_metric(\"accuracy\")\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", - "\n", - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", - " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", - " model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", - " lora_config = LoraConfig(\n", - " r=16,\n", - " lora_alpha=32,\n", - " target_modules=[\"q\", \"v\"],\n", - " lora_dropout=0.05,\n", - " bias=\"none\",\n", - " task_type=TaskType.SEQ_2_SEQ_LM\n", - " )\n", - " # prepare int-8 model for training\n", - " model = prepare_model_for_int8_training(model)\n", - "\n", - " # add LoRA adaptor\n", - " model = get_peft_model(model, lora_config)\n", - " model.print_trainable_parameters()\n", - " \n", - " from transformers import DataCollatorForSeq2Seq\n", - "\n", - " # we want to ignore tokenizer pad token in the loss\n", - " label_pad_token_id = -100\n", - " # Data collator\n", - " data_collator = DataCollatorForSeq2Seq(\n", - " tokenizer,\n", - " model=model,\n", - " label_pad_token_id=label_pad_token_id,\n", - " pad_to_multiple_of=8\n", - " )\n", - " \n", - " output_dir=\"/tmp/flan/test\"\n", - "\n", - " # Define training args\n", - " training_args = Seq2SeqTrainingArguments(\n", - " output_dir=output_dir,\n", - " auto_find_batch_size=True,\n", - " learning_rate=1e-3, # higher learning rate\n", - " num_train_epochs=5,\n", - " logging_dir=f\"{output_dir}/logs\",\n", - " logging_strategy=\"steps\",\n", - " logging_steps=500,\n", - " save_strategy=\"no\",\n", - " report_to=\"tensorboard\",\n", - " )\n", - "\n", - " trainer = Seq2SeqTrainer(model=model,\n", - " args=training_args,\n", - " data_collator=data_collator,\n", - " train_dataset=tokenized_dataset[\"train\"])\n", - " \n", - " return trainer\n", - "\n", - " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", - "\n", - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", - " trainer = HuggingFaceTrainer(\n", - " trainer_init_per_worker=trainer_init_per_worker,\n", - " scaling_config=scaling_config,\n", - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", - " )\n", - " result = trainer.fit()" - ] - }, - { - "cell_type": "markdown", - "id": "d4d8fd65", - "metadata": {}, - "source": [ - "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "5901d958", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n", - " 0%| | 0/3 [00:00 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m 101\u001b[0m \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 432\u001b[0m op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n", - "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 460\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n\u001b[1;32m 466\u001b[0m size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n", - "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n self.setup(copy.deepcopy(self.config))\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n setup_kwargs[k] = parameter_registry.get(prefix + k)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n table = _memory_mapped_arrow_table_from_file(path)\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n memory_mapped_stream = pa.memory_map(filename)\n File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n", - "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n" - ] - } - ], - "source": [ - "#call the above cell as a remote ray function\n", - "ray.get(train_fn.remote())" - ] - }, - { - "cell_type": "markdown", - "id": "5af8cd32", - "metadata": {}, - "source": [ - "Once complete, we can bring our Ray cluster down and clean up:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.down()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d41b90e", - "metadata": {}, - "outputs": [], - "source": [ - "auth.logout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}