From 416179343a0664ae2c12f4ca8278b5ea3463826e Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Thu, 25 May 2023 14:42:45 +0000 Subject: [PATCH] Add ray experiments --- notebooks/ray-experiments/finetuneflan.yaml | 155 ++++ notebooks/ray-experiments/ray-flantune.ipynb | 797 +++++++++++++++++++ 2 files changed, 952 insertions(+) create mode 100644 notebooks/ray-experiments/finetuneflan.yaml create mode 100644 notebooks/ray-experiments/ray-flantune.ipynb diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml new file mode 100644 index 0000000..2cee801 --- /dev/null +++ b/notebooks/ray-experiments/finetuneflan.yaml @@ -0,0 +1,155 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: finetuneflan + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 1 + replicas: 2 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 1 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: finetuneflan + controller-tools.k8s.io: '1.0' + name: finetuneflan + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + rayVersion: 1.12.0 + workerGroupSpecs: + - groupName: small-group-finetuneflan + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '1' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 1 + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: finetuneflan-head-svc + name: ray-dashboard-finetuneflan + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: finetuneflan-head-svc + replica: 1 + Items: [] diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb new file mode 100644 index 0000000..50d5b6a --- /dev/null +++ b/notebooks/ray-experiments/ray-flantune.ipynb @@ -0,0 +1,797 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bbc21043", + "metadata": {}, + "source": [ + "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development." + ] + }, + { + "cell_type": "markdown", + "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa", + "metadata": {}, + "source": [ + "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner. The bitsandbytes package doesn't work because of CUDA and Pytorch version." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", + "from codeflare_sdk.cluster.auth import TokenAuthentication" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "614daa0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create authentication object for oc user permissions\n", + "auth = TokenAuthentication(\n", + " token = \"sha256~26Kf2-d4ytnUrGO1nI72hm1qKqVTbDDDv_IKKOHeThU\",\n", + " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n", + " skip_tls=False\n", + ")\n", + "auth.login()" + ] + }, + { + "cell_type": "markdown", + "id": "bc27f84c", + "metadata": {}, + "source": [ + "Once again, let's start by running through the same cluster setup as before:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0f4bc870-091f-4e11-9642-cba145710159", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: finetuneflan.yaml\n" + ] + } + ], + "source": [ + "# Create and configure our cluster object (and appwrapper)\n", + "cluster = Cluster(ClusterConfiguration(\n", + " name='finetuneflan',\n", + " namespace='default',\n", + " min_worker=2,\n", + " max_worker=2,\n", + " min_cpus=1,\n", + " max_cpus=2,\n", + " min_memory=2,\n", + " max_memory=8,\n", + " gpu=1,\n", + " instascale=False,\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for requested resources to be set up...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "df71c1ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                     ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                     \n",
+       "                                                                         \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                              โ”‚ \n",
+       " โ”‚   finetuneflan                                        Inactive โŒ   โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   URI: ray://finetuneflan-head-svc.default.svc:10001                โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                       โ”‚ \n",
+       " โ”‚                                                                     โ”‚ \n",
+       " โ”‚                      Cluster Resources                              โ”‚ \n",
+       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ          โ”‚ \n",
+       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  2~8         1           1           โ”‚          โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚          โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ          โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mfinetuneflan\u001b[0m Inactive โŒ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "markdown", + "id": "33663f47", + "metadata": {}, + "source": [ + "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n", + "\n", + "Using the SDK, we can get both the Ray cluster URI and dashboard URI:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c1719bca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n", + "ray://finetuneflan-head-svc.default.svc:10001\n" + ] + } + ], + "source": [ + "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n", + "ray_cluster_uri = cluster.cluster_uri()\n", + "print(ray_dashboard_uri)\n", + "print(ray_cluster_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "2a2aca6a", + "metadata": {}, + "source": [ + "Now we can connect directly to our Ray cluster via the Ray python client:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "300146dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray cluster is up and running: True\n" + ] + } + ], + "source": [ + "#before proceeding make sure the cluster exists and the uri is not empty\n", + "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", + "\n", + "import ray\n", + "from ray.air.config import ScalingConfig\n", + "\n", + "# reset the ray context in case there's already one. \n", + "ray.shutdown()\n", + "# establish connection to ray cluster\n", + "\n", + "#install additionall libraries that will be required for model training\n", + "runtime_env = {\"pip\": [\"transformers\",\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"pyarrow<7.0.0\",\n", + " \"accelerate\",\n", + " \"bitsandbytes\",\n", + " \"loralib\",\n", + " \"py7zr\",\n", + " \"tensorboard\",\n", + " \"peft\"], \n", + " \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n", + "\n", + "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n", + "\n", + "print(\"Ray cluster is up and running: \", ray.is_initialized())" + ] + }, + { + "cell_type": "markdown", + "id": "9711030b", + "metadata": {}, + "source": [ + "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1b36e0d9", + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def train_fn():\n", + " from datasets import load_dataset\n", + " import transformers\n", + " from transformers import AutoTokenizer, TrainingArguments\n", + " from transformers import AutoModelForSequenceClassification\n", + " import numpy as np\n", + " from datasets import load_metric\n", + " import ray\n", + " from ray import tune\n", + " from ray.train.huggingface import HuggingFaceTrainer\n", + " \n", + " from datasets import load_dataset, concatenate_datasets\n", + " from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", + " from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n", + "\n", + " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", + "\n", + " #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " \n", + " dataset = load_dataset(\"samsum\")\n", + "\n", + " print(f\"Train dataset size: {len(dataset['train'])}\")\n", + " print(f\"Test dataset size: {len(dataset['test'])}\")\n", + " \n", + " #### COMPUTE MAX SEQ LEN ##########\n", + " # The maximum total input sequence length after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\n", + " conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n", + "\n", + " \n", + " tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"])\n", + " \n", + " input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n", + " # take 85 percentile of max length for better utilization\n", + " max_source_length = int(np.percentile(input_lengths, 85))\n", + " print(f\"Max source length: {max_source_length}\")\n", + "\n", + " # The maximum total sequence length for target text after tokenization.\n", + " # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n", + " tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n", + " truncation=True),\n", + " batched=True,\n", + " remove_columns=[\"dialogue\", \"summary\"]) \n", + " target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n", + " # take 90 percentile of max length for better utilization\n", + " max_target_length = int(np.percentile(target_lengths, 90))\n", + " print(f\"Max target length: {max_target_length}\")\n", + " \n", + " #### PREPROCESS DATA ##########\n", + " \n", + " def preprocess_function(sample,padding=\"max_length\"):\n", + " # add prefix to the input for t5\n", + " inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n", + "\n", + " # tokenize inputs\n", + " model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n", + "\n", + " # Tokenize targets with the `text_target` keyword argument\n", + " labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n", + "\n", + " # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n", + " # padding in the loss.\n", + " if padding == \"max_length\":\n", + " labels[\"input_ids\"] = [\n", + " [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n", + " ]\n", + "\n", + " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", + " return model_inputs\n", + "\n", + " tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n", + " print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n", + "\n", + " ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n", + " ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n", + "\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", + " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n", + " model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n", + " lora_config = LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " target_modules=[\"q\", \"v\"],\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=TaskType.SEQ_2_SEQ_LM\n", + " )\n", + " # prepare int-8 model for training\n", + " model = prepare_model_for_int8_training(model)\n", + "\n", + " # add LoRA adaptor\n", + " model = get_peft_model(model, lora_config)\n", + " model.print_trainable_parameters()\n", + " \n", + " from transformers import DataCollatorForSeq2Seq\n", + "\n", + " # we want to ignore tokenizer pad token in the loss\n", + " label_pad_token_id = -100\n", + " # Data collator\n", + " data_collator = DataCollatorForSeq2Seq(\n", + " tokenizer,\n", + " model=model,\n", + " label_pad_token_id=label_pad_token_id,\n", + " pad_to_multiple_of=8\n", + " )\n", + " \n", + " output_dir=\"/tmp/flan/test\"\n", + "\n", + " # Define training args\n", + " training_args = Seq2SeqTrainingArguments(\n", + " output_dir=output_dir,\n", + " auto_find_batch_size=True,\n", + " learning_rate=1e-3, # higher learning rate\n", + " num_train_epochs=5,\n", + " logging_dir=f\"{output_dir}/logs\",\n", + " logging_strategy=\"steps\",\n", + " logging_steps=500,\n", + " save_strategy=\"no\",\n", + " report_to=\"tensorboard\",\n", + " )\n", + "\n", + " trainer = Seq2SeqTrainer(model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=tokenized_dataset[\"train\"])\n", + " \n", + " return trainer\n", + "\n", + " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", + "\n", + " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", + " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", + " trainer = HuggingFaceTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " scaling_config=scaling_config,\n", + " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " )\n", + " result = trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "id": "d4d8fd65", + "metadata": {}, + "source": [ + "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5901d958", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n", + " 0%| | 0/3 [00:00 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m 101\u001b[0m \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 432\u001b[0m op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n", + "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 460\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n\u001b[1;32m 466\u001b[0m size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n", + "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n self.setup(copy.deepcopy(self.config))\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n setup_kwargs[k] = parameter_registry.get(prefix + k)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n table = _memory_mapped_arrow_table_from_file(path)\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n memory_mapped_stream = pa.memory_map(filename)\n File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n", + "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "#call the above cell as a remote ray function\n", + "ray.get(train_fn.remote())" + ] + }, + { + "cell_type": "markdown", + "id": "5af8cd32", + "metadata": {}, + "source": [ + "Once complete, we can bring our Ray cluster down and clean up:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'cluster' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n", + "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined" + ] + } + ], + "source": [ + "cluster.down()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d41b90e", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}