From 416179343a0664ae2c12f4ca8278b5ea3463826e Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 25 May 2023 14:42:45 +0000
Subject: [PATCH] Add ray experiments

---
 notebooks/ray-experiments/finetuneflan.yaml  | 155 ++++
 notebooks/ray-experiments/ray-flantune.ipynb | 797 +++++++++++++++++++
 2 files changed, 952 insertions(+)
 create mode 100644 notebooks/ray-experiments/finetuneflan.yaml
 create mode 100644 notebooks/ray-experiments/ray-flantune.ipynb

diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml
new file mode 100644
index 0000000..2cee801
--- /dev/null
+++ b/notebooks/ray-experiments/finetuneflan.yaml
@@ -0,0 +1,155 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+  name: finetuneflan
+  namespace: default
+spec:
+  priority: 9
+  resources:
+    GenericItems:
+    - custompodresources:
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+        replicas: 1
+        requests:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 1
+        replicas: 2
+        requests:
+          cpu: 1
+          memory: 2G
+          nvidia.com/gpu: 1
+      generictemplate:
+        apiVersion: ray.io/v1alpha1
+        kind: RayCluster
+        metadata:
+          labels:
+            appwrapper.mcad.ibm.com: finetuneflan
+            controller-tools.k8s.io: '1.0'
+          name: finetuneflan
+          namespace: default
+        spec:
+          autoscalerOptions:
+            idleTimeoutSeconds: 60
+            imagePullPolicy: Always
+            resources:
+              limits:
+                cpu: 500m
+                memory: 512Mi
+              requests:
+                cpu: 500m
+                memory: 512Mi
+            upscalingMode: Default
+          enableInTreeAutoscaling: false
+          headGroupSpec:
+            rayStartParams:
+              block: 'true'
+              dashboard-host: 0.0.0.0
+              num-gpus: '0'
+            serviceType: ClusterIP
+            template:
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  imagePullPolicy: Always
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: ray-head
+                  ports:
+                  - containerPort: 6379
+                    name: gcs
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+                    requests:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+          rayVersion: 1.12.0
+          workerGroupSpecs:
+          - groupName: small-group-finetuneflan
+            maxReplicas: 2
+            minReplicas: 2
+            rayStartParams:
+              block: 'true'
+              num-gpus: '1'
+            replicas: 2
+            template:
+              metadata:
+                annotations:
+                  key: value
+                labels:
+                  key: value
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: machine-learning
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: 1
+                      memory: 2G
+                      nvidia.com/gpu: 1
+                initContainers:
+                - command:
+                  - sh
+                  - -c
+                  - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
+                    do echo waiting for myservice; sleep 2; done
+                  image: busybox:1.28
+                  name: init-myservice
+      replicas: 1
+    - generictemplate:
+        apiVersion: route.openshift.io/v1
+        kind: Route
+        metadata:
+          labels:
+            odh-ray-cluster-service: finetuneflan-head-svc
+          name: ray-dashboard-finetuneflan
+          namespace: default
+        spec:
+          port:
+            targetPort: dashboard
+          to:
+            kind: Service
+            name: finetuneflan-head-svc
+      replica: 1
+    Items: []
diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb
new file mode 100644
index 0000000..50d5b6a
--- /dev/null
+++ b/notebooks/ray-experiments/ray-flantune.ipynb
@@ -0,0 +1,797 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bbc21043",
+   "metadata": {},
+   "source": [
+    "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa",
+   "metadata": {},
+   "source": [
+    "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner.  The bitsandbytes package doesn't work because of CUDA and Pytorch version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import pieces from codeflare-sdk\n",
+    "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
+    "from codeflare_sdk.cluster.auth import TokenAuthentication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "614daa0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create authentication object for oc user permissions\n",
+    "auth = TokenAuthentication(\n",
+    "    token = \"sha256~26Kf2-d4ytnUrGO1nI72hm1qKqVTbDDDv_IKKOHeThU\",\n",
+    "    server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
+    "    skip_tls=False\n",
+    ")\n",
+    "auth.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc27f84c",
+   "metadata": {},
+   "source": [
+    "Once again, let's start by running through the same cluster setup as before:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0f4bc870-091f-4e11-9642-cba145710159",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Written to: finetuneflan.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create and configure our cluster object (and appwrapper)\n",
+    "cluster = Cluster(ClusterConfiguration(\n",
+    "    name='finetuneflan',\n",
+    "    namespace='default',\n",
+    "    min_worker=2,\n",
+    "    max_worker=2,\n",
+    "    min_cpus=1,\n",
+    "    max_cpus=2,\n",
+    "    min_memory=2,\n",
+    "    max_memory=8,\n",
+    "    gpu=1,\n",
+    "    instascale=False,\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waiting for requested resources to be set up...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m      2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m    224\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m         \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    226\u001b[0m         time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Bring up the cluster\n",
+    "cluster.up()\n",
+    "cluster.wait_ready()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "df71c1ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    </span><span style=\"font-weight: bold; font-style: italic\"> 🚀 CodeFlare Cluster Details 🚀</span><span style=\"font-style: italic\">                     </span>\n",
+       "<span style=\"font-weight: bold\">                                                                         </span>\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   <span style=\"color: #c0c0c0; text-decoration-color: #c0c0c0; background-color: #008000; font-weight: bold\">Name</span>                                                              │ \n",
+       " │   <span style=\"font-weight: bold; text-decoration: underline\">finetuneflan</span>                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   <span style=\"font-weight: bold\">URI:</span> ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   <a href=\"http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\" target=\"_blank\"><span style=\"color: #000080; text-decoration-color: #000080; text-decoration: underline\">Dashboard🔗</span></a>                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  <span style=\"font-style: italic\">                    Cluster Resources                     </span>         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ <span style=\"font-weight: bold\"> Min  Max </span> │  │ <span style=\"font-weight: bold\"> Memory      CPU         GPU        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2   </span><span style=\"color: #800080; text-decoration-color: #800080\"> 2   </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2~8        </span><span style=\"color: #800080; text-decoration-color: #800080\"> 1           1          </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m                     \u001b[0m\n",
+       "\u001b[1m \u001b[0m\u001b[1m                                                                       \u001b[0m\u001b[1m \u001b[0m\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   \u001b[1;37;42mName\u001b[0m                                                              │ \n",
+       " │   \u001b[1;4mfinetuneflan\u001b[0m                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  \u001b[3m                    Cluster Resources                     \u001b[0m         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │  │ \u001b[1m \u001b[0m\u001b[1mMemory    \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU       \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU       \u001b[0m\u001b[1m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m2  \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2  \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m2~8       \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RayCluster(name='finetuneflan', status=<CodeFlareClusterStatus.STARTING: 2>, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cluster.details()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33663f47",
+   "metadata": {},
+   "source": [
+    "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n",
+    "\n",
+    "Using the SDK, we can get both the Ray cluster URI and dashboard URI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c1719bca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n",
+      "ray://finetuneflan-head-svc.default.svc:10001\n"
+     ]
+    }
+   ],
+   "source": [
+    "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n",
+    "ray_cluster_uri = cluster.cluster_uri()\n",
+    "print(ray_dashboard_uri)\n",
+    "print(ray_cluster_uri)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a2aca6a",
+   "metadata": {},
+   "source": [
+    "Now we can connect directly to our Ray cluster via the Ray python client:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "300146dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ray cluster is up and running:  True\n"
+     ]
+    }
+   ],
+   "source": [
+    "#before proceeding make sure the cluster exists and the uri is not empty\n",
+    "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
+    "\n",
+    "import ray\n",
+    "from ray.air.config import ScalingConfig\n",
+    "\n",
+    "# reset the ray context in case there's already one. \n",
+    "ray.shutdown()\n",
+    "# establish connection to ray cluster\n",
+    "\n",
+    "#install additionall libraries that will be required for model training\n",
+    "runtime_env = {\"pip\": [\"transformers\",\n",
+    "                       \"datasets\",\n",
+    "                       \"evaluate\",\n",
+    "                       \"pyarrow<7.0.0\",\n",
+    "                       \"accelerate\",\n",
+    "                       \"bitsandbytes\",\n",
+    "                       \"loralib\",\n",
+    "                       \"py7zr\",\n",
+    "                       \"tensorboard\",\n",
+    "                       \"peft\"], \n",
+    "              \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n",
+    "\n",
+    "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n",
+    "\n",
+    "print(\"Ray cluster is up and running: \", ray.is_initialized())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9711030b",
+   "metadata": {},
+   "source": [
+    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1b36e0d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ray.remote\n",
+    "def train_fn():\n",
+    "    from datasets import load_dataset\n",
+    "    import transformers\n",
+    "    from transformers import AutoTokenizer, TrainingArguments\n",
+    "    from transformers import AutoModelForSequenceClassification\n",
+    "    import numpy as np\n",
+    "    from datasets import load_metric\n",
+    "    import ray\n",
+    "    from ray import tune\n",
+    "    from ray.train.huggingface import HuggingFaceTrainer\n",
+    "    \n",
+    "    from datasets import load_dataset, concatenate_datasets\n",
+    "    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "    from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n",
+    "\n",
+    "    model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+    "\n",
+    "    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    \n",
+    "    dataset = load_dataset(\"samsum\")\n",
+    "\n",
+    "    print(f\"Train dataset size: {len(dataset['train'])}\")\n",
+    "    print(f\"Test dataset size: {len(dataset['test'])}\")\n",
+    "    \n",
+    "    #### COMPUTE MAX SEQ LEN ##########\n",
+    "    # The maximum total input sequence length after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\n",
+    "    conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n",
+    "\n",
+    "    \n",
+    "    tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])\n",
+    "    \n",
+    "    input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n",
+    "    # take 85 percentile of max length for better utilization\n",
+    "    max_source_length = int(np.percentile(input_lengths, 85))\n",
+    "    print(f\"Max source length: {max_source_length}\")\n",
+    "\n",
+    "    # The maximum total sequence length for target text after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n",
+    "    tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])  \n",
+    "    target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n",
+    "    # take 90 percentile of max length for better utilization\n",
+    "    max_target_length = int(np.percentile(target_lengths, 90))\n",
+    "    print(f\"Max target length: {max_target_length}\")\n",
+    "    \n",
+    "    #### PREPROCESS DATA ##########\n",
+    "    \n",
+    "    def preprocess_function(sample,padding=\"max_length\"):\n",
+    "        # add prefix to the input for t5\n",
+    "        inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n",
+    "\n",
+    "        # tokenize inputs\n",
+    "        model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # Tokenize targets with the `text_target` keyword argument\n",
+    "        labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
+    "        # padding in the loss.\n",
+    "        if padding == \"max_length\":\n",
+    "            labels[\"input_ids\"] = [\n",
+    "                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+    "            ]\n",
+    "\n",
+    "        model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "        return model_inputs\n",
+    "\n",
+    "    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
+    "    print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n",
+    "\n",
+    "    ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n",
+    "    ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n",
+    "\n",
+    "    def compute_metrics(eval_pred):\n",
+    "        metric = load_metric(\"accuracy\")\n",
+    "        logits, labels = eval_pred\n",
+    "        predictions = np.argmax(logits, axis=-1)\n",
+    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "\n",
+    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
+    "        model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+    "        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+    "        lora_config = LoraConfig(\n",
+    "            r=16,\n",
+    "            lora_alpha=32,\n",
+    "            target_modules=[\"q\", \"v\"],\n",
+    "            lora_dropout=0.05,\n",
+    "            bias=\"none\",\n",
+    "            task_type=TaskType.SEQ_2_SEQ_LM\n",
+    "        )\n",
+    "        # prepare int-8 model for training\n",
+    "        model = prepare_model_for_int8_training(model)\n",
+    "\n",
+    "        # add LoRA adaptor\n",
+    "        model = get_peft_model(model, lora_config)\n",
+    "        model.print_trainable_parameters()\n",
+    "        \n",
+    "        from transformers import DataCollatorForSeq2Seq\n",
+    "\n",
+    "        # we want to ignore tokenizer pad token in the loss\n",
+    "        label_pad_token_id = -100\n",
+    "        # Data collator\n",
+    "        data_collator = DataCollatorForSeq2Seq(\n",
+    "            tokenizer,\n",
+    "            model=model,\n",
+    "            label_pad_token_id=label_pad_token_id,\n",
+    "            pad_to_multiple_of=8\n",
+    "        )\n",
+    "        \n",
+    "        output_dir=\"/tmp/flan/test\"\n",
+    "\n",
+    "        # Define training args\n",
+    "        training_args = Seq2SeqTrainingArguments(\n",
+    "            output_dir=output_dir,\n",
+    "            auto_find_batch_size=True,\n",
+    "            learning_rate=1e-3, # higher learning rate\n",
+    "            num_train_epochs=5,\n",
+    "            logging_dir=f\"{output_dir}/logs\",\n",
+    "            logging_strategy=\"steps\",\n",
+    "            logging_steps=500,\n",
+    "            save_strategy=\"no\",\n",
+    "            report_to=\"tensorboard\",\n",
+    "        )\n",
+    "\n",
+    "        trainer = Seq2SeqTrainer(model=model,\n",
+    "                                args=training_args,\n",
+    "                                data_collator=data_collator,\n",
+    "                                train_dataset=tokenized_dataset[\"train\"])\n",
+    "        \n",
+    "        return trainer\n",
+    "\n",
+    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+    "\n",
+    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
+    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
+    "    trainer = HuggingFaceTrainer(\n",
+    "        trainer_init_per_worker=trainer_init_per_worker,\n",
+    "        scaling_config=scaling_config,\n",
+    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+    "    )\n",
+    "    result = trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4d8fd65",
+   "metadata": {},
+   "source": [
+    "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5901d958",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n",
+      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "100%|██████████| 3/3 [00:00<00:00, 680.49it/s]\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Train dataset size: 14732\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Test dataset size: 819\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max source length: 255\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max target length: 297\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-8356b281822134f5.arrow\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-af8f1296892299f1.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ===================================BUG REPORT===================================\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m python -m bitsandbytes\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ================================================================================\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | RUNNING  |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result for HuggingFaceTrainer_be877_00000:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   trial_id: be877_00000\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,170\tERROR serialization.py:371 -- [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 369, in deserialize_objects\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = self._deserialize_object(data, metadata, object_ref)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 252, in _deserialize_object\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return self._deserialize_msgpack_data(data, metadata_fields)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 207, in _deserialize_msgpack_data\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     python_objects = self._deserialize_pickle5_data(pickle5_data)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 197, in _deserialize_pickle5_data\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = pickle.loads(in_band)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,172\tERROR worker.py:763 -- Exception raised in creation task: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return ray.get(self.references[k])\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m traceback: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,177\tERROR trial_runner.py:993 -- Trial HuggingFaceTrainer_be877_00000: Error processing event.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     future_result = ray.get(ready_future)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return func(*args, **kwargs)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2291, in get\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     raise value\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return ray.get(self.references[k])\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m traceback: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+     ]
+    },
+    {
+     "ename": "RayTaskError(TrainingFailedError)",
+     "evalue": "\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [19], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#call the above cell as a remote ray function\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m    103\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;124;03m        vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;124;03m        timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m    432\u001b[0m     op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m    433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m     res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    435\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m    460\u001b[0m         logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m    461\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m    463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m    464\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    465\u001b[0m ):\n\u001b[1;32m    466\u001b[0m     size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n",
+      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "#call the above cell as a remote ray function\n",
+    "ray.get(train_fn.remote())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5af8cd32",
+   "metadata": {},
+   "source": [
+    "Once complete, we can bring our Ray cluster down and clean up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'cluster' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "cluster.down()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d41b90e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auth.logout()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}