From 2ce1e718d6c11462d0dd87ec93ff5ceaa2e7d277 Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 25 May 2023 14:56:49 +0000
Subject: [PATCH 1/5] Add ray experiments

remove creds
---
 notebooks/ray-experiments/finetuneflan.yaml  | 155 ++++
 notebooks/ray-experiments/ray-flantune.ipynb | 797 +++++++++++++++++++
 2 files changed, 952 insertions(+)
 create mode 100644 notebooks/ray-experiments/finetuneflan.yaml
 create mode 100644 notebooks/ray-experiments/ray-flantune.ipynb

diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml
new file mode 100644
index 0000000..2cee801
--- /dev/null
+++ b/notebooks/ray-experiments/finetuneflan.yaml
@@ -0,0 +1,155 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+  name: finetuneflan
+  namespace: default
+spec:
+  priority: 9
+  resources:
+    GenericItems:
+    - custompodresources:
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+        replicas: 1
+        requests:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 1
+        replicas: 2
+        requests:
+          cpu: 1
+          memory: 2G
+          nvidia.com/gpu: 1
+      generictemplate:
+        apiVersion: ray.io/v1alpha1
+        kind: RayCluster
+        metadata:
+          labels:
+            appwrapper.mcad.ibm.com: finetuneflan
+            controller-tools.k8s.io: '1.0'
+          name: finetuneflan
+          namespace: default
+        spec:
+          autoscalerOptions:
+            idleTimeoutSeconds: 60
+            imagePullPolicy: Always
+            resources:
+              limits:
+                cpu: 500m
+                memory: 512Mi
+              requests:
+                cpu: 500m
+                memory: 512Mi
+            upscalingMode: Default
+          enableInTreeAutoscaling: false
+          headGroupSpec:
+            rayStartParams:
+              block: 'true'
+              dashboard-host: 0.0.0.0
+              num-gpus: '0'
+            serviceType: ClusterIP
+            template:
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  imagePullPolicy: Always
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: ray-head
+                  ports:
+                  - containerPort: 6379
+                    name: gcs
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+                    requests:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+          rayVersion: 1.12.0
+          workerGroupSpecs:
+          - groupName: small-group-finetuneflan
+            maxReplicas: 2
+            minReplicas: 2
+            rayStartParams:
+              block: 'true'
+              num-gpus: '1'
+            replicas: 2
+            template:
+              metadata:
+                annotations:
+                  key: value
+                labels:
+                  key: value
+              spec:
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: machine-learning
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: 1
+                      memory: 2G
+                      nvidia.com/gpu: 1
+                initContainers:
+                - command:
+                  - sh
+                  - -c
+                  - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
+                    do echo waiting for myservice; sleep 2; done
+                  image: busybox:1.28
+                  name: init-myservice
+      replicas: 1
+    - generictemplate:
+        apiVersion: route.openshift.io/v1
+        kind: Route
+        metadata:
+          labels:
+            odh-ray-cluster-service: finetuneflan-head-svc
+          name: ray-dashboard-finetuneflan
+          namespace: default
+        spec:
+          port:
+            targetPort: dashboard
+          to:
+            kind: Service
+            name: finetuneflan-head-svc
+      replica: 1
+    Items: []
diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb
new file mode 100644
index 0000000..4275c59
--- /dev/null
+++ b/notebooks/ray-experiments/ray-flantune.ipynb
@@ -0,0 +1,797 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bbc21043",
+   "metadata": {},
+   "source": [
+    "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa",
+   "metadata": {},
+   "source": [
+    "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner.  The bitsandbytes package doesn't work because of CUDA and Pytorch version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import pieces from codeflare-sdk\n",
+    "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
+    "from codeflare_sdk.cluster.auth import TokenAuthentication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "614daa0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create authentication object for oc user permissions\n",
+    "auth = TokenAuthentication(\n",
+    "    token = \"XX\",\n",
+    "    server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
+    "    skip_tls=False\n",
+    ")\n",
+    "auth.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc27f84c",
+   "metadata": {},
+   "source": [
+    "Once again, let's start by running through the same cluster setup as before:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0f4bc870-091f-4e11-9642-cba145710159",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Written to: finetuneflan.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create and configure our cluster object (and appwrapper)\n",
+    "cluster = Cluster(ClusterConfiguration(\n",
+    "    name='finetuneflan',\n",
+    "    namespace='default',\n",
+    "    min_worker=2,\n",
+    "    max_worker=2,\n",
+    "    min_cpus=1,\n",
+    "    max_cpus=2,\n",
+    "    min_memory=2,\n",
+    "    max_memory=8,\n",
+    "    gpu=1,\n",
+    "    instascale=False,\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waiting for requested resources to be set up...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m      2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m    224\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m         \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    226\u001b[0m         time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Bring up the cluster\n",
+    "cluster.up()\n",
+    "cluster.wait_ready()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "df71c1ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    </span><span style=\"font-weight: bold; font-style: italic\"> 🚀 CodeFlare Cluster Details 🚀</span><span style=\"font-style: italic\">                     </span>\n",
+       "<span style=\"font-weight: bold\">                                                                         </span>\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   <span style=\"color: #c0c0c0; text-decoration-color: #c0c0c0; background-color: #008000; font-weight: bold\">Name</span>                                                              │ \n",
+       " │   <span style=\"font-weight: bold; text-decoration: underline\">finetuneflan</span>                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   <span style=\"font-weight: bold\">URI:</span> ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   <a href=\"http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\" target=\"_blank\"><span style=\"color: #000080; text-decoration-color: #000080; text-decoration: underline\">Dashboard🔗</span></a>                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  <span style=\"font-style: italic\">                    Cluster Resources                     </span>         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ <span style=\"font-weight: bold\"> Min  Max </span> │  │ <span style=\"font-weight: bold\"> Memory      CPU         GPU        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2   </span><span style=\"color: #800080; text-decoration-color: #800080\"> 2   </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2~8        </span><span style=\"color: #800080; text-decoration-color: #800080\"> 1           1          </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m                     \u001b[0m\n",
+       "\u001b[1m \u001b[0m\u001b[1m                                                                       \u001b[0m\u001b[1m \u001b[0m\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   \u001b[1;37;42mName\u001b[0m                                                              │ \n",
+       " │   \u001b[1;4mfinetuneflan\u001b[0m                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  \u001b[3m                    Cluster Resources                     \u001b[0m         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │  │ \u001b[1m \u001b[0m\u001b[1mMemory    \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU       \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU       \u001b[0m\u001b[1m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m2  \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2  \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m2~8       \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RayCluster(name='finetuneflan', status=<CodeFlareClusterStatus.STARTING: 2>, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cluster.details()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33663f47",
+   "metadata": {},
+   "source": [
+    "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n",
+    "\n",
+    "Using the SDK, we can get both the Ray cluster URI and dashboard URI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c1719bca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n",
+      "ray://finetuneflan-head-svc.default.svc:10001\n"
+     ]
+    }
+   ],
+   "source": [
+    "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n",
+    "ray_cluster_uri = cluster.cluster_uri()\n",
+    "print(ray_dashboard_uri)\n",
+    "print(ray_cluster_uri)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a2aca6a",
+   "metadata": {},
+   "source": [
+    "Now we can connect directly to our Ray cluster via the Ray python client:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "300146dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ray cluster is up and running:  True\n"
+     ]
+    }
+   ],
+   "source": [
+    "#before proceeding make sure the cluster exists and the uri is not empty\n",
+    "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
+    "\n",
+    "import ray\n",
+    "from ray.air.config import ScalingConfig\n",
+    "\n",
+    "# reset the ray context in case there's already one. \n",
+    "ray.shutdown()\n",
+    "# establish connection to ray cluster\n",
+    "\n",
+    "#install additionall libraries that will be required for model training\n",
+    "runtime_env = {\"pip\": [\"transformers\",\n",
+    "                       \"datasets\",\n",
+    "                       \"evaluate\",\n",
+    "                       \"pyarrow<7.0.0\",\n",
+    "                       \"accelerate\",\n",
+    "                       \"bitsandbytes\",\n",
+    "                       \"loralib\",\n",
+    "                       \"py7zr\",\n",
+    "                       \"tensorboard\",\n",
+    "                       \"peft\"], \n",
+    "              \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n",
+    "\n",
+    "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n",
+    "\n",
+    "print(\"Ray cluster is up and running: \", ray.is_initialized())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9711030b",
+   "metadata": {},
+   "source": [
+    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1b36e0d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ray.remote\n",
+    "def train_fn():\n",
+    "    from datasets import load_dataset\n",
+    "    import transformers\n",
+    "    from transformers import AutoTokenizer, TrainingArguments\n",
+    "    from transformers import AutoModelForSequenceClassification\n",
+    "    import numpy as np\n",
+    "    from datasets import load_metric\n",
+    "    import ray\n",
+    "    from ray import tune\n",
+    "    from ray.train.huggingface import HuggingFaceTrainer\n",
+    "    \n",
+    "    from datasets import load_dataset, concatenate_datasets\n",
+    "    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "    from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n",
+    "\n",
+    "    model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+    "\n",
+    "    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    \n",
+    "    dataset = load_dataset(\"samsum\")\n",
+    "\n",
+    "    print(f\"Train dataset size: {len(dataset['train'])}\")\n",
+    "    print(f\"Test dataset size: {len(dataset['test'])}\")\n",
+    "    \n",
+    "    #### COMPUTE MAX SEQ LEN ##########\n",
+    "    # The maximum total input sequence length after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\n",
+    "    conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n",
+    "\n",
+    "    \n",
+    "    tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])\n",
+    "    \n",
+    "    input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n",
+    "    # take 85 percentile of max length for better utilization\n",
+    "    max_source_length = int(np.percentile(input_lengths, 85))\n",
+    "    print(f\"Max source length: {max_source_length}\")\n",
+    "\n",
+    "    # The maximum total sequence length for target text after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n",
+    "    tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])  \n",
+    "    target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n",
+    "    # take 90 percentile of max length for better utilization\n",
+    "    max_target_length = int(np.percentile(target_lengths, 90))\n",
+    "    print(f\"Max target length: {max_target_length}\")\n",
+    "    \n",
+    "    #### PREPROCESS DATA ##########\n",
+    "    \n",
+    "    def preprocess_function(sample,padding=\"max_length\"):\n",
+    "        # add prefix to the input for t5\n",
+    "        inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n",
+    "\n",
+    "        # tokenize inputs\n",
+    "        model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # Tokenize targets with the `text_target` keyword argument\n",
+    "        labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
+    "        # padding in the loss.\n",
+    "        if padding == \"max_length\":\n",
+    "            labels[\"input_ids\"] = [\n",
+    "                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+    "            ]\n",
+    "\n",
+    "        model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "        return model_inputs\n",
+    "\n",
+    "    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
+    "    print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n",
+    "\n",
+    "    ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n",
+    "    ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n",
+    "\n",
+    "    def compute_metrics(eval_pred):\n",
+    "        metric = load_metric(\"accuracy\")\n",
+    "        logits, labels = eval_pred\n",
+    "        predictions = np.argmax(logits, axis=-1)\n",
+    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "\n",
+    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
+    "        model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+    "        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+    "        lora_config = LoraConfig(\n",
+    "            r=16,\n",
+    "            lora_alpha=32,\n",
+    "            target_modules=[\"q\", \"v\"],\n",
+    "            lora_dropout=0.05,\n",
+    "            bias=\"none\",\n",
+    "            task_type=TaskType.SEQ_2_SEQ_LM\n",
+    "        )\n",
+    "        # prepare int-8 model for training\n",
+    "        model = prepare_model_for_int8_training(model)\n",
+    "\n",
+    "        # add LoRA adaptor\n",
+    "        model = get_peft_model(model, lora_config)\n",
+    "        model.print_trainable_parameters()\n",
+    "        \n",
+    "        from transformers import DataCollatorForSeq2Seq\n",
+    "\n",
+    "        # we want to ignore tokenizer pad token in the loss\n",
+    "        label_pad_token_id = -100\n",
+    "        # Data collator\n",
+    "        data_collator = DataCollatorForSeq2Seq(\n",
+    "            tokenizer,\n",
+    "            model=model,\n",
+    "            label_pad_token_id=label_pad_token_id,\n",
+    "            pad_to_multiple_of=8\n",
+    "        )\n",
+    "        \n",
+    "        output_dir=\"/tmp/flan/test\"\n",
+    "\n",
+    "        # Define training args\n",
+    "        training_args = Seq2SeqTrainingArguments(\n",
+    "            output_dir=output_dir,\n",
+    "            auto_find_batch_size=True,\n",
+    "            learning_rate=1e-3, # higher learning rate\n",
+    "            num_train_epochs=5,\n",
+    "            logging_dir=f\"{output_dir}/logs\",\n",
+    "            logging_strategy=\"steps\",\n",
+    "            logging_steps=500,\n",
+    "            save_strategy=\"no\",\n",
+    "            report_to=\"tensorboard\",\n",
+    "        )\n",
+    "\n",
+    "        trainer = Seq2SeqTrainer(model=model,\n",
+    "                                args=training_args,\n",
+    "                                data_collator=data_collator,\n",
+    "                                train_dataset=tokenized_dataset[\"train\"])\n",
+    "        \n",
+    "        return trainer\n",
+    "\n",
+    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+    "\n",
+    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
+    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
+    "    trainer = HuggingFaceTrainer(\n",
+    "        trainer_init_per_worker=trainer_init_per_worker,\n",
+    "        scaling_config=scaling_config,\n",
+    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+    "    )\n",
+    "    result = trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4d8fd65",
+   "metadata": {},
+   "source": [
+    "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5901d958",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n",
+      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "100%|██████████| 3/3 [00:00<00:00, 680.49it/s]\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Train dataset size: 14732\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Test dataset size: 819\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max source length: 255\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max target length: 297\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-8356b281822134f5.arrow\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-af8f1296892299f1.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ===================================BUG REPORT===================================\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m python -m bitsandbytes\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ================================================================================\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | RUNNING  |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result for HuggingFaceTrainer_be877_00000:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   trial_id: be877_00000\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,170\tERROR serialization.py:371 -- [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 369, in deserialize_objects\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = self._deserialize_object(data, metadata, object_ref)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 252, in _deserialize_object\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return self._deserialize_msgpack_data(data, metadata_fields)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 207, in _deserialize_msgpack_data\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     python_objects = self._deserialize_pickle5_data(pickle5_data)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 197, in _deserialize_pickle5_data\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = pickle.loads(in_band)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,172\tERROR worker.py:763 -- Exception raised in creation task: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return ray.get(self.references[k])\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m traceback: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,177\tERROR trial_runner.py:993 -- Trial HuggingFaceTrainer_be877_00000: Error processing event.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     future_result = ray.get(ready_future)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return func(*args, **kwargs)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2291, in get\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     raise value\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return ray.get(self.references[k])\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m traceback: Traceback (most recent call last):\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+     ]
+    },
+    {
+     "ename": "RayTaskError(TrainingFailedError)",
+     "evalue": "\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [19], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#call the above cell as a remote ray function\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m    103\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;124;03m        vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;124;03m        timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m    432\u001b[0m     op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m    433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m     res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    435\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m    460\u001b[0m         logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m    461\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m    463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m    464\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    465\u001b[0m ):\n\u001b[1;32m    466\u001b[0m     size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n",
+      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n",
+      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "#call the above cell as a remote ray function\n",
+    "ray.get(train_fn.remote())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5af8cd32",
+   "metadata": {},
+   "source": [
+    "Once complete, we can bring our Ray cluster down and clean up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'cluster' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "cluster.down()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d41b90e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auth.logout()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.14",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.14"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From f2b4f6759870080cc9b532b07a44f90ce251919a Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 13 Jul 2023 17:07:12 +0000
Subject: [PATCH 2/5] Add recent changes

---
 notebooks/ray-experiments/finetuneflan.yaml  | 40 +++++++++++++++++++-
 notebooks/ray-experiments/ray-flantune.ipynb | 39 +++++++++----------
 2 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml
index 2cee801..dafc03c 100644
--- a/notebooks/ray-experiments/finetuneflan.yaml
+++ b/notebooks/ray-experiments/finetuneflan.yaml
@@ -1,6 +1,8 @@
 apiVersion: mcad.ibm.com/v1beta1
 kind: AppWrapper
 metadata:
+  labels:
+    orderedinstance: m5.xlarge_g4dn.xlarge
   name: finetuneflan
   namespace: default
 spec:
@@ -56,12 +58,29 @@ spec:
             serviceType: ClusterIP
             template:
               spec:
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: finetuneflan
+                          operator: In
+                          values:
+                          - finetuneflan
                 containers:
                 - env:
                   - name: MY_POD_IP
                     valueFrom:
                       fieldRef:
                         fieldPath: status.podIP
+                  - name: RAY_USE_TLS
+                    value: '0'
+                  - name: RAY_TLS_SERVER_CERT
+                    value: /home/ray/workspace/tls/server.crt
+                  - name: RAY_TLS_SERVER_KEY
+                    value: /home/ray/workspace/tls/server.key
+                  - name: RAY_TLS_CA_CERT
+                    value: /home/ray/workspace/tls/ca.crt
                   image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
                   imagePullPolicy: Always
                   lifecycle:
@@ -88,7 +107,8 @@ spec:
                       cpu: 2
                       memory: 8G
                       nvidia.com/gpu: 0
-          rayVersion: 1.12.0
+                imagePullSecrets: []
+          rayVersion: 2.1.0
           workerGroupSpecs:
           - groupName: small-group-finetuneflan
             maxReplicas: 2
@@ -104,12 +124,29 @@ spec:
                 labels:
                   key: value
               spec:
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: finetuneflan
+                          operator: In
+                          values:
+                          - finetuneflan
                 containers:
                 - env:
                   - name: MY_POD_IP
                     valueFrom:
                       fieldRef:
                         fieldPath: status.podIP
+                  - name: RAY_USE_TLS
+                    value: '0'
+                  - name: RAY_TLS_SERVER_CERT
+                    value: /home/ray/workspace/tls/server.crt
+                  - name: RAY_TLS_SERVER_KEY
+                    value: /home/ray/workspace/tls/server.key
+                  - name: RAY_TLS_CA_CERT
+                    value: /home/ray/workspace/tls/ca.crt
                   image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
                   lifecycle:
                     preStop:
@@ -128,6 +165,7 @@ spec:
                       cpu: 1
                       memory: 2G
                       nvidia.com/gpu: 1
+                imagePullSecrets: []
                 initContainers:
                 - command:
                   - sh
diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb
index 4275c59..bac7a90 100644
--- a/notebooks/ray-experiments/ray-flantune.ipynb
+++ b/notebooks/ray-experiments/ray-flantune.ipynb
@@ -48,7 +48,7 @@
    "source": [
     "# Create authentication object for oc user permissions\n",
     "auth = TokenAuthentication(\n",
-    "    token = \"XX\",\n",
+    "    token = \"sha256~Z29WoRM5bMsxVgZpJ5uX9XtB-qPZzdOuGo9upSvpc98\",\n",
     "    server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
     "    skip_tls=False\n",
     ")\n",
@@ -89,7 +89,8 @@
     "    min_memory=2,\n",
     "    max_memory=8,\n",
     "    gpu=1,\n",
-    "    instascale=False,\n",
+    "    instascale=True,\n",
+    "    machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
     "))"
    ]
   },
@@ -120,7 +121,13 @@
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
       "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m      2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m    224\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m         \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    226\u001b[0m         time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:221\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    219\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    220\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ready:\n\u001b[0;32m--> 221\u001b[0m     status, ready \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprint_to_console\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    222\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m==\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN:\n\u001b[1;32m    223\u001b[0m         \u001b[38;5;28mprint\u001b[39m(\n\u001b[1;32m    224\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWARNING: Current cluster status is unknown, have you run cluster.up yet?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    225\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:160\u001b[0m, in \u001b[0;36mCluster.status\u001b[0;34m(self, print_to_console)\u001b[0m\n\u001b[1;32m    158\u001b[0m status \u001b[38;5;241m=\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# check the app wrapper status\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m appwrapper \u001b[38;5;241m=\u001b[39m \u001b[43m_app_wrapper_status\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnamespace\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper:\n\u001b[1;32m    162\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m appwrapper\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m    163\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING,\n\u001b[1;32m    164\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mCOMPLETED,\n\u001b[1;32m    165\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING_HOLD_COMPLETION,\n\u001b[1;32m    166\u001b[0m     ]:\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:330\u001b[0m, in \u001b[0;36m_app_wrapper_status\u001b[0;34m(name, namespace)\u001b[0m\n\u001b[1;32m    328\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    329\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mproject(namespace), oc\u001b[38;5;241m.\u001b[39mtimeout(\u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m):\n\u001b[0;32m--> 330\u001b[0m         cluster \u001b[38;5;241m=\u001b[39m \u001b[43moc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mappwrapper/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mname\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    331\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mOpenShiftPythonException \u001b[38;5;28;01mas\u001b[39;00m osp:  \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m    332\u001b[0m     msg \u001b[38;5;241m=\u001b[39m osp\u001b[38;5;241m.\u001b[39mmsg\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:403\u001b[0m, in \u001b[0;36mSelector.object\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m    394\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mobject\u001b[39m(\u001b[38;5;28mself\u001b[39m, ignore_not_found\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    395\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    396\u001b[0m \u001b[38;5;124;03m    Returns a single APIObject that represents the selected resource. If multiple\u001b[39;00m\n\u001b[1;32m    397\u001b[0m \u001b[38;5;124;03m    resources are being selected an exception will be thrown (use objects() when\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    401\u001b[0m \u001b[38;5;124;03m    :return: A Model of the selected resource.\u001b[39;00m\n\u001b[1;32m    402\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 403\u001b[0m     objs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobjects\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    404\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    405\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m ignore_not_found:\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:423\u001b[0m, in \u001b[0;36mSelector.objects\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m    414\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    415\u001b[0m \u001b[38;5;124;03mReturns a python list of APIObject objects that represent the selected resources. An\u001b[39;00m\n\u001b[1;32m    416\u001b[0m \u001b[38;5;124;03mempty is returned if nothing is selected.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    419\u001b[0m \u001b[38;5;124;03m:return: A list of Model objects representing the receiver's selected resources.\u001b[39;00m\n\u001b[1;32m    420\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    421\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapiobject\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m APIObject\n\u001b[0;32m--> 423\u001b[0m obj \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43mignore_not_found\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_not_found\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    426\u001b[0m     api_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(obj)\u001b[38;5;241m.\u001b[39melements(\u001b[38;5;28mcls\u001b[39m)\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:380\u001b[0m, in \u001b[0;36mSelector.object_json\u001b[0;34m(self, ignore_not_found)\u001b[0m\n\u001b[1;32m    377\u001b[0m     cmd_args\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--ignore-not-found\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    379\u001b[0m r \u001b[38;5;241m=\u001b[39m Result(verb)\n\u001b[0;32m--> 380\u001b[0m r\u001b[38;5;241m.\u001b[39madd_action(\u001b[43moc_action\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mall_namespaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_namespaces\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcmd_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcmd_args\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    381\u001b[0m r\u001b[38;5;241m.\u001b[39mfail_if(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to read object\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    383\u001b[0m \u001b[38;5;66;03m# --ignore-not-found returns an empty string instead of an error if nothing is found\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/action.py:363\u001b[0m, in \u001b[0;36moc_action\u001b[0;34m(context, verb, cmd_args, all_namespaces, no_namespace, namespace, references, stdin_obj, stdin_str, last_attempt, **kwargs)\u001b[0m\n\u001b[1;32m    361\u001b[0m         \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m    362\u001b[0m             \u001b[38;5;28;01mpass\u001b[39;00m  \u001b[38;5;66;03m# ignore\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    364\u001b[0m     period \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(\u001b[38;5;241m1\u001b[39m, period \u001b[38;5;241m+\u001b[39m period)  \u001b[38;5;66;03m# Poll fast at first, but slow down to 1/sec over time\u001b[39;00m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;66;03m# See note in paramiko flow on decoding\u001b[39;00m\n",
       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
@@ -133,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "df71c1ed",
    "metadata": {},
    "outputs": [
@@ -169,7 +176,7 @@
        " │                                                                     │ \n",
        " │   \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001                │ \n",
        " │                                                                     │ \n",
-       " │   \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
+       " │   \u001b]8;id=991912;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
        " │                                                                     │ \n",
        " │  \u001b[3m                    Cluster Resources                     \u001b[0m         │ \n",
        " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
@@ -190,7 +197,7 @@
        "RayCluster(name='finetuneflan', status=<CodeFlareClusterStatus.STARTING: 2>, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -737,22 +744,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'cluster' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "cluster.down()"
    ]
@@ -770,7 +765,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.14",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -784,7 +779,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.14"
+   "version": "3.8.13"
   },
   "vscode": {
    "interpreter": {

From 1bcaf5b84dcbb774c9a62fb28b762135ec9bf5ee Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 10 Aug 2023 20:29:08 +0000
Subject: [PATCH 3/5] Add initial ray experiments

---
 .../ray-flan-interactive.ipynb                | 3499 +++++++++++++++++
 1 file changed, 3499 insertions(+)
 create mode 100644 notebooks/ray-experiments/ray-flan-interactive.ipynb

diff --git a/notebooks/ray-experiments/ray-flan-interactive.ipynb b/notebooks/ray-experiments/ray-flan-interactive.ipynb
new file mode 100644
index 0000000..858894c
--- /dev/null
+++ b/notebooks/ray-experiments/ray-flan-interactive.ipynb
@@ -0,0 +1,3499 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bbc21043",
+   "metadata": {},
+   "source": [
+    "# Fine tune Flan T5 model using the Codeflare stack and Ray distribution"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa",
+   "metadata": {},
+   "source": [
+    "This notebook fine tunes the flan T5 model with a summarization dataset. It first uses Instascale to add required machines to the Openshift cluster and then uses Codeflare stack to spawn up a ray cluster. Then it uses Ray train api to distribute the training job over multiple nodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import pieces from codeflare-sdk\n",
+    "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
+    "from codeflare_sdk.cluster.auth import TokenAuthentication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a066b71b-4967-4d03-8601-c2afb2d0b507",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.1.0'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Check ray version: it should match the worker's ray version\n",
+    "import ray\n",
+    "ray.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "614daa0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create authentication object for oc user permissions\n",
+    "auth = TokenAuthentication(\n",
+    "    token = \"xxx\",\n",
+    "    server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
+    "    skip_tls=False\n",
+    ")\n",
+    "auth.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc27f84c",
+   "metadata": {},
+   "source": [
+    "Once again, let's start by running through the same cluster setup as before:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0f4bc870-091f-4e11-9642-cba145710159",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Written to: finetuneflan.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create and configure our cluster object (and appwrapper)\n",
+    "cluster = Cluster(ClusterConfiguration(\n",
+    "    name='finetuneflan',\n",
+    "    namespace='default',\n",
+    "    min_worker=2,\n",
+    "    max_worker=2,\n",
+    "    min_cpus=1,\n",
+    "    max_cpus=2,\n",
+    "    min_memory=8,\n",
+    "    max_memory=24,\n",
+    "    gpu=1,\n",
+    "    instascale=True,\n",
+    "    machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waiting for requested resources to be set up...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [14], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m      2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:229\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    227\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m    228\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 229\u001b[0m         \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    230\u001b[0m         time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m    231\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Bring up the cluster\n",
+    "cluster.up()\n",
+    "cluster.wait_ready()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "df71c1ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    </span><span style=\"font-weight: bold; font-style: italic\"> 🚀 CodeFlare Cluster Details 🚀</span><span style=\"font-style: italic\">                     </span>\n",
+       "<span style=\"font-weight: bold\">                                                                         </span>\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   <span style=\"color: #c0c0c0; text-decoration-color: #c0c0c0; background-color: #008000; font-weight: bold\">Name</span>                                                              │ \n",
+       " │   <span style=\"font-weight: bold; text-decoration: underline\">finetuneflan</span>                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   <span style=\"font-weight: bold\">URI:</span> ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   <a href=\"http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\" target=\"_blank\"><span style=\"color: #000080; text-decoration-color: #000080; text-decoration: underline\">Dashboard🔗</span></a>                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  <span style=\"font-style: italic\">                    Cluster Resources                     </span>         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ <span style=\"font-weight: bold\"> Min  Max </span> │  │ <span style=\"font-weight: bold\"> Memory      CPU         GPU        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2   </span><span style=\"color: #800080; text-decoration-color: #800080\"> 2   </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\"> 8~24       </span><span style=\"color: #800080; text-decoration-color: #800080\"> 1           1          </span> │          │ \n",
+       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m                     \u001b[0m\n",
+       "\u001b[1m \u001b[0m\u001b[1m                                                                       \u001b[0m\u001b[1m \u001b[0m\n",
+       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
+       " │   \u001b[1;37;42mName\u001b[0m                                                              │ \n",
+       " │   \u001b[1;4mfinetuneflan\u001b[0m                                        Inactive ❌   │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001                │ \n",
+       " │                                                                     │ \n",
+       " │   \u001b]8;id=384441;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
+       " │                                                                     │ \n",
+       " │  \u001b[3m                    Cluster Resources                     \u001b[0m         │ \n",
+       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
+       " │   │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │  │ \u001b[1m \u001b[0m\u001b[1mMemory    \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU       \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU       \u001b[0m\u001b[1m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m2  \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2  \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m8~24      \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
+       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
+       " ╰─────────────────────────────────────────────────────────────────────╯ \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RayCluster(name='finetuneflan', status=<CodeFlareClusterStatus.STARTING: 2>, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=24, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cluster.details()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33663f47",
+   "metadata": {},
+   "source": [
+    "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n",
+    "\n",
+    "Using the SDK, we can get both the Ray cluster URI and dashboard URI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c1719bca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n",
+      "ray://finetuneflan-head-svc.default.svc:10001\n"
+     ]
+    }
+   ],
+   "source": [
+    "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n",
+    "ray_cluster_uri = cluster.cluster_uri()\n",
+    "print(ray_dashboard_uri)\n",
+    "print(ray_cluster_uri)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a2aca6a",
+   "metadata": {},
+   "source": [
+    "Now we can connect directly to our Ray cluster via the Ray python client:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "300146dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ray cluster is up and running:  True\n"
+     ]
+    }
+   ],
+   "source": [
+    "#before proceeding make sure the cluster exists and the uri is not empty\n",
+    "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
+    "\n",
+    "import ray\n",
+    "from ray.air.config import ScalingConfig\n",
+    "\n",
+    "# reset the ray context in case there's already one. \n",
+    "ray.shutdown()\n",
+    "# establish connection to ray cluster\n",
+    "\n",
+    "#install additionall libraries that will be required for model training\n",
+    "runtime_env = {\"pip\": [\"transformers\",\n",
+    "                       \"datasets\",\n",
+    "                       \"evaluate\",\n",
+    "                       \"pyarrow<7.0.0\",\n",
+    "                       \"accelerate\",\n",
+    "                       \"loralib\",\n",
+    "                       \"py7zr\",\n",
+    "                       \"tensorboard\",\n",
+    "                       \"peft\"], \n",
+    "              \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n",
+    "\n",
+    "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n",
+    "\n",
+    "print(\"Ray cluster is up and running: \", ray.is_initialized())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9711030b",
+   "metadata": {},
+   "source": [
+    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1b36e0d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ray.remote\n",
+    "def train_fn():\n",
+    "    from datasets import load_dataset\n",
+    "    import transformers\n",
+    "    from transformers import AutoTokenizer, TrainingArguments\n",
+    "    from transformers import AutoModelForSequenceClassification\n",
+    "    import numpy as np\n",
+    "    from datasets import load_metric\n",
+    "    import ray\n",
+    "    from ray import tune\n",
+    "    from ray.train.huggingface import HuggingFaceTrainer\n",
+    "    from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
+    "    from datasets import load_dataset, concatenate_datasets\n",
+    "    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "    from peft import LoraConfig, get_peft_model, TaskType #, prepare_model_for_int8_training\n",
+    "\n",
+    "    model_name = \"google/flan-t5-xl\"\n",
+    "\n",
+    "    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    \n",
+    "    dataset = load_dataset(\"samsum\")\n",
+    "\n",
+    "    print(f\"Train dataset size: {len(dataset['train'])}\")\n",
+    "    print(f\"Test dataset size: {len(dataset['test'])}\")\n",
+    "    \n",
+    "    #### COMPUTE MAX SEQ LEN ##########\n",
+    "    # The maximum total input sequence length after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\n",
+    "    conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n",
+    "\n",
+    "    \n",
+    "    tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])\n",
+    "    \n",
+    "    input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n",
+    "    # take 85 percentile of max length for better utilization\n",
+    "    max_source_length = int(np.percentile(input_lengths, 85))\n",
+    "    print(f\"Max source length: {max_source_length}\")\n",
+    "\n",
+    "    # The maximum total sequence length for target text after tokenization.\n",
+    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n",
+    "    tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+    "                                                            truncation=True),\n",
+    "                                        batched=True,\n",
+    "                                        remove_columns=[\"dialogue\", \"summary\"])  \n",
+    "    target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n",
+    "    # take 90 percentile of max length for better utilization\n",
+    "    max_target_length = int(np.percentile(target_lengths, 90))\n",
+    "    print(f\"Max target length: {max_target_length}\")\n",
+    "    \n",
+    "    #### PREPROCESS DATA ##########\n",
+    "    \n",
+    "    def preprocess_function(sample,padding=\"max_length\"):\n",
+    "        # add prefix to the input for t5\n",
+    "        inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n",
+    "\n",
+    "        # tokenize inputs\n",
+    "        model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # Tokenize targets with the `text_target` keyword argument\n",
+    "        labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n",
+    "\n",
+    "        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
+    "        # padding in the loss.\n",
+    "        if padding == \"max_length\":\n",
+    "            labels[\"input_ids\"] = [\n",
+    "                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+    "            ]\n",
+    "\n",
+    "        model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "        return model_inputs\n",
+    "\n",
+    "    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
+    "    print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n",
+    "\n",
+    "    ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n",
+    "    ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n",
+    "\n",
+    "    def compute_metrics(eval_pred):\n",
+    "        metric = load_metric(\"accuracy\")\n",
+    "        logits, labels = eval_pred\n",
+    "        predictions = np.argmax(logits, axis=-1)\n",
+    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "    \n",
+    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
+    "        model_name = \"google/flan-t5-xl\"\n",
+    "        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map=\"auto\")\n",
+    "        lora_config = LoraConfig(\n",
+    "            r=16,\n",
+    "            lora_alpha=32,\n",
+    "            target_modules=[\"q\", \"v\"],\n",
+    "            lora_dropout=0.05,\n",
+    "            bias=\"none\",\n",
+    "            task_type=TaskType.SEQ_2_SEQ_LM\n",
+    "        )\n",
+    "        # prepare int-8 model for training\n",
+    "        #model = prepare_model_for_int8_training(model)\n",
+    "\n",
+    "        # add LoRA adaptor\n",
+    "        model = get_peft_model(model, lora_config)\n",
+    "        model.print_trainable_parameters()\n",
+    "        \n",
+    "        from transformers import DataCollatorForSeq2Seq\n",
+    "\n",
+    "        # we want to ignore tokenizer pad token in the loss\n",
+    "        label_pad_token_id = -100\n",
+    "        # Data collator\n",
+    "        data_collator = DataCollatorForSeq2Seq(\n",
+    "            tokenizer,\n",
+    "            model=model,\n",
+    "            label_pad_token_id=label_pad_token_id,\n",
+    "            pad_to_multiple_of=8\n",
+    "        )\n",
+    "        \n",
+    "        output_dir=\"/tmp/flan/test\"\n",
+    "\n",
+    "        # Define training args\n",
+    "        training_args = Seq2SeqTrainingArguments(\n",
+    "            output_dir=output_dir,\n",
+    "            auto_find_batch_size=True,\n",
+    "            learning_rate=1e-3, # higher learning rate\n",
+    "            num_train_epochs=5,\n",
+    "            logging_dir=f\"{output_dir}/logs\",\n",
+    "            logging_strategy=\"steps\",\n",
+    "            logging_steps=500,\n",
+    "            save_strategy=\"no\",\n",
+    "            report_to=\"tensorboard\",\n",
+    "        )\n",
+    "\n",
+    "        trainer = Seq2SeqTrainer(model=model,\n",
+    "                                args=training_args,\n",
+    "                                data_collator=data_collator,\n",
+    "                                train_dataset=train_dataset,\n",
+    "                                eval_dataset=eval_dataset)\n",
+    "        \n",
+    "        return trainer\n",
+    "\n",
+    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+    "\n",
+    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
+    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
+    "    trainer = HuggingFaceTrainer(\n",
+    "        trainer_init_per_worker=trainer_init_per_worker,\n",
+    "        scaling_config=scaling_config,\n",
+    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4d8fd65",
+   "metadata": {},
+   "source": [
+    "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5901d958",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 2.54k/2.54k [00:00<00:00, 767kB/s]\n",
+      "Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 99.4MB/s]\n",
+      "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]\n",
+      "Downloading (…)/main/tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 48.7MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 1.34MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Train dataset size: 14732\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Test dataset size: 819\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n",
+      "100%|██████████| 3/3 [00:00<00:00, 837.63it/s]\n",
+      "Map:   0%|          | 0/15551 [00:00<?, ? examples/s]\n",
+      "Map:   6%|▋         | 1000/15551 [00:00<00:03, 4023.66 examples/s]\n",
+      "Map:  13%|█▎        | 2000/15551 [00:00<00:03, 4031.63 examples/s]\n",
+      "Map:  19%|█▉        | 3000/15551 [00:00<00:02, 4215.37 examples/s]\n",
+      "Map:  26%|██▌       | 4000/15551 [00:00<00:02, 4380.19 examples/s]\n",
+      "Map:  32%|███▏      | 5000/15551 [00:01<00:02, 4403.46 examples/s]\n",
+      "Map:  39%|███▊      | 6000/15551 [00:01<00:02, 4544.88 examples/s]\n",
+      "Map:  45%|████▌     | 7000/15551 [00:01<00:01, 4502.46 examples/s]\n",
+      "Map:  51%|█████▏    | 8000/15551 [00:01<00:01, 4506.27 examples/s]\n",
+      "Map:  58%|█████▊    | 9000/15551 [00:02<00:01, 4525.94 examples/s]\n",
+      "Map:  64%|██████▍   | 10000/15551 [00:02<00:01, 4507.28 examples/s]\n",
+      "Map:  71%|███████   | 11000/15551 [00:02<00:01, 4394.20 examples/s]\n",
+      "Map:  77%|███████▋  | 12000/15551 [00:02<00:00, 4482.44 examples/s]\n",
+      "Map:  84%|████████▎ | 13000/15551 [00:02<00:00, 4499.59 examples/s]\n",
+      "Map:  90%|█████████ | 14000/15551 [00:03<00:00, 4488.96 examples/s]\n",
+      "Map:  96%|█████████▋| 15000/15551 [00:03<00:00, 4321.68 examples/s]\n",
+      "                                                                   \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Max source length: 255\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map:   0%|          | 0/15551 [00:00<?, ? examples/s]\n",
+      "Map:   6%|▋         | 1000/15551 [00:00<00:02, 4940.97 examples/s]\n",
+      "Map:  13%|█▎        | 2000/15551 [00:00<00:02, 4525.57 examples/s]\n",
+      "Map:  19%|█▉        | 3000/15551 [00:00<00:03, 4073.89 examples/s]\n",
+      "Map:  26%|██▌       | 4000/15551 [00:01<00:03, 3686.14 examples/s]\n",
+      "Map:  32%|███▏      | 5000/15551 [00:01<00:03, 3482.70 examples/s]\n",
+      "Map:  39%|███▊      | 6000/15551 [00:01<00:02, 3446.74 examples/s]\n",
+      "Map:  45%|████▌     | 7000/15551 [00:01<00:02, 3546.28 examples/s]\n",
+      "Map:  51%|█████▏    | 8000/15551 [00:02<00:02, 3634.73 examples/s]\n",
+      "Map:  58%|█████▊    | 9000/15551 [00:02<00:02, 3271.87 examples/s]\n",
+      "Map:  64%|██████▍   | 10000/15551 [00:02<00:01, 3237.06 examples/s]\n",
+      "Map:  71%|███████   | 11000/15551 [00:03<00:01, 3545.31 examples/s]\n",
+      "Map:  77%|███████▋  | 12000/15551 [00:03<00:00, 3766.11 examples/s]\n",
+      "Map:  84%|████████▎ | 13000/15551 [00:03<00:00, 4014.10 examples/s]\n",
+      "Map:  90%|█████████ | 14000/15551 [00:03<00:00, 4194.66 examples/s]\n",
+      "Map:  96%|█████████▋| 15000/15551 [00:03<00:00, 4252.76 examples/s]\n",
+      "                                                                   \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Max target length: 297\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map:   0%|          | 0/14732 [00:00<?, ? examples/s]\n",
+      "Map:   7%|▋         | 1000/14732 [00:00<00:08, 1593.46 examples/s]\n",
+      "Map:  14%|█▎        | 2000/14732 [00:01<00:08, 1508.77 examples/s]\n",
+      "Map:  20%|██        | 3000/14732 [00:01<00:07, 1528.24 examples/s]\n",
+      "Map:  27%|██▋       | 4000/14732 [00:02<00:06, 1535.12 examples/s]\n",
+      "Map:  34%|███▍      | 5000/14732 [00:03<00:06, 1522.68 examples/s]\n",
+      "Map:  41%|████      | 6000/14732 [00:03<00:05, 1551.01 examples/s]\n",
+      "Map:  48%|████▊     | 7000/14732 [00:04<00:05, 1491.83 examples/s]\n",
+      "Map:  54%|█████▍    | 8000/14732 [00:05<00:04, 1419.68 examples/s]\n",
+      "Map:  61%|██████    | 9000/14732 [00:06<00:03, 1453.36 examples/s]\n",
+      "Map:  68%|██████▊   | 10000/14732 [00:06<00:03, 1433.24 examples/s]\n",
+      "Map:  75%|███████▍  | 11000/14732 [00:07<00:02, 1401.46 examples/s]\n",
+      "Map:  81%|████████▏ | 12000/14732 [00:08<00:01, 1422.42 examples/s]\n",
+      "Map:  88%|████████▊ | 13000/14732 [00:08<00:01, 1439.46 examples/s]\n",
+      "Map:  95%|█████████▌| 14000/14732 [00:09<00:00, 1476.15 examples/s]\n",
+      "Map:   0%|          | 0/819 [00:00<?, ? examples/s]                \n",
+      "Map:   0%|          | 0/818 [00:00<?, ? examples/s]            \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                               \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m To disable this warning, you can either:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:29 (running for 00:00:08.27)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m 2023-07-27 07:58:32,632\tINFO config.py:87 -- Setting up process group for: env:// [rank=0, world_size=2]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:34 (running for 00:00:13.27)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)lve/main/config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 416kB/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 414kB/s]\n",
+      "Downloading (…)model.bin.index.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 50.8k/50.8k [00:00<00:00, 15.2MB/s]\n",
+      "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 50.8k/50.8k [00:00<00:00, 14.9MB/s]\n",
+      "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]\n",
+      "Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.45G [00:00<?, ?B/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.45G [00:00<?, ?B/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   0%|          | 31.5M/9.45G [00:00<00:37, 249MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   0%|          | 41.9M/9.45G [00:00<00:23, 404MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   1%|          | 62.9M/9.45G [00:00<00:36, 257MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   1%|          | 94.4M/9.45G [00:00<00:20, 465MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   1%|          | 94.4M/9.45G [00:00<00:34, 268MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   2%|▏         | 147M/9.45G [00:00<00:20, 452MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   2%|▏         | 199M/9.45G [00:00<00:20, 457MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   1%|▏         | 136M/9.45G [00:00<00:29, 316MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   3%|▎         | 252M/9.45G [00:00<00:19, 473MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   2%|▏         | 178M/9.45G [00:00<00:27, 334MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   3%|▎         | 304M/9.45G [00:00<00:20, 456MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   2%|▏         | 220M/9.45G [00:00<00:28, 325MB/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00002.bin:   3%|▎         | 262M/9.45G [00:00<00:27, 334MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:39 (running for 00:00:18.27)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 4.0/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   4%|▍         | 357M/9.45G [00:00<00:20, 444MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   3%|▎         | 304M/9.45G [00:00<00:26, 344MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   4%|▍         | 409M/9.45G [00:00<00:20, 437MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   5%|▍         | 461M/9.45G [00:01<00:20, 445MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   4%|▎         | 346M/9.45G [00:01<00:28, 323MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   5%|▌         | 514M/9.45G [00:01<00:19, 459MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   4%|▍         | 388M/9.45G [00:01<00:27, 325MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   6%|▌         | 566M/9.45G [00:01<00:19, 450MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   5%|▍         | 430M/9.45G [00:01<00:26, 336MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   7%|▋         | 619M/9.45G [00:01<00:20, 436MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   5%|▍         | 472M/9.45G [00:01<00:25, 352MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   7%|▋         | 671M/9.45G [00:01<00:19, 446MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   5%|▌         | 514M/9.45G [00:01<00:25, 350MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   8%|▊         | 724M/9.45G [00:01<00:18, 460MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   8%|▊         | 776M/9.45G [00:01<00:18, 473MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   6%|▌         | 556M/9.45G [00:01<00:26, 335MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   6%|▋         | 598M/9.45G [00:01<00:27, 323MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   9%|▉         | 828M/9.45G [00:01<00:18, 475MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   7%|▋         | 640M/9.45G [00:01<00:26, 337MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   9%|▉         | 881M/9.45G [00:01<00:17, 482MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   7%|▋         | 682M/9.45G [00:02<00:25, 348MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  10%|▉         | 933M/9.45G [00:02<00:17, 491MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   8%|▊         | 724M/9.45G [00:02<00:24, 355MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  11%|█         | 996M/9.45G [00:02<00:16, 504MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   8%|▊         | 765M/9.45G [00:02<00:24, 358MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  11%|█         | 1.05G/9.45G [00:02<00:16, 506MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   9%|▊         | 807M/9.45G [00:02<00:24, 359MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  12%|█▏        | 1.10G/9.45G [00:02<00:16, 502MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  12%|█▏        | 1.15G/9.45G [00:02<00:16, 490MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   9%|▉         | 849M/9.45G [00:02<00:25, 342MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  13%|█▎        | 1.21G/9.45G [00:02<00:16, 490MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:   9%|▉         | 891M/9.45G [00:02<00:24, 352MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  13%|█▎        | 1.26G/9.45G [00:02<00:16, 488MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  10%|▉         | 933M/9.45G [00:02<00:24, 347MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  14%|█▍        | 1.31G/9.45G [00:02<00:16, 486MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  14%|█▍        | 1.36G/9.45G [00:02<00:16, 486MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  10%|█         | 975M/9.45G [00:02<00:25, 328MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  15%|█▍        | 1.42G/9.45G [00:03<00:16, 485MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  11%|█         | 1.02G/9.45G [00:03<00:25, 330MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  16%|█▌        | 1.47G/9.45G [00:03<00:16, 484MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  11%|█         | 1.06G/9.45G [00:03<00:25, 334MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  16%|█▌        | 1.52G/9.45G [00:03<00:16, 488MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  12%|█▏        | 1.10G/9.45G [00:03<00:25, 324MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  17%|█▋        | 1.57G/9.45G [00:03<00:16, 489MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  17%|█▋        | 1.63G/9.45G [00:03<00:15, 497MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  12%|█▏        | 1.14G/9.45G [00:03<00:25, 329MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  18%|█▊        | 1.68G/9.45G [00:03<00:15, 497MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  13%|█▎        | 1.18G/9.45G [00:03<00:25, 327MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  18%|█▊        | 1.73G/9.45G [00:03<00:15, 494MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  13%|█▎        | 1.23G/9.45G [00:03<00:25, 328MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  19%|█▉        | 1.78G/9.45G [00:03<00:15, 482MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  13%|█▎        | 1.27G/9.45G [00:03<00:25, 321MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  19%|█▉        | 1.84G/9.45G [00:03<00:19, 393MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  14%|█▍        | 1.31G/9.45G [00:03<00:25, 322MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  14%|█▍        | 1.35G/9.45G [00:04<00:25, 319MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  20%|█▉        | 1.89G/9.45G [00:04<00:23, 321MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  15%|█▍        | 1.39G/9.45G [00:04<00:24, 323MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  15%|█▌        | 1.44G/9.45G [00:04<00:24, 325MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  20%|██        | 1.93G/9.45G [00:04<00:25, 297MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  21%|██        | 1.97G/9.45G [00:04<00:25, 299MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  16%|█▌        | 1.48G/9.45G [00:04<00:30, 264MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  21%|██▏       | 2.01G/9.45G [00:04<00:25, 296MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  16%|█▌        | 1.51G/9.45G [00:04<00:28, 274MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  22%|██▏       | 2.04G/9.45G [00:04<00:25, 291MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  16%|█▋        | 1.55G/9.45G [00:04<00:27, 286MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  22%|██▏       | 2.08G/9.45G [00:04<00:25, 288MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  17%|█▋        | 1.58G/9.45G [00:04<00:27, 282MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  22%|██▏       | 2.11G/9.45G [00:04<00:25, 291MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  17%|█▋        | 1.63G/9.45G [00:05<00:25, 304MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.14G/9.45G [00:05<00:26, 281MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  18%|█▊        | 1.67G/9.45G [00:05<00:25, 310MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.17G/9.45G [00:05<00:26, 275MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  18%|█▊        | 1.71G/9.45G [00:05<00:24, 314MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.20G/9.45G [00:05<00:26, 277MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  19%|█▊        | 1.75G/9.45G [00:05<00:24, 310MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  24%|██▎       | 2.23G/9.45G [00:05<00:25, 281MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  19%|█▉        | 1.79G/9.45G [00:05<00:24, 316MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  24%|██▍       | 2.26G/9.45G [00:05<00:25, 283MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  24%|██▍       | 2.30G/9.45G [00:05<00:25, 278MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  19%|█▉        | 1.84G/9.45G [00:05<00:24, 305MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  25%|██▍       | 2.33G/9.45G [00:05<00:25, 278MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  20%|█▉        | 1.88G/9.45G [00:05<00:23, 316MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  25%|██▍       | 2.36G/9.45G [00:05<00:24, 284MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:44 (running for 00:00:23.28)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  20%|██        | 1.92G/9.45G [00:05<00:23, 326MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  25%|██▌       | 2.40G/9.45G [00:06<00:23, 297MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  21%|██        | 1.96G/9.45G [00:06<00:22, 327MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  26%|██▌       | 2.43G/9.45G [00:06<00:24, 285MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  21%|██        | 2.00G/9.45G [00:06<00:22, 326MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  26%|██▌       | 2.46G/9.45G [00:06<00:24, 280MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  22%|██▏       | 2.04G/9.45G [00:06<00:22, 335MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  26%|██▋       | 2.50G/9.45G [00:06<00:24, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  22%|██▏       | 2.09G/9.45G [00:06<00:22, 323MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.53G/9.45G [00:06<00:23, 288MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.13G/9.45G [00:06<00:23, 318MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.56G/9.45G [00:06<00:24, 285MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.59G/9.45G [00:06<00:23, 291MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.17G/9.45G [00:06<00:23, 313MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  28%|██▊       | 2.63G/9.45G [00:06<00:22, 303MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  28%|██▊       | 2.66G/9.45G [00:06<00:22, 304MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  23%|██▎       | 2.21G/9.45G [00:06<00:23, 305MB/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00002.bin:  24%|██▍       | 2.25G/9.45G [00:07<00:22, 317MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  29%|██▊       | 2.69G/9.45G [00:07<00:22, 299MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  29%|██▉       | 2.73G/9.45G [00:07<00:23, 285MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  24%|██▍       | 2.30G/9.45G [00:07<00:22, 324MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  29%|██▉       | 2.76G/9.45G [00:07<00:23, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  25%|██▍       | 2.34G/9.45G [00:07<00:21, 335MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|██▉       | 2.79G/9.45G [00:07<00:23, 288MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  25%|██▌       | 2.38G/9.45G [00:07<00:20, 340MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|██▉       | 2.82G/9.45G [00:07<00:23, 285MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  26%|██▌       | 2.42G/9.45G [00:07<00:21, 327MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  26%|██▌       | 2.46G/9.45G [00:07<00:20, 337MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|███       | 2.85G/9.45G [00:07<00:23, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.51G/9.45G [00:07<00:20, 343MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  31%|███       | 2.88G/9.45G [00:07<00:23, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  31%|███       | 2.92G/9.45G [00:07<00:23, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.55G/9.45G [00:07<00:21, 328MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  31%|███       | 2.95G/9.45G [00:07<00:22, 292MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  27%|██▋       | 2.59G/9.45G [00:08<00:22, 304MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  32%|███▏      | 2.98G/9.45G [00:08<00:22, 282MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  32%|███▏      | 3.01G/9.45G [00:08<00:22, 290MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  28%|██▊       | 2.62G/9.45G [00:08<00:23, 294MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  32%|███▏      | 3.04G/9.45G [00:08<00:23, 273MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  28%|██▊       | 2.65G/9.45G [00:08<00:23, 284MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  28%|██▊       | 2.68G/9.45G [00:08<00:23, 287MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.07G/9.45G [00:08<00:25, 255MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  29%|██▊       | 2.72G/9.45G [00:08<00:24, 281MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.10G/9.45G [00:08<00:25, 253MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  29%|██▉       | 2.75G/9.45G [00:08<00:24, 276MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.15G/9.45G [00:08<00:23, 274MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|██▉       | 2.79G/9.45G [00:08<00:22, 294MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  34%|███▎      | 3.19G/9.45G [00:08<00:22, 279MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|██▉       | 2.83G/9.45G [00:08<00:21, 304MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  34%|███▍      | 3.23G/9.45G [00:08<00:21, 288MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  30%|███       | 2.86G/9.45G [00:08<00:21, 306MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  35%|███▍      | 3.26G/9.45G [00:09<00:21, 292MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  31%|███       | 2.90G/9.45G [00:09<00:20, 324MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  35%|███▍      | 3.29G/9.45G [00:09<00:20, 294MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  31%|███       | 2.95G/9.45G [00:09<00:19, 342MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  35%|███▌      | 3.32G/9.45G [00:09<00:21, 288MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  32%|███▏      | 2.99G/9.45G [00:09<00:18, 348MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  36%|███▌      | 3.37G/9.45G [00:09<00:20, 300MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  32%|███▏      | 3.03G/9.45G [00:09<00:19, 337MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  36%|███▌      | 3.41G/9.45G [00:09<00:19, 313MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.07G/9.45G [00:09<00:18, 341MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  37%|███▋      | 3.45G/9.45G [00:09<00:19, 316MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.11G/9.45G [00:09<00:18, 342MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  37%|███▋      | 3.49G/9.45G [00:09<00:18, 322MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  33%|███▎      | 3.16G/9.45G [00:09<00:17, 352MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  38%|███▊      | 3.54G/9.45G [00:09<00:16, 360MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  34%|███▍      | 3.20G/9.45G [00:09<00:18, 336MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  38%|███▊      | 3.59G/9.45G [00:10<00:17, 344MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  34%|███▍      | 3.24G/9.45G [00:10<00:18, 338MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  35%|███▍      | 3.28G/9.45G [00:10<00:17, 348MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  38%|███▊      | 3.63G/9.45G [00:10<00:16, 343MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  35%|███▌      | 3.32G/9.45G [00:10<00:17, 348MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  39%|███▉      | 3.67G/9.45G [00:10<00:16, 350MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  39%|███▉      | 3.71G/9.45G [00:10<00:16, 340MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  36%|███▌      | 3.37G/9.45G [00:10<00:17, 354MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  36%|███▌      | 3.41G/9.45G [00:10<00:17, 351MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  37%|███▋      | 3.45G/9.45G [00:10<00:17, 345MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  37%|███▋      | 3.49G/9.45G [00:10<00:17, 349MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  40%|███▉      | 3.75G/9.45G [00:10<00:26, 217MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:49 (running for 00:00:28.28)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  37%|███▋      | 3.53G/9.45G [00:10<00:16, 350MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  40%|████      | 3.79G/9.45G [00:10<00:25, 221MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  38%|███▊      | 3.58G/9.45G [00:11<00:16, 357MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  40%|████      | 3.82G/9.45G [00:11<00:25, 221MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  38%|███▊      | 3.62G/9.45G [00:11<00:16, 355MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████      | 3.85G/9.45G [00:11<00:25, 224MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  39%|███▊      | 3.66G/9.45G [00:11<00:16, 349MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████      | 3.88G/9.45G [00:11<00:22, 243MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  39%|███▉      | 3.70G/9.45G [00:11<00:15, 360MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████▏     | 3.91G/9.45G [00:11<00:22, 249MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  40%|███▉      | 3.74G/9.45G [00:11<00:15, 365MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  42%|████▏     | 3.95G/9.45G [00:11<00:20, 273MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  40%|████      | 3.79G/9.45G [00:11<00:16, 352MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  42%|████▏     | 4.00G/9.45G [00:11<00:18, 292MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.03G/9.45G [00:11<00:19, 285MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████      | 3.83G/9.45G [00:11<00:16, 339MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████      | 3.87G/9.45G [00:11<00:17, 321MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.07G/9.45G [00:11<00:17, 301MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.10G/9.45G [00:11<00:18, 291MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  41%|████▏     | 3.91G/9.45G [00:12<00:19, 286MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▎     | 4.13G/9.45G [00:12<00:21, 250MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  42%|████▏     | 3.94G/9.45G [00:12<00:20, 275MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▍     | 4.16G/9.45G [00:12<00:24, 220MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  42%|████▏     | 3.97G/9.45G [00:12<00:21, 260MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  42%|████▏     | 4.01G/9.45G [00:12<00:22, 242MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▍     | 4.19G/9.45G [00:12<00:25, 208MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.04G/9.45G [00:12<00:22, 240MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▍     | 4.23G/9.45G [00:12<00:27, 193MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.07G/9.45G [00:12<00:24, 222MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▍     | 4.25G/9.45G [00:12<00:27, 189MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▌     | 4.27G/9.45G [00:12<00:27, 185MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  43%|████▎     | 4.10G/9.45G [00:12<00:25, 210MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▌     | 4.29G/9.45G [00:13<00:29, 178MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▎     | 4.13G/9.45G [00:13<00:26, 204MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.31G/9.45G [00:13<00:29, 174MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▍     | 4.15G/9.45G [00:13<00:26, 201MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.33G/9.45G [00:13<00:29, 174MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▍     | 4.17G/9.45G [00:13<00:26, 198MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.35G/9.45G [00:13<00:31, 163MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  44%|████▍     | 4.19G/9.45G [00:13<00:27, 194MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▍     | 4.22G/9.45G [00:13<00:27, 193MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▋     | 4.37G/9.45G [00:13<00:31, 160MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▍     | 4.24G/9.45G [00:13<00:29, 177MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▋     | 4.39G/9.45G [00:13<00:31, 158MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▌     | 4.26G/9.45G [00:13<00:28, 184MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.41G/9.45G [00:13<00:32, 153MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▌     | 4.28G/9.45G [00:13<00:28, 180MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  45%|████▌     | 4.30G/9.45G [00:14<00:29, 174MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.44G/9.45G [00:14<00:42, 117MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.32G/9.45G [00:14<00:29, 172MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.47G/9.45G [00:14<00:32, 153MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.34G/9.45G [00:14<00:30, 166MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.50G/9.45G [00:14<00:29, 167MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.52G/9.45G [00:14<00:30, 162MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.54G/9.45G [00:14<00:31, 158MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▌     | 4.36G/9.45G [00:14<00:45, 112MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.56G/9.45G [00:14<00:31, 154MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  46%|████▋     | 4.39G/9.45G [00:14<00:39, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.58G/9.45G [00:15<00:31, 153MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.44G/9.45G [00:14<00:29, 172MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.47G/9.45G [00:15<00:26, 186MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▊     | 4.60G/9.45G [00:15<00:32, 150MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  47%|████▋     | 4.49G/9.45G [00:15<00:27, 178MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.62G/9.45G [00:15<00:32, 150MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.65G/9.45G [00:15<00:32, 149MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.51G/9.45G [00:15<00:29, 167MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.53G/9.45G [00:15<00:29, 165MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.67G/9.45G [00:15<00:32, 145MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|████▉     | 4.69G/9.45G [00:15<00:33, 144MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.55G/9.45G [00:15<00:30, 162MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  48%|████▊     | 4.57G/9.45G [00:15<00:31, 156MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|████▉     | 4.71G/9.45G [00:15<00:32, 146MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:54 (running for 00:00:33.28)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▊     | 4.59G/9.45G [00:15<00:31, 156MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|█████     | 4.73G/9.45G [00:16<00:32, 145MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.61G/9.45G [00:16<00:31, 153MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|█████     | 4.75G/9.45G [00:16<00:33, 142MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.63G/9.45G [00:16<00:31, 155MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|█████     | 4.77G/9.45G [00:16<00:33, 141MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.66G/9.45G [00:16<00:30, 155MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.79G/9.45G [00:16<00:32, 143MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  49%|████▉     | 4.68G/9.45G [00:16<00:31, 150MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.81G/9.45G [00:16<00:33, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|████▉     | 4.70G/9.45G [00:16<00:32, 145MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.83G/9.45G [00:16<00:33, 138MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|████▉     | 4.72G/9.45G [00:16<00:32, 145MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████▏    | 4.85G/9.45G [00:16<00:33, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|█████     | 4.74G/9.45G [00:16<00:33, 142MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.88G/9.45G [00:17<00:34, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  50%|█████     | 4.76G/9.45G [00:17<00:35, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.90G/9.45G [00:17<00:33, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.78G/9.45G [00:17<00:33, 141MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.92G/9.45G [00:17<00:33, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.80G/9.45G [00:17<00:33, 138MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████     | 4.82G/9.45G [00:17<00:33, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.94G/9.45G [00:17<00:34, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.96G/9.45G [00:17<00:34, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████▏    | 4.84G/9.45G [00:17<00:33, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  51%|█████▏    | 4.87G/9.45G [00:17<00:33, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 4.98G/9.45G [00:17<00:34, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.89G/9.45G [00:18<00:33, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.00G/9.45G [00:18<00:34, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.02G/9.45G [00:18<00:34, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.91G/9.45G [00:18<00:33, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.93G/9.45G [00:18<00:33, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.04G/9.45G [00:18<00:33, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▎    | 5.06G/9.45G [00:18<00:33, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  52%|█████▏    | 4.95G/9.45G [00:18<00:33, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 4.97G/9.45G [00:18<00:33, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.09G/9.45G [00:18<00:33, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.11G/9.45G [00:18<00:33, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 4.99G/9.45G [00:18<00:32, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.01G/9.45G [00:18<00:32, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.13G/9.45G [00:19<00:33, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.03G/9.45G [00:19<00:32, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.15G/9.45G [00:19<00:33, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  53%|█████▎    | 5.05G/9.45G [00:19<00:32, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▍    | 5.17G/9.45G [00:19<00:33, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▎    | 5.08G/9.45G [00:19<00:32, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▍    | 5.19G/9.45G [00:19<00:32, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.10G/9.45G [00:19<00:32, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▌    | 5.21G/9.45G [00:19<00:33, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.12G/9.45G [00:19<00:32, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▌    | 5.23G/9.45G [00:19<00:32, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  54%|█████▍    | 5.14G/9.45G [00:19<00:32, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.25G/9.45G [00:20<00:32, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▍    | 5.16G/9.45G [00:20<00:31, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.27G/9.45G [00:20<00:31, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▍    | 5.18G/9.45G [00:20<00:31, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.30G/9.45G [00:20<00:32, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▌    | 5.20G/9.45G [00:20<00:32, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▋    | 5.32G/9.45G [00:20<00:31, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▌    | 5.22G/9.45G [00:20<00:31, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▋    | 5.34G/9.45G [00:20<00:31, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  55%|█████▌    | 5.24G/9.45G [00:20<00:31, 136MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:58:59 (running for 00:00:38.28)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.36G/9.45G [00:20<00:31, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.26G/9.45G [00:20<00:30, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.38G/9.45G [00:20<00:31, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.28G/9.45G [00:21<00:31, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.40G/9.45G [00:21<00:31, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▌    | 5.31G/9.45G [00:21<00:31, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.42G/9.45G [00:21<00:31, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  56%|█████▋    | 5.33G/9.45G [00:21<00:31, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.44G/9.45G [00:21<00:30, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.35G/9.45G [00:21<00:30, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.46G/9.45G [00:21<00:30, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.37G/9.45G [00:21<00:30, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.48G/9.45G [00:21<00:30, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.39G/9.45G [00:21<00:30, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.51G/9.45G [00:21<00:30, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.41G/9.45G [00:21<00:31, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.53G/9.45G [00:22<00:30, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  57%|█████▋    | 5.43G/9.45G [00:22<00:29, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▊    | 5.55G/9.45G [00:22<00:30, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.45G/9.45G [00:22<00:29, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.57G/9.45G [00:22<00:29, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.47G/9.45G [00:22<00:30, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.59G/9.45G [00:22<00:29, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.49G/9.45G [00:22<00:29, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  58%|█████▊    | 5.52G/9.45G [00:22<00:30, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.61G/9.45G [00:22<00:29, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|█████▉    | 5.63G/9.45G [00:22<00:29, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▊    | 5.54G/9.45G [00:22<00:30, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.56G/9.45G [00:23<00:29, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|█████▉    | 5.65G/9.45G [00:23<00:29, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|██████    | 5.67G/9.45G [00:23<00:29, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.58G/9.45G [00:23<00:29, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|██████    | 5.69G/9.45G [00:23<00:29, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.60G/9.45G [00:23<00:29, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|██████    | 5.71G/9.45G [00:23<00:28, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  59%|█████▉    | 5.62G/9.45G [00:23<00:28, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.74G/9.45G [00:23<00:29, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|█████▉    | 5.64G/9.45G [00:23<00:28, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|█████▉    | 5.66G/9.45G [00:23<00:27, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.76G/9.45G [00:23<00:28, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|██████    | 5.68G/9.45G [00:24<00:27, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.78G/9.45G [00:24<00:28, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  60%|██████    | 5.70G/9.45G [00:24<00:28, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.73G/9.45G [00:24<00:28, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████▏   | 5.80G/9.45G [00:24<00:35, 102MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.75G/9.45G [00:24<00:28, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.84G/9.45G [00:24<00:26, 139MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.86G/9.45G [00:24<00:26, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████    | 5.77G/9.45G [00:24<00:38, 96.5MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.88G/9.45G [00:24<00:26, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████▏   | 5.79G/9.45G [00:25<00:33, 109MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  61%|██████▏   | 5.81G/9.45G [00:25<00:28, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.90G/9.45G [00:25<00:26, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.84G/9.45G [00:25<00:23, 152MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.92G/9.45G [00:25<00:26, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.86G/9.45G [00:25<00:24, 148MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.95G/9.45G [00:25<00:26, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.88G/9.45G [00:25<00:25, 142MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.97G/9.45G [00:25<00:26, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.99G/9.45G [00:25<00:27, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  62%|██████▏   | 5.90G/9.45G [00:25<00:26, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.92G/9.45G [00:25<00:26, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▎   | 6.01G/9.45G [00:25<00:26, 129MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:04 (running for 00:00:43.29)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.95G/9.45G [00:26<00:25, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.03G/9.45G [00:26<00:26, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.97G/9.45G [00:26<00:26, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.05G/9.45G [00:26<00:26, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  63%|██████▎   | 5.99G/9.45G [00:26<00:27, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.07G/9.45G [00:26<00:26, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▎   | 6.01G/9.45G [00:26<00:25, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.09G/9.45G [00:26<00:26, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.03G/9.45G [00:26<00:25, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▍   | 6.11G/9.45G [00:26<00:26, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.05G/9.45G [00:26<00:26, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▍   | 6.13G/9.45G [00:26<00:26, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.07G/9.45G [00:27<00:25, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▌   | 6.16G/9.45G [00:27<00:25, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  64%|██████▍   | 6.09G/9.45G [00:27<00:26, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▌   | 6.18G/9.45G [00:27<00:25, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▍   | 6.11G/9.45G [00:27<00:25, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.20G/9.45G [00:27<00:25, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▍   | 6.13G/9.45G [00:27<00:25, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.22G/9.45G [00:27<00:25, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▌   | 6.16G/9.45G [00:27<00:24, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.24G/9.45G [00:27<00:25, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  65%|██████▌   | 6.18G/9.45G [00:27<00:25, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.26G/9.45G [00:27<00:25, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.20G/9.45G [00:27<00:24, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▋   | 6.28G/9.45G [00:28<00:25, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.22G/9.45G [00:28<00:24, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.30G/9.45G [00:28<00:24, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.24G/9.45G [00:28<00:25, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.32G/9.45G [00:28<00:24, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▌   | 6.26G/9.45G [00:28<00:24, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.34G/9.45G [00:28<00:24, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  66%|██████▋   | 6.28G/9.45G [00:28<00:23, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.30G/9.45G [00:28<00:23, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.36G/9.45G [00:28<00:24, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.39G/9.45G [00:28<00:24, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.32G/9.45G [00:28<00:24, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.41G/9.45G [00:28<00:23, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.34G/9.45G [00:29<00:23, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  67%|██████▋   | 6.36G/9.45G [00:29<00:23, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.43G/9.45G [00:29<00:24, 124MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.45G/9.45G [00:29<00:23, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.39G/9.45G [00:29<00:24, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.41G/9.45G [00:29<00:22, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.47G/9.45G [00:29<00:23, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▊   | 6.49G/9.45G [00:29<00:22, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.43G/9.45G [00:29<00:22, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.51G/9.45G [00:29<00:23, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.45G/9.45G [00:29<00:22, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.53G/9.45G [00:29<00:22, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  68%|██████▊   | 6.47G/9.45G [00:30<00:22, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.55G/9.45G [00:30<00:22, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▊   | 6.49G/9.45G [00:30<00:22, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|██████▉   | 6.57G/9.45G [00:30<00:22, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.51G/9.45G [00:30<00:22, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|██████▉   | 6.60G/9.45G [00:30<00:21, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.53G/9.45G [00:30<00:21, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.62G/9.45G [00:30<00:22, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  69%|██████▉   | 6.55G/9.45G [00:30<00:22, 129MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:09 (running for 00:00:48.29)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.64G/9.45G [00:30<00:21, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|██████▉   | 6.57G/9.45G [00:30<00:22, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.66G/9.45G [00:30<00:22, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|██████▉   | 6.60G/9.45G [00:31<00:21, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.68G/9.45G [00:31<00:21, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.62G/9.45G [00:31<00:21, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.70G/9.45G [00:31<00:21, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.64G/9.45G [00:31<00:21, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  70%|███████   | 6.66G/9.45G [00:31<00:21, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.72G/9.45G [00:31<00:21, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████▏  | 6.74G/9.45G [00:31<00:21, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.68G/9.45G [00:31<00:20, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.70G/9.45G [00:31<00:20, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.76G/9.45G [00:31<00:20, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.78G/9.45G [00:31<00:21, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████   | 6.72G/9.45G [00:31<00:20, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  71%|███████▏  | 6.74G/9.45G [00:32<00:20, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.81G/9.45G [00:32<00:20, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.83G/9.45G [00:32<00:20, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.76G/9.45G [00:32<00:20, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.85G/9.45G [00:32<00:20, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.78G/9.45G [00:32<00:20, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.81G/9.45G [00:32<00:20, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.87G/9.45G [00:32<00:20, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.89G/9.45G [00:32<00:19, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.83G/9.45G [00:32<00:20, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.91G/9.45G [00:32<00:20, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  72%|███████▏  | 6.85G/9.45G [00:32<00:20, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.87G/9.45G [00:33<00:19, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.93G/9.45G [00:33<00:19, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▎  | 6.95G/9.45G [00:33<00:19, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.89G/9.45G [00:33<00:19, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.91G/9.45G [00:33<00:19, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 6.97G/9.45G [00:33<00:19, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 6.99G/9.45G [00:33<00:18, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  73%|███████▎  | 6.93G/9.45G [00:33<00:19, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▎  | 6.95G/9.45G [00:33<00:18, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 7.01G/9.45G [00:33<00:18, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 6.97G/9.45G [00:33<00:18, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 7.04G/9.45G [00:33<00:19, 124MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 6.99G/9.45G [00:34<00:18, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▍  | 7.06G/9.45G [00:34<00:18, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 7.01G/9.45G [00:34<00:18, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▍  | 7.08G/9.45G [00:34<00:18, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▌  | 7.10G/9.45G [00:34<00:18, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  74%|███████▍  | 7.04G/9.45G [00:34<00:18, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▍  | 7.06G/9.45G [00:34<00:18, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▌  | 7.12G/9.45G [00:34<00:18, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.14G/9.45G [00:34<00:17, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▍  | 7.08G/9.45G [00:34<00:17, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.16G/9.45G [00:34<00:17, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▌  | 7.10G/9.45G [00:34<00:21, 110MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.18G/9.45G [00:35<00:17, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  75%|███████▌  | 7.12G/9.45G [00:35<00:20, 114MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.20G/9.45G [00:35<00:17, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.14G/9.45G [00:35<00:19, 121MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▋  | 7.22G/9.45G [00:35<00:17, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.17G/9.45G [00:35<00:15, 151MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▌  | 7.19G/9.45G [00:35<00:15, 148MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.25G/9.45G [00:35<00:17, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.27G/9.45G [00:35<00:16, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  76%|███████▋  | 7.21G/9.45G [00:35<00:15, 144MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.24G/9.45G [00:35<00:15, 141MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.29G/9.45G [00:35<00:16, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.31G/9.45G [00:36<00:16, 128MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:14 (running for 00:00:53.29)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.26G/9.45G [00:36<00:16, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.28G/9.45G [00:36<00:15, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.33G/9.45G [00:36<00:16, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.35G/9.45G [00:36<00:16, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.30G/9.45G [00:36<00:16, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.37G/9.45G [00:36<00:16, 126MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  77%|███████▋  | 7.32G/9.45G [00:36<00:15, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.34G/9.45G [00:36<00:15, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.39G/9.45G [00:36<00:15, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.41G/9.45G [00:36<00:15, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.36G/9.45G [00:36<00:15, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.38G/9.45G [00:36<00:15, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▊  | 7.43G/9.45G [00:37<00:15, 127MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.46G/9.45G [00:37<00:15, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  78%|███████▊  | 7.40G/9.45G [00:37<00:15, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▊  | 7.42G/9.45G [00:37<00:15, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.48G/9.45G [00:37<00:15, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.44G/9.45G [00:37<00:15, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.50G/9.45G [00:37<00:15, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.47G/9.45G [00:37<00:14, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|███████▉  | 7.52G/9.45G [00:37<00:17, 107MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.49G/9.45G [00:37<00:14, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|███████▉  | 7.55G/9.45G [00:37<00:13, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  79%|███████▉  | 7.51G/9.45G [00:37<00:14, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|████████  | 7.57G/9.45G [00:38<00:13, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|███████▉  | 7.53G/9.45G [00:38<00:14, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|████████  | 7.59G/9.45G [00:38<00:14, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|███████▉  | 7.55G/9.45G [00:38<00:14, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.61G/9.45G [00:38<00:13, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|████████  | 7.57G/9.45G [00:38<00:14, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.63G/9.45G [00:38<00:13, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  80%|████████  | 7.59G/9.45G [00:38<00:14, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.65G/9.45G [00:38<00:13, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.61G/9.45G [00:38<00:13, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.68G/9.45G [00:38<00:13, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.63G/9.45G [00:38<00:13, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████▏ | 7.70G/9.45G [00:39<00:13, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.65G/9.45G [00:39<00:13, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.72G/9.45G [00:39<00:13, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████  | 7.68G/9.45G [00:39<00:13, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.74G/9.45G [00:39<00:13, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  81%|████████▏ | 7.70G/9.45G [00:39<00:13, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.72G/9.45G [00:39<00:13, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.76G/9.45G [00:39<00:13, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.78G/9.45G [00:39<00:12, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.74G/9.45G [00:39<00:13, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.80G/9.45G [00:39<00:12, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.76G/9.45G [00:39<00:12, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.82G/9.45G [00:40<00:12, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  82%|████████▏ | 7.78G/9.45G [00:40<00:12, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.84G/9.45G [00:40<00:12, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.80G/9.45G [00:40<00:12, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.86G/9.45G [00:40<00:12, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.82G/9.45G [00:40<00:12, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.89G/9.45G [00:40<00:11, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.84G/9.45G [00:40<00:12, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.86G/9.45G [00:40<00:12, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▎ | 7.91G/9.45G [00:40<00:11, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.93G/9.45G [00:40<00:11, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  83%|████████▎ | 7.89G/9.45G [00:40<00:11, 135MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:19 (running for 00:00:58.29)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▎ | 7.91G/9.45G [00:40<00:11, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.95G/9.45G [00:40<00:11, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.97G/9.45G [00:41<00:11, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.93G/9.45G [00:41<00:11, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 7.99G/9.45G [00:41<00:11, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.95G/9.45G [00:41<00:11, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  84%|████████▍ | 7.97G/9.45G [00:41<00:11, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 8.01G/9.45G [00:41<00:11, 128MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 8.03G/9.45G [00:41<00:10, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 7.99G/9.45G [00:41<00:11, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 8.01G/9.45G [00:41<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▌ | 8.05G/9.45G [00:41<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▌ | 8.07G/9.45G [00:41<00:10, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▍ | 8.03G/9.45G [00:41<00:10, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▌ | 8.05G/9.45G [00:42<00:10, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.10G/9.45G [00:42<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.12G/9.45G [00:42<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  85%|████████▌ | 8.07G/9.45G [00:42<00:10, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.14G/9.45G [00:42<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.10G/9.45G [00:42<00:10, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.12G/9.45G [00:42<00:10, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▋ | 8.16G/9.45G [00:42<00:09, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.18G/9.45G [00:42<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▌ | 8.14G/9.45G [00:42<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  86%|████████▋ | 8.16G/9.45G [00:42<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.20G/9.45G [00:42<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.18G/9.45G [00:43<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.22G/9.45G [00:43<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.20G/9.45G [00:43<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.24G/9.45G [00:43<00:09, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.22G/9.45G [00:43<00:09, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.26G/9.45G [00:43<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.24G/9.45G [00:43<00:09, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.28G/9.45G [00:43<00:08, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  87%|████████▋ | 8.26G/9.45G [00:43<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.30G/9.45G [00:43<00:08, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.28G/9.45G [00:43<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.33G/9.45G [00:43<00:08, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.30G/9.45G [00:43<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.35G/9.45G [00:44<00:08, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.33G/9.45G [00:44<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▊ | 8.37G/9.45G [00:44<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  88%|████████▊ | 8.35G/9.45G [00:44<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.39G/9.45G [00:44<00:08, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▊ | 8.37G/9.45G [00:44<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.41G/9.45G [00:44<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.39G/9.45G [00:44<00:07, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.43G/9.45G [00:44<00:07, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.41G/9.45G [00:44<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.45G/9.45G [00:44<00:07, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.43G/9.45G [00:44<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|████████▉ | 8.47G/9.45G [00:44<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|████████▉ | 8.49G/9.45G [00:45<00:07, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  89%|████████▉ | 8.45G/9.45G [00:45<00:10, 96.7MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|█████████ | 8.51G/9.45G [00:45<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|████████▉ | 8.48G/9.45G [00:45<00:07, 124MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|█████████ | 8.54G/9.45G [00:45<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|█████████ | 8.51G/9.45G [00:45<00:06, 152MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.56G/9.45G [00:45<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  90%|█████████ | 8.54G/9.45G [00:45<00:06, 149MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.58G/9.45G [00:45<00:06, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.56G/9.45G [00:45<00:06, 141MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:24 (running for 00:01:03.30)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.60G/9.45G [00:45<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.58G/9.45G [00:46<00:06, 141MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.62G/9.45G [00:46<00:06, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.60G/9.45G [00:46<00:06, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████▏| 8.64G/9.45G [00:46<00:06, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████ | 8.62G/9.45G [00:46<00:06, 138MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.66G/9.45G [00:46<00:06, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  91%|█████████▏| 8.64G/9.45G [00:46<00:05, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.68G/9.45G [00:46<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.66G/9.45G [00:46<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.70G/9.45G [00:46<00:05, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.68G/9.45G [00:46<00:05, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.72G/9.45G [00:46<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.70G/9.45G [00:46<00:05, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.75G/9.45G [00:47<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  92%|█████████▏| 8.72G/9.45G [00:47<00:05, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.77G/9.45G [00:47<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.75G/9.45G [00:47<00:05, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.79G/9.45G [00:47<00:05, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.77G/9.45G [00:47<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.81G/9.45G [00:47<00:04, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.79G/9.45G [00:47<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.81G/9.45G [00:47<00:04, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.83G/9.45G [00:47<00:05, 117MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.86G/9.45G [00:47<00:04, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  93%|█████████▎| 8.83G/9.45G [00:47<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▎| 8.85G/9.45G [00:48<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.88G/9.45G [00:48<00:04, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.90G/9.45G [00:48<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.87G/9.45G [00:48<00:04, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.89G/9.45G [00:48<00:04, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.92G/9.45G [00:48<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  94%|█████████▍| 8.91G/9.45G [00:48<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▍| 8.94G/9.45G [00:48<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▍| 8.93G/9.45G [00:48<00:03, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▍| 8.97G/9.45G [00:48<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▍| 8.95G/9.45G [00:48<00:03, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▌| 8.99G/9.45G [00:48<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▍| 8.98G/9.45G [00:49<00:03, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▌| 9.01G/9.45G [00:49<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▌| 9.00G/9.45G [00:49<00:03, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.03G/9.45G [00:49<00:03, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  95%|█████████▌| 9.02G/9.45G [00:49<00:03, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.05G/9.45G [00:49<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.04G/9.45G [00:49<00:03, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.07G/9.45G [00:49<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.06G/9.45G [00:49<00:02, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.09G/9.45G [00:49<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▌| 9.08G/9.45G [00:49<00:02, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▋| 9.11G/9.45G [00:49<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  96%|█████████▋| 9.10G/9.45G [00:49<00:02, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.13G/9.45G [00:50<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.12G/9.45G [00:50<00:02, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.15G/9.45G [00:50<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.14G/9.45G [00:50<00:02, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.18G/9.45G [00:50<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.16G/9.45G [00:50<00:02, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.20G/9.45G [00:50<00:01, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.19G/9.45G [00:50<00:01, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.22G/9.45G [00:50<00:01, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  97%|█████████▋| 9.21G/9.45G [00:50<00:01, 135MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:29 (running for 00:01:08.30)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.24G/9.45G [00:50<00:01, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.23G/9.45G [00:50<00:01, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.26G/9.45G [00:51<00:01, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.25G/9.45G [00:51<00:01, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.28G/9.45G [00:51<00:01, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.27G/9.45G [00:51<00:01, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.30G/9.45G [00:51<00:01, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  98%|█████████▊| 9.29G/9.45G [00:51<00:01, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▊| 9.32G/9.45G [00:51<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▊| 9.31G/9.45G [00:51<00:01, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.34G/9.45G [00:51<00:00, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.33G/9.45G [00:51<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.36G/9.45G [00:51<00:00, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.35G/9.45G [00:51<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.38G/9.45G [00:51<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.37G/9.45G [00:52<00:00, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|█████████▉| 9.41G/9.45G [00:52<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin:  99%|█████████▉| 9.40G/9.45G [00:52<00:00, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|█████████▉| 9.43G/9.45G [00:52<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|█████████▉| 9.42G/9.45G [00:52<00:00, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|█████████▉| 9.45G/9.45G [00:52<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|█████████▉| 9.44G/9.45G [00:52<00:00, 134MB/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|██████████| 9.45G/9.45G [00:52<00:00, 180MB/s]\n",
+      "Downloading shards:  50%|█████     | 1/2 [00:52<00:52, 52.57s/it]\n",
+      "Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00002.bin: 100%|██████████| 9.45G/9.45G [00:52<00:00, 180MB/s]\n",
+      "Downloading shards:  50%|█████     | 1/2 [00:52<00:52, 52.68s/it]\n",
+      "Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   2%|▏         | 31.5M/1.95G [00:00<00:11, 172MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   3%|▎         | 52.4M/1.95G [00:00<00:11, 163MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   2%|▏         | 31.5M/1.95G [00:00<00:09, 212MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   4%|▍         | 73.4M/1.95G [00:00<00:12, 149MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   3%|▎         | 62.9M/1.95G [00:00<00:11, 160MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   4%|▍         | 83.9M/1.95G [00:00<00:12, 149MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   5%|▍         | 94.4M/1.95G [00:00<00:13, 143MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   6%|▌         | 115M/1.95G [00:00<00:13, 140MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   5%|▌         | 105M/1.95G [00:00<00:12, 144MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   6%|▋         | 126M/1.95G [00:00<00:13, 140MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   7%|▋         | 136M/1.95G [00:00<00:13, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   8%|▊         | 157M/1.95G [00:01<00:13, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   8%|▊         | 147M/1.95G [00:01<00:12, 139MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   9%|▊         | 168M/1.95G [00:01<00:12, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:   9%|▉         | 178M/1.95G [00:01<00:13, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  10%|▉         | 189M/1.95G [00:01<00:12, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  10%|█         | 199M/1.95G [00:01<00:13, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  11%|█         | 210M/1.95G [00:01<00:12, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  11%|█▏        | 220M/1.95G [00:01<00:13, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  12%|█▏        | 231M/1.95G [00:01<00:12, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  12%|█▏        | 241M/1.95G [00:01<00:12, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  13%|█▎        | 252M/1.95G [00:01<00:12, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  13%|█▎        | 262M/1.95G [00:01<00:12, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  14%|█▍        | 273M/1.95G [00:01<00:12, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  15%|█▍        | 283M/1.95G [00:02<00:12, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  15%|█▌        | 294M/1.95G [00:02<00:12, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  16%|█▌        | 304M/1.95G [00:02<00:12, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  16%|█▌        | 315M/1.95G [00:02<00:12, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  17%|█▋        | 325M/1.95G [00:02<00:12, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  17%|█▋        | 336M/1.95G [00:02<00:11, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  18%|█▊        | 346M/1.95G [00:02<00:15, 105MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  19%|█▉        | 377M/1.95G [00:02<00:11, 139MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  20%|██        | 398M/1.95G [00:02<00:11, 139MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  18%|█▊        | 357M/1.95G [00:02<00:18, 88.2MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  20%|██        | 398M/1.95G [00:02<00:11, 135MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  22%|██▏       | 419M/1.95G [00:03<00:11, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  23%|██▎       | 440M/1.95G [00:03<00:11, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  22%|██▏       | 430M/1.95G [00:03<00:09, 154MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:34 (running for 00:01:13.30)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  23%|██▎       | 451M/1.95G [00:03<00:10, 148MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  24%|██▎       | 461M/1.95G [00:03<00:11, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  24%|██▍       | 472M/1.95G [00:03<00:10, 145MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  25%|██▍       | 482M/1.95G [00:03<00:11, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  25%|██▌       | 493M/1.95G [00:03<00:10, 141MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  26%|██▌       | 503M/1.95G [00:03<00:10, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  26%|██▋       | 514M/1.95G [00:03<00:10, 140MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  27%|██▋       | 524M/1.95G [00:03<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  27%|██▋       | 535M/1.95G [00:03<00:10, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  28%|██▊       | 545M/1.95G [00:04<00:10, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  29%|██▊       | 556M/1.95G [00:04<00:10, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  30%|██▉       | 577M/1.95G [00:04<00:10, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  29%|██▉       | 566M/1.95G [00:04<00:11, 125MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  30%|███       | 587M/1.95G [00:04<00:10, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  31%|███       | 598M/1.95G [00:04<00:09, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  32%|███▏      | 619M/1.95G [00:04<00:09, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  31%|███       | 608M/1.95G [00:04<00:10, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  32%|███▏      | 629M/1.95G [00:04<00:10, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  33%|███▎      | 640M/1.95G [00:04<00:09, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  33%|███▎      | 650M/1.95G [00:04<00:10, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  34%|███▍      | 661M/1.95G [00:04<00:09, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  35%|███▍      | 682M/1.95G [00:04<00:09, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  34%|███▍      | 671M/1.95G [00:05<00:09, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  35%|███▌      | 692M/1.95G [00:05<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  36%|███▌      | 703M/1.95G [00:05<00:09, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  37%|███▋      | 724M/1.95G [00:05<00:09, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  37%|███▋      | 713M/1.95G [00:05<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  38%|███▊      | 734M/1.95G [00:05<00:09, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  38%|███▊      | 744M/1.95G [00:05<00:08, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  39%|███▉      | 765M/1.95G [00:05<00:08, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  39%|███▊      | 755M/1.95G [00:05<00:09, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  40%|███▉      | 776M/1.95G [00:05<00:08, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  40%|████      | 786M/1.95G [00:05<00:08, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  41%|████▏     | 807M/1.95G [00:05<00:08, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  41%|████      | 797M/1.95G [00:05<00:08, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  42%|████▏     | 828M/1.95G [00:06<00:08, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  42%|████▏     | 818M/1.95G [00:06<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  44%|████▎     | 849M/1.95G [00:06<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  43%|████▎     | 839M/1.95G [00:06<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  45%|████▍     | 870M/1.95G [00:06<00:08, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  44%|████▍     | 860M/1.95G [00:06<00:08, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  45%|████▌     | 881M/1.95G [00:06<00:08, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  46%|████▌     | 891M/1.95G [00:06<00:08, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  47%|████▋     | 912M/1.95G [00:06<00:07, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  46%|████▋     | 902M/1.95G [00:06<00:08, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  47%|████▋     | 923M/1.95G [00:06<00:07, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  48%|████▊     | 933M/1.95G [00:06<00:07, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  49%|████▉     | 954M/1.95G [00:07<00:07, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  48%|████▊     | 944M/1.95G [00:07<00:07, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  50%|█████     | 975M/1.95G [00:07<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  49%|████▉     | 965M/1.95G [00:07<00:07, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  51%|█████     | 996M/1.95G [00:07<00:07, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  51%|█████     | 986M/1.95G [00:07<00:07, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  52%|█████▏    | 1.02G/1.95G [00:07<00:06, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  52%|█████▏    | 1.01G/1.95G [00:07<00:07, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  53%|█████▎    | 1.04G/1.95G [00:07<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  53%|█████▎    | 1.03G/1.95G [00:07<00:07, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  54%|█████▍    | 1.06G/1.95G [00:07<00:06, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  54%|█████▍    | 1.05G/1.95G [00:07<00:06, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  55%|█████▍    | 1.07G/1.95G [00:08<00:06, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  55%|█████▌    | 1.08G/1.95G [00:07<00:06, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  56%|█████▋    | 1.10G/1.95G [00:08<00:06, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  56%|█████▌    | 1.09G/1.95G [00:08<00:06, 132MB/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:39 (running for 00:01:18.30)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  57%|█████▋    | 1.11G/1.95G [00:08<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  58%|█████▊    | 1.12G/1.95G [00:08<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  59%|█████▊    | 1.14G/1.95G [00:08<00:06, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  58%|█████▊    | 1.13G/1.95G [00:08<00:06, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  60%|█████▉    | 1.16G/1.95G [00:08<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  59%|█████▉    | 1.15G/1.95G [00:08<00:06, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  61%|██████    | 1.18G/1.95G [00:08<00:05, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  60%|██████    | 1.17G/1.95G [00:08<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  62%|██████▏   | 1.21G/1.95G [00:08<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  61%|██████▏   | 1.20G/1.95G [00:09<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  63%|██████▎   | 1.23G/1.95G [00:09<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  62%|██████▏   | 1.22G/1.95G [00:09<00:05, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  64%|██████▍   | 1.25G/1.95G [00:09<00:05, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  63%|██████▎   | 1.24G/1.95G [00:09<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  65%|██████▌   | 1.27G/1.95G [00:09<00:05, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  65%|██████▍   | 1.26G/1.95G [00:09<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  66%|██████▌   | 1.29G/1.95G [00:09<00:05, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  66%|██████▌   | 1.28G/1.95G [00:09<00:05, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  67%|██████▋   | 1.31G/1.95G [00:09<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  67%|██████▋   | 1.30G/1.95G [00:09<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  68%|██████▊   | 1.33G/1.95G [00:09<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  68%|██████▊   | 1.32G/1.95G [00:09<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  69%|██████▉   | 1.35G/1.95G [00:10<00:04, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  69%|██████▉   | 1.34G/1.95G [00:10<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  70%|███████   | 1.37G/1.95G [00:10<00:04, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  70%|██████▉   | 1.36G/1.95G [00:10<00:04, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  72%|███████▏  | 1.39G/1.95G [00:10<00:04, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  71%|███████   | 1.38G/1.95G [00:10<00:04, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  73%|███████▎  | 1.42G/1.95G [00:10<00:03, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  72%|███████▏  | 1.41G/1.95G [00:10<00:04, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  74%|███████▎  | 1.44G/1.95G [00:10<00:03, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  73%|███████▎  | 1.43G/1.95G [00:10<00:04, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  75%|███████▍  | 1.46G/1.95G [00:10<00:03, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  74%|███████▍  | 1.45G/1.95G [00:10<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  76%|███████▌  | 1.48G/1.95G [00:11<00:03, 129MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  75%|███████▌  | 1.47G/1.95G [00:11<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  77%|███████▋  | 1.50G/1.95G [00:11<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  76%|███████▋  | 1.49G/1.95G [00:11<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  78%|███████▊  | 1.52G/1.95G [00:11<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  77%|███████▋  | 1.51G/1.95G [00:11<00:03, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  79%|███████▉  | 1.54G/1.95G [00:11<00:03, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  79%|███████▊  | 1.53G/1.95G [00:11<00:03, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  80%|████████  | 1.56G/1.95G [00:11<00:02, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  80%|███████▉  | 1.55G/1.95G [00:11<00:03, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  81%|████████  | 1.58G/1.95G [00:11<00:02, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  81%|████████  | 1.57G/1.95G [00:11<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  82%|████████▏ | 1.60G/1.95G [00:11<00:02, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  82%|████████▏ | 1.59G/1.95G [00:12<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  83%|████████▎ | 1.63G/1.95G [00:12<00:02, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  83%|████████▎ | 1.61G/1.95G [00:12<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  84%|████████▍ | 1.65G/1.95G [00:12<00:02, 130MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  84%|████████▍ | 1.64G/1.95G [00:12<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  86%|████████▌ | 1.67G/1.95G [00:12<00:02, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  85%|████████▍ | 1.66G/1.95G [00:12<00:02, 131MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  87%|████████▋ | 1.69G/1.95G [00:12<00:01, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  88%|████████▊ | 1.71G/1.95G [00:12<00:01, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  86%|████████▌ | 1.68G/1.95G [00:12<00:02, 98.3MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  88%|████████▊ | 1.72G/1.95G [00:13<00:01, 142MB/s] \u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  89%|████████▉ | 1.74G/1.95G [00:13<00:01, 140MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  89%|████████▊ | 1.73G/1.95G [00:13<00:02, 97.3MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  90%|█████████ | 1.76G/1.95G [00:13<00:01, 132MB/s] \u001b[A\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:44 (running for 00:01:23.31)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  90%|█████████ | 1.76G/1.95G [00:13<00:01, 138MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  91%|█████████▏| 1.78G/1.95G [00:13<00:01, 137MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  92%|█████████▏| 1.79G/1.95G [00:13<00:01, 146MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  93%|█████████▎| 1.81G/1.95G [00:13<00:00, 144MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  93%|█████████▎| 1.80G/1.95G [00:13<00:01, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  94%|█████████▍| 1.84G/1.95G [00:13<00:00, 139MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  94%|█████████▎| 1.82G/1.95G [00:13<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  95%|█████████▌| 1.86G/1.95G [00:13<00:00, 136MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  95%|█████████▍| 1.85G/1.95G [00:13<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  96%|█████████▋| 1.88G/1.95G [00:14<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  96%|█████████▌| 1.87G/1.95G [00:14<00:00, 133MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  97%|█████████▋| 1.90G/1.95G [00:14<00:00, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  97%|█████████▋| 1.89G/1.95G [00:14<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  98%|█████████▊| 1.92G/1.95G [00:14<00:00, 134MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  98%|█████████▊| 1.91G/1.95G [00:14<00:00, 132MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin: 100%|█████████▉| 1.94G/1.95G [00:14<00:00, 135MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin:  99%|█████████▉| 1.93G/1.95G [00:14<00:00, 132MB/s]\u001b[A\n",
+      "Downloading (…)l-00002-of-00002.bin: 100%|██████████| 1.95G/1.95G [00:14<00:00, 133MB/s]\n",
+      "Downloading shards: 100%|██████████| 2/2 [01:07<00:00, 33.67s/it]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=378, ip=10.128.30.22)\u001b[0m \n",
+      "Downloading (…)l-00002-of-00002.bin: 100%|██████████| 1.95G/1.95G [00:14<00:00, 130MB/s]\u001b[A\n",
+      "Downloading (…)l-00002-of-00002.bin: 100%|██████████| 1.95G/1.95G [00:14<00:00, 132MB/s]\n",
+      "Downloading shards: 100%|██████████| 2/2 [01:07<00:00, 33.71s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:49 (running for 00:01:28.31)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]\n",
+      "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:54 (running for 00:01:33.31)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 07:59:59 (running for 00:01:38.31)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:04 (running for 00:01:43.31)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:09 (running for 00:01:48.32)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:14 (running for 00:01:53.32)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:19 (running for 00:01:58.32)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:24 (running for 00:02:03.33)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:29 (running for 00:02:08.33)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:34 (running for 00:02:13.33)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:39 (running for 00:02:18.33)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:44 (running for 00:02:23.33)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:49 (running for 00:02:28.34)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:54 (running for 00:02:33.34)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:00:59 (running for 00:02:38.34)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:04 (running for 00:02:43.34)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:09 (running for 00:02:48.35)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:14 (running for 00:02:53.35)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards:  50%|█████     | 1/2 [01:25<01:25, 85.26s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:19 (running for 00:02:58.35)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:24 (running for 00:03:03.35)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:29 (running for 00:03:08.36)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 2/2 [01:39<00:00, 49.95s/it]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 44.3kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:34 (running for 00:03:13.36)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:39 (running for 00:03:18.36)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.33006444422319176\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m /tmp/ray/session_2023-07-27_07-21-47_353834_9/runtime_resources/pip/04a15979ef108d1f8e906345b347c268df9d6aa1/virtualenv/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=277, ip=10.128.32.21)\u001b[0m   warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:44 (running for 00:03:23.36)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:49 (running for 00:03:28.37)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:54 (running for 00:03:33.37)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:01:59 (running for 00:03:38.37)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:02:04 (running for 00:03:43.37)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:02:09 (running for 00:03:48.38)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | RUNNING  | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff537034a8f0299b4acc1e1f4e05000000 Worker ID: 20fd7164328a7217f296071d2bde261324cae0cd44aa1a5887306bbd Node ID: 37f6f748268c35754f1bf5790acbd3c6a9f245776da67ebc9830fc1d Worker IP address: 10.128.30.22 Worker port: 10007 Worker PID: 378 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.\n",
+      "\u001b[2m\u001b[36m(HuggingFaceTrainer pid=345, ip=10.128.30.22)\u001b[0m 2023-07-27 08:02:12,849\tINFO utils.py:57 -- Worker 1 has failed.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result for HuggingFaceTrainer_06f2f_00000:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   date: 2023-07-27_07-58-29\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   experiment_id: 182233fc8cb24d72bfa113cb6a3f25bd\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   hostname: finetuneflan-worker-small-group-finetuneflan-6f6ft\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   node_ip: 10.128.30.22\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   pid: 345\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   timestamp: 1690469909\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   trial_id: 06f2f_00000\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m == Status ==\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Current time: 2023-07-27 08:02:13 (running for 00:03:52.79)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Memory usage on this node: 3.9/15.4 GiB \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Using FIFO scheduling algorithm.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/52.15 GiB heap, 0.0/11.29 GiB objects\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     | status   | loc              |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+----------+------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 | ERROR    | 10.128.30.22:345 |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+----------+------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m Number of errored trials: 1\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m | HuggingFaceTrainer_06f2f_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-07-27_07-58-20/HuggingFaceTrainer_06f2f_00000_0_2023-07-27_07-58-22/error.txt |\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m 2023-07-27 08:02:13,669\tERROR trial_runner.py:993 -- Trial HuggingFaceTrainer_06f2f_00000: Error processing event.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m ray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \tclass_name: RayTrainWorker\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \tactor_id: 537034a8f0299b4acc1e1f4e05000000\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \tpid: 378\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \tnamespace: 79e19797-9a9d-4359-9e7e-135e143c02c0\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \tip: 10.128.30.22\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m The above exception was the direct cause of the following exception:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m \u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 355, in train\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     raise skipped from exception_cause(skipped)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 325, in entrypoint\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     return self._trainable_func(\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 475, in _trainable_func\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     super()._trainable_func(self._merged_config, reporter, checkpoint_dir)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 651, in _trainable_func\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     output = fn()\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 390, in train_func\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     trainer.training_loop()\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 371, in training_loop\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     self._report(training_iterator)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 320, in _report\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     for results in training_iterator:\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 225, in __next__\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     next_results = self._run_with_error_handling(self._fetch_next_result)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 188, in _run_with_error_handling\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     return func()\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 257, in _fetch_next_result\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     results = self._backend_executor.get_next_results()\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 390, in get_next_results\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     results = self.get_with_failure_handling(futures)\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 483, in get_with_failure_handling\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     self._increment_failures()\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 533, in _increment_failures\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m     raise exc.with_traceback(None) from self._last_failure\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m RuntimeError: Training has failed after 1 attempts. You can change the number of max failure attempts by setting the `max_retries` arg in your `Trainer`.\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m 2023-07-27 08:02:13,782\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_06f2f_00000]\n",
+      "\u001b[2m\u001b[36m(train_fn pid=4614)\u001b[0m 2023-07-27 08:02:13,782\tINFO tune.py:777 -- Total run time: 232.95 seconds (232.79 seconds for the tuning loop).\n"
+     ]
+    },
+    {
+     "ename": "RayTaskError(RuntimeError)",
+     "evalue": "\u001b[36mray::train_fn()\u001b[39m (pid=4614, ip=10.128.28.7)\n  File \"/tmp/ipykernel_14249/2624701892.py\", line 150, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 360, in fit\n    raise result.error\nray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\nray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.\n\tclass_name: RayTrainWorker\n\tactor_id: 537034a8f0299b4acc1e1f4e05000000\n\tpid: 378\n\tnamespace: 79e19797-9a9d-4359-9e7e-135e143c02c0\n\tip: 10.128.30.22\nThe actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 355, in train\n    raise skipped from exception_cause(skipped)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 325, in entrypoint\n    return self._trainable_func(\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 475, in _trainable_func\n    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 651, in _trainable_func\n    output = fn()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 390, in train_func\n    trainer.training_loop()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 371, in training_loop\n    self._report(training_iterator)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 320, in _report\n    for results in training_iterator:\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 225, in __next__\n    next_results = self._run_with_error_handling(self._fetch_next_result)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 188, in _run_with_error_handling\n    return func()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 257, in _fetch_next_result\n    results = self._backend_executor.get_next_results()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 390, in get_next_results\n    results = self.get_with_failure_handling(futures)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 483, in get_with_failure_handling\n    self._increment_failures()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 533, in _increment_failures\n    raise exc.with_traceback(None) from self._last_failure\nRuntimeError: Training has failed after 1 attempts. You can change the number of max failure attempts by setting the `max_retries` arg in your `Trainer`.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRayTaskError(RuntimeError)\u001b[0m                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [8], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#call the above cell as a remote ray function\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m r \u001b[38;5;241m=\u001b[39m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m    103\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;124;03m        vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;124;03m        timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m    432\u001b[0m     op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m    433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m     res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    435\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n",
+      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m    460\u001b[0m         logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m    461\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m    463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m    464\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    465\u001b[0m ):\n\u001b[1;32m    466\u001b[0m     size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n",
+      "\u001b[0;31mRayTaskError(RuntimeError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=4614, ip=10.128.28.7)\n  File \"/tmp/ipykernel_14249/2624701892.py\", line 150, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 360, in fit\n    raise result.error\nray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\nray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.\n\tclass_name: RayTrainWorker\n\tactor_id: 537034a8f0299b4acc1e1f4e05000000\n\tpid: 378\n\tnamespace: 79e19797-9a9d-4359-9e7e-135e143c02c0\n\tip: 10.128.30.22\nThe actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::_Inner.train()\u001b[39m (pid=345, ip=10.128.30.22, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 355, in train\n    raise skipped from exception_cause(skipped)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 325, in entrypoint\n    return self._trainable_func(\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 475, in _trainable_func\n    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py\", line 651, in _trainable_func\n    output = fn()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 390, in train_func\n    trainer.training_loop()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 371, in training_loop\n    self._report(training_iterator)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/data_parallel_trainer.py\", line 320, in _report\n    for results in training_iterator:\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 225, in __next__\n    next_results = self._run_with_error_handling(self._fetch_next_result)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 188, in _run_with_error_handling\n    return func()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/trainer.py\", line 257, in _fetch_next_result\n    results = self._backend_executor.get_next_results()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 390, in get_next_results\n    results = self.get_with_failure_handling(futures)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 483, in get_with_failure_handling\n    self._increment_failures()\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/_internal/backend_executor.py\", line 533, in _increment_failures\n    raise exc.with_traceback(None) from self._last_failure\nRuntimeError: Training has failed after 1 attempts. You can change the number of max failure attempts by setting the `max_retries` arg in your `Trainer`."
+     ]
+    }
+   ],
+   "source": [
+    "#call the above cell as a remote ray function\n",
+    "r = ray.get(train_fn.remote())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c69fb6a5-173b-4564-bd20-49fcd6aebd64",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade ray peft accelerate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25819219-0317-43e5-bc31-d1fddd1fe897",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from ray.train.huggingface.transformers.transformers_checkpoint import TransformersCheckpoint\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-large')\n",
+    "tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')\n",
+    "\n",
+    "checkpoint = TransformersCheckpoint.from_checkpoint(r.checkpoint)\n",
+    "\n",
+    "model_output_dir = '../../models/raytune'\n",
+    "checkpoint.to_directory(model_output_dir)\n",
+    "\n",
+    "# Load the Lora model\n",
+    "model = PeftModel.from_pretrained(model, model_output_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5af8cd32",
+   "metadata": {},
+   "source": [
+    "Once complete, we can bring our Ray cluster down and clean up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f995319e-17a1-4e1c-80bb-5cd1014e719a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To do next:\n",
+    "# - train on ROSA data and add inference code\n",
+    "# - train a higher param model\n",
+    "# - Add bitsandbytes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster.down()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d41b90e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auth.logout()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From af7e2179a4a4a1e8ecfae44d660567f73c00957a Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 10 Aug 2023 20:30:32 +0000
Subject: [PATCH 4/5] Add initial ray experiments

---
 .../ray-flan-interactive.ipynb                | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/notebooks/ray-experiments/ray-flan-interactive.ipynb b/notebooks/ray-experiments/ray-flan-interactive.ipynb
index 858894c..a63f5ec 100644
--- a/notebooks/ray-experiments/ray-flan-interactive.ipynb
+++ b/notebooks/ray-experiments/ray-flan-interactive.ipynb
@@ -51,6 +51,17 @@
     "ray.__version__"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2fae774b-1cbb-4548-88bd-841ca0d3b0c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get packages for loading the model in this environment\n",
+    "#!pip install --upgrade ray peft accelerate"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -3391,18 +3402,6 @@
     "r = ray.get(train_fn.remote())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c69fb6a5-173b-4564-bd20-49fcd6aebd64",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!pip install --upgrade ray peft accelerate"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -3421,6 +3420,7 @@
     "\n",
     "checkpoint = TransformersCheckpoint.from_checkpoint(r.checkpoint)\n",
     "\n",
+    "# Save model in a directory\n",
     "model_output_dir = '../../models/raytune'\n",
     "checkpoint.to_directory(model_output_dir)\n",
     "\n",

From fdbfae4845e472fa39012401d85678b3f289379d Mon Sep 17 00:00:00 2001
From: Shreyanand <shanand@redhat.com>
Date: Thu, 10 Aug 2023 20:33:10 +0000
Subject: [PATCH 5/5] Add initial ray experiments

---
 notebooks/ray-experiments/ray-flantune.ipynb | 792 -------------------
 1 file changed, 792 deletions(-)
 delete mode 100644 notebooks/ray-experiments/ray-flantune.ipynb

diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb
deleted file mode 100644
index bac7a90..0000000
--- a/notebooks/ray-experiments/ray-flantune.ipynb
+++ /dev/null
@@ -1,792 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "bbc21043",
-   "metadata": {},
-   "source": [
-    "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa",
-   "metadata": {},
-   "source": [
-    "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner.  The bitsandbytes package doesn't work because of CUDA and Pytorch version."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import pieces from codeflare-sdk\n",
-    "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
-    "from codeflare_sdk.cluster.auth import TokenAuthentication"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "614daa0c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Create authentication object for oc user permissions\n",
-    "auth = TokenAuthentication(\n",
-    "    token = \"sha256~Z29WoRM5bMsxVgZpJ5uX9XtB-qPZzdOuGo9upSvpc98\",\n",
-    "    server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
-    "    skip_tls=False\n",
-    ")\n",
-    "auth.login()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc27f84c",
-   "metadata": {},
-   "source": [
-    "Once again, let's start by running through the same cluster setup as before:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "0f4bc870-091f-4e11-9642-cba145710159",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Written to: finetuneflan.yaml\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Create and configure our cluster object (and appwrapper)\n",
-    "cluster = Cluster(ClusterConfiguration(\n",
-    "    name='finetuneflan',\n",
-    "    namespace='default',\n",
-    "    min_worker=2,\n",
-    "    max_worker=2,\n",
-    "    min_cpus=1,\n",
-    "    max_cpus=2,\n",
-    "    min_memory=2,\n",
-    "    max_memory=8,\n",
-    "    gpu=1,\n",
-    "    instascale=True,\n",
-    "    machine_types=[\"m5.xlarge\", \"g4dn.xlarge\"]\n",
-    "))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Waiting for requested resources to be set up...\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m      2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:221\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    219\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    220\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ready:\n\u001b[0;32m--> 221\u001b[0m     status, ready \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprint_to_console\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    222\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m==\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN:\n\u001b[1;32m    223\u001b[0m         \u001b[38;5;28mprint\u001b[39m(\n\u001b[1;32m    224\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWARNING: Current cluster status is unknown, have you run cluster.up yet?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    225\u001b[0m         )\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:160\u001b[0m, in \u001b[0;36mCluster.status\u001b[0;34m(self, print_to_console)\u001b[0m\n\u001b[1;32m    158\u001b[0m status \u001b[38;5;241m=\u001b[39m CodeFlareClusterStatus\u001b[38;5;241m.\u001b[39mUNKNOWN\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# check the app wrapper status\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m appwrapper \u001b[38;5;241m=\u001b[39m \u001b[43m_app_wrapper_status\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnamespace\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m appwrapper:\n\u001b[1;32m    162\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m appwrapper\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m    163\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING,\n\u001b[1;32m    164\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mCOMPLETED,\n\u001b[1;32m    165\u001b[0m         AppWrapperStatus\u001b[38;5;241m.\u001b[39mRUNNING_HOLD_COMPLETION,\n\u001b[1;32m    166\u001b[0m     ]:\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:330\u001b[0m, in \u001b[0;36m_app_wrapper_status\u001b[0;34m(name, namespace)\u001b[0m\n\u001b[1;32m    328\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    329\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mproject(namespace), oc\u001b[38;5;241m.\u001b[39mtimeout(\u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m60\u001b[39m):\n\u001b[0;32m--> 330\u001b[0m         cluster \u001b[38;5;241m=\u001b[39m \u001b[43moc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mappwrapper/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mname\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    331\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oc\u001b[38;5;241m.\u001b[39mOpenShiftPythonException \u001b[38;5;28;01mas\u001b[39;00m osp:  \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m    332\u001b[0m     msg \u001b[38;5;241m=\u001b[39m osp\u001b[38;5;241m.\u001b[39mmsg\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:403\u001b[0m, in \u001b[0;36mSelector.object\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m    394\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mobject\u001b[39m(\u001b[38;5;28mself\u001b[39m, ignore_not_found\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    395\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    396\u001b[0m \u001b[38;5;124;03m    Returns a single APIObject that represents the selected resource. If multiple\u001b[39;00m\n\u001b[1;32m    397\u001b[0m \u001b[38;5;124;03m    resources are being selected an exception will be thrown (use objects() when\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    401\u001b[0m \u001b[38;5;124;03m    :return: A Model of the selected resource.\u001b[39;00m\n\u001b[1;32m    402\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 403\u001b[0m     objs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobjects\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    404\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    405\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m ignore_not_found:\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:423\u001b[0m, in \u001b[0;36mSelector.objects\u001b[0;34m(self, ignore_not_found, cls)\u001b[0m\n\u001b[1;32m    414\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    415\u001b[0m \u001b[38;5;124;03mReturns a python list of APIObject objects that represent the selected resources. An\u001b[39;00m\n\u001b[1;32m    416\u001b[0m \u001b[38;5;124;03mempty is returned if nothing is selected.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    419\u001b[0m \u001b[38;5;124;03m:return: A list of Model objects representing the receiver's selected resources.\u001b[39;00m\n\u001b[1;32m    420\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    421\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapiobject\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m APIObject\n\u001b[0;32m--> 423\u001b[0m obj \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobject_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43mignore_not_found\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_not_found\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    426\u001b[0m     api_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(obj)\u001b[38;5;241m.\u001b[39melements(\u001b[38;5;28mcls\u001b[39m)\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/selector.py:380\u001b[0m, in \u001b[0;36mSelector.object_json\u001b[0;34m(self, ignore_not_found)\u001b[0m\n\u001b[1;32m    377\u001b[0m     cmd_args\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--ignore-not-found\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    379\u001b[0m r \u001b[38;5;241m=\u001b[39m Result(verb)\n\u001b[0;32m--> 380\u001b[0m r\u001b[38;5;241m.\u001b[39madd_action(\u001b[43moc_action\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mall_namespaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_namespaces\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcmd_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcmd_args\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    381\u001b[0m r\u001b[38;5;241m.\u001b[39mfail_if(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to read object\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    383\u001b[0m \u001b[38;5;66;03m# --ignore-not-found returns an empty string instead of an error if nothing is found\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/openshift/action.py:363\u001b[0m, in \u001b[0;36moc_action\u001b[0;34m(context, verb, cmd_args, all_namespaces, no_namespace, namespace, references, stdin_obj, stdin_str, last_attempt, **kwargs)\u001b[0m\n\u001b[1;32m    361\u001b[0m         \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m    362\u001b[0m             \u001b[38;5;28;01mpass\u001b[39;00m  \u001b[38;5;66;03m# ignore\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    364\u001b[0m     period \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(\u001b[38;5;241m1\u001b[39m, period \u001b[38;5;241m+\u001b[39m period)  \u001b[38;5;66;03m# Poll fast at first, but slow down to 1/sec over time\u001b[39;00m\n\u001b[1;32m    366\u001b[0m \u001b[38;5;66;03m# See note in paramiko flow on decoding\u001b[39;00m\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "# Bring up the cluster\n",
-    "cluster.up()\n",
-    "cluster.wait_ready()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "df71c1ed",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    </span><span style=\"font-weight: bold; font-style: italic\"> 🚀 CodeFlare Cluster Details 🚀</span><span style=\"font-style: italic\">                     </span>\n",
-       "<span style=\"font-weight: bold\">                                                                         </span>\n",
-       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
-       " │   <span style=\"color: #c0c0c0; text-decoration-color: #c0c0c0; background-color: #008000; font-weight: bold\">Name</span>                                                              │ \n",
-       " │   <span style=\"font-weight: bold; text-decoration: underline\">finetuneflan</span>                                        Inactive ❌   │ \n",
-       " │                                                                     │ \n",
-       " │   <span style=\"font-weight: bold\">URI:</span> ray://finetuneflan-head-svc.default.svc:10001                │ \n",
-       " │                                                                     │ \n",
-       " │   <a href=\"http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\" target=\"_blank\"><span style=\"color: #000080; text-decoration-color: #000080; text-decoration: underline\">Dashboard🔗</span></a>                                                       │ \n",
-       " │                                                                     │ \n",
-       " │  <span style=\"font-style: italic\">                    Cluster Resources                     </span>         │ \n",
-       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
-       " │   │ <span style=\"font-weight: bold\"> Min  Max </span> │  │ <span style=\"font-weight: bold\"> Memory      CPU         GPU        </span> │          │ \n",
-       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
-       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2   </span><span style=\"color: #800080; text-decoration-color: #800080\"> 2   </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\"> 2~8        </span><span style=\"color: #800080; text-decoration-color: #800080\"> 1           1          </span> │          │ \n",
-       " │   │ <span style=\"color: #008080; text-decoration-color: #008080\">     </span><span style=\"color: #800080; text-decoration-color: #800080\">     </span> │  │ <span style=\"color: #008080; text-decoration-color: #008080\">            </span><span style=\"color: #800080; text-decoration-color: #800080\">                        </span> │          │ \n",
-       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
-       " ╰─────────────────────────────────────────────────────────────────────╯ \n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[3m                    \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Details 🚀\u001b[0m\u001b[3m                     \u001b[0m\n",
-       "\u001b[1m \u001b[0m\u001b[1m                                                                       \u001b[0m\u001b[1m \u001b[0m\n",
-       " ╭─────────────────────────────────────────────────────────────────────╮ \n",
-       " │   \u001b[1;37;42mName\u001b[0m                                                              │ \n",
-       " │   \u001b[1;4mfinetuneflan\u001b[0m                                        Inactive ❌   │ \n",
-       " │                                                                     │ \n",
-       " │   \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001                │ \n",
-       " │                                                                     │ \n",
-       " │   \u001b]8;id=991912;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\                                                       │ \n",
-       " │                                                                     │ \n",
-       " │  \u001b[3m                    Cluster Resources                     \u001b[0m         │ \n",
-       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮          │ \n",
-       " │   │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │  │ \u001b[1m \u001b[0m\u001b[1mMemory    \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU       \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU       \u001b[0m\u001b[1m \u001b[0m │          │ \n",
-       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
-       " │   │ \u001b[36m \u001b[0m\u001b[36m2  \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2  \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m2~8       \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1         \u001b[0m\u001b[35m \u001b[0m │          │ \n",
-       " │   │ \u001b[36m \u001b[0m\u001b[36m   \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m   \u001b[0m\u001b[35m \u001b[0m │  │ \u001b[36m \u001b[0m\u001b[36m          \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m          \u001b[0m\u001b[35m \u001b[0m │          │ \n",
-       " │   ╰────────────╯  ╰──────────────────────────────────────╯          │ \n",
-       " ╰─────────────────────────────────────────────────────────────────────╯ \n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "RayCluster(name='finetuneflan', status=<CodeFlareClusterStatus.STARTING: 2>, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cluster.details()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "33663f47",
-   "metadata": {},
-   "source": [
-    "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n",
-    "\n",
-    "Using the SDK, we can get both the Ray cluster URI and dashboard URI:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "c1719bca",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n",
-      "ray://finetuneflan-head-svc.default.svc:10001\n"
-     ]
-    }
-   ],
-   "source": [
-    "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n",
-    "ray_cluster_uri = cluster.cluster_uri()\n",
-    "print(ray_dashboard_uri)\n",
-    "print(ray_cluster_uri)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2a2aca6a",
-   "metadata": {},
-   "source": [
-    "Now we can connect directly to our Ray cluster via the Ray python client:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "300146dc",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ray cluster is up and running:  True\n"
-     ]
-    }
-   ],
-   "source": [
-    "#before proceeding make sure the cluster exists and the uri is not empty\n",
-    "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
-    "\n",
-    "import ray\n",
-    "from ray.air.config import ScalingConfig\n",
-    "\n",
-    "# reset the ray context in case there's already one. \n",
-    "ray.shutdown()\n",
-    "# establish connection to ray cluster\n",
-    "\n",
-    "#install additionall libraries that will be required for model training\n",
-    "runtime_env = {\"pip\": [\"transformers\",\n",
-    "                       \"datasets\",\n",
-    "                       \"evaluate\",\n",
-    "                       \"pyarrow<7.0.0\",\n",
-    "                       \"accelerate\",\n",
-    "                       \"bitsandbytes\",\n",
-    "                       \"loralib\",\n",
-    "                       \"py7zr\",\n",
-    "                       \"tensorboard\",\n",
-    "                       \"peft\"], \n",
-    "              \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n",
-    "\n",
-    "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n",
-    "\n",
-    "print(\"Ray cluster is up and running: \", ray.is_initialized())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9711030b",
-   "metadata": {},
-   "source": [
-    "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "1b36e0d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@ray.remote\n",
-    "def train_fn():\n",
-    "    from datasets import load_dataset\n",
-    "    import transformers\n",
-    "    from transformers import AutoTokenizer, TrainingArguments\n",
-    "    from transformers import AutoModelForSequenceClassification\n",
-    "    import numpy as np\n",
-    "    from datasets import load_metric\n",
-    "    import ray\n",
-    "    from ray import tune\n",
-    "    from ray.train.huggingface import HuggingFaceTrainer\n",
-    "    \n",
-    "    from datasets import load_dataset, concatenate_datasets\n",
-    "    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
-    "    from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n",
-    "\n",
-    "    model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
-    "\n",
-    "    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "    \n",
-    "    dataset = load_dataset(\"samsum\")\n",
-    "\n",
-    "    print(f\"Train dataset size: {len(dataset['train'])}\")\n",
-    "    print(f\"Test dataset size: {len(dataset['test'])}\")\n",
-    "    \n",
-    "    #### COMPUTE MAX SEQ LEN ##########\n",
-    "    # The maximum total input sequence length after tokenization.\n",
-    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\n",
-    "    conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n",
-    "\n",
-    "    \n",
-    "    tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
-    "                                                            truncation=True),\n",
-    "                                        batched=True,\n",
-    "                                        remove_columns=[\"dialogue\", \"summary\"])\n",
-    "    \n",
-    "    input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n",
-    "    # take 85 percentile of max length for better utilization\n",
-    "    max_source_length = int(np.percentile(input_lengths, 85))\n",
-    "    print(f\"Max source length: {max_source_length}\")\n",
-    "\n",
-    "    # The maximum total sequence length for target text after tokenization.\n",
-    "    # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n",
-    "    tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
-    "                                                            truncation=True),\n",
-    "                                        batched=True,\n",
-    "                                        remove_columns=[\"dialogue\", \"summary\"])  \n",
-    "    target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n",
-    "    # take 90 percentile of max length for better utilization\n",
-    "    max_target_length = int(np.percentile(target_lengths, 90))\n",
-    "    print(f\"Max target length: {max_target_length}\")\n",
-    "    \n",
-    "    #### PREPROCESS DATA ##########\n",
-    "    \n",
-    "    def preprocess_function(sample,padding=\"max_length\"):\n",
-    "        # add prefix to the input for t5\n",
-    "        inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n",
-    "\n",
-    "        # tokenize inputs\n",
-    "        model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
-    "\n",
-    "        # Tokenize targets with the `text_target` keyword argument\n",
-    "        labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n",
-    "\n",
-    "        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
-    "        # padding in the loss.\n",
-    "        if padding == \"max_length\":\n",
-    "            labels[\"input_ids\"] = [\n",
-    "                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
-    "            ]\n",
-    "\n",
-    "        model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
-    "        return model_inputs\n",
-    "\n",
-    "    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
-    "    print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n",
-    "\n",
-    "    ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n",
-    "    ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n",
-    "\n",
-    "    def compute_metrics(eval_pred):\n",
-    "        metric = load_metric(\"accuracy\")\n",
-    "        logits, labels = eval_pred\n",
-    "        predictions = np.argmax(logits, axis=-1)\n",
-    "        return metric.compute(predictions=predictions, references=labels)\n",
-    "\n",
-    "    def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
-    "        model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
-    "        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
-    "        lora_config = LoraConfig(\n",
-    "            r=16,\n",
-    "            lora_alpha=32,\n",
-    "            target_modules=[\"q\", \"v\"],\n",
-    "            lora_dropout=0.05,\n",
-    "            bias=\"none\",\n",
-    "            task_type=TaskType.SEQ_2_SEQ_LM\n",
-    "        )\n",
-    "        # prepare int-8 model for training\n",
-    "        model = prepare_model_for_int8_training(model)\n",
-    "\n",
-    "        # add LoRA adaptor\n",
-    "        model = get_peft_model(model, lora_config)\n",
-    "        model.print_trainable_parameters()\n",
-    "        \n",
-    "        from transformers import DataCollatorForSeq2Seq\n",
-    "\n",
-    "        # we want to ignore tokenizer pad token in the loss\n",
-    "        label_pad_token_id = -100\n",
-    "        # Data collator\n",
-    "        data_collator = DataCollatorForSeq2Seq(\n",
-    "            tokenizer,\n",
-    "            model=model,\n",
-    "            label_pad_token_id=label_pad_token_id,\n",
-    "            pad_to_multiple_of=8\n",
-    "        )\n",
-    "        \n",
-    "        output_dir=\"/tmp/flan/test\"\n",
-    "\n",
-    "        # Define training args\n",
-    "        training_args = Seq2SeqTrainingArguments(\n",
-    "            output_dir=output_dir,\n",
-    "            auto_find_batch_size=True,\n",
-    "            learning_rate=1e-3, # higher learning rate\n",
-    "            num_train_epochs=5,\n",
-    "            logging_dir=f\"{output_dir}/logs\",\n",
-    "            logging_strategy=\"steps\",\n",
-    "            logging_steps=500,\n",
-    "            save_strategy=\"no\",\n",
-    "            report_to=\"tensorboard\",\n",
-    "        )\n",
-    "\n",
-    "        trainer = Seq2SeqTrainer(model=model,\n",
-    "                                args=training_args,\n",
-    "                                data_collator=data_collator,\n",
-    "                                train_dataset=tokenized_dataset[\"train\"])\n",
-    "        \n",
-    "        return trainer\n",
-    "\n",
-    "    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
-    "\n",
-    "    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
-    "    # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
-    "    trainer = HuggingFaceTrainer(\n",
-    "        trainer_init_per_worker=trainer_init_per_worker,\n",
-    "        scaling_config=scaling_config,\n",
-    "        datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
-    "    )\n",
-    "    result = trainer.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d4d8fd65",
-   "metadata": {},
-   "source": [
-    "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "5901d958",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n",
-      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
-      "100%|██████████| 3/3 [00:00<00:00, 680.49it/s]\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Train dataset size: 14732\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Test dataset size: 819\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max source length: 255\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max target length: 297\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-8356b281822134f5.arrow\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-af8f1296892299f1.arrow\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ===================================BUG REPORT===================================\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m python -m bitsandbytes\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m  and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ================================================================================\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | RUNNING  |       |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result for HuggingFaceTrainer_be877_00000:\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   trial_id: be877_00000\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     | status   | loc   |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR    |       |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name                     |   # failures | error file                                                                                                                  |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 |            1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,170\tERROR serialization.py:371 -- [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Traceback (most recent call last):\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 369, in deserialize_objects\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = self._deserialize_object(data, metadata, object_ref)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 252, in _deserialize_object\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return self._deserialize_msgpack_data(data, metadata_fields)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 207, in _deserialize_msgpack_data\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     python_objects = self._deserialize_pickle5_data(pickle5_data)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 197, in _deserialize_pickle5_data\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     obj = pickle.loads(in_band)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,172\tERROR worker.py:763 -- Exception raised in creation task: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     return ray.get(self.references[k])\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m traceback: Traceback (most recent call last):\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
-      "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,177\tERROR trial_runner.py:993 -- Trial HuggingFaceTrainer_be877_00000: Error processing event.\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     future_result = ray.get(ready_future)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return func(*args, **kwargs)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2291, in get\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     raise value\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     self.setup(copy.deepcopy(self.config))\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     return ray.get(self.references[k])\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m traceback: Traceback (most recent call last):\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     table = _memory_mapped_arrow_table_from_file(path)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m     memory_mapped_stream = pa.memory_map(filename)\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m   File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
-     ]
-    },
-    {
-     "ename": "RayTaskError(TrainingFailedError)",
-     "evalue": "\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn [19], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#call the above cell as a remote ray function\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m    103\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;124;03m        vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;124;03m        timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m    432\u001b[0m     op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m    433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m     res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    435\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n",
-      "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m    460\u001b[0m         logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m    461\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m    463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m    464\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    465\u001b[0m ):\n\u001b[1;32m    466\u001b[0m     size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n",
-      "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n    future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n    self.setup(copy.deepcopy(self.config))\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n    setup_kwargs[k] = parameter_registry.get(prefix + k)\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n    return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n    table = _memory_mapped_arrow_table_from_file(path)\n  File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n    memory_mapped_stream = pa.memory_map(filename)\n  File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n  File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n  File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n  File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n  File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n  File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n    raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n",
-      "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n"
-     ]
-    }
-   ],
-   "source": [
-    "#call the above cell as a remote ray function\n",
-    "ray.get(train_fn.remote())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5af8cd32",
-   "metadata": {},
-   "source": [
-    "Once complete, we can bring our Ray cluster down and clean up:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cluster.down()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0d41b90e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "auth.logout()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.13"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}