diff --git a/notebooks/ray-experiments/finetuneflan.yaml b/notebooks/ray-experiments/finetuneflan.yaml
new file mode 100644
index 0000000..2cee801
--- /dev/null
+++ b/notebooks/ray-experiments/finetuneflan.yaml
@@ -0,0 +1,155 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+ name: finetuneflan
+ namespace: default
+spec:
+ priority: 9
+ resources:
+ GenericItems:
+ - custompodresources:
+ - limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ replicas: 1
+ requests:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ - limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 1
+ replicas: 2
+ requests:
+ cpu: 1
+ memory: 2G
+ nvidia.com/gpu: 1
+ generictemplate:
+ apiVersion: ray.io/v1alpha1
+ kind: RayCluster
+ metadata:
+ labels:
+ appwrapper.mcad.ibm.com: finetuneflan
+ controller-tools.k8s.io: '1.0'
+ name: finetuneflan
+ namespace: default
+ spec:
+ autoscalerOptions:
+ idleTimeoutSeconds: 60
+ imagePullPolicy: Always
+ resources:
+ limits:
+ cpu: 500m
+ memory: 512Mi
+ requests:
+ cpu: 500m
+ memory: 512Mi
+ upscalingMode: Default
+ enableInTreeAutoscaling: false
+ headGroupSpec:
+ rayStartParams:
+ block: 'true'
+ dashboard-host: 0.0.0.0
+ num-gpus: '0'
+ serviceType: ClusterIP
+ template:
+ spec:
+ containers:
+ - env:
+ - name: MY_POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+ imagePullPolicy: Always
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: ray-head
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ rayVersion: 1.12.0
+ workerGroupSpecs:
+ - groupName: small-group-finetuneflan
+ maxReplicas: 2
+ minReplicas: 2
+ rayStartParams:
+ block: 'true'
+ num-gpus: '1'
+ replicas: 2
+ template:
+ metadata:
+ annotations:
+ key: value
+ labels:
+ key: value
+ spec:
+ containers:
+ - env:
+ - name: MY_POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: machine-learning
+ resources:
+ limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 1
+ requests:
+ cpu: 1
+ memory: 2G
+ nvidia.com/gpu: 1
+ initContainers:
+ - command:
+ - sh
+ - -c
+ - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
+ do echo waiting for myservice; sleep 2; done
+ image: busybox:1.28
+ name: init-myservice
+ replicas: 1
+ - generictemplate:
+ apiVersion: route.openshift.io/v1
+ kind: Route
+ metadata:
+ labels:
+ odh-ray-cluster-service: finetuneflan-head-svc
+ name: ray-dashboard-finetuneflan
+ namespace: default
+ spec:
+ port:
+ targetPort: dashboard
+ to:
+ kind: Service
+ name: finetuneflan-head-svc
+ replica: 1
+ Items: []
diff --git a/notebooks/ray-experiments/ray-flantune.ipynb b/notebooks/ray-experiments/ray-flantune.ipynb
new file mode 100644
index 0000000..50d5b6a
--- /dev/null
+++ b/notebooks/ray-experiments/ray-flantune.ipynb
@@ -0,0 +1,797 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "bbc21043",
+ "metadata": {},
+ "source": [
+ "In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "439ab88e-e05e-43b5-b506-960aa9a5afaa",
+ "metadata": {},
+ "source": [
+ "To Do: I tried adding the flan code in the interactive notebook but hit some errors. They need to be resolved to see if we can run the training in a distributed manner. The bitsandbytes package doesn't work because of CUDA and Pytorch version."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import pieces from codeflare-sdk\n",
+ "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
+ "from codeflare_sdk.cluster.auth import TokenAuthentication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "614daa0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Logged into \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\" as \"shanand@redhat.com\" using the token provided.\\n\\nYou have access to 113 projects, the list has been suppressed. You can list all projects with \\'oc projects\\'\\n\\nUsing project \"opendatahub\".\\n'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Create authentication object for oc user permissions\n",
+ "auth = TokenAuthentication(\n",
+ " token = \"sha256~26Kf2-d4ytnUrGO1nI72hm1qKqVTbDDDv_IKKOHeThU\",\n",
+ " server = \"https://api.et-cluster.6mwp.p1.openshiftapps.com:6443\",\n",
+ " skip_tls=False\n",
+ ")\n",
+ "auth.login()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc27f84c",
+ "metadata": {},
+ "source": [
+ "Once again, let's start by running through the same cluster setup as before:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "0f4bc870-091f-4e11-9642-cba145710159",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Written to: finetuneflan.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create and configure our cluster object (and appwrapper)\n",
+ "cluster = Cluster(ClusterConfiguration(\n",
+ " name='finetuneflan',\n",
+ " namespace='default',\n",
+ " min_worker=2,\n",
+ " max_worker=2,\n",
+ " min_cpus=1,\n",
+ " max_cpus=2,\n",
+ " min_memory=2,\n",
+ " max_memory=8,\n",
+ " gpu=1,\n",
+ " instascale=False,\n",
+ "))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Waiting for requested resources to be set up...\n"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn [4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Bring up the cluster\u001b[39;00m\n\u001b[1;32m 2\u001b[0m cluster\u001b[38;5;241m.\u001b[39mup()\n\u001b[0;32m----> 3\u001b[0m cluster\u001b[38;5;241m.\u001b[39mwait_ready()\n",
+ "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/codeflare_sdk/cluster/cluster.py:225\u001b[0m, in \u001b[0;36mCluster.wait_ready\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mand\u001b[39;00m time \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m timeout:\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwait() timed out after waiting \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 225\u001b[0m \u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequested cluster up and running!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "# Bring up the cluster\n",
+ "cluster.up()\n",
+ "cluster.wait_ready()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "df71c1ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
๐ CodeFlare Cluster Details ๐ \n",
+ " \n",
+ " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
+ " โ Name โ \n",
+ " โ finetuneflan Inactive โ โ \n",
+ " โ โ \n",
+ " โ URI: ray://finetuneflan-head-svc.default.svc:10001 โ \n",
+ " โ โ \n",
+ " โ Dashboard๐ โ \n",
+ " โ โ \n",
+ " โ Cluster Resources โ \n",
+ " โ โญโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
+ " โ โ Min Max โ โ Memory CPU GPU โ โ \n",
+ " โ โ โ โ โ โ \n",
+ " โ โ 2 2 โ โ 2~8 1 1 โ โ \n",
+ " โ โ โ โ โ โ \n",
+ " โ โฐโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
+ " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[3m \u001b[0m\u001b[1;3m ๐ CodeFlare Cluster Details ๐\u001b[0m\u001b[3m \u001b[0m\n",
+ "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n",
+ " โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ \n",
+ " โ \u001b[1;37;42mName\u001b[0m โ \n",
+ " โ \u001b[1;4mfinetuneflan\u001b[0m Inactive โ โ \n",
+ " โ โ \n",
+ " โ \u001b[1mURI:\u001b[0m ray://finetuneflan-head-svc.default.svc:10001 โ \n",
+ " โ โ \n",
+ " โ \u001b]8;id=510497;http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\u001b\\\u001b[4;34mDashboard๐\u001b[0m\u001b]8;;\u001b\\ โ \n",
+ " โ โ \n",
+ " โ \u001b[3m Cluster Resources \u001b[0m โ \n",
+ " โ โญโ Workers โโโฎ โญโโโโโโโโโ Worker specs(each) โโโโโโโโโโฎ โ \n",
+ " โ โ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ โ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ โ \n",
+ " โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
+ " โ โ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m2~8 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m โ โ \n",
+ " โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ โ \n",
+ " โ โฐโโโโโโโโโโโโโฏ โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ โ \n",
+ " โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ \n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "RayCluster(name='finetuneflan', status=, min_workers=2, max_workers=2, worker_mem_min=2, worker_mem_max=8, worker_cpu=1, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cluster.details()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33663f47",
+ "metadata": {},
+ "source": [
+ "This time we will demonstrate another potential method of use: working with the Ray cluster interactively.\n",
+ "\n",
+ "Using the SDK, we can get both the Ray cluster URI and dashboard URI:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "c1719bca",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "http://ray-dashboard-finetuneflan-default.apps.et-cluster.6mwp.p1.openshiftapps.com\n",
+ "ray://finetuneflan-head-svc.default.svc:10001\n"
+ ]
+ }
+ ],
+ "source": [
+ "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n",
+ "ray_cluster_uri = cluster.cluster_uri()\n",
+ "print(ray_dashboard_uri)\n",
+ "print(ray_cluster_uri)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2a2aca6a",
+ "metadata": {},
+ "source": [
+ "Now we can connect directly to our Ray cluster via the Ray python client:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "300146dc",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ray cluster is up and running: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "#before proceeding make sure the cluster exists and the uri is not empty\n",
+ "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
+ "\n",
+ "import ray\n",
+ "from ray.air.config import ScalingConfig\n",
+ "\n",
+ "# reset the ray context in case there's already one. \n",
+ "ray.shutdown()\n",
+ "# establish connection to ray cluster\n",
+ "\n",
+ "#install additionall libraries that will be required for model training\n",
+ "runtime_env = {\"pip\": [\"transformers\",\n",
+ " \"datasets\",\n",
+ " \"evaluate\",\n",
+ " \"pyarrow<7.0.0\",\n",
+ " \"accelerate\",\n",
+ " \"bitsandbytes\",\n",
+ " \"loralib\",\n",
+ " \"py7zr\",\n",
+ " \"tensorboard\",\n",
+ " \"peft\"], \n",
+ " \"env_vars\": {\"HF_HOME\":\"huggingface\"}}\n",
+ "\n",
+ "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env, _temp_dir=\"huggingface\")\n",
+ "\n",
+ "print(\"Ray cluster is up and running: \", ray.is_initialized())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9711030b",
+ "metadata": {},
+ "source": [
+ "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "1b36e0d9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@ray.remote\n",
+ "def train_fn():\n",
+ " from datasets import load_dataset\n",
+ " import transformers\n",
+ " from transformers import AutoTokenizer, TrainingArguments\n",
+ " from transformers import AutoModelForSequenceClassification\n",
+ " import numpy as np\n",
+ " from datasets import load_metric\n",
+ " import ray\n",
+ " from ray import tune\n",
+ " from ray.train.huggingface import HuggingFaceTrainer\n",
+ " \n",
+ " from datasets import load_dataset, concatenate_datasets\n",
+ " from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+ " from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType\n",
+ "\n",
+ " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+ "\n",
+ " #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+ " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ " \n",
+ " dataset = load_dataset(\"samsum\")\n",
+ "\n",
+ " print(f\"Train dataset size: {len(dataset['train'])}\")\n",
+ " print(f\"Test dataset size: {len(dataset['test'])}\")\n",
+ " \n",
+ " #### COMPUTE MAX SEQ LEN ##########\n",
+ " # The maximum total input sequence length after tokenization.\n",
+ " # Sequences longer than this will be truncated, sequences shorter will be padded.\n",
+ " conc_dataset = concatenate_datasets([dataset[\"train\"], dataset[\"test\"]])\n",
+ "\n",
+ " \n",
+ " tokenized_inputs = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+ " truncation=True),\n",
+ " batched=True,\n",
+ " remove_columns=[\"dialogue\", \"summary\"])\n",
+ " \n",
+ " input_lengths = [len(x) for x in tokenized_inputs[\"input_ids\"]]\n",
+ " # take 85 percentile of max length for better utilization\n",
+ " max_source_length = int(np.percentile(input_lengths, 85))\n",
+ " print(f\"Max source length: {max_source_length}\")\n",
+ "\n",
+ " # The maximum total sequence length for target text after tokenization.\n",
+ " # Sequences longer than this will be truncated, sequences shorter will be padded.\"\n",
+ " tokenized_targets = conc_dataset.map(lambda x: tokenizer(x[\"dialogue\"],\n",
+ " truncation=True),\n",
+ " batched=True,\n",
+ " remove_columns=[\"dialogue\", \"summary\"]) \n",
+ " target_lengths = [len(x) for x in tokenized_targets[\"input_ids\"]]\n",
+ " # take 90 percentile of max length for better utilization\n",
+ " max_target_length = int(np.percentile(target_lengths, 90))\n",
+ " print(f\"Max target length: {max_target_length}\")\n",
+ " \n",
+ " #### PREPROCESS DATA ##########\n",
+ " \n",
+ " def preprocess_function(sample,padding=\"max_length\"):\n",
+ " # add prefix to the input for t5\n",
+ " inputs = [\"summarize: \" + item for item in sample[\"dialogue\"]]\n",
+ "\n",
+ " # tokenize inputs\n",
+ " model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
+ "\n",
+ " # Tokenize targets with the `text_target` keyword argument\n",
+ " labels = tokenizer(text_target=sample[\"summary\"], max_length=max_target_length, padding=padding, truncation=True)\n",
+ "\n",
+ " # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
+ " # padding in the loss.\n",
+ " if padding == \"max_length\":\n",
+ " labels[\"input_ids\"] = [\n",
+ " [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+ " ]\n",
+ "\n",
+ " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+ " return model_inputs\n",
+ "\n",
+ " tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
+ " print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")\n",
+ "\n",
+ " ray_train_ds = ray.data.from_huggingface(tokenized_dataset['train'])\n",
+ " ray_evaluation_ds = ray.data.from_huggingface(tokenized_dataset['test'])\n",
+ "\n",
+ " def compute_metrics(eval_pred):\n",
+ " metric = load_metric(\"accuracy\")\n",
+ " logits, labels = eval_pred\n",
+ " predictions = np.argmax(logits, axis=-1)\n",
+ " return metric.compute(predictions=predictions, references=labels)\n",
+ "\n",
+ " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n",
+ " model_name = \"ybelkada/flan-t5-xl-sharded-bf16\"\n",
+ " model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\")\n",
+ " lora_config = LoraConfig(\n",
+ " r=16,\n",
+ " lora_alpha=32,\n",
+ " target_modules=[\"q\", \"v\"],\n",
+ " lora_dropout=0.05,\n",
+ " bias=\"none\",\n",
+ " task_type=TaskType.SEQ_2_SEQ_LM\n",
+ " )\n",
+ " # prepare int-8 model for training\n",
+ " model = prepare_model_for_int8_training(model)\n",
+ "\n",
+ " # add LoRA adaptor\n",
+ " model = get_peft_model(model, lora_config)\n",
+ " model.print_trainable_parameters()\n",
+ " \n",
+ " from transformers import DataCollatorForSeq2Seq\n",
+ "\n",
+ " # we want to ignore tokenizer pad token in the loss\n",
+ " label_pad_token_id = -100\n",
+ " # Data collator\n",
+ " data_collator = DataCollatorForSeq2Seq(\n",
+ " tokenizer,\n",
+ " model=model,\n",
+ " label_pad_token_id=label_pad_token_id,\n",
+ " pad_to_multiple_of=8\n",
+ " )\n",
+ " \n",
+ " output_dir=\"/tmp/flan/test\"\n",
+ "\n",
+ " # Define training args\n",
+ " training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=output_dir,\n",
+ " auto_find_batch_size=True,\n",
+ " learning_rate=1e-3, # higher learning rate\n",
+ " num_train_epochs=5,\n",
+ " logging_dir=f\"{output_dir}/logs\",\n",
+ " logging_strategy=\"steps\",\n",
+ " logging_steps=500,\n",
+ " save_strategy=\"no\",\n",
+ " report_to=\"tensorboard\",\n",
+ " )\n",
+ "\n",
+ " trainer = Seq2SeqTrainer(model=model,\n",
+ " args=training_args,\n",
+ " data_collator=data_collator,\n",
+ " train_dataset=tokenized_dataset[\"train\"])\n",
+ " \n",
+ " return trainer\n",
+ "\n",
+ " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n",
+ "\n",
+ " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n",
+ " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n",
+ " trainer = HuggingFaceTrainer(\n",
+ " trainer_init_per_worker=trainer_init_per_worker,\n",
+ " scaling_config=scaling_config,\n",
+ " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n",
+ " )\n",
+ " result = trainer.fit()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4d8fd65",
+ "metadata": {},
+ "source": [
+ "Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "5901d958",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ===================================BUG REPORT===================================\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m python -m bitsandbytes\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ================================================================================\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m To disable this warning, you can either:\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Found cached dataset samsum (/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)\n",
+ " 0%| | 0/3 [00:00, ?it/s]\n",
+ "100%|โโโโโโโโโโ| 3/3 [00:00<00:00, 680.49it/s]\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Train dataset size: 14732\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Test dataset size: 819\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max source length: 255\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-0d5be1d47aabc667.arrow\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Max target length: 297\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-8356b281822134f5.arrow\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Loading cached processed dataset at /home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-af8f1296892299f1.arrow\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ===================================BUG REPORT===================================\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Welcome to bitsandbytes. For bug reports, please run\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m python -m bitsandbytes\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ================================================================================\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m bin /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m CUDA SETUP: Loading binary /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m /tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 RUNNING)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name | status | loc |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | RUNNING | |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result for HuggingFaceTrainer_be877_00000:\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m trial_id: be877_00000\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name | status | loc |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR | |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name | # failures | error file |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | 1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m == Status ==\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Current time: 2023-05-24 14:34:26 (running for 00:00:05.39)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Memory usage on this node: 6.0/30.9 GiB \n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Using FIFO scheduling algorithm.\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.54 GiB objects\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of trials: 1/1 (1 ERROR)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name | status | loc |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+----------+-------|\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | ERROR | |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+----------+-------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m Number of errored trials: 1\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | Trial name | # failures | error file |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m |--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------|\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m | HuggingFaceTrainer_be877_00000 | 1 | /home/ray/ray_results/HuggingFaceTrainer_2023-05-24_14-34-20/HuggingFaceTrainer_be877_00000_0_2023-05-24_14-34-20/error.txt |\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m +--------------------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------+\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,170\tERROR serialization.py:371 -- [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m Traceback (most recent call last):\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 369, in deserialize_objects\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m obj = self._deserialize_object(data, metadata, object_ref)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 252, in _deserialize_object\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m return self._deserialize_msgpack_data(data, metadata_fields)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 207, in _deserialize_msgpack_data\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m python_objects = self._deserialize_pickle5_data(pickle5_data)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/serialization.py\", line 197, in _deserialize_pickle5_data\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m obj = pickle.loads(in_band)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m table = _memory_mapped_arrow_table_from_file(path)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m memory_mapped_stream = pa.memory_map(filename)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m 2023-05-24 14:34:26,172\tERROR worker.py:763 -- Exception raised in creation task: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m self.setup(copy.deepcopy(self.config))\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m return ray.get(self.references[k])\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m traceback: Traceback (most recent call last):\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m table = _memory_mapped_arrow_table_from_file(path)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m memory_mapped_stream = pa.memory_map(filename)\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+ "\u001b[2m\u001b[36m(TrainTrainable pid=613, ip=10.128.14.50)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,177\tERROR trial_runner.py:993 -- Trial HuggingFaceTrainer_be877_00000: Error processing event.\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m future_result = ray.get(ready_future)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m return func(*args, **kwargs)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2291, in get\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m raise value\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m self.setup(copy.deepcopy(self.config))\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m setup_kwargs[k] = parameter_registry.get(prefix + k)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m return ray.get(self.references[k])\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m ray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m traceback: Traceback (most recent call last):\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m table = _memory_mapped_arrow_table_from_file(path)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m memory_mapped_stream = pa.memory_map(filename)\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m FileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m \n"
+ ]
+ },
+ {
+ "ename": "RayTaskError(TrainingFailedError)",
+ "evalue": "\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n self.setup(copy.deepcopy(self.config))\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n setup_kwargs[k] = parameter_registry.get(prefix + k)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n table = _memory_mapped_arrow_table_from_file(path)\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n memory_mapped_stream = pa.memory_map(filename)\n File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn [19], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#call the above cell as a remote ray function\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m ray\u001b[38;5;241m.\u001b[39mget(train_fn\u001b[38;5;241m.\u001b[39mremote())\n",
+ "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m client_mode_should_convert(auto_init\u001b[38;5;241m=\u001b[39mauto_init):\n\u001b[1;32m 101\u001b[0m \u001b[38;5;66;03m# Legacy code\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;66;03m# we only convert init function if RAY_CLIENT_MODE=1\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+ "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/api.py:42\u001b[0m, in \u001b[0;36m_ClientAPI.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, vals, \u001b[38;5;241m*\u001b[39m, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124;03m\"\"\"get is the hook stub passed on to replace `ray.get`\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m vals: [Client]ObjectRef or list of these refs to retrieve.\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m timeout: Optional timeout in milliseconds\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:434\u001b[0m, in \u001b[0;36mWorker.get\u001b[0;34m(self, vals, timeout)\u001b[0m\n\u001b[1;32m 432\u001b[0m op_timeout \u001b[38;5;241m=\u001b[39m max_blocking_operation_time\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 434\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get\u001b[49m\u001b[43m(\u001b[49m\u001b[43mto_get\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_timeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GetTimeoutError:\n",
+ "File \u001b[0;32m/opt/app-root/lib64/python3.8/site-packages/ray/util/client/worker.py:462\u001b[0m, in \u001b[0;36mWorker._get\u001b[0;34m(self, ref, timeout)\u001b[0m\n\u001b[1;32m 460\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to deserialize \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(chunk\u001b[38;5;241m.\u001b[39merror))\n\u001b[1;32m 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m>\u001b[39m OBJECT_TRANSFER_WARNING_SIZE \u001b[38;5;129;01mand\u001b[39;00m log_once(\n\u001b[1;32m 464\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_object_transfer_size_warning\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 465\u001b[0m ):\n\u001b[1;32m 466\u001b[0m size_gb \u001b[38;5;241m=\u001b[39m chunk\u001b[38;5;241m.\u001b[39mtotal_size \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m30\u001b[39m\n",
+ "\u001b[0;31mRayTaskError(TrainingFailedError)\u001b[0m: \u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\nray.tune.error.TuneError: Failure # 1 (occurred at 2023-05-24_14-34-26)\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py\", line 1050, in get_next_executor_event\n future_result = ray.get(ready_future)\nray.exceptions.RayActorError: The actor died because of an error raised in its creation task, \u001b[36mray::_Inner.__init__()\u001b[39m (pid=613, ip=10.128.14.50, repr=HuggingFaceTrainer)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py\", line 161, in __init__\n self.setup(copy.deepcopy(self.config))\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py\", line 339, in setup\n setup_kwargs[k] = parameter_registry.get(prefix + k)\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/tune/registry.py\", line 234, in get\n return ray.get(self.references[k])\nray.exceptions.RaySystemError: System error: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\ntraceback: Traceback (most recent call last):\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 1075, in __setstate__\n table = _memory_mapped_arrow_table_from_file(path)\n File \"/tmp/ray/session_2023-05-24_13-35-49_647135_7/runtime_resources/pip/1ea979819413e33c98849c7e365e5e151f8a8356/virtualenv/lib/python3.8/site-packages/datasets/table.py\", line 50, in _memory_mapped_arrow_table_from_file\n memory_mapped_stream = pa.memory_map(filename)\n File \"pyarrow/io.pxi\", line 851, in pyarrow.lib.memory_map\n File \"pyarrow/io.pxi\", line 812, in pyarrow.lib.MemoryMappedFile._open\n File \"pyarrow/error.pxi\", line 143, in pyarrow.lib.pyarrow_internal_check_status\n File \"pyarrow/error.pxi\", line 112, in pyarrow.lib.check_status\nFileNotFoundError: [Errno 2] Failed to open local file '/home/ray/workspace/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b332bfb65957e3fe.arrow'. Detail: [errno 2] No such file or directory\n\n\n\nThe above exception was the direct cause of the following exception:\n\n\u001b[36mray::train_fn()\u001b[39m (pid=2288, ip=10.128.10.87)\n File \"/tmp/ipykernel_1499/1774758453.py\", line 149, in train_fn\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/train/base_trainer.py\", line 362, in fit\n raise TrainingFailedError from e\nray.train.base_trainer.TrainingFailedError"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tERROR tune.py:773 -- Trials did not complete: [HuggingFaceTrainer_be877_00000]\n",
+ "\u001b[2m\u001b[36m(train_fn pid=2288)\u001b[0m 2023-05-24 14:34:26,284\tINFO tune.py:777 -- Total run time: 5.50 seconds (5.39 seconds for the tuning loop).\n"
+ ]
+ }
+ ],
+ "source": [
+ "#call the above cell as a remote ray function\n",
+ "ray.get(train_fn.remote())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5af8cd32",
+ "metadata": {},
+ "source": [
+ "Once complete, we can bring our Ray cluster down and clean up:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'cluster' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn [1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cluster\u001b[38;5;241m.\u001b[39mdown()\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'cluster' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "cluster.down()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0d41b90e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "auth.logout()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}