modal-labs · AgenP · Aug 6, 2024
diff --git a/config/llama-3.1.yml b/config/llama-3.1.yml
@@ -0,0 +1,101 @@
+###
+# Model Configuration: LLaMA-3.1 8B 
+###
+
+base_model: meta-llama/Meta-Llama-3.1-8B 
+sequence_len: 4096
+
+# base model weight quantization
+load_in_8bit: true
+
+# attention implementation
+flash_attention: true
+
+# finetuned adapter config
+adapter: lora
+lora_model_dir:
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_modules_to_save: # required when adding new tokens to LLaMA/Mistral
+  - embed_tokens
+  - lm_head
+# for details, see https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+
+###
+# Dataset Configuration: sqlqa
+###
+
+datasets:
+  # This will be the path used for the data when it is saved to the Volume in the cloud.
+  - path: data.jsonl
+    ds_type: json
+    type:
+      # JSONL file contains question, context, answer fields per line.
+      # This gets mapped to instruction, input, output axolotl tags.
+      field_instruction: question
+      field_input: context
+      field_output: answer
+      # Format is used by axolotl to generate the prompt.
+      format: |-
+        [INST] Using the schema context below, generate a SQL query that answers the question.
+        {input}
+        {instruction} [/INST]
+
+# dataset formatting config
+tokens: # add new control tokens from the dataset to the model
+  - "[INST]"
+  - " [/INST]"
+  - "[SQL]"
+  - " [/SQL]"
+
+special_tokens:
+  pad_token: <|end_of_text|>
+
+val_set_size: 0.05
+
+###
+# Training Configuration
+###
+
+# random seed for better reproducibility
+seed: 117
+
+# optimizer config
+optimizer: adamw_bnb_8bit
+learning_rate: 0.0001
+lr_scheduler: cosine
+num_epochs: 4
+micro_batch_size: 32
+gradient_accumulation_steps: 1
+warmup_steps: 10
+
+# axolotl saving config
+dataset_prepared_path: last_run_prepared
+output_dir: ./lora-out
+
+# logging and eval config
+logging_steps: 1
+eval_steps: 0.05
+
+# training performance optimization config
+bf16: auto
+tf32: false
+gradient_checkpointing: true
+
+# Optional wandb logging (uncomment to use)
+# wandb_project: llama-3.1-fine-tuning 
+# wandb_watch: all 
+
+###
+# Miscellaneous Configuration
+###
+
+# when true, prevents over-writing the config from the CLI
+strict: false
+
+# "Don't mess with this, it's here for accelerate and torchrun" -- axolotl docs
+local_rank:
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+modal
+pyyaml
+# For ci (check_loss.py)
+# pandas
diff --git a/src/inference.py b/src/inference.py
@@ -1,12 +1,60 @@
 import os
 import time
 import yaml
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 
 import modal
 from fastapi.responses import StreamingResponse
 
-from .common import app, vllm_image, Colors, MINUTES, VOLUME_CONFIG
+# from .common import app, vllm_image, Colors, MINUTES, VOLUME_CONFIG
+
+from typing import Union
+
+import modal
+
+APP_NAME = "example-axolotl"
+
+MINUTES = 60  # seconds
+HOURS = 60 * MINUTES
+
+
+ALLOW_WANDB = os.environ.get("ALLOW_WANDB", "false").lower() == "true"
+
+vllm_image = (
+    modal.Image.from_registry("nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10")
+    .pip_install("vllm==v0.5.3.post1", "torch==2.3.1")
+    .entrypoint([])
+)
+
+app = modal.App(
+    APP_NAME,
+    secrets=[
+        modal.Secret.from_name("huggingface"),
+        modal.Secret.from_dict({"ALLOW_WANDB": os.environ.get("ALLOW_WANDB", "false")}),
+        *([modal.Secret.from_name("wandb")] if ALLOW_WANDB else []),
+    ],
+)
+
+# Volumes for pre-trained models and training runs.
+pretrained_volume = modal.Volume.from_name(
+    "example-pretrained-vol", create_if_missing=True
+)
+runs_volume = modal.Volume.from_name("example-runs-vol", create_if_missing=True)
+VOLUME_CONFIG: dict[Union[str, PurePosixPath], modal.Volume] = {
+    "/pretrained": pretrained_volume,
+    "/runs": runs_volume,
+}
+
+
+class Colors:
+    """ANSI color codes"""
+
+    GREEN = "\033[0;32m"
+    BLUE = "\033[0;34m"
+    GRAY = "\033[0;90m"
+    BOLD = "\033[1m"
+    END = "\033[0m"
+
 
 INFERENCE_GPU_CONFIG = os.environ.get("INFERENCE_GPU_CONFIG", "a10g:2")
 if len(INFERENCE_GPU_CONFIG.split(":")) <= 1:

diff --git a/src/train.py b/src/train.py
@@ -1,16 +1,75 @@
 import os
 from datetime import datetime
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 import secrets
 
-from .common import (
-    app,
-    axolotl_image,
-    HOURS,
-    MINUTES,
-    VOLUME_CONFIG,
+# from .common import (
+#     app,
+#     axolotl_image,
+#     HOURS,
+#     MINUTES,
+#     VOLUME_CONFIG,
+# )
+
+from typing import Union
+
+import modal
+
+APP_NAME = "example-axolotl"
+
+MINUTES = 60  # seconds
+HOURS = 60 * MINUTES
+
+
+# Updated image main-20240805-py3.11-cu121-2.3.1 for Llama-3.1 compatibility
+AXOLOTL_REGISTRY_SHA = (
+    "30ecbf47963eb1a6b8f3808b2f11951d6aba61ea6d7065c009841e8d761775cf"
 )
 
+ALLOW_WANDB = os.environ.get("ALLOW_WANDB", "false").lower() == "true"
+
+axolotl_image = (
+    modal.Image.from_registry(f"winglian/axolotl@sha256:{AXOLOTL_REGISTRY_SHA}")
+    .pip_install(
+        "huggingface_hub==0.23.2",
+        "hf-transfer==0.1.5",
+        "wandb==0.16.3",
+        "fastapi==0.110.0",
+        "pydantic==2.6.3",
+    )
+    .env(
+        dict(
+            HUGGINGFACE_HUB_CACHE="/pretrained",
+            HF_HUB_ENABLE_HF_TRANSFER="1",
+            TQDM_DISABLE="true",
+            AXOLOTL_NCCL_TIMEOUT="60",
+        )
+    )
+    .entrypoint([])
+)
+
+
+app = modal.App(
+    APP_NAME,
+    secrets=[
+        modal.Secret.from_name("huggingface"),
+        modal.Secret.from_dict({"ALLOW_WANDB": os.environ.get("ALLOW_WANDB", "false")}),
+        *([modal.Secret.from_name("wandb")] if ALLOW_WANDB else []),
+    ],
+)
+
+# Volumes for pre-trained models and training runs.
+pretrained_volume = modal.Volume.from_name(
+    "example-pretrained-vol", create_if_missing=True
+)
+
+runs_volume = modal.Volume.from_name("example-runs-vol", create_if_missing=True)
+VOLUME_CONFIG: dict[Union[str, PurePosixPath], modal.Volume] = {
+    "/pretrained": pretrained_volume,
+    "/runs": runs_volume,
+}
+
+
 GPU_CONFIG = os.environ.get("GPU_CONFIG", "a100:2")
 if len(GPU_CONFIG.split(":")) <= 1:
     N_GPUS = int(os.environ.get("N_GPUS", 2))
@@ -23,7 +82,6 @@
     gpu=GPU_CONFIG,
     volumes=VOLUME_CONFIG,
     timeout=24 * HOURS,
-    _allow_background_volume_commits=True,
 )
 def train(run_folder: str, output_dir: str):
     import torch
@@ -48,7 +106,6 @@ def train(run_folder: str, output_dir: str):
     gpu=SINGLE_GPU_CONFIG,
     volumes=VOLUME_CONFIG,
     timeout=24 * HOURS,
-    _allow_background_volume_commits=True,
 )
 def preproc_data(run_folder: str):
     print("Preprocessing data.")