johnsmith0031 · winglian · Apr 16, 2023 · Apr 16, 2023 · Apr 16, 2023 · Apr 16, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS builder
 
 RUN apt-get update && apt-get install -y python3 python3-pip git
 
-RUN pip3 install --upgrade pip 
+RUN pip3 install --upgrade pip
 
 # Some of the requirements expect some python packages in their setup.py, just install them first.
 RUN --mount=type=cache,target=/root/.cache/pip pip install --user torch==2.0.0
@@ -61,14 +61,14 @@ RUN cd text-generation-webui-tmp && python download-model.py --text-only decapod
 # Get LoRA
 RUN cd text-generation-webui-tmp && python download-model.py samwit/alpaca7b-lora && mv loras/samwit_alpaca7b-lora ../alpaca7b_lora
 
-COPY *.py .
+COPY src .
 COPY text-generation-webui text-generation-webui
-COPY monkeypatch text-generation-webui/monkeypatch
+COPY src/alpaca_lora_4bit/monkeypatch text-generation-webui/monkeypatch
 
 RUN mv -f text-generation-webui-tmp/* text-generation-webui/
 
 # Symlink for monkeypatch
-RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
+RUN cd text-generation-webui && ln -s ../src/alpaca_lora_4bit/autograd_4bit.py ./autograd_4bit.py && ln -s ../src/alpaca_lora_4bit/matmul_utils_4bit.py . && ln -s ../src/alpaca_lora_4bit/models.py .
 
 # Swap to the 7bn parameter model
 RUN sed -i 's/llama-13b-4bit/llama-7b-4bit/g' text-generation-webui/custom_monkey_patch.py && sed -i 's/alpaca13b_lora/alpaca7b_lora/g' text-generation-webui/custom_monkey_patch.py

diff --git a/finetune.py b/finetune.py
@@ -16,21 +16,28 @@
         }
     ]
 """
+import os
+import sys
+# set src so alpaca_lora_4bit package is available without installing
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+
 # Early load config to replace attn if needed
-from arg_parser import get_config
+from alpaca_lora_4bit.arg_parser import get_config
 ft_config = get_config()
 
-from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
-replace_peft_model_with_gptq_lora_model()
+from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model
+replace_peft_model_with_int4_lora_model()
 
 if ft_config.flash_attention:
-    from monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+    from alpaca_lora_4bit.monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
     replace_llama_attn_with_flash_attn()
 elif ft_config.xformers:
-    from monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention
+    from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention
     hijack_llama_attention()
 
-import autograd_4bit
+from alpaca_lora_4bit import autograd_4bit
 if ft_config.backend.lower() == 'triton':
     autograd_4bit.switch_backend_to('triton')
 else:
@@ -44,11 +51,11 @@
 
 import torch
 import transformers
-from autograd_4bit import load_llama_model_4bit_low_ram
+from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
 from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel, set_peft_model_state_dict
 
 # ! Config
-import train_data
+from alpaca_lora_4bit import train_data
 
 # * Show loaded parameters
 if ft_config.local_rank == 0:
@@ -92,8 +99,8 @@
 # Scales to half
 print('Fitting 4bit scales and zeros to half')
 for n, m in model.named_modules():
-    if '4bit' in str(type(m)):
-        if m.is_v1_model:
+    if 'Autograd4bitQuantLinear' in str(type(m)) or 'Linear4bitLt' in str(type(m)):
+        if hasattr(m, "is_v1_model") and m.is_v1_model:
             m.zeros = m.zeros.half()
         m.scales = m.scales.half()
 
@@ -120,7 +127,7 @@
     # Use gradient checkpointing
     if ft_config.gradient_checkpointing:
         print('Applying gradient checkpointing ...')
-        from gradient_checkpointing import apply_gradient_checkpointing
+        from alpaca_lora_4bit.gradient_checkpointing import apply_gradient_checkpointing
         apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)
 
     # Disable Trainer's DataParallel for multigpu

diff --git a/inference.py b/inference.py
@@ -1,10 +1,15 @@
 import os
 import sys
+# set src so alpaca_lora_4bit package is available without installing
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+
 import time
 import torch
-from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
-from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
-replace_peft_model_with_gptq_lora_model()
+from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
+from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model
+replace_peft_model_with_int4_lora_model()
 
 config_path = './llama-13b-4bit/'
 model_path = './llama-13b-4bit.pt'

diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,5 @@ sentencepiece
 safetensors
 einops
 colorama
-git+https://github.com/huggingface/peft.git@70af02a2bca5a63921790036b2c9430edf4037e2
-git+https://github.com/huggingface/transformers.git
-git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
+peft @ git+https://github.com/huggingface/peft.git@70af02a2bca5a63921790036b2c9430edf4037e2
+transformers @ git+https://github.com/huggingface/transformers.git
diff --git a/setup.py b/setup.py
@@ -0,0 +1,30 @@
+import sys
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+install_requires = []
+with open("./requirements.txt", "r") as requirements_file:
+    reqs = [r.strip() for r in requirements_file.readlines()]
+    for r in reqs:
+        install_requires.append(r)
+
+quant_cuda_module = CUDAExtension(
+    'alpaca_lora_4bit.quant_cuda',
+    sources=[
+        'src/alpaca_lora_4bit/quant_cuda/quant_cuda.cpp',
+        'src/alpaca_lora_4bit/quant_cuda/quant_cuda_kernel.cu'
+    ])
+
+setup(
+    name='alpaca_lora_4bit',
+    version='0.1',
+    description='Alpaca LoRA 4-bit',
+    package_dir={'alpaca_lora_4bit': 'src/alpaca_lora_4bit'},
+    packages=['alpaca_lora_4bit', 'alpaca_lora_4bit.monkeypatch', 'alpaca_lora_4bit.quant_cuda'],
+    install_requires=install_requires,
+    extras_require={
+        'triton': 'triton',
+    },
+    ext_modules=[quant_cuda_module],
+    cmdclass={'build_ext': BuildExtension},
+)
diff --git a/Finetune4bConfig.py → src/alpaca_lora_4bit/Finetune4bConfig.py b/Finetune4bConfig.py → src/alpaca_lora_4bit/Finetune4bConfig.py
diff --git a/src/alpaca_lora_4bit/__init__.py b/src/alpaca_lora_4bit/__init__.py
@@ -0,0 +1,12 @@
+from . import monkeypatch
+from . import amp_wrapper
+from . import arg_parser
+from . import autograd_4bit
+from . import custom_autotune
+from . import Finetune4bConfig
+from . import gradient_checkpointing
+from . import models
+from . import train_data
+# We don't import these automatically as it is dependent on whether we need cuda or triton
+# from . import matmul_utils_4bit
+# from . import triton_utils
diff --git a/amp_wrapper.py → src/alpaca_lora_4bit/amp_wrapper.py b/amp_wrapper.py → src/alpaca_lora_4bit/amp_wrapper.py
diff --git a/arg_parser.py → src/alpaca_lora_4bit/arg_parser.py b/arg_parser.py → src/alpaca_lora_4bit/arg_parser.py
@@ -1,19 +1,19 @@
 import os
 import argparse
-from Finetune4bConfig import Finetune4bConfig
+from .Finetune4bConfig import Finetune4bConfig
 
 def parse_commandline():
     parser = argparse.ArgumentParser(
         prog=__file__.split(os.path.sep)[-1],
         description="Produce LoRA in 4bit training",
         usage="%(prog)s [config] [training]\n\nAll arguments are optional"
     )
-    
+
     parser.add_argument("dataset", nargs="?",
-        default="./dataset.json", 
+        default="./dataset.json",
         help="Path to dataset file. Default: %(default)s"
     )
-    
+
     parser_config = parser.add_argument_group("config")
     parser_training = parser.add_argument_group("training")
 
@@ -60,14 +60,14 @@ def parse_commandline():
     # Data args
     parser_training.add_argument("--txt_row_thd", default=-1, type=int, help="Custom thd for txt rows.")
     parser_training.add_argument("--use_eos_token", default=1, type=int, help="Use eos token instead if padding with 0. enable with 1, disable with 0.")
-    
+
     # V2 model support
     parser_training.add_argument("--groupsize", type=int, default=-1, help="Groupsize of v2 model")
     parser_training.add_argument("--v1", action="store_true", help="Use V1 model")
 
     # Multi GPU Support
     parser_training.add_argument("--local_rank", type=int, default=0, help="local rank if using torch.distributed.launch")
-    
+
     # Flash Attention
     parser_training.add_argument("--flash_attention", action="store_true", help="enables flash attention, can improve performance and reduce VRAM use")
     parser_training.add_argument("--xformers", action="store_true", help="enables xformers memory efficient attention, can improve performance and reduce VRAM use")
@@ -81,20 +81,20 @@ def parse_commandline():
 def get_config() -> Finetune4bConfig:
     args = parse_commandline()
     return Finetune4bConfig(
-        dataset=args["dataset"], 
-        ds_type=args["ds_type"], 
-        lora_out_dir=args["lora_out_dir"], 
+        dataset=args["dataset"],
+        ds_type=args["ds_type"],
+        lora_out_dir=args["lora_out_dir"],
         lora_apply_dir=args["lora_apply_dir"],
         resume_checkpoint=args["resume_checkpoint"],
         llama_q4_config_dir=args["llama_q4_config_dir"],
         llama_q4_model=args["llama_q4_model"],
         mbatch_size=args["mbatch_size"],
         batch_size=args["batch_size"],
-        epochs=args["epochs"], 
+        epochs=args["epochs"],
         lr=args["lr"],
         cutoff_len=args["cutoff_len"],
-        lora_r=args["lora_r"], 
-        lora_alpha=args["lora_alpha"], 
+        lora_r=args["lora_r"],
+        lora_alpha=args["lora_alpha"],
         lora_dropout=args["lora_dropout"],
         val_set_size=args["val_set_size"],
         gradient_checkpointing=args["grad_chckpt"],

diff --git a/autograd_4bit.py → src/alpaca_lora_4bit/autograd_4bit.py b/autograd_4bit.py → src/alpaca_lora_4bit/autograd_4bit.py
@@ -1,40 +1,67 @@
-import matmul_utils_4bit as mm4b
+import logging
+
 import torch
 import torch.nn as nn
 import time
 import math
 from torch.cuda.amp import custom_bwd, custom_fwd
 from colorama import init, Fore, Back, Style
+from huggingface_hub.utils._validators import HFValidationError
 init(autoreset=True)
 
 
-class AutogradMatmul4bitCuda(torch.autograd.Function):
+gptq_backend_loaded = False
+triton_backend_loaded = False
+
 
+class AutogradMatmul4bitNotImplemented(torch.autograd.Function):
     @staticmethod
     @custom_fwd(cast_inputs=torch.float16)
     def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
-        ctx.save_for_backward(qweight, scales, zeros, g_idx)
-        if g_idx is None:
-            output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
-        else:
-            output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
-        output = output.clone()
-        return output
+        raise NotImplementedError()
 
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
-        qweight, scales, zeros, g_idx = ctx.saved_tensors
-        if ctx.needs_input_grad[0]:
+        raise NotImplementedError()
+
+
+try:
+    from . import matmul_utils_4bit as mm4b
+
+    class AutogradMatmul4bitCuda(torch.autograd.Function):
+
+        @staticmethod
+        @custom_fwd(cast_inputs=torch.float16)
+        def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
+            ctx.save_for_backward(qweight, scales, zeros, g_idx)
             if g_idx is None:
-                grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
+                output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
             else:
-                grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
-        return grad, None, None, None, None, None, None
+                output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
+            output = output.clone()
+            return output
+
+        @staticmethod
+        @custom_bwd
+        def backward(ctx, grad_output):
+            qweight, scales, zeros, g_idx = ctx.saved_tensors
+            if ctx.needs_input_grad[0]:
+                if g_idx is None:
+                    grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
+                else:
+                    grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
+            return grad, None, None, None, None, None, None
+
+
+    gptq_backend_loaded = True
+except ImportError:
+    print('quant_cuda not found. Please run "pip install alpaca_lora_4bit[cuda]".')
 
 
 try:
-    import triton_utils as tu
+    from . import triton_utils as tu
+
 
     class AutogradMatmul4bitTriton(torch.autograd.Function):
 
@@ -46,7 +73,7 @@ def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
             ctx.bits, ctx.maxq = bits, maxq
             output = output.clone()
             return output
-        
+
         @staticmethod
         @custom_bwd
         def backward(ctx, grad_output):
@@ -57,25 +84,45 @@ def backward(ctx, grad_output):
             if ctx.needs_input_grad[0]:
                 grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
             return grad_input, None, None, None, None, None, None
-
+
+
+    triton_backend_loaded = True
 except ImportError:
     print('Triton not found. Please run "pip install triton".')
 
 
-AutogradMatmul4bit = AutogradMatmul4bitCuda
-backend = 'cuda'
+def is_triton_backend_available():
+    return 'AutogradMatmul4bitTriton' in globals()
+
+
+def is_gptq_backend_available():
+    return 'AutogradMatmul4bitCuda' in globals()
+
+
+AutogradMatmul4bit = AutogradMatmul4bitNotImplemented
+backend = None
+if is_gptq_backend_available():
+    AutogradMatmul4bit = AutogradMatmul4bitCuda
+    backend = 'cuda'
+elif is_triton_backend_available():
+    AutogradMatmul4bit = AutogradMatmul4bitTriton
+    backend = 'triton'
+else:
+    logging.warning("Neither gptq/cuda or triton backends are available.")
 
 
 def switch_backend_to(to_backend):
     global AutogradMatmul4bit
     global backend
     if to_backend == 'cuda':
+        if not is_gptq_backend_available():
+            raise ValueError('quant_cuda not found. Please reinstall with pip install .')
         AutogradMatmul4bit = AutogradMatmul4bitCuda
         backend = 'cuda'
         print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
     elif to_backend == 'triton':
         # detect if AutogradMatmul4bitTriton is defined
-        if 'AutogradMatmul4bitTriton' not in globals():
+        if not is_triton_backend_available():
             raise ValueError('Triton not found. Please install triton')
         AutogradMatmul4bit = AutogradMatmul4bitTriton
         backend = 'triton'
@@ -211,7 +258,10 @@ def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=Fa
     if half:
         model_to_half(model)
 
-    tokenizer = LlamaTokenizer.from_pretrained(config_path)
+    try:
+        tokenizer = LlamaTokenizer.from_pretrained(config_path)
+    except HFValidationError as e:
+        tokenizer = LlamaTokenizer.from_pretrained(model)
     tokenizer.truncation_side = 'left'
 
     print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
@@ -248,7 +298,7 @@ def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path
 
     if lora_path is not None:
         from peft import PeftModel
-        from monkeypatch.peft_tuners_lora_monkey_patch import Linear4bitLt
+        from .models import Linear4bitLt
         model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
         print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))
 

diff --git a/custom_autotune.py → src/alpaca_lora_4bit/custom_autotune.py b/custom_autotune.py → src/alpaca_lora_4bit/custom_autotune.py
diff --git a/gradient_checkpointing.py → ...lpaca_lora_4bit/gradient_checkpointing.py b/gradient_checkpointing.py → ...lpaca_lora_4bit/gradient_checkpointing.py