Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make this pip installable #82

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS builder

RUN apt-get update && apt-get install -y python3 python3-pip git

RUN pip3 install --upgrade pip
RUN pip3 install --upgrade pip

# Some of the requirements expect some python packages in their setup.py, just install them first.
RUN --mount=type=cache,target=/root/.cache/pip pip install --user torch==2.0.0
Expand Down Expand Up @@ -61,14 +61,14 @@ RUN cd text-generation-webui-tmp && python download-model.py --text-only decapod
# Get LoRA
RUN cd text-generation-webui-tmp && python download-model.py samwit/alpaca7b-lora && mv loras/samwit_alpaca7b-lora ../alpaca7b_lora

COPY *.py .
COPY src .
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is quite right. I tried to build the image and run it to test it for you, but the symlinks below were not pointing to anything.

If they were ln -s ../alpaca_lora_4bit/autograd_4bit.py ./autograd_4bit.py (remove 'src/') then they would have linked. So I recommend, either change the copy or change the symlinking.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whoops, COPY src . didn't do what I thought 🤦

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dockerfile updated!

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I won't be able to test that for a bit. I broke my machine pretty badly.

COPY text-generation-webui text-generation-webui
COPY monkeypatch text-generation-webui/monkeypatch
COPY src/alpaca_lora_4bit/monkeypatch text-generation-webui/monkeypatch

RUN mv -f text-generation-webui-tmp/* text-generation-webui/

# Symlink for monkeypatch
RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
RUN cd text-generation-webui && ln -s ../src/alpaca_lora_4bit/autograd_4bit.py ./autograd_4bit.py && ln -s ../src/alpaca_lora_4bit/matmul_utils_4bit.py . && ln -s ../src/alpaca_lora_4bit/models.py .

# Swap to the 7bn parameter model
RUN sed -i 's/llama-13b-4bit/llama-7b-4bit/g' text-generation-webui/custom_monkey_patch.py && sed -i 's/alpaca13b_lora/alpaca7b_lora/g' text-generation-webui/custom_monkey_patch.py
Expand Down
29 changes: 18 additions & 11 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,28 @@
}
]
"""
import os
import sys
# set src so alpaca_lora_4bit package is available without installing
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

# Early load config to replace attn if needed
from arg_parser import get_config
from alpaca_lora_4bit.arg_parser import get_config
ft_config = get_config()

from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
replace_peft_model_with_gptq_lora_model()
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model
replace_peft_model_with_int4_lora_model()

if ft_config.flash_attention:
from monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
from alpaca_lora_4bit.monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
replace_llama_attn_with_flash_attn()
elif ft_config.xformers:
from monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention
from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention
hijack_llama_attention()

import autograd_4bit
from alpaca_lora_4bit import autograd_4bit
if ft_config.backend.lower() == 'triton':
autograd_4bit.switch_backend_to('triton')
else:
Expand All @@ -44,11 +51,11 @@

import torch
import transformers
from autograd_4bit import load_llama_model_4bit_low_ram
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel, set_peft_model_state_dict

# ! Config
import train_data
from alpaca_lora_4bit import train_data

# * Show loaded parameters
if ft_config.local_rank == 0:
Expand Down Expand Up @@ -92,8 +99,8 @@
# Scales to half
print('Fitting 4bit scales and zeros to half')
for n, m in model.named_modules():
if '4bit' in str(type(m)):
if m.is_v1_model:
if 'Autograd4bitQuantLinear' in str(type(m)) or 'Linear4bitLt' in str(type(m)):
if hasattr(m, "is_v1_model") and m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()

Expand All @@ -120,7 +127,7 @@
# Use gradient checkpointing
if ft_config.gradient_checkpointing:
print('Applying gradient checkpointing ...')
from gradient_checkpointing import apply_gradient_checkpointing
from alpaca_lora_4bit.gradient_checkpointing import apply_gradient_checkpointing
apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)

# Disable Trainer's DataParallel for multigpu
Expand Down
11 changes: 8 additions & 3 deletions inference.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import os
import sys
# set src so alpaca_lora_4bit package is available without installing
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

import time
import torch
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
replace_peft_model_with_gptq_lora_model()
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model
replace_peft_model_with_int4_lora_model()

config_path = './llama-13b-4bit/'
model_path = './llama-13b-4bit.pt'
Expand Down
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,5 @@ sentencepiece
safetensors
einops
colorama
git+https://github.com/huggingface/peft.git@70af02a2bca5a63921790036b2c9430edf4037e2
git+https://github.com/huggingface/transformers.git
git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
peft @ git+https://github.com/huggingface/peft.git@70af02a2bca5a63921790036b2c9430edf4037e2
transformers @ git+https://github.com/huggingface/transformers.git
30 changes: 30 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import sys
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

install_requires = []
with open("./requirements.txt", "r") as requirements_file:
reqs = [r.strip() for r in requirements_file.readlines()]
for r in reqs:
install_requires.append(r)

quant_cuda_module = CUDAExtension(
'alpaca_lora_4bit.quant_cuda',
sources=[
'src/alpaca_lora_4bit/quant_cuda/quant_cuda.cpp',
'src/alpaca_lora_4bit/quant_cuda/quant_cuda_kernel.cu'
])

setup(
name='alpaca_lora_4bit',
version='0.1',
description='Alpaca LoRA 4-bit',
package_dir={'alpaca_lora_4bit': 'src/alpaca_lora_4bit'},
packages=['alpaca_lora_4bit', 'alpaca_lora_4bit.monkeypatch', 'alpaca_lora_4bit.quant_cuda'],
install_requires=install_requires,
extras_require={
'triton': 'triton',
},
ext_modules=[quant_cuda_module],
cmdclass={'build_ext': BuildExtension},
)
File renamed without changes.
12 changes: 12 additions & 0 deletions src/alpaca_lora_4bit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from . import monkeypatch
from . import amp_wrapper
from . import arg_parser
from . import autograd_4bit
from . import custom_autotune
from . import Finetune4bConfig
from . import gradient_checkpointing
from . import models
from . import train_data
# We don't import these automatically as it is dependent on whether we need cuda or triton
# from . import matmul_utils_4bit
# from . import triton_utils
File renamed without changes.
24 changes: 12 additions & 12 deletions arg_parser.py → src/alpaca_lora_4bit/arg_parser.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import os
import argparse
from Finetune4bConfig import Finetune4bConfig
from .Finetune4bConfig import Finetune4bConfig

def parse_commandline():
parser = argparse.ArgumentParser(
prog=__file__.split(os.path.sep)[-1],
description="Produce LoRA in 4bit training",
usage="%(prog)s [config] [training]\n\nAll arguments are optional"
)

parser.add_argument("dataset", nargs="?",
default="./dataset.json",
default="./dataset.json",
help="Path to dataset file. Default: %(default)s"
)

parser_config = parser.add_argument_group("config")
parser_training = parser.add_argument_group("training")

Expand Down Expand Up @@ -60,14 +60,14 @@ def parse_commandline():
# Data args
parser_training.add_argument("--txt_row_thd", default=-1, type=int, help="Custom thd for txt rows.")
parser_training.add_argument("--use_eos_token", default=1, type=int, help="Use eos token instead if padding with 0. enable with 1, disable with 0.")

# V2 model support
parser_training.add_argument("--groupsize", type=int, default=-1, help="Groupsize of v2 model")
parser_training.add_argument("--v1", action="store_true", help="Use V1 model")

# Multi GPU Support
parser_training.add_argument("--local_rank", type=int, default=0, help="local rank if using torch.distributed.launch")

# Flash Attention
parser_training.add_argument("--flash_attention", action="store_true", help="enables flash attention, can improve performance and reduce VRAM use")
parser_training.add_argument("--xformers", action="store_true", help="enables xformers memory efficient attention, can improve performance and reduce VRAM use")
Expand All @@ -81,20 +81,20 @@ def parse_commandline():
def get_config() -> Finetune4bConfig:
args = parse_commandline()
return Finetune4bConfig(
dataset=args["dataset"],
ds_type=args["ds_type"],
lora_out_dir=args["lora_out_dir"],
dataset=args["dataset"],
ds_type=args["ds_type"],
lora_out_dir=args["lora_out_dir"],
lora_apply_dir=args["lora_apply_dir"],
resume_checkpoint=args["resume_checkpoint"],
llama_q4_config_dir=args["llama_q4_config_dir"],
llama_q4_model=args["llama_q4_model"],
mbatch_size=args["mbatch_size"],
batch_size=args["batch_size"],
epochs=args["epochs"],
epochs=args["epochs"],
lr=args["lr"],
cutoff_len=args["cutoff_len"],
lora_r=args["lora_r"],
lora_alpha=args["lora_alpha"],
lora_r=args["lora_r"],
lora_alpha=args["lora_alpha"],
lora_dropout=args["lora_dropout"],
val_set_size=args["val_set_size"],
gradient_checkpointing=args["grad_chckpt"],
Expand Down
94 changes: 72 additions & 22 deletions autograd_4bit.py → src/alpaca_lora_4bit/autograd_4bit.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,67 @@
import matmul_utils_4bit as mm4b
import logging

import torch
import torch.nn as nn
import time
import math
from torch.cuda.amp import custom_bwd, custom_fwd
from colorama import init, Fore, Back, Style
from huggingface_hub.utils._validators import HFValidationError
init(autoreset=True)


class AutogradMatmul4bitCuda(torch.autograd.Function):
gptq_backend_loaded = False
triton_backend_loaded = False


class AutogradMatmul4bitNotImplemented(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
ctx.save_for_backward(qweight, scales, zeros, g_idx)
if g_idx is None:
output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
else:
output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
output = output.clone()
return output
raise NotImplementedError()

@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, zeros, g_idx = ctx.saved_tensors
if ctx.needs_input_grad[0]:
raise NotImplementedError()


try:
from . import matmul_utils_4bit as mm4b

class AutogradMatmul4bitCuda(torch.autograd.Function):

@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
ctx.save_for_backward(qweight, scales, zeros, g_idx)
if g_idx is None:
grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
else:
grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
return grad, None, None, None, None, None, None
output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
output = output.clone()
return output

@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, zeros, g_idx = ctx.saved_tensors
if ctx.needs_input_grad[0]:
if g_idx is None:
grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
else:
grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
return grad, None, None, None, None, None, None


gptq_backend_loaded = True
except ImportError:
print('quant_cuda not found. Please run "pip install alpaca_lora_4bit[cuda]".')


try:
import triton_utils as tu
from . import triton_utils as tu


class AutogradMatmul4bitTriton(torch.autograd.Function):

Expand All @@ -46,7 +73,7 @@ def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
ctx.bits, ctx.maxq = bits, maxq
output = output.clone()
return output

@staticmethod
@custom_bwd
def backward(ctx, grad_output):
Expand All @@ -57,25 +84,45 @@ def backward(ctx, grad_output):
if ctx.needs_input_grad[0]:
grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
return grad_input, None, None, None, None, None, None



triton_backend_loaded = True
except ImportError:
print('Triton not found. Please run "pip install triton".')


AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
def is_triton_backend_available():
return 'AutogradMatmul4bitTriton' in globals()


def is_gptq_backend_available():
return 'AutogradMatmul4bitCuda' in globals()


AutogradMatmul4bit = AutogradMatmul4bitNotImplemented
backend = None
if is_gptq_backend_available():
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
elif is_triton_backend_available():
AutogradMatmul4bit = AutogradMatmul4bitTriton
backend = 'triton'
else:
logging.warning("Neither gptq/cuda or triton backends are available.")


def switch_backend_to(to_backend):
global AutogradMatmul4bit
global backend
if to_backend == 'cuda':
if not is_gptq_backend_available():
raise ValueError('quant_cuda not found. Please reinstall with pip install .')
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
elif to_backend == 'triton':
# detect if AutogradMatmul4bitTriton is defined
if 'AutogradMatmul4bitTriton' not in globals():
if not is_triton_backend_available():
raise ValueError('Triton not found. Please install triton')
AutogradMatmul4bit = AutogradMatmul4bitTriton
backend = 'triton'
Expand Down Expand Up @@ -211,7 +258,10 @@ def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=Fa
if half:
model_to_half(model)

tokenizer = LlamaTokenizer.from_pretrained(config_path)
try:
tokenizer = LlamaTokenizer.from_pretrained(config_path)
except HFValidationError as e:
tokenizer = LlamaTokenizer.from_pretrained(model)
tokenizer.truncation_side = 'left'

print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
Expand Down Expand Up @@ -248,7 +298,7 @@ def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path

if lora_path is not None:
from peft import PeftModel
from monkeypatch.peft_tuners_lora_monkey_patch import Linear4bitLt
from .models import Linear4bitLt
model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))

Expand Down
File renamed without changes.
File renamed without changes.
Loading