Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use unsloth as a booster? #111

Open
dimentox opened this issue Sep 6, 2024 · 0 comments
Open

Use unsloth as a booster? #111

dimentox opened this issue Sep 6, 2024 · 0 comments

Comments

@dimentox
Copy link

dimentox commented Sep 6, 2024

example of unsloth training.

from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import torch

class AdaptiveTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_eval_loss = float('inf')
        self.no_improve_count = 0

    def evaluation_step(self, *args, **kwargs):
        output = super().evaluation_step(*args, **kwargs)
        current_eval_loss = output['eval_loss']

        # Adaptive Learning Rate Adjustment
        if current_eval_loss > self.prev_eval_loss:
            new_lr = self.args.learning_rate * 0.9  # Reduce learning rate if loss increased
            print(f"Decreased learning rate to: {new_lr}")
            self.args.learning_rate = new_lr
            self.no_improve_count += 1
        else:
            new_lr = self.args.learning_rate * 1.05  # Slightly increase if loss decreased
            print(f"Increased learning rate to: {new_lr}")
            self.args.learning_rate = new_lr
            self.no_improve_count = 0

        self.prev_eval_loss = current_eval_loss
        return output

    def training_step(self, *args, **kwargs):
        # Adjust gradient clipping based on gradient norms
        if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
            current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
            print(f"Adjusted gradient clipping to: {current_grad_norm}")

        return super().training_step(*args, **kwargs)

def print_memory_stats(stage):
    gpu_stats = torch.cuda.get_device_properties(0)
    used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"Stage: {stage}")
    print(f"GPU: {gpu_stats.name}")
    print(f"Max memory: {max_memory} GB")
    print(f"Memory reserved: {used_memory} GB")

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=""
)

print("Loading Laura")
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("Loading dataset")
custom_prompt = """Source: {}
Repository: {}
File: {}
Label: {}
Content: {}
"""

EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func(examples):
    sources = examples["source"]
    repositories = examples["repository"]
    files = examples["file"]
    labels = examples["label"]
    contents = examples["content"]
    texts = []
    for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
        text = f"Source: {source}\nRepository: {repository}\nFile: {file}\nLabel: {label}\nContent:\n```\n{content}\n```\n"
        texts.append(text)
    tokenized_texts = tokenizer(texts, truncation=True, padding=True, max_length=max_seq_length)
    tokenized_texts["labels"] = tokenized_texts["input_ids"].copy()  # Add labels for loss calculation
    return tokenized_texts


dataset_path = "autogen_python_dataset.json"
dataset = load_dataset("json", data_files=dataset_path, split="train")
dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

data_collator = DataCollatorForSeq2Seq(tokenizer)

print_memory_stats("After loading dataset")  # Check memory after loading dataset

trainer = AdaptiveTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    dataset_text_field="input_ids",
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,  # We will loop over epochs dynamically
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        evaluation_strategy="steps",
        eval_steps=10,
        save_steps=10,
        save_total_limit=2,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",  # Use cosine annealing scheduler
        gradient_checkpointing=True,
        seed=3407,
        output_dir="outputs",
        fp16_full_eval=True,
        per_device_eval_batch_size=2,
        eval_accumulation_steps=4,
        resume_from_checkpoint=True,  # Resume from last checkpoint
        max_grad_norm=1.0,  # Gradient clipping to stabilize training
    ),
)

print_memory_stats("After initializing trainer")  # Check memory after initializing trainer

target_loss = 1.0  # Set your target loss here
patience = 3  # Number of epochs to wait for improvement before stopping

while True:
    trainer_stats = trainer.train()
    if trainer.state.global_step >= trainer.args.num_train_epochs * len(train_dataset) / trainer.args.per_device_train_batch_size:
        break
    if trainer.prev_eval_loss <= target_loss:
        print(f"Target loss of {target_loss} achieved.")
        break
    if trainer.no_improve_count >= patience:
        print(f"No improvement for {patience} epochs, stopping training.")
        break

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")

print_memory_stats("After training")  # Check memory after training

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant