Enable fine tuning on HPU

splotnikv · splotnikv · commit f9b6d2cd5111 · 2025-06-13T08:46:45.000-07:00
diff --git a/src/instructlab/training/accelerator.py b/src/instructlab/training/accelerator.py
@@ -3,7 +3,12 @@
 from typing import Callable, Optional
 
 # Third Party
-from accelerate import Accelerator as TransformersAccel
+from instructlab.training.hpu_utils import is_torch_hpu_available
+if is_torch_hpu_available():
+    from optimum.habana.accelerate import GaudiAccelerator as TransformersAccel
+else:
+    from accelerate import Accelerator as TransformersAccel
+
 from torch.utils.data import DataLoader
 from transformers import get_scheduler
 import torch
@@ -124,7 +129,11 @@ def get_fsdp_config(self):
         from functools import partial
 
         # Third Party
-        from accelerate.utils import FullyShardedDataParallelPlugin
+        if is_torch_hpu_available():
+            from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin
+        else:
+            from accelerate.utils import FullyShardedDataParallelPlugin
+
         from peft.utils.other import fsdp_auto_wrap_policy
         from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
         from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
@@ -152,14 +161,18 @@ def get_fsdp_config(self):
         prefetch_policy = (
             BackwardPrefetch.BACKWARD_POST if is_lora else BackwardPrefetch.BACKWARD_PRE
         )
-        fsdp_plugin = FullyShardedDataParallelPlugin(
+        fsdp_plugin = (GaudiFullyShardedDataParallelPlugin if is_torch_hpu_available() else FullyShardedDataParallelPlugin)(
             auto_wrap_policy=wrap_policy,
             limit_all_gathers=True,
             backward_prefetch=prefetch_policy,
             sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
             cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
         )
 
+        if is_torch_hpu_available():
+            fsdp_plugin.use_orig_params=True
+            fsdp_plugin.sync_module_states=True
+
         # `use_orig_params` must be disabled when using LoRA and FSDP together
         # Source: https://huggingface.co/docs/peft/en/accelerate/fsdp#the-important-parts
         if self.model.lora_config is not None:
diff --git a/src/instructlab/training/hpu_utils.py b/src/instructlab/training/hpu_utils.py
@@ -0,0 +1,49 @@
+import torch
+from functools import lru_cache
+
+
+@lru_cache(maxsize=None)
+def is_torch_hpu_available() -> bool:
+    try:
+        import habana_frameworks.torch.core  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
+def simple_bucket(length):
+    """
+    This bucket algorithm merely relies on the given number instead of based on
+    slicing the known (min, max) range for several reasons:
+        1) Due to the use of the first-fit-decreasing (FFD) algorithm, the
+           (min, max) sequence length of each rank will be much smaller than the
+           (min, max) sequence length of the dataset. Bucketing on the
+           (min, max) sequence length of the dataset is not practical
+        2) The (min, max) sequence length of a given rank is unknown until
+           finishing 1 epoch since the packing is done on the fly
+        3) Due to the shuffling, the (min, max) sequence length of a given rank
+           may vary between ranks. Once the (min, max) sequence length of a
+           given rank changes, the bucketing also needs adjustment
+
+    This bucket algorithm is based on the most significant set bit of the input number.
+    It first check what’s the most significant set bit, assuming it's bit "S",
+    and then slice the range [2 ** S, 2 ** (S+1)] into buckets with the same size.
+    By default the range is divided into 16 buckets, so the bucket size will be
+    2 ** (S - 4)
+    For example, 0b10001 will be padded to 0b10010.
+    This approach can limit the overhead of bucketing (at most 1/16 of the input
+    number) and also prevent recompilation due to a too small bucket size.
+    """
+    l = length
+    msb = 0
+    while l > 0:
+        msb += 1
+        l = l // 2
+
+    align = (1 << (msb - 4)) if msb >= 4 else 1
+
+    return (length + align - 1) // align * align
+
+
+def bucket(length):
+    return simple_bucket(length)
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -33,6 +33,14 @@
             UserWarning,
         )
 
+from instructlab.training.hpu_utils import is_torch_hpu_available
+
+if is_torch_hpu_available():
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.distributed.hccl
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    adapt_transformers_to_gaudi()
+
 # Third Party
 from tqdm import tqdm
 from transformers import AutoConfig
@@ -139,10 +147,19 @@ def train(
             total_length = float(torch.tensor([batch.pop("total_length")]))
             if not args.use_dolomite:
                 for k in batch:
-                    batch[k] = batch[k].to(local_rank)
+                    batch[k] = batch[k].to('hpu' if is_torch_hpu_available() else local_rank)
+
+            hpu_args = {}
+            if is_torch_hpu_available():
+                hpu_args = {
+                    "use_flash_attention":True,
+                    "lazy_mode":False,
+                }
+
             output = model(
                 **batch,
                 use_cache=False,
+                **hpu_args,
             )
             loss = output.loss
             log_loss = loss.detach().item()
@@ -179,8 +196,14 @@ def train(
                 elapsed_time = time.time() - start
                 overall_throughput = args.samples_per_gpu * world_size / elapsed_time
                 current_lr = accelerator.lr_scheduler.get_last_lr()[0]
-                cuda_mem_allocated = torch.cuda.memory_allocated() / (1024**3)
-                cuda_malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
+                if is_torch_hpu_available():
+                    mem_allocated = torch.hpu.memory_allocated() / (1024**3)
+                    malloc_retries = 0
+                else:
+                    mem_allocated = torch.cuda.memory_allocated() / (1024**3)
+                    malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
                 global_grad_norm = (
                     model.get_global_grad_norm()
                     if hasattr(model, "get_global_grad_norm")
@@ -202,8 +225,8 @@ def train(
                         "rank": torch.distributed.get_rank(),
                         "overall_throughput": overall_throughput,
                         "lr": current_lr,
-                        "cuda_mem_allocated": cuda_mem_allocated,
-                        "cuda_malloc_retries": cuda_malloc_retries,
+                        ("hpu" if is_torch_hpu_available() else "cuda") + "_mem_allocated": mem_allocated,
+                        ("hpu" if is_torch_hpu_available() else "cuda") + "_malloc_retries": malloc_retries,
                         "num_loss_counted_tokens": int(num_loss_counted_tokens),
                         "num_tokens_rank0": int(total_length),
                         "batch_size": int(micro_batch_size),
@@ -236,7 +259,10 @@ def train(
             global_step += 1
             if local_rank == 0:
                 inner_pb.update(1)
-            torch.cuda.empty_cache()
+
+            if not is_torch_hpu_available():
+                torch.cuda.empty_cache()
+
         if args.checkpoint_at_epoch:
             base_logger.debug(f"Saving checkpoint at epoch {epoch}")
             save_checkpoint(
@@ -314,17 +340,24 @@ def main(args):
     args.model_type = model_conf.model_type
 
     #### distributed init #####
-    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+    if is_torch_hpu_available():
+        torch.hpu.set_device(int(os.environ["LOCAL_RANK"]))
+    else:
+        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
     args.local_rank = int(os.environ["LOCAL_RANK"])
 
     timeout = _get_collective_timeout()
-    if timeout is not None:
-        torch.distributed.init_process_group(timeout=timeout)
-    else:
-        torch.distributed.init_process_group()
+    backend = "hccl" if is_torch_hpu_available() else None
+    torch.distributed.init_process_group(backend=backend, timeout=timeout)
 
     args.global_rank = torch.distributed.get_rank()
-    tensor = torch.ByteTensor([False]).cuda()
+
+    if is_torch_hpu_available():
+        tensor = torch.ByteTensor([False]).to('hpu')
+    else:
+        tensor = torch.ByteTensor([False]).cuda()
+
     torch.distributed.all_reduce(tensor)
     torch.distributed.barrier()
 
diff --git a/src/instructlab/training/model.py b/src/instructlab/training/model.py
@@ -34,6 +34,8 @@
 import torch
 
 # First Party
+from instructlab.training.hpu_utils import is_torch_hpu_available
+
 from instructlab.training.config import (  # Adjust this import if needed
     DistributedBackend,
     Optimizer,
@@ -78,6 +80,14 @@ def __init__(
 
     def _post_model_init(self):
         """Common initialization steps that should happen after model initialization."""
+
+        if is_torch_hpu_available() and os.getenv("HPU_ENABLE_TORCH_COMPILE", False):
+            torch._dynamo.config.cache_size_limit = 10*1000
+            torch._dynamo.config.accumulated_cache_size_limit = 20*1000
+            self.model = torch.compile(self.model, backend="hpu_backend", dynamic=False)
+            for layer in self.model.model.layers:
+                layer.compile(backend="hpu_backend", dynamic=False) 
+
         self.reconcile_tokenizer()
         if self.lora_config:
             self.model = self.prepare_peft_model(
@@ -264,7 +274,11 @@ def _is_causal_lm_model(self) -> bool:
             bool: True if the model is a causal language model, False otherwise.
         """
         # Third Party
-        return "ForCausalLM" in self.model.__class__.__name__
+        if not is_torch_hpu_available():
+            class_name = self.model.__class__.__name__
+        else:
+            class_name = self.model._orig_mod.__class__.__name__ if self.model.__class__.__name__ == 'OptimizedModule' else self.model.__class__.__name__
+        return "ForCausalLM" in class_name
 
     def reconcile_tokenizer(self):
         if len(self.tokenizer) > self.model.config.vocab_size:
@@ -320,6 +334,17 @@ def reconcile_tokenizer(self):
         ):
             self.model.config.eos_token_id = self.tokenizer.eos_token_id
 
+        if is_torch_hpu_available():
+            model = self.model._orig_mod if self.model.__class__.__name__ == 'OptimizedModule' else self.model
+            class_name = model.__class__.__name__
+
+            replace_no_split_modules = {
+                'GaudiLlamaForCausalLM': ['GaudiLlamaDecoderLayer',]
+            }
+            
+            if class_name in replace_no_split_modules: 
+                model._no_split_modules = replace_no_split_modules[class_name]
+
         if not self._is_causal_lm_model():
             raise ValueError(
                 f"Model must be a causal language model, got {type(self.model)}"
diff --git a/src/instructlab/training/multipack_sampler.py b/src/instructlab/training/multipack_sampler.py
@@ -34,6 +34,8 @@
 import torch
 import torch.distributed as dist
 
+from instructlab.training.hpu_utils import is_torch_hpu_available, bucket
+
 
 def find_max_pack_len_with_padding(
     dataset,
@@ -68,9 +70,14 @@ def get_effective_samples_per_minibatch(num_tokens_per_gpu):
 
         The function creates a sampler using the MultipackDistributedBatchSampler class, generates batches using the sampler, and then returns the ratio of the dataset size to the number of batches.
         """
+        lengths=dataset.get_lengths()
+        if is_torch_hpu_available():
+            bucket_v = np.vectorize(bucket)
+            lengths = bucket_v(lengths)
+
         sampler = MultipackDistributedBatchSampler(
             batch_max_length=num_tokens_per_gpu,
-            lengths=dataset.get_lengths(),
+            lengths=lengths,
             num_replicas=torch.distributed.get_world_size(),
             rank=torch.distributed.get_rank(),
             seed=seed,
diff --git a/src/instructlab/training/token_dataset.py b/src/instructlab/training/token_dataset.py
@@ -13,6 +13,7 @@
 from instructlab.training.multipack_sampler import MultipackDistributedBatchSampler
 from instructlab.training.utils import log_rank_0, make_collate_fn
 
+from instructlab.training.hpu_utils import is_torch_hpu_available, bucket
 
 class TokenDataset(Dataset):
     def __init__(self, data_path):
@@ -109,6 +110,10 @@ def setup_dataloader(
 
     lengths = dataset.get_lengths()
     if sampler == "multipack":
+        if is_torch_hpu_available():
+            bucket_v = np.vectorize(bucket)
+            lengths = bucket_v(lengths)
+
         sampler = MultipackDistributedBatchSampler(
             batch_max_length=packing_max_batch_len,
             lengths=lengths,
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -51,6 +51,7 @@
     TrainingArgs,
 )
 from instructlab.training.model import Model
+from instructlab.training.hpu_utils import is_torch_hpu_available, bucket
 
 logger = logging.getLogger("instructlab.training")
 
@@ -275,6 +276,9 @@ def pad_collate_fn(batch):
                 lens = np.array([len(item["input_ids"]) for item in batch])
                 max_len = max(lens)
 
+                if is_torch_hpu_available():
+                    max_len = bucket(max_len)
+
                 input_ids = torch.stack(
                     [
                         F.pad(
@@ -386,6 +390,7 @@ def reduce_sum_forward(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
+                **(_deprecated_arguments if is_torch_hpu_available() else {}),
             )
 
             return_dict = isinstance(output, dict)
@@ -794,7 +799,10 @@ def set_random_seed(seed):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
+        if is_torch_hpu_available():
+            torch.hpu.manual_seed_all(seed)
+        else:
+            torch.cuda.manual_seed_all(seed)
 
 
 def save_checkpoint(