From 833e7ad8c308c30063f461b676b8967f68fd024f Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Sep 2024 09:29:26 +0000 Subject: [PATCH 01/19] add llama model from torchtitan --- src/zeroband/models/__init__.py | 0 src/zeroband/models/llama/__init__.py | 61 ++++ src/zeroband/models/llama/model.py | 460 ++++++++++++++++++++++++++ src/zeroband/models/norms.py | 333 +++++++++++++++++++ tests/test_model.py | 21 ++ 5 files changed, 875 insertions(+) create mode 100644 src/zeroband/models/__init__.py create mode 100644 src/zeroband/models/llama/__init__.py create mode 100644 src/zeroband/models/llama/model.py create mode 100644 src/zeroband/models/norms.py create mode 100644 tests/test_model.py diff --git a/src/zeroband/models/__init__.py b/src/zeroband/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py new file mode 100644 index 00000000..dfc7ae71 --- /dev/null +++ b/src/zeroband/models/llama/__init__.py @@ -0,0 +1,61 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Llama 2 is licensed under the LLAMA 2 Community License, +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + +from zeroband.models.llama.model import ModelArgs, Transformer + +__all__ = ["Transformer"] + +llama2_configs = { + "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16), + "150M": ModelArgs(dim=1024, n_layers=12, n_heads=16), # todo(sami): double check this + "271M": ModelArgs(dim=1024, n_layers=16, n_heads=8), + "1B": ModelArgs(dim=2048, n_layers=18, n_heads=16), + "7B": ModelArgs(dim=4096, n_layers=32, n_heads=32), + "13B": ModelArgs(dim=5120, n_layers=40, n_heads=40), + "26B": ModelArgs(dim=5120, n_layers=80, n_heads=40), + "70B": ModelArgs( + dim=8192, + n_layers=80, + n_heads=64, + n_kv_heads=8, + ffn_dim_multiplier=1.3, + multiple_of=4096, + ), +} + +llama3_configs = { + "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16, rope_theta=500000), + "8B": ModelArgs( + dim=4096, + n_layers=32, + n_heads=32, + n_kv_heads=8, + ffn_dim_multiplier=1.3, + multiple_of=1024, + rope_theta=500000, + ), + "70B": ModelArgs( + dim=8192, + n_layers=80, + n_heads=64, + n_kv_heads=8, + ffn_dim_multiplier=1.3, + multiple_of=4096, + rope_theta=500000, + ), + "405B": ModelArgs( + dim=16384, + n_layers=126, + n_heads=128, + n_kv_heads=8, + ffn_dim_multiplier=1.2, + multiple_of=4096, + rope_theta=500000, + ), +} \ No newline at end of file diff --git a/src/zeroband/models/llama/model.py b/src/zeroband/models/llama/model.py new file mode 100644 index 00000000..a08f85f4 --- /dev/null +++ b/src/zeroband/models/llama/model.py @@ -0,0 +1,460 @@ +# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py +# the commit at time of copy paste was commit f2a1551 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Llama 2 is licensed under the LLAMA 2 Community License, +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn +from zeroband.models.norms import build_norm + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + + max_batch_size: int = 32 + max_seq_len: int = 2048 + # If `True`, then each transformer block init uses its layer ID, and if + # `False`, each uses the total number of transformer blocks + depth_init: bool = True + norm_type: str = "rmsnorm" + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor: + """ + Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' + and the end index 'end'. The 'theta' parameter scales the frequencies. + The returned tensor contains complex values in complex64 data type. + + Args: + dim (int): Dimension of the frequency tensor. + end (int): End index for precomputing frequencies. + theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. + + Returns: + torch.Tensor: Precomputed frequency tensor with complex exponentials. + """ + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) + freqs = torch.outer(t, freqs).float() + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor: + """ + Reshape frequency tensor for broadcasting it with another tensor. + + This function reshapes the frequency tensor to have the same shape as the target tensor 'x' + for the purpose of broadcasting the frequency tensor during element-wise operations. + + The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim), + and the first seqlen elements will be sliced, but dim must match x. + + Args: + freqs_cis (torch.Tensor): Frequency tensor to be reshaped. + x (torch.Tensor): Target tensor for broadcasting compatibility. + + Returns: + torch.Tensor: Reshaped frequency tensor. + """ + ndim = x.ndim + assert 0 <= 1 < ndim + seqlen = x.shape[1] + freqs_cis = freqs_cis[0:seqlen] + assert freqs_cis.shape == (seqlen, x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. + + This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided + frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor + is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are + returned as real tensors. + + Args: + xq (torch.Tensor): Query tensor to apply rotary embeddings. + xk (torch.Tensor): Key tensor to apply rotary embeddings. + freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + """ + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + torch.unsqueeze(x, dim=3) + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +class Attention(nn.Module): + """ + Multi-head attention module. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_kv_heads (int): Number of key and value heads. + n_heads (int): Number of query heads. + n_rep (int): Number of repetitions for local heads. + head_dim (int): Dimension size of each attention head. + wq (Linear): Linear transformation for queries. + wk (Linear): Linear transformation for keys. + wv (Linear): Linear transformation for values. + wo (Linear): Linear transformation for output. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.n_kv_heads = ( + model_args.n_heads + if model_args.n_kv_heads is None + else model_args.n_kv_heads + ) + self.n_rep = self.n_heads // self.n_kv_heads + self.head_dim = model_args.dim // model_args.n_heads + + self.wq = nn.Linear( + model_args.dim, model_args.n_heads * self.head_dim, bias=False + ) + self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear( + model_args.n_heads * self.head_dim, model_args.dim, bias=False + ) + + def init_weights(self, init_std: float): + for linear in (self.wq, self.wk, self.wv): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std) + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """ + Forward pass of the attention module. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed frequency tensor. + + Returns: + torch.Tensor: Output tensor after attention. + + """ + bs, seqlen, _ = x.shape + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + + # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual + # local heads from sizes of xq, xk, and xv as TP may have sharded them + # after the above linear ops. + xq = xq.view(bs, seqlen, -1, self.head_dim) + xk = xk.view(bs, seqlen, -1, self.head_dim) + xv = xv.view(bs, seqlen, -1, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + # repeat k/v heads if n_kv_heads < n_heads + keys = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + + # we use casual mask for training + output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) + output = output.transpose( + 1, 2 + ).contiguous() # (bs, seqlen, n_local_heads, head_dim) + output = output.view(bs, seqlen, -1) + return self.wo(output) + + +class FeedForward(nn.Module): + """ + FeedForward module + + Args: + dim (int): Input dimension. + hidden_dim (int): Hidden dimension of the feedforward layer. + multiple_of (int): Value to ensure hidden dimension is a multiple of this value. + ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None. + + Attributes: + w1 (Linear): Linear transformation for the first layer. + w2 (Linear): Linear transformation for the second layer. + w3 (Linear): Linear transformation for the third layer. + + """ + + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ffn_dim_multiplier: Optional[float], + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) + for linear in (self.w2, self.w3): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + +class TransformerBlock(nn.Module): + """ + TransformerBlock Module + + Args: + layer_id (int): Identifier for the layer. + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_heads (int): Number of attention heads. + dim (int): Dimension size of the model. + head_dim (int): Dimension size of each attention head. + attention (Attention): Attention module. + feed_forward (FeedForward): FeedForward module. + layer_id (int): Identifier for the layer. + attention_norm (RMSNorm): Layer normalization for attention output. + ffn_norm (RMSNorm): Layer normalization for feedforward output. + + """ + + def __init__(self, layer_id: int, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.dim = model_args.dim + self.attention = Attention(model_args) + self.feed_forward = FeedForward( + dim=model_args.dim, + hidden_dim=4 * model_args.dim, + multiple_of=model_args.multiple_of, + ffn_dim_multiplier=model_args.ffn_dim_multiplier, + ) + self.layer_id = layer_id + self.num_layers = model_args.n_layers + + self.attention_norm = build_norm( + model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps + ) + self.ffn_norm = build_norm( + model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps + ) + + if model_args.depth_init: + self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5 + else: + self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5 + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """ + Perform a forward pass through the TransformerBlock. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + Returns: + torch.Tensor: Output tensor after applying attention and feedforward layers. + + """ + h = x + self.attention(self.attention_norm(x), freqs_cis) + out = h + self.feed_forward(self.ffn_norm(h)) + return out + + def init_weights(self): + for norm in (self.attention_norm, self.ffn_norm): + norm.reset_parameters() + self.attention.init_weights(self.weight_init_std) + self.feed_forward.init_weights(self.weight_init_std) + + +class Transformer(nn.Module): + """ + Transformer Module + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + model_args (ModelArgs): Model configuration arguments. + vocab_size (int): Vocabulary size. + n_layers (int): Number of layers in the model. + tok_embeddings (ParallelEmbedding): Token embeddings. + layers (torch.nn.ModuleList): List of Transformer blocks. + norm (RMSNorm): Layer normalization for the model output. + output (ColumnParallelLinear): Linear layer for final output. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.model_args = model_args + self.vocab_size = model_args.vocab_size + self.n_layers = model_args.n_layers + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + + # TODO persistent should be set to false, since this buffer can be recomputed. + # however, we set it to true for 2 reasons. (1) due to pytorch/pytorch#123411, + # compile or pipeline-tracer will not correctly handle non-persistent buffers, + # so we need to fix that. (2) if we initialize pipeline-parallel models from + # a seed checkpoint rather than calling init_weights, we need freqs_cis to be + # initialized by the checkpoint, or we need to add a separate initializer for + # just the non-persistent buffers that is called after loading checkpoints. + self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True) + + self.layers = torch.nn.ModuleDict() + for layer_id in range(model_args.n_layers): + self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args) + + self.norm = build_norm( + model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps + ) + + self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) + self.init_weights() + + def init_weights(self): + """ + [Note: On ``init_weights`` vs. ``reset_parameters``] + Modules may define ``reset_parameters`` to initialize parameter values. + ``reset_parameters`` is meant to only initialize directly owned + parameters/buffers, not those of their child modules, and it can be + used to give the initial values for these tensors. + Separately, users may want custom initialization for their modules, + different from that in ``reset_parameters``. For this, we define + ``init_weights``. We only call it in the constructor of this + ``Transformer`` root module to avoid reinitializing tensors. + """ + with torch.device(self.freqs_cis.device): + self.freqs_cis = self._precompute_freqs_cis() + if self.tok_embeddings is not None: + nn.init.normal_(self.tok_embeddings.weight) + for layer in self.layers.values(): + if layer is not None: + layer.init_weights() + if self.norm is not None: + self.norm.reset_parameters() + final_out_std = self.model_args.dim**-0.5 + cutoff_factor = 3 + if self.output is not None: + nn.init.trunc_normal_( + self.output.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + def _precompute_freqs_cis(self) -> torch.Tensor: + return precompute_freqs_cis( + self.model_args.dim // self.model_args.n_heads, + # Need to compute until at least the max token limit for generation + # (use 2x max sequence length to be safe) + self.model_args.max_seq_len * 2, + self.model_args.rope_theta, + ) + + def forward(self, tokens: torch.Tensor): + """ + Perform a forward pass through the Transformer model. + + Args: + tokens (torch.Tensor): Input token indices. + + Returns: + torch.Tensor: Output logits after applying the Transformer model. + + """ + # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages + h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens + + for layer in self.layers.values(): + h = layer(h, self.freqs_cis) + + h = self.norm(h) if self.norm else h + output = self.output(h).float() if self.output else h + return output + + @classmethod + def from_model_args(cls, model_args: ModelArgs) -> "Transformer": + """ + Initialize a Transformer model from a ModelArgs object. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Returns: + Transformer: Transformer model. + + """ + return cls(model_args) \ No newline at end of file diff --git a/src/zeroband/models/norms.py b/src/zeroband/models/norms.py new file mode 100644 index 00000000..72fb225d --- /dev/null +++ b/src/zeroband/models/norms.py @@ -0,0 +1,333 @@ +# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py +# the commit at time of copy paste was commit f2a1551 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math + +from functools import partial + +import torch +import torch.nn as nn + +import triton +import triton.language as tl + +from torch.distributed._tensor import Partial, Replicate, Shard +from torch.distributed._tensor.experimental import local_map + + +def build_norm(norm_type: str, dim: int, eps: float = 1e-6): + """ + Builds the specified normalization layer based on the norm_type. + + Args: + norm_type (str): The type of normalization layer to build. + Supported types: layernorm, np_layernorm, rmsnorm, fused_rmsnorm + dim (int): The dimension of the normalization layer. + eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6. + + Returns: + The built normalization layer. + + Raises: + NotImplementedError: If an unknown norm_type is provided. + """ + norm_type = norm_type.lower() # Normalize to lowercase + + if norm_type == "layernorm": + return nn.LayerNorm(dim, eps=eps, bias=False) + elif norm_type == "np_layernorm": + return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False) + elif norm_type == "rmsnorm": + return RMSNorm(dim, eps=eps) + elif norm_type == "fused_rmsnorm": + return FusedRMSNorm(dim, eps=eps) + else: + raise NotImplementedError(f"Unknown norm_type: '{norm_type}'") + + +class FusedRMSNorm(nn.Module): + """Fused RMS Norm, wraps a fused Triton Kernel""" + + def __init__( + self, + dim: int, + eps: float = 1e-6, + ): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + self.fused_rms_norm_fn = fused_rms_norm_fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """leverages Triton Fused RMS Norm kernel""" + return self.fused_rms_norm_fn( + x, + self.weight, + eps=self.eps, + ) + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) # type: ignore + + +class RMSNorm(nn.Module): + """ + Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x: torch.Tensor): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) # type: ignore + + +# FusedRMSNorm in Triton + +# Credit +# Tri Dao's Triton LayerNorm: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py +# Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N"], +) +@triton.jit +def _rms_norm_fwd_kernel( + X, + stride_x, + Y, + stride_y, + W, + Rstd, + eps, + M, # num rows + N, # num cols + block_N: tl.constexpr, +): + row = tl.program_id(0) + cols = tl.arange(0, block_N) + + # Load input data and weights + mask = cols < N + x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32) + w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32) + + # Compute mean and variance + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + + # Store the reciprocal standard deviation + tl.store(Rstd + row, rstd) + + # Normalize and apply linear transformation + x_hat = x * rstd + y = x_hat * w + + # Write output + tl.store(Y + row * stride_y + cols, y, mask=mask) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N"], +) +@triton.jit +def _rms_norm_bwd_kernel_sm( + X, + stride_x, + W, + DY, + stride_dy, + DX, + stride_dx, + Rstd, + DW, + eps, + M, # num rows + N, # num cols + rows_per_program, + block_N: tl.constexpr, +): + row_block_id = tl.program_id(0) + row_start = row_block_id * rows_per_program + cols = tl.arange(0, block_N) + mask = cols < N + + # Load weights + w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32) + + # Accumulate gradients for weights + dw = tl.zeros((block_N,), dtype=tl.float32) + + row_end = min(row_start + rows_per_program, M) + for row in range(row_start, row_end): + # Load input, output gradient, and reciprocal standard deviation + x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32) + dy = tl.load(DY + row * stride_dy + cols, mask=mask, other=0.0).to(tl.float32) + rstd = tl.load(Rstd + row) + + # Compute normalized input and gradients + x_hat = x * rstd + wdy = w * dy + dw += dy * x_hat + c1 = tl.sum(x_hat * wdy, axis=0) / N + dx = (wdy - x_hat * c1) * rstd + + # Store input gradient + tl.store(DX + row * stride_dx + cols, dx, mask=mask) + + # Store weight gradients + tl.store(DW + row_block_id * N + cols, dw, mask=mask) + + +class TritonFusedRMSNorm(torch.autograd.Function): + @partial( + local_map, + out_placements=[Shard(1)], + in_placements=(None, [Shard(1)], [Replicate()], None), + ) + @staticmethod + def forward(ctx, x, weight, eps): + x_shape_start = x.shape + + # Flatten input + x = x.view(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if weight.stride(-1) != 1: + weight = weight.contiguous() + + M, N = x.shape + y = torch.empty_like(x) + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + + max_size = 65536 // x.element_size() + block_N = min(max_size, triton.next_power_of_2(N)) + + if N > block_N: + raise ValueError(f"N {N} must be <= {block_N=}") + + grid = lambda meta: (M,) + _rms_norm_fwd_kernel[grid]( + x, + x.stride(0), + y, + y.stride(0), + weight, + rstd, + eps, + M, + N, + block_N, + ) + + ctx.eps = eps + ctx.save_for_backward(x, weight, rstd) + ctx.x_shape_start = x_shape_start + + y = y.reshape(x_shape_start) + return y + + @partial( + local_map, + out_placements=([Shard(1)], [Partial()], None), + in_placements=(None, [Shard(1)]), + ) + @staticmethod + def backward(ctx, dy): + x, weight, rstd = ctx.saved_tensors + eps = ctx.eps + x_shape_start = ctx.x_shape_start + + # Flatten input and output gradients + dy = dy.view(-1, dy.shape[-1]) + if dy.stride(-1) != 1: + dy = dy.contiguous() + + M, N = dy.shape + dx = torch.empty_like(x) + + sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count + _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device) + + max_size = 65536 // x.element_size() + block_N = min(max_size, triton.next_power_of_2(N)) + rows_per_sm = math.ceil(M / sm_count) + + if N > block_N: + raise ValueError(f"N {N} must be <= {block_N=}") + + grid = lambda meta: (sm_count,) + _rms_norm_bwd_kernel_sm[grid]( + x, + x.stride(0), + weight, + dy, + dy.stride(0), + dx, + dx.stride(0), + rstd, + _dw, + eps, + M, + N, + rows_per_sm, + block_N, + ) + dw = _dw.sum(0).to(weight.dtype) + dx = dx.view(x_shape_start) + return dx, dw, None + + +# expose fusedRMSNorm as a function +def fused_rms_norm_fn( + x, + weight, + eps=1e-6, +): + return TritonFusedRMSNorm.apply( + x, + weight, + eps, + ) \ No newline at end of file diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 00000000..017448ea --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,21 @@ +import pytest +import torch +from zeroband.models.llama import Transformer, llama2_configs + + +VOCAB_SIZE = 1024 + +@pytest.fixture +def llama_config(): + config = llama2_configs["debugmodel"] + config.vocab_size = VOCAB_SIZE + return config + +def test_llama(llama_config): + seq_len = 512 + bs = 8 + model = Transformer(llama_config) + input_ = torch.randint(0, llama_config.vocab_size, (bs, seq_len)) + output = model(input_) + assert output.shape == (bs, seq_len, llama_config.vocab_size) + From 410e002694d41d860ccde8191e5aeb734205fdce Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 18:15:21 +0000 Subject: [PATCH 02/19] add training loop --- README.md | 2 +- configs/debug.toml | 10 ++ pyproject.toml | 7 +- src/zeroband/__init__.py | 2 - src/zeroband/data.py | 74 +++++++++++ src/zeroband/train.py | 221 +++++++++++++++++++++++++++++++++ src/zeroband/utils/__init__.py | 21 ++++ src/zeroband/utils/monitor.py | 45 +++++++ uv.lock | 216 ++++++++++++++++++++++++++++++-- 9 files changed, 587 insertions(+), 11 deletions(-) create mode 100644 configs/debug.toml create mode 100644 src/zeroband/data.py create mode 100644 src/zeroband/train.py create mode 100644 src/zeroband/utils/__init__.py create mode 100644 src/zeroband/utils/monitor.py diff --git a/README.md b/README.md index b04cd13d..094d95e4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ install uv ```bash curl -LsSf https://astral.sh/uv/install.sh | sh -uv sync +uv sync --extra all ``` run your code using diff --git a/configs/debug.toml b/configs/debug.toml new file mode 100644 index 00000000..c64d8c05 --- /dev/null +++ b/configs/debug.toml @@ -0,0 +1,10 @@ +name_model = "debugmodel" +project = "debug" + +[train] +micro_bs = 8 + +[optim] +batch_size = 64 +warmup_steps = 10 +total_steps = 5000 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f6b103e3..e157efa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,14 @@ dependencies = [ "setuptools", "transformers>=4.44.2", "datasets>=3.0.0", - "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@v0.2" + "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@48aa6b9", + "einops" ] +[project.optional-dependencies] +all = [ + "wandb", +] [build-system] requires = ["hatchling"] diff --git a/src/zeroband/__init__.py b/src/zeroband/__init__.py index 7bcbc130..e69de29b 100644 --- a/src/zeroband/__init__.py +++ b/src/zeroband/__init__.py @@ -1,2 +0,0 @@ -def hello() -> str: - return "Hello from zeroband!" diff --git a/src/zeroband/data.py b/src/zeroband/data.py new file mode 100644 index 00000000..cf3c522d --- /dev/null +++ b/src/zeroband/data.py @@ -0,0 +1,74 @@ + +from functools import partial +from typing import Any, Generator + +import torch +from torch.utils.data import DataLoader +from torch.utils.data import IterableDataset + + +TEST_VOCAB_SIZE = 1024 + + +class FakeTokenizedDataset(IterableDataset): + """This is a dummy dataset that generates random sequences of length seq_len and vocab_size""" + + def __init__(self, seq_len: int, vocab_size: int): + self.seq_len = seq_len + self.vocab_size = vocab_size + assert vocab_size > 3, "Vocab size must be greater than 3" + + def __iter__(self) -> Generator[dict[str, Any], Any, None]: + while True: + input_ids = torch.randint(3, self.vocab_size, (self.seq_len,)).tolist() + yield {"input_ids": input_ids} + + +def collate_causal_mask(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100) -> callable: + """collate function for causal mask. Fill with padding tokens if sequence is shorter than max_seq_length""" + return partial(_collate_fn_causal_mask, max_seq_length=max_seq_length, pad_id=pad_id, ignore_index=ignore_index) + + +def _collate_fn_causal_mask( + samples: list[dict[str, torch.LongTensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100 +) -> dict[str, torch.LongTensor]: + """collate function for causal mask. Fill with padding tokens if sequence is shorter than max_seq_length. + input_ids and labels are both of size max_seq_length. + """ + + assert samples[0].keys() == {"input_ids"} + + batched = {"input_ids": [], "labels": []} + + if max_seq_length > 0: + max_seq_length += 1 # this makes sure that the effective seqlen is correct + + for sample in samples: + input_ids = torch.Tensor(sample["input_ids"]).long() + + if len(input_ids) < max_seq_length: + input_ids = torch.cat([input_ids, torch.full((max_seq_length - len(input_ids),), pad_id)]) + elif len(input_ids) > max_seq_length: + input_ids = input_ids[:max_seq_length] + + batched["input_ids"].append(input_ids[1:]) + batched["labels"].append(input_ids[:-1]) + + return {"input_ids": torch.stack(batched["input_ids"], dim=0), "labels": torch.stack(batched["labels"], dim=0)} + + +def get_dataloader(pad_token_id: int, world_size: int, rank: int, seq_length: int, batch_size: int, num_workers: int) -> DataLoader: + """ + Get a pytorch dataloader to train on + """ + #todo add real dataset and world splitting + train_dataset = FakeTokenizedDataset(seq_length, TEST_VOCAB_SIZE) + data_collator = collate_causal_mask(max_seq_length=seq_length, pad_id=pad_token_id, ignore_index=-100) + + return DataLoader( + train_dataset, + collate_fn=data_collator, + batch_size=batch_size, + num_workers=num_workers, + ) + diff --git a/src/zeroband/train.py b/src/zeroband/train.py new file mode 100644 index 00000000..bdb5b5c0 --- /dev/null +++ b/src/zeroband/train.py @@ -0,0 +1,221 @@ +import os +from contextlib import nullcontext +import datetime +import logging # Added logging import +from typing import Literal + +import torch +from pydantic_config import parse_argv, BaseConfig +from torch.distributed import destroy_process_group, init_process_group +from einops import rearrange +from torch.nn import functional as F + +from transformers import ( + AutoTokenizer, + get_cosine_schedule_with_warmup, +) +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, + MixedPrecision, +) +from zeroband.utils import get_sharding_strategy +from zeroband.utils.monitor import WandbMonitor, DummyMonitor +from zeroband.data import TEST_VOCAB_SIZE, get_dataloader +from zeroband.models.llama import llama2_configs, llama3_configs, Transformer + + +local_rank = int(os.getenv("LOCAL_RANK", 0)) + +if local_rank == 0: + log_level = os.getenv("ZERO_BAND_LOG_LEVEL", "INFO") + logging.basicConfig(level=getattr(logging, log_level, logging.INFO)) +else: + logging.basicConfig(level=logging.CRITICAL) # Disable logging for non-zero ranks + +logger = logging.getLogger(__name__) + +# Function to initialize the distributed process group +def ddp_setup(): + init_process_group() + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + + +class DilocoConfig(BaseConfig): + outer_lr: float = 0.7 + inner_steps: int = 10 + + +class DataConfig(BaseConfig): + dataset_name_or_path: str = "allenai/c4" + seq_length: int = 1024 + fake_data: bool = False + num_workers: int = 4 + +class OptimConfig(BaseConfig): + lr: float = 4e-4 + weight_decay: float = 0.1 + adam_betas1: float = 0.9 + adam_betas2: float = 0.95 + + warmup_steps: int = 1000 + total_steps: int = 88_000 + batch_size: int = 512 + +class TrainConfig(BaseConfig): + micro_bs: int + torch_compile: bool = True + sharding_strategy: str = "FULL_SHARD" + + +class Config(BaseConfig): + + # main config + name_model: Literal["debugmodel", "150M", "271M", "1B", "7B", "13B", "26B", "70B"] = "150M" + type_model: Literal["llama2","llama3"] = "llama2" + + project: str = "zeroband" + metric_logger_type: Literal["wandb", "dummy"] = "wandb" + + + # sub config + diloco: DilocoConfig | None = None + data: DataConfig = DataConfig() + optim: OptimConfig = OptimConfig() + train: TrainConfig + + + +def get_model(name_model: str, type_model: str, tokenizer: AutoTokenizer) -> Transformer: + """get the transformer model""" + + if type_model == "llama2": + config = llama2_configs[name_model] + elif type_model == "llama3": + config = llama3_configs[name_model] + else: + raise ValueError(f"Model type {type_model} not supported") + + config.vocab_size = tokenizer.vocab_size if name_model != "debugmodel" else TEST_VOCAB_SIZE + return Transformer(config) + +def train(config: Config): + sharding_strategy = get_sharding_strategy(config.train.sharding_strategy) + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) + rank = int(os.environ["RANK"]) + + # batch_size is the total batch size for all GPUs + assert config.optim.batch_size % local_world_size == 0 + batch_size = config.optim.batch_size // local_world_size + + assert batch_size % config.train.micro_bs == 0 + gradient_accumulation_steps = batch_size // config.train.micro_bs + + tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=True) + tokenizer.pad_token = "" # todo(sami): remove padding tokens once we have context stuffing + + logger.debug("tokenizer loaded") + train_dataloader = get_dataloader(tokenizer.pad_token_id, world_size, rank, config.data.seq_length, config.train.micro_bs, config.data.num_workers) + + model = get_model(config.name_model, config.type_model, tokenizer=tokenizer) + model = model.to(local_rank) + logger.debug("model loaded") + + model = FSDP( + model, + sharding_strategy=sharding_strategy, + mixed_precision=MixedPrecision(param_dtype=torch.bfloat16), + use_orig_params=True, + ) + + if config.train.torch_compile: + model = torch.compile(model) + logger.debug("model compiled and fsdped") + + # Setup optimizers + inner_optimizer = torch.optim.AdamW(model.parameters(), lr=config.optim.lr, weight_decay=config.optim.weight_decay, betas=(config.optim.adam_betas1, config.optim.adam_betas2)) + + scheduler = get_cosine_schedule_with_warmup( + inner_optimizer, + num_warmup_steps=config.optim.warmup_steps, + num_training_steps=config.optim.total_steps, + ) + + model.train() + + if rank == 0: + logger_cls = WandbMonitor if config.metric_logger_type == "wandb" else DummyMonitor + metric_logger = logger_cls(project=config.project, config=config.model_dump(), resume=False) + + train_dataloader_iterator = iter(train_dataloader) + + outer_step = 0 + num_inner_steps = config.diloco.inner_steps if config.diloco is not None else 1 + + logger.info("starting training") + while True: + logger.info(f"outer_step step: {outer_step}") + + for inner_step in range(num_inner_steps): + loss_batch = 0 + + for grad_acc_step in range(gradient_accumulation_steps): + is_accumulating = grad_acc_step < gradient_accumulation_steps - 1 + batch = next(train_dataloader_iterator) + input_ids = batch["input_ids"].to("cuda") + labels = batch["labels"].to("cuda") + + with model.no_sync() if is_accumulating else nullcontext(): + logits = model(tokens = input_ids).contiguous() + flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab") + flatten_labels = rearrange(labels, "b seq -> (b seq)") + + loss = F.cross_entropy(flatten_logits, flatten_labels, ignore_index=-100) / gradient_accumulation_steps + loss.backward() + loss_batch += loss.detach() + + model.clip_grad_norm_(1.0) # gradient clipping + inner_optimizer.step() + scheduler.step() + inner_optimizer.zero_grad() + + # logging + real_step = outer_step * num_inner_steps + inner_step + 1 # add + 1 because inner_step start at 0 + inner_lr = [group["lr"] for group in inner_optimizer.param_groups][0] + + metrics = { + "Loss": loss_batch.item(), # todo(sami): do local all reduce for the loss + "step": real_step, + "inner_lr": inner_lr, + } + + if rank == 0: + metric_logger.log(metrics) + + logger.info(f"step: {real_step}, loss: {loss_batch.item()}, inner_lr: {inner_lr}") + + outer_step += 1 + + if real_step >= config.optim.total_steps: + # we only allow to break outisde of the inner loop. + # This avoid ending the training in the middle of a the inner loop + # Since ckpt strategy and all reduce is done at the outer loop level. + break + + if rank == 0: + metric_logger.finish() + + +if __name__ == "__main__": + # Allow eager fallback during production so that that the training runs dont die + # However, in development, we want to know that we broke torch compile + torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ + torch.set_float32_matmul_precision("high") + ddp_setup() + + config = Config(**parse_argv()) + logger.debug(f"config: {config.model_dump()}") + + train(config) + destroy_process_group() diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils/__init__.py new file mode 100644 index 00000000..d26823e4 --- /dev/null +++ b/src/zeroband/utils/__init__.py @@ -0,0 +1,21 @@ +from torch.distributed.fsdp import ShardingStrategy + + +__all__ = ["get_sharding_strategy"] + + +def get_sharding_strategy(sharding_strategy: str) -> ShardingStrategy: + if sharding_strategy == "FULL_SHARD": + return ShardingStrategy.FULL_SHARD + elif sharding_strategy == "SHARD_GRAD_OP": + return ShardingStrategy.SHARD_GRAD_OP + elif sharding_strategy == "NO_SHARD": + return ShardingStrategy.NO_SHARD + elif sharding_strategy == "HYBRID_SHARD": + return ShardingStrategy.HYBRID_SHARD + elif sharding_strategy == "_HYBRID_SHARD_ZERO2": + return ShardingStrategy._HYBRID_SHARD_ZERO2 + else: + raise ValueError( + f"Invalid sharding_strategy: {sharding_strategy}. Please choose 'FULL_SHARD', 'SHARD_GRAD_OP', 'NO_SHARD', 'HYBRID_SHARD', or '_HYBRID_SHARD_ZERO2'." + ) diff --git a/src/zeroband/utils/monitor.py b/src/zeroband/utils/monitor.py new file mode 100644 index 00000000..64fc9c02 --- /dev/null +++ b/src/zeroband/utils/monitor.py @@ -0,0 +1,45 @@ +import pickle +from typing import Any, Protocol +import importlib + +class Monitor(Protocol): + def __init__(self, project, config): ... + + def log(self, metrics: dict[str, Any]): ... + + def finish(self): ... + + +class WandbMonitor: + def __init__(self, project, config, resume: bool): + if importlib.util.find_spec("wandb") is None: + raise ImportError("wandb is not installed. Please install it to use WandbMonitor.") + + import wandb + wandb.init( + project=project, config=config, resume="auto" if resume else None + ) # make wandb reuse the same run id if possible + + def log(self, metrics: dict[str, Any]): + import wandb + wandb.log(metrics) + + def finish(self): + import wandb + wandb.finish() + + +class DummyMonitor: + def __init__(self, project, config, *args, **kwargs): + self.project = project + self.config = config + open(project, "a").close() # Create an empty file at the project path + + self.data = [] + + def log(self, metrics: dict[str, Any]): + self.data.append(metrics) + + def finish(self): + with open(self.project, "wb") as f: + pickle.dump(self.data, f) \ No newline at end of file diff --git a/uv.lock b/uv.lock index 0d981321..cee36787 100644 --- a/uv.lock +++ b/uv.lock @@ -1,10 +1,18 @@ version = 1 requires-python = ">=3.10" resolution-markers = [ - "python_full_version < '3.11'", - "python_full_version == '3.11.*'", - "python_full_version < '3.13'", - "python_full_version >= '3.13'", + "python_full_version < '3.11' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", + "python_full_version < '3.11' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version >= '3.13' and sys_platform == 'linux'", + "python_full_version >= '3.13' and sys_platform != 'linux'", ] [[package]] @@ -204,6 +212,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/76/e6222113b83e3622caa4bb41032d0b1bf785250607392e1b778aca0b8a7d/charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc", size = 48543 }, ] +[[package]] +name = "click" +version = "8.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "platform_system == 'Windows'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", size = 97941 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -256,6 +276,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784", size = 468850 }, ] +[[package]] +name = "docker-pycreds" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/e6/d1f6c00b7221e2d7c4b470132c931325c8b22c51ca62417e300f5ce16009/docker-pycreds-0.4.0.tar.gz", hash = "sha256:6ce3270bcaf404cc4c3e27e4b6c70d3521deae82fb508767870fdbf772d584d4", size = 8754 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl", hash = "sha256:7266112468627868005106ec19cd0d722702d2b7d5912a28e19b826c3d37af49", size = 8982 }, +] + +[[package]] +name = "einops" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/ca/9f5dcb8bead39959454c3912266bedc4c315839cee0e0ca9f4328f4588c1/einops-0.8.0.tar.gz", hash = "sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85", size = 58861 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/5a/f0b9ad6c0a9017e62d4735daaeb11ba3b6c009d69a26141b258cd37b5588/einops-0.8.0-py3-none-any.whl", hash = "sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f", size = 43223 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -342,6 +383,30 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "gitdb" +version = "4.0.11" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/0d/bbb5b5ee188dec84647a4664f3e11b06ade2bde568dbd489d9d64adef8ed/gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b", size = 394469 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/5b/8f0c4a5bb9fd491c277c21eff7ccae71b47d43c4446c9d0c6cff2fe8c2c4/gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4", size = 62721 }, +] + +[[package]] +name = "gitpython" +version = "3.1.43" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/a1/106fd9fa2dd989b6fb36e5893961f82992cf676381707253e0bf93eb1662/GitPython-3.1.43.tar.gz", hash = "sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c", size = 214149 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/bd/cc3a402a6439c15c3d4294333e13042b915bbeab54edc457c723931fed3f/GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff", size = 207337 }, +] + [[package]] name = "huggingface-hub" version = "0.24.6" @@ -831,6 +896,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f", size = 204643 }, ] +[[package]] +name = "protobuf" +version = "5.28.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b1/a4/4579a61de526e19005ceeb93e478b61d77aa38c8a85ad958ff16a9906549/protobuf-5.28.2.tar.gz", hash = "sha256:59379674ff119717404f7454647913787034f03fe7049cbef1d74a97bb4593f0", size = 422494 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/30/231764750e0987755b7b8d66771f161e5f002e165d27b72154c776dbabf7/protobuf-5.28.2-cp310-abi3-win32.whl", hash = "sha256:eeea10f3dc0ac7e6b4933d32db20662902b4ab81bf28df12218aa389e9c2102d", size = 419662 }, + { url = "https://files.pythonhosted.org/packages/7d/46/3fdf7462160135aee6a530f1ec66665b5b4132fa2e1002ab971bc6ec2589/protobuf-5.28.2-cp310-abi3-win_amd64.whl", hash = "sha256:2c69461a7fcc8e24be697624c09a839976d82ae75062b11a0972e41fd2cd9132", size = 431479 }, + { url = "https://files.pythonhosted.org/packages/37/45/d2a760580f8f2ed2825ba44cb370e0a4011ddef85e728f46ea3dd565a8a5/protobuf-5.28.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8b9403fc70764b08d2f593ce44f1d2920c5077bf7d311fefec999f8c40f78b7", size = 414736 }, + { url = "https://files.pythonhosted.org/packages/e6/23/ed718dc18e6a561445ece1e7a17d2dda0c634ad9cf663102b47f10005d8f/protobuf-5.28.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:35cfcb15f213449af7ff6198d6eb5f739c37d7e4f1c09b5d0641babf2cc0c68f", size = 316518 }, + { url = "https://files.pythonhosted.org/packages/23/08/a1ce0415a115c2b703bfa798f06f0e43ca91dbe29d6180bf86a9287b15e2/protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:5e8a95246d581eef20471b5d5ba010d55f66740942b95ba9b872d918c459452f", size = 316605 }, + { url = "https://files.pythonhosted.org/packages/9b/55/f24e3b801d2e108c48aa2b1b59bb791b5cffba89465cbbf66fc98de89270/protobuf-5.28.2-py3-none-any.whl", hash = "sha256:52235802093bd8a2811abbe8bf0ab9c5f54cca0a751fdd3f6ac2a21438bffece", size = 169566 }, +] + +[[package]] +name = "psutil" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/c7/8c6872f7372eb6a6b2e4708b88419fb46b857f7a2e1892966b851cc79fc9/psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2", size = 508067 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/66/78c9c3020f573c58101dc43a44f6855d01bbbd747e24da2f0c4491200ea3/psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35", size = 249766 }, + { url = "https://files.pythonhosted.org/packages/e1/3f/2403aa9558bea4d3854b0e5e567bc3dd8e9fbc1fc4453c0aa9aafeb75467/psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1", size = 253024 }, + { url = "https://files.pythonhosted.org/packages/0b/37/f8da2fbd29690b3557cca414c1949f92162981920699cd62095a984983bf/psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0", size = 250961 }, + { url = "https://files.pythonhosted.org/packages/35/56/72f86175e81c656a01c4401cd3b1c923f891b31fbcebe98985894176d7c9/psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0", size = 287478 }, + { url = "https://files.pythonhosted.org/packages/19/74/f59e7e0d392bc1070e9a70e2f9190d652487ac115bb16e2eff6b22ad1d24/psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", size = 290455 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/60038e277ff0a9cc8f0c9ea3d0c5eb6ee1d2470ea3f9389d776432888e47/psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132", size = 292046 }, + { url = "https://files.pythonhosted.org/packages/8b/20/2ff69ad9c35c3df1858ac4e094f20bd2374d33c8643cf41da8fd7cdcb78b/psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d", size = 253560 }, + { url = "https://files.pythonhosted.org/packages/73/44/561092313ae925f3acfaace6f9ddc4f6a9c748704317bad9c8c8f8a36a79/psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3", size = 257399 }, + { url = "https://files.pythonhosted.org/packages/7c/06/63872a64c312a24fb9b4af123ee7007a306617da63ff13bcc1432386ead7/psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0", size = 251988 }, +] + [[package]] name = "pyarrow" version = "17.0.0" @@ -880,7 +976,7 @@ wheels = [ [[package]] name = "pydantic-config" version = "0.2.0" -source = { git = "https://github.com/samsja/pydantic_config.git?rev=v0.2#e50503071fcfbfdbfd9442dc45eb853a4033565d" } +source = { git = "https://github.com/samsja/pydantic_config.git?rev=48aa6b9#48aa6b9d2bc6aa0e4d72a919d07e808802e558de" } dependencies = [ { name = "pydantic" }, { name = "rich" }, @@ -1227,6 +1323,67 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/46/5d11dc300feaad285c2f1bd784ff3f689f5e0ab6be49aaf568f3a77019eb/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f", size = 606660 }, ] +[[package]] +name = "sentry-sdk" +version = "2.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/23/6527e56fb17817153c37d702d6b9ed0a2f75ed213fd98a176c1b8894ad20/sentry_sdk-2.14.0.tar.gz", hash = "sha256:1e0e2eaf6dad918c7d1e0edac868a7bf20017b177f242cefe2a6bcd47955961d", size = 282948 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/de/956ce1d71459fa1af0486ca141fc605ac16f7c8855750668ff663e2b436a/sentry_sdk-2.14.0-py2.py3-none-any.whl", hash = "sha256:b8bc3dc51d06590df1291b7519b85c75e2ced4f28d9ea655b6d54033503b5bf4", size = 311425 }, +] + +[[package]] +name = "setproctitle" +version = "1.3.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/e1/b16b16a1aa12174349d15b73fd4b87e641a8ae3fb1163e80938dbbf6ae98/setproctitle-1.3.3.tar.gz", hash = "sha256:c913e151e7ea01567837ff037a23ca8740192880198b7fbb90b16d181607caae", size = 27253 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4f/cc/c51e6371f640a9adbe693ddb89d68596e5a8e4b5e05b4d3c65ec504e2f6d/setproctitle-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:897a73208da48db41e687225f355ce993167079eda1260ba5e13c4e53be7f754", size = 16954 }, + { url = "https://files.pythonhosted.org/packages/c3/7d/d03f319e0f3b3a6e98731a56cd4d81478ed0c12531b822fd2c728b948edb/setproctitle-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8c331e91a14ba4076f88c29c777ad6b58639530ed5b24b5564b5ed2fd7a95452", size = 11304 }, + { url = "https://files.pythonhosted.org/packages/9c/56/6f4a4e80b2810eb7ea9ab355022c780ef80457de368ab5b6b21b795e4f05/setproctitle-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbbd6c7de0771c84b4aa30e70b409565eb1fc13627a723ca6be774ed6b9d9fa3", size = 31249 }, + { url = "https://files.pythonhosted.org/packages/d0/ae/010811bece9a59a8bba131d9e7acea9c2e3c3cbf544bf06d8b10b8c28ff5/setproctitle-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c05ac48ef16ee013b8a326c63e4610e2430dbec037ec5c5b58fcced550382b74", size = 32594 }, + { url = "https://files.pythonhosted.org/packages/87/7b/69bdc791001250dff279a1a81904f3f563caece4fa1607a95b9fd5197d6e/setproctitle-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1342f4fdb37f89d3e3c1c0a59d6ddbedbde838fff5c51178a7982993d238fe4f", size = 29713 }, + { url = "https://files.pythonhosted.org/packages/79/e7/54b36be02aee8ad573be68f6f46fd62838735c2f007b22df50eb5e13a20d/setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc74e84fdfa96821580fb5e9c0b0777c1c4779434ce16d3d62a9c4d8c710df39", size = 30755 }, + { url = "https://files.pythonhosted.org/packages/69/a7/2a77b68c11db87c22350381d6ce022011eb420076790e0e3697153e89458/setproctitle-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9617b676b95adb412bb69645d5b077d664b6882bb0d37bfdafbbb1b999568d85", size = 38562 }, + { url = "https://files.pythonhosted.org/packages/9d/09/bc108723bbfb7c50c22fdf22191f3e32abcb5d6f46610018030b25f601c5/setproctitle-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6a249415f5bb88b5e9e8c4db47f609e0bf0e20a75e8d744ea787f3092ba1f2d0", size = 36991 }, + { url = "https://files.pythonhosted.org/packages/94/ad/4166381d79f6ae8138be9b49f05d193a8deb748debace9896dffad45a753/setproctitle-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:38da436a0aaace9add67b999eb6abe4b84397edf4a78ec28f264e5b4c9d53cd5", size = 39866 }, + { url = "https://files.pythonhosted.org/packages/3d/92/17168f4bb1a695094e93e73a1ef1f7b89953a6d91e8a7699a2c840ba712f/setproctitle-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:da0d57edd4c95bf221b2ebbaa061e65b1788f1544977288bdf95831b6e44e44d", size = 38221 }, + { url = "https://files.pythonhosted.org/packages/0c/1b/753432a877bcdfb099e280795c86ac7dc245d9651b98308f606bb3db610d/setproctitle-1.3.3-cp310-cp310-win32.whl", hash = "sha256:a1fcac43918b836ace25f69b1dca8c9395253ad8152b625064415b1d2f9be4fb", size = 11064 }, + { url = "https://files.pythonhosted.org/packages/29/ff/80a02c5b414c2d3ff49c36c0a571a94aa3b4236f07eee39f72ebdb7314a0/setproctitle-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:200620c3b15388d7f3f97e0ae26599c0c378fdf07ae9ac5a13616e933cbd2086", size = 11815 }, + { url = "https://files.pythonhosted.org/packages/c9/17/7f9d5ddf4cfc4386e74565ccf63b8381396336e4629bb165b52b803ceddb/setproctitle-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:334f7ed39895d692f753a443102dd5fed180c571eb6a48b2a5b7f5b3564908c8", size = 16948 }, + { url = "https://files.pythonhosted.org/packages/ff/5d/77edf4c29c8d6728b49d3f0abb22159bb9c0c4ddebd721c09486b34985c8/setproctitle-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:950f6476d56ff7817a8fed4ab207727fc5260af83481b2a4b125f32844df513a", size = 11305 }, + { url = "https://files.pythonhosted.org/packages/13/f0/263954ca925a278036f100405e7ba82d4341e1e6bdc09f35362a7b40f684/setproctitle-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:195c961f54a09eb2acabbfc90c413955cf16c6e2f8caa2adbf2237d1019c7dd8", size = 31578 }, + { url = "https://files.pythonhosted.org/packages/79/52/503b546da451deb78fde27fec96c39d3f63a7958be60c9a837de89f47a0d/setproctitle-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f05e66746bf9fe6a3397ec246fe481096664a9c97eb3fea6004735a4daf867fd", size = 32910 }, + { url = "https://files.pythonhosted.org/packages/48/72/aeb734419a58a85ca7845c3d0011c322597da4ff601ebbc28f6c1dfd1ae8/setproctitle-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5901a31012a40ec913265b64e48c2a4059278d9f4e6be628441482dd13fb8b5", size = 30086 }, + { url = "https://files.pythonhosted.org/packages/fd/df/44b267cb8f073a4ae77e120f0705ab3a07165ad90cecd4881b34c7e1e37b/setproctitle-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64286f8a995f2cd934082b398fc63fca7d5ffe31f0e27e75b3ca6b4efda4e353", size = 31076 }, + { url = "https://files.pythonhosted.org/packages/82/c2/79ad43c914418cb1920e0198ac7326061c05cd4ec75c86ed0ca456b7e957/setproctitle-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:184239903bbc6b813b1a8fc86394dc6ca7d20e2ebe6f69f716bec301e4b0199d", size = 41226 }, + { url = "https://files.pythonhosted.org/packages/81/1b/0498c36a07a73d39a7070f45d96a299006e624efc07fc2e2296286237316/setproctitle-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:664698ae0013f986118064b6676d7dcd28fefd0d7d5a5ae9497cbc10cba48fa5", size = 39723 }, + { url = "https://files.pythonhosted.org/packages/3a/fe/ebbcffd6012b9cf5edb017a9c30cfc2beccf707f5bf495da8cf69b4abe69/setproctitle-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e5119a211c2e98ff18b9908ba62a3bd0e3fabb02a29277a7232a6fb4b2560aa0", size = 42773 }, + { url = "https://files.pythonhosted.org/packages/64/b1/5786c0442435eb18d04299c8ce7d1f86feb5154444ac684963527a76e169/setproctitle-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:417de6b2e214e837827067048f61841f5d7fc27926f2e43954567094051aff18", size = 41089 }, + { url = "https://files.pythonhosted.org/packages/33/fb/14b41e920406a12de0a164ef3b86d62edb4fac63d91d9f86f3b80dae5b38/setproctitle-1.3.3-cp311-cp311-win32.whl", hash = "sha256:6a143b31d758296dc2f440175f6c8e0b5301ced3b0f477b84ca43cdcf7f2f476", size = 11066 }, + { url = "https://files.pythonhosted.org/packages/7e/ba/f6da9ba74e8c2c662e932b27a01025c1bee2846222f6a2e87a69c259772f/setproctitle-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a680d62c399fa4b44899094027ec9a1bdaf6f31c650e44183b50d4c4d0ccc085", size = 11817 }, + { url = "https://files.pythonhosted.org/packages/32/22/9672612b194e4ac5d9fb67922ad9d30232b4b66129b0381ab5efeb6ae88f/setproctitle-1.3.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d4460795a8a7a391e3567b902ec5bdf6c60a47d791c3b1d27080fc203d11c9dc", size = 16917 }, + { url = "https://files.pythonhosted.org/packages/49/e5/562ff00f2f3f4253ff8fa6886e0432b8eae8cde82530ac19843d8ed2c485/setproctitle-1.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:bdfd7254745bb737ca1384dee57e6523651892f0ea2a7344490e9caefcc35e64", size = 11264 }, + { url = "https://files.pythonhosted.org/packages/8f/1f/f97ea7bf71c873590a63d62ba20bf7294439d1c28603e5c63e3616c2131a/setproctitle-1.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:477d3da48e216d7fc04bddab67b0dcde633e19f484a146fd2a34bb0e9dbb4a1e", size = 31907 }, + { url = "https://files.pythonhosted.org/packages/66/fb/2d90806b9a2ed97c140baade3d1d2d41d3b51458300a2d999268be24d21d/setproctitle-1.3.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ab2900d111e93aff5df9fddc64cf51ca4ef2c9f98702ce26524f1acc5a786ae7", size = 33333 }, + { url = "https://files.pythonhosted.org/packages/38/39/e7ce791f5635f3a16bd21d6b79bd9280c4c4aed8ab936b4b21334acf05a7/setproctitle-1.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:088b9efc62d5aa5d6edf6cba1cf0c81f4488b5ce1c0342a8b67ae39d64001120", size = 30573 }, + { url = "https://files.pythonhosted.org/packages/20/22/fd76bbde4194d4e31d5b31a02f80c8e7e54a99d3d8ff34f3d656c6655689/setproctitle-1.3.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6d50252377db62d6a0bb82cc898089916457f2db2041e1d03ce7fadd4a07381", size = 31601 }, + { url = "https://files.pythonhosted.org/packages/51/5c/a6257cc68e17abcc4d4a78cc6666aa0d3805af6d942576625c4a468a72f0/setproctitle-1.3.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:87e668f9561fd3a457ba189edfc9e37709261287b52293c115ae3487a24b92f6", size = 40717 }, + { url = "https://files.pythonhosted.org/packages/db/31/4f0faad7ef641be4e8dfcbc40829775f2d6a4ca1ff435a4074047fa3dad1/setproctitle-1.3.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:287490eb90e7a0ddd22e74c89a92cc922389daa95babc833c08cf80c84c4df0a", size = 39384 }, + { url = "https://files.pythonhosted.org/packages/22/17/8763dc4f9ddf36af5f043ceec213b0f9f45f09fd2d5061a89c699aabe8b0/setproctitle-1.3.3-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:4fe1c49486109f72d502f8be569972e27f385fe632bd8895f4730df3c87d5ac8", size = 42350 }, + { url = "https://files.pythonhosted.org/packages/7b/b2/2403cecf2e5c5b4da22f7d9df4b2149bf92d03a3422185e682e81055549c/setproctitle-1.3.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4a6ba2494a6449b1f477bd3e67935c2b7b0274f2f6dcd0f7c6aceae10c6c6ba3", size = 40704 }, + { url = "https://files.pythonhosted.org/packages/5e/c1/11e80061ac06aece2a0ffcaf018cdc088aebb2fc586f68201755518532ad/setproctitle-1.3.3-cp312-cp312-win32.whl", hash = "sha256:2df2b67e4b1d7498632e18c56722851ba4db5d6a0c91aaf0fd395111e51cdcf4", size = 11057 }, + { url = "https://files.pythonhosted.org/packages/90/e8/ece468e93e99d3b2826e9649f6d03e80f071d451e20c742f201f77d1bea1/setproctitle-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f38d48abc121263f3b62943f84cbaede05749047e428409c2c199664feb6abc7", size = 11809 }, + { url = "https://files.pythonhosted.org/packages/24/55/8b369b56007a5a2c7594cdb58cd4a09d7cca65b28483bb5582c6975663f1/setproctitle-1.3.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6b9e62ddb3db4b5205c0321dd69a406d8af9ee1693529d144e86bd43bcb4b6c0", size = 10726 }, + { url = "https://files.pythonhosted.org/packages/35/30/ac99ecae8458ba995f85aa3aa911004679b405922e1487b0fba6fe8f4d37/setproctitle-1.3.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e3b99b338598de0bd6b2643bf8c343cf5ff70db3627af3ca427a5e1a1a90dd9", size = 13368 }, + { url = "https://files.pythonhosted.org/packages/70/1d/3b2249c833c7d52b59ff0602d760df0543dc1e6c272f145b949750edeb01/setproctitle-1.3.3-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ae9a02766dad331deb06855fb7a6ca15daea333b3967e214de12cfae8f0ef5", size = 12969 }, + { url = "https://files.pythonhosted.org/packages/76/78/97f36752438cb5c6409b53eb3b1a334827cede43acab65e4fc4a0014cf9f/setproctitle-1.3.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:200ede6fd11233085ba9b764eb055a2a191fb4ffb950c68675ac53c874c22e20", size = 11848 }, +] + [[package]] name = "setuptools" version = "74.1.2" @@ -1245,6 +1402,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", size = 11053 }, ] +[[package]] +name = "smmap" +version = "5.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/04/b5bf6d21dc4041000ccba7eb17dd3055feb237e7ffc2c20d3fae3af62baa/smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62", size = 22291 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/a5/10f97f73544edcdef54409f1d839f6049a0d79df68adbc1ceb24d1aaca42/smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da", size = 24282 }, +] + [[package]] name = "sympy" version = "1.13.2" @@ -1397,7 +1563,7 @@ name = "triton" version = "3.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, + { name = "filelock", marker = "python_full_version < '3.13'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/45/27/14cc3101409b9b4b9241d2ba7deaa93535a217a211c86c4cc7151fb12181/triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a", size = 209376304 }, @@ -1446,6 +1612,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/ea/12f774a18b55754c730c8383dad8f10d7b87397d1cb6b2b944c87381bb3b/virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55", size = 6013327 }, ] +[[package]] +name = "wandb" +version = "0.18.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "docker-pycreds" }, + { name = "gitpython" }, + { name = "platformdirs" }, + { name = "protobuf" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sentry-sdk" }, + { name = "setproctitle" }, + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/e4/ca1da2dde43886e7daf2260e4dcbd4daed9b00599ee12432cadc2dab4ca3/wandb-0.18.1.tar.gz", hash = "sha256:d625e94d53ff4ff961c58a9a17f0a1ea35720d98b9db710a458235924469fc6b", size = 6238045 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/5b/ab5c2e69c9f49fdea2f83c3f8e15d9388a92e9c5639dd3618a2d5d5cd144/wandb-0.18.1-py3-none-any.whl", hash = "sha256:be936a193eeb940ce03d966f013b847562497e76256852d5fb170cdcdf50f185", size = 5125929 }, + { url = "https://files.pythonhosted.org/packages/f5/8d/298e1a8e1c101894b0805e197667d910e3c0ed46ce537d26c5d3ec1081f1/wandb-0.18.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:1f143b814b0fd51b5f1a676ad8b66bd06a5ee4ad22fc46bcbf24048d76c77d35", size = 6636762 }, + { url = "https://files.pythonhosted.org/packages/aa/fc/6832f3546ee43db973748dd0153a1e6c11b1af5cf29bc1187498620f83f3/wandb-0.18.1-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:86b73a9f94f18b07f0e937ae945560244b560b57c16a9dfb8f03e2516d0cc666", size = 6708580 }, + { url = "https://files.pythonhosted.org/packages/dd/66/5c5e76b0c5a0016d9b935e961ce4444ec280af43af7512258490533630d9/wandb-0.18.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc404682ebfb2477b48cb436a331e1bea0262e002d6fb3ccafe71d13657dd4ee", size = 9281298 }, + { url = "https://files.pythonhosted.org/packages/a8/64/6b1549a02151c3b8426e54fc7011733fa284483151a0189c85b309c9ec4e/wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4c97d69242efd604c1a2077c8b56341e236cfaca78c40f59dcef9b95464fdc", size = 9663908 }, + { url = "https://files.pythonhosted.org/packages/85/5e/bbf9937120b6a95cf859179eb77eee7bde7f63c365efa23c36ca2331c579/wandb-0.18.1-py3-none-win32.whl", hash = "sha256:33c5a0d74bc28879917b519f24d69b0e81530d72e99aba1c115189a2c9aac9cf", size = 6787975 }, + { url = "https://files.pythonhosted.org/packages/63/a8/397fb9a7d6e78136efd6765744f7a992c3c9a119f13448ded2b4885b88e7/wandb-0.18.1-py3-none-win_amd64.whl", hash = "sha256:559cbd6e9ab752622f7d6dacdc334ede7f1bc34f42df3f48ed32bde55db42c6e", size = 6787977 }, +] + [[package]] name = "xxhash" version = "3.5.0" @@ -1598,6 +1792,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "datasets" }, + { name = "einops" }, { name = "numpy" }, { name = "pydantic-config" }, { name = "setuptools" }, @@ -1605,6 +1800,11 @@ dependencies = [ { name = "transformers" }, ] +[package.optional-dependencies] +all = [ + { name = "wandb" }, +] + [package.dev-dependencies] dev = [ { name = "pre-commit" }, @@ -1615,11 +1815,13 @@ dev = [ [package.metadata] requires-dist = [ { name = "datasets", specifier = ">=3.0.0" }, + { name = "einops" }, { name = "numpy" }, - { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=v0.2" }, + { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=48aa6b9" }, { name = "setuptools" }, { name = "torch", specifier = "==2.4.1" }, { name = "transformers", specifier = ">=4.44.2" }, + { name = "wandb", marker = "extra == 'all'" }, ] [package.metadata.requires-dev] From b4e4760d9431667b25ff30e97248d3eb0881df31 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 19:20:06 +0000 Subject: [PATCH 03/19] add torchrun test --- configs/debug.toml | 2 +- pyproject.toml | 2 +- src/zeroband/models/llama/__init__.py | 2 +- src/zeroband/train.py | 10 +- tests/test_torchrun/test_train | 206 ++++++++++++++++++++++++++ tests/test_torchrun/test_train.py | 45 ++++++ uv.lock | 4 +- 7 files changed, 265 insertions(+), 6 deletions(-) create mode 100644 tests/test_torchrun/test_train create mode 100644 tests/test_torchrun/test_train.py diff --git a/configs/debug.toml b/configs/debug.toml index c64d8c05..2a9bea2e 100644 --- a/configs/debug.toml +++ b/configs/debug.toml @@ -5,6 +5,6 @@ project = "debug" micro_bs = 8 [optim] -batch_size = 64 +batch_size = 16 warmup_steps = 10 total_steps = 5000 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e157efa2..f5b1a711 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "setuptools", "transformers>=4.44.2", "datasets>=3.0.0", - "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@48aa6b9", + "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@e529c9c", "einops" ] diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py index dfc7ae71..ce3a676f 100644 --- a/src/zeroband/models/llama/__init__.py +++ b/src/zeroband/models/llama/__init__.py @@ -12,7 +12,7 @@ __all__ = ["Transformer"] llama2_configs = { - "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16), + "debugmodel": ModelArgs(dim=256, n_layers=2, n_heads=8), "150M": ModelArgs(dim=1024, n_layers=12, n_heads=16), # todo(sami): double check this "271M": ModelArgs(dim=1024, n_layers=16, n_heads=8), "1B": ModelArgs(dim=2048, n_layers=18, n_heads=16), diff --git a/src/zeroband/train.py b/src/zeroband/train.py index bdb5b5c0..ad6dbbfa 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -24,6 +24,11 @@ from zeroband.models.llama import llama2_configs, llama3_configs, Transformer +### TODO + +# use torch.idst.local rank instead of env var +# fix logger + local_rank = int(os.getenv("LOCAL_RANK", 0)) if local_rank == 0: @@ -155,7 +160,10 @@ def train(config: Config): logger.info("starting training") while True: - logger.info(f"outer_step step: {outer_step}") + + if num_inner_steps > 1: + # if we don't use diloco we don't print the outer step logs + logger.info(f"outer_step step: {outer_step}") for inner_step in range(num_inner_steps): loss_batch = 0 diff --git a/tests/test_torchrun/test_train b/tests/test_torchrun/test_train new file mode 100644 index 00000000..3295d5f5 --- /dev/null +++ b/tests/test_torchrun/test_train @@ -0,0 +1,206 @@ +import pickle +import subprocess +import numpy as np +import pytest +import socket +from hivemind.dht.dht import DHT +from open_diloco.ckpt_utils import CKPT_PREFIX + + +def get_random_available_port(): + # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="session") +def random_available_port(): + return get_random_available_port() + + +@pytest.fixture +def config() -> list[str]: + return [ + "--path_model", + "tests/models/llama-2m-fresh", + "--fake_data", + "--no-torch_compile", + "--lr", + "1e-2", + "--per_device_train_batch_size", + "8", + "--total_batch_size", + "16", + "--max_steps", + "50", + "--metric_logger_type", + "dummy", + ] + + +@pytest.mark.parametrize("num_gpu", [2]) +def test_multi_gpu_ckpt(config, random_available_port, num_gpu, tmp_path): + ckpt_path = f"{tmp_path}/ckpt" + log_file_1 = f"{tmp_path}/log1.json" + log_file_2 = f"{tmp_path}/log2.json" + + run_1 = ["--ckpt.path", ckpt_path, "--ckpt.interval", "10", "--project", log_file_1] + + cmd = [ + "torchrun", + f"--nproc_per_node={num_gpu}", + "--rdzv-endpoint", + f"localhost:{random_available_port}", + "open_diloco/train_fsdp.py", + *config, + ] + + result = subprocess.run(cmd + run_1) + + if result.returncode != 0: + pytest.fail(f"Process {result} failed {result.stderr}") + + run_2 = ["--ckpt.path", ckpt_path, "--ckpt.resume", f"{ckpt_path}/{CKPT_PREFIX}_20", "--project", log_file_2] + + results_resume = subprocess.run(cmd + run_2) + + if results_resume.returncode != 0: + pytest.fail(f"Process {result} failed {result.stderr}") + + with open(log_file_1, "rb") as f: + log1 = pickle.load(f) + with open(log_file_2, "rb") as f: + log2 = pickle.load(f) + + log1 = {data["step"]: [data["Loss"], data["lr"]] for data in log1} + log2 = {data["step"]: [data["Loss"], data["lr"]] for data in log2} + + common_step = set(log1.keys()) & set(log2.keys()) + + for step in common_step: + assert np.allclose(log1[step][0], log2[step][0], atol=1e-3), f"Loss at step {step} is different" + assert log1[step][1] == log2[step][1], f"Lr at step {step} is different" + + +@pytest.fixture +def config_hv() -> list[str]: + config = [ + "--path_model", + "tests/models/llama-2m-fresh", + "--fake_data", + "--no-torch_compile", + "--lr", + "1e-2", + "--per_device_train_batch_size", + "8", + "--total_batch_size", + "16", + "--max_steps", + "100", + "--metric_logger_type", + "dummy", + ] + + return config + [ + "--hv.local_steps", + "25", + "--hv.skip_load_from_peers", + "--hv.fail_rank_drop", + "--hv.matchmaking_time", + "5", + ] + + +@pytest.mark.parametrize("num_diloco", [2]) +def test_multi_gpu_hivemind(config_hv, num_diloco, tmp_path): + dht = DHT( + start=True, + host_maddrs=[f"/ip4/0.0.0.0/tcp/{get_random_available_port()}"], + ) + + initial_peers = str(dht.get_visible_maddrs()[0]) + + results = [] + + ckpt_path = f"{tmp_path}/ckpt" + + def get_base_cmd(i, initial_peers): + return [ + "torchrun", + f"--nproc_per_node={1}", + "--rdzv-endpoint", + f"localhost:{port}", + "open_diloco/train_fsdp.py", + *config_hv, + "--hv.initial_peers", + initial_peers, + "--hv.world_rank", + str(i), + "--hv.galaxy_size", + str(num_diloco), + ] + + for i in range(num_diloco): + port = get_random_available_port() + + cmd = get_base_cmd(i, initial_peers) + [ + "--ckpt.path", + ckpt_path, + "--ckpt.interval", + "25", + "--project", + f"{tmp_path}/log{i}_part1.json", + ] + + result = subprocess.Popen(cmd) + results.append(result) + + for result in results: + result.wait() + if result.returncode != 0: + pytest.fail(f"Process {result} failed {result.stderr}") + + # resume from ckpt + + dht.shutdown() + + del dht + dht = DHT( + start=True, + host_maddrs=[f"/ip4/0.0.0.0/tcp/{get_random_available_port()}"], + ) + initial_peers = str(dht.get_visible_maddrs()[0]) + + for i in range(num_diloco): + port = get_random_available_port() + + cmd = get_base_cmd(i, initial_peers) + [ + "--ckpt.resume", + f"{ckpt_path}/{CKPT_PREFIX}_50", + "--project", + f"{tmp_path}/log{i}_part2.json", + ] + + result = subprocess.Popen(cmd) + results.append(result) + + for result in results: + result.wait() + if result.returncode != 0: + pytest.fail(f"Process {result} failed {result.stderr}") + + for i in range(num_diloco): + with open(f"{tmp_path}/log{i}_part1.json", "rb") as f: + log1 = pickle.load(f) + with open(f"{tmp_path}/log{i}_part2.json", "rb") as f: + log2 = pickle.load(f) + + log1 = {data["step"]: [data["Loss"], data["lr"]] for data in log1} + log2 = {data["step"]: [data["Loss"], data["lr"]] for data in log2} + + common_step = set(log1.keys()) & set(log2.keys()) + + for step in common_step: + assert np.allclose(log1[step][0], log2[step][0], atol=1e-2), f"Loss at step {step} is different" + assert log1[step][1] == log2[step][1], f"Lr at step {step} is different" diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py new file mode 100644 index 00000000..211c5f2a --- /dev/null +++ b/tests/test_torchrun/test_train.py @@ -0,0 +1,45 @@ +import pickle +import subprocess +import numpy as np +import pytest +import socket + + +def get_random_available_port(): + # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture() +def random_available_port(): + return get_random_available_port() + + + +@pytest.fixture() +def config_path() -> str: + # need to be executed in the root dir + return "configs/debug.toml" + + + +@pytest.mark.parametrize("num_gpu", [1, 2]) +def test_multi_gpu_ckpt(config_path, random_available_port, num_gpu): + + cmd = [ + "torchrun", + f"--nproc_per_node={num_gpu}", + "--rdzv-endpoint", + f"localhost:{random_available_port}", + "src/zeroband/train.py", + f"@{config_path}", + "--optim.total_steps", + "10" + ] + + result = subprocess.run(cmd) + + if result.returncode != 0: + pytest.fail(f"Process {result} failed {result.stderr}") \ No newline at end of file diff --git a/uv.lock b/uv.lock index cee36787..f0ec766e 100644 --- a/uv.lock +++ b/uv.lock @@ -976,7 +976,7 @@ wheels = [ [[package]] name = "pydantic-config" version = "0.2.0" -source = { git = "https://github.com/samsja/pydantic_config.git?rev=48aa6b9#48aa6b9d2bc6aa0e4d72a919d07e808802e558de" } +source = { git = "https://github.com/samsja/pydantic_config.git?rev=e529c9c#e529c9ca7f3bd5581e2e8bab013faa6d2996810a" } dependencies = [ { name = "pydantic" }, { name = "rich" }, @@ -1817,7 +1817,7 @@ requires-dist = [ { name = "datasets", specifier = ">=3.0.0" }, { name = "einops" }, { name = "numpy" }, - { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=48aa6b9" }, + { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=e529c9c" }, { name = "setuptools" }, { name = "torch", specifier = "==2.4.1" }, { name = "transformers", specifier = ">=4.44.2" }, From fa9fd110c3879cfeeddc1f35d4750583a87ae297 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 19:20:24 +0000 Subject: [PATCH 04/19] add torchrun test --- tests/test_torchrun/test_train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py index 211c5f2a..83644bdd 100644 --- a/tests/test_torchrun/test_train.py +++ b/tests/test_torchrun/test_train.py @@ -1,6 +1,4 @@ -import pickle import subprocess -import numpy as np import pytest import socket From e94d3c0989a898d8912ce8c221ced2160b2daefc Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 19:45:41 +0000 Subject: [PATCH 05/19] add tests config --- tests/test_configs.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/test_configs.py diff --git a/tests/test_configs.py b/tests/test_configs.py new file mode 100644 index 00000000..4427e2a0 --- /dev/null +++ b/tests/test_configs.py @@ -0,0 +1,19 @@ +""" +Tests all of the config file. usefull to catch mismatch key after a renaming of a arg name +Need to be run from the root folder +""" + +import os +from zeroband.train import Config +import pytest +import tomli + +config_file_names = [file for file in os.listdir("configs") if file.endswith(".toml")] + +@pytest.mark.parametrize("config_file_name", config_file_names) +def test_load_config(config_file_name): + with open(f"configs/{config_file_name}", "rb") as f: + content = tomli.load(f) + config = Config(**content) + assert config is not None + From 2feb040b885447ca12b565cfb6940e568d83f32c Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 19:58:53 +0000 Subject: [PATCH 06/19] use world info --- src/zeroband/train.py | 29 +++++++++++------------------ src/zeroband/utils/world_info.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 src/zeroband/utils/world_info.py diff --git a/src/zeroband/train.py b/src/zeroband/train.py index ad6dbbfa..ff020ff4 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -1,6 +1,5 @@ import os from contextlib import nullcontext -import datetime import logging # Added logging import from typing import Literal @@ -22,16 +21,15 @@ from zeroband.utils.monitor import WandbMonitor, DummyMonitor from zeroband.data import TEST_VOCAB_SIZE, get_dataloader from zeroband.models.llama import llama2_configs, llama3_configs, Transformer +from zeroband.utils.world_info import WorldInfo ### TODO - -# use torch.idst.local rank instead of env var # fix logger -local_rank = int(os.getenv("LOCAL_RANK", 0)) +world_info = WorldInfo() -if local_rank == 0: +if world_info.local_rank == 0: log_level = os.getenv("ZERO_BAND_LOG_LEVEL", "INFO") logging.basicConfig(level=getattr(logging, log_level, logging.INFO)) else: @@ -42,8 +40,7 @@ # Function to initialize the distributed process group def ddp_setup(): init_process_group() - torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) - + torch.cuda.set_device(world_info.local_rank) class DilocoConfig(BaseConfig): outer_lr: float = 0.7 @@ -105,14 +102,10 @@ def get_model(name_model: str, type_model: str, tokenizer: AutoTokenizer) -> Tra def train(config: Config): sharding_strategy = get_sharding_strategy(config.train.sharding_strategy) - local_rank = int(os.environ["LOCAL_RANK"]) - world_size = int(os.environ["WORLD_SIZE"]) - local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) - rank = int(os.environ["RANK"]) # batch_size is the total batch size for all GPUs - assert config.optim.batch_size % local_world_size == 0 - batch_size = config.optim.batch_size // local_world_size + assert config.optim.batch_size % world_info.local_world_size == 0 + batch_size = config.optim.batch_size // world_info.local_world_size assert batch_size % config.train.micro_bs == 0 gradient_accumulation_steps = batch_size // config.train.micro_bs @@ -121,10 +114,10 @@ def train(config: Config): tokenizer.pad_token = "" # todo(sami): remove padding tokens once we have context stuffing logger.debug("tokenizer loaded") - train_dataloader = get_dataloader(tokenizer.pad_token_id, world_size, rank, config.data.seq_length, config.train.micro_bs, config.data.num_workers) + train_dataloader = get_dataloader(tokenizer.pad_token_id, world_info.world_size, world_info.rank, config.data.seq_length, config.train.micro_bs, config.data.num_workers) model = get_model(config.name_model, config.type_model, tokenizer=tokenizer) - model = model.to(local_rank) + model = model.to(world_info.local_rank) logger.debug("model loaded") model = FSDP( @@ -149,7 +142,7 @@ def train(config: Config): model.train() - if rank == 0: + if world_info.rank == 0: logger_cls = WandbMonitor if config.metric_logger_type == "wandb" else DummyMonitor metric_logger = logger_cls(project=config.project, config=config.model_dump(), resume=False) @@ -198,7 +191,7 @@ def train(config: Config): "inner_lr": inner_lr, } - if rank == 0: + if world_info.rank == 0: metric_logger.log(metrics) logger.info(f"step: {real_step}, loss: {loss_batch.item()}, inner_lr: {inner_lr}") @@ -211,7 +204,7 @@ def train(config: Config): # Since ckpt strategy and all reduce is done at the outer loop level. break - if rank == 0: + if world_info.rank == 0: metric_logger.finish() diff --git a/src/zeroband/utils/world_info.py b/src/zeroband/utils/world_info.py new file mode 100644 index 00000000..c3bfe22d --- /dev/null +++ b/src/zeroband/utils/world_info.py @@ -0,0 +1,14 @@ +import os + +class WorldInfo: + """This class parse env var about torch world into class variables.""" + world_size: int + rank: int + local_rank: int + local_world_size: int + + def __init__(self): + self.world_size = int(os.environ["WORLD_SIZE"]) + self.rank = int(os.environ["RANK"]) + self.local_rank = int(os.environ["LOCAL_RANK"]) + self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) \ No newline at end of file From c710fe83f3c849422c3fdc04d57a85dd6395fe98 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 21:40:10 +0000 Subject: [PATCH 07/19] add proper logging --- src/zeroband/models/llama/__init__.py | 16 ++++++++++- src/zeroband/train.py | 38 ++++++-------------------- src/zeroband/utils/logging.py | 39 +++++++++++++++++++++++++++ src/zeroband/utils/world_info.py | 14 +++++++++- 4 files changed, 75 insertions(+), 32 deletions(-) create mode 100644 src/zeroband/utils/logging.py diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py index ce3a676f..6bfdae29 100644 --- a/src/zeroband/models/llama/__init__.py +++ b/src/zeroband/models/llama/__init__.py @@ -58,4 +58,18 @@ multiple_of=4096, rope_theta=500000, ), -} \ No newline at end of file +} + +def get_model(name_model: str, type_model: str, vocab_size: int) -> Transformer: + """get the transformer model""" + + if type_model == "llama2": + config = llama2_configs[name_model] + elif type_model == "llama3": + config = llama3_configs[name_model] + else: + raise ValueError(f"Model type {type_model} not supported") + + config.vocab_size = vocab_size + return Transformer(config) + diff --git a/src/zeroband/train.py b/src/zeroband/train.py index ff020ff4..b5159dc2 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -1,6 +1,5 @@ import os from contextlib import nullcontext -import logging # Added logging import from typing import Literal import torch @@ -20,23 +19,11 @@ from zeroband.utils import get_sharding_strategy from zeroband.utils.monitor import WandbMonitor, DummyMonitor from zeroband.data import TEST_VOCAB_SIZE, get_dataloader -from zeroband.models.llama import llama2_configs, llama3_configs, Transformer -from zeroband.utils.world_info import WorldInfo +from zeroband.models.llama import get_model +from zeroband.utils.world_info import get_world_info +from zeroband.utils.logging import get_logger -### TODO -# fix logger - -world_info = WorldInfo() - -if world_info.local_rank == 0: - log_level = os.getenv("ZERO_BAND_LOG_LEVEL", "INFO") - logging.basicConfig(level=getattr(logging, log_level, logging.INFO)) -else: - logging.basicConfig(level=logging.CRITICAL) # Disable logging for non-zero ranks - -logger = logging.getLogger(__name__) - # Function to initialize the distributed process group def ddp_setup(): init_process_group() @@ -87,19 +74,6 @@ class Config(BaseConfig): -def get_model(name_model: str, type_model: str, tokenizer: AutoTokenizer) -> Transformer: - """get the transformer model""" - - if type_model == "llama2": - config = llama2_configs[name_model] - elif type_model == "llama3": - config = llama3_configs[name_model] - else: - raise ValueError(f"Model type {type_model} not supported") - - config.vocab_size = tokenizer.vocab_size if name_model != "debugmodel" else TEST_VOCAB_SIZE - return Transformer(config) - def train(config: Config): sharding_strategy = get_sharding_strategy(config.train.sharding_strategy) @@ -116,7 +90,7 @@ def train(config: Config): logger.debug("tokenizer loaded") train_dataloader = get_dataloader(tokenizer.pad_token_id, world_info.world_size, world_info.rank, config.data.seq_length, config.train.micro_bs, config.data.num_workers) - model = get_model(config.name_model, config.type_model, tokenizer=tokenizer) + model = get_model(config.name_model, config.type_model, vocab_size=tokenizer.vocab_size if config.name_model != "debugmodel" else TEST_VOCAB_SIZE) model = model.to(world_info.local_rank) logger.debug("model loaded") @@ -213,6 +187,10 @@ def train(config: Config): # However, in development, we want to know that we broke torch compile torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ torch.set_float32_matmul_precision("high") + + world_info = get_world_info() + logger = get_logger() + ddp_setup() config = Config(**parse_argv()) diff --git a/src/zeroband/utils/logging.py b/src/zeroband/utils/logging.py new file mode 100644 index 00000000..de6a4ff8 --- /dev/null +++ b/src/zeroband/utils/logging.py @@ -0,0 +1,39 @@ +import logging +import os + +from zeroband.utils.world_info import get_world_info + +logger = None + +class CustomFormatter(logging.Formatter): + def __init__(self, local_rank: int): + super().__init__() + self.local_rank = local_rank + + def format(self, record): + log_format = "{asctime} [{levelname}] [Rank {local_rank}] {message}" + formatter = logging.Formatter(log_format, style='{', datefmt="%H:%M:%S") + record.local_rank = self.local_rank # Add this line to set the local rank in the record + return formatter.format(record) + +def get_logger(): + global logger # Add this line to modify the global logger variable + if logger is not None: + return logger + + world_info = get_world_info() + logger = logging.getLogger(__name__) + + if world_info.local_rank == 0: + log_level = os.getenv("ZERO_BAND_LOG_LEVEL", "INFO") + logging.basicConfig(level=getattr(logging, log_level, logging.INFO)) + else: + logging.basicConfig(level=logging.CRITICAL) # Disable logging for non-zero ranks + + handler = logging.StreamHandler() + handler.setFormatter(CustomFormatter(world_info.local_rank)) + logger.addHandler(handler) + logger.propagate = False # Prevent the log messages from being propagated to the root logger + + return logger + diff --git a/src/zeroband/utils/world_info.py b/src/zeroband/utils/world_info.py index c3bfe22d..c21c74d9 100644 --- a/src/zeroband/utils/world_info.py +++ b/src/zeroband/utils/world_info.py @@ -1,5 +1,7 @@ import os +world_info = None + class WorldInfo: """This class parse env var about torch world into class variables.""" world_size: int @@ -11,4 +13,14 @@ def __init__(self): self.world_size = int(os.environ["WORLD_SIZE"]) self.rank = int(os.environ["RANK"]) self.local_rank = int(os.environ["LOCAL_RANK"]) - self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) \ No newline at end of file + self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) + +def get_world_info() -> WorldInfo: + """ + Return a WorldInfo singleton. + """ + global world_info + if world_info is None: + world_info = WorldInfo() + return world_info + From 7108546f8c6cc7534d52e159cfe96144c6adfe94 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 21:53:34 +0000 Subject: [PATCH 08/19] update readme --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 094d95e4..c8d6d172 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,22 @@ run your code using ```bash uv run ... ``` + +## quick check + +To check that everything is working you can do + +```bash +ZERO_BAND_LOG_LEVEL=DEBUG torchrun --nproc_per_node=2 src/zeroband/train.py @configs/debug.toml +``` + +## run test + +You need a machine with a least two gpus to run the full test suite. + +Some test must be run from the root directory. + +```bash +uv run pytest +``` + From 08e02a4eb2df9af899e92d21adcf8e0434a8e34b Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 21:55:21 +0000 Subject: [PATCH 09/19] chore: change doscrting --- src/zeroband/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zeroband/train.py b/src/zeroband/train.py index b5159dc2..aa529cfd 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -24,8 +24,10 @@ from zeroband.utils.logging import get_logger -# Function to initialize the distributed process group def ddp_setup(): + """ + Initialize the distributed process group. + """ init_process_group() torch.cuda.set_device(world_info.local_rank) @@ -112,7 +114,7 @@ def train(config: Config): inner_optimizer, num_warmup_steps=config.optim.warmup_steps, num_training_steps=config.optim.total_steps, - ) + ) model.train() From 0e4c2b2b019ca1a01339e4af75f5e454f67b021e Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 21:58:02 +0000 Subject: [PATCH 10/19] add noqa triton kernel --- src/zeroband/models/norms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zeroband/models/norms.py b/src/zeroband/models/norms.py index 72fb225d..e57e2cf2 100644 --- a/src/zeroband/models/norms.py +++ b/src/zeroband/models/norms.py @@ -248,7 +248,7 @@ def forward(ctx, x, weight, eps): if N > block_N: raise ValueError(f"N {N} must be <= {block_N=}") - grid = lambda meta: (M,) + grid = lambda meta: (M,) # noqa: E731 _rms_norm_fwd_kernel[grid]( x, x.stride(0), @@ -298,7 +298,7 @@ def backward(ctx, dy): if N > block_N: raise ValueError(f"N {N} must be <= {block_N=}") - grid = lambda meta: (sm_count,) + grid = lambda meta: (sm_count,) # noqa: E731 _rms_norm_bwd_kernel_sm[grid]( x, x.stride(0), From a145583f86d6ce3731e3ae63b1593a49e92bbb72 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 22:34:40 +0000 Subject: [PATCH 11/19] add 150m config --- configs/150m.toml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 configs/150m.toml diff --git a/configs/150m.toml b/configs/150m.toml new file mode 100644 index 00000000..f0ff1446 --- /dev/null +++ b/configs/150m.toml @@ -0,0 +1,15 @@ +name_model = "150M" +project = "debug_150m_zero_band" + +[train] +micro_bs = 16 # change this base on the gpu + +# 16 for 3090/4090 (24gb) +# 32 for a40/a100 (48gb/40gb) +# 64 for h100/a100 (80gb) + +[optim] +batch_size = 512 +warmup_steps = 1000 +total_steps = 88_000 +lr = 4e-4 \ No newline at end of file From 27669b3268f026fafdc0378878da221bc0e7cfa5 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 22:35:32 +0000 Subject: [PATCH 12/19] use grad shard op as default strategy --- configs/{150m.toml => 150M/3090.toml} | 5 +---- configs/150M/A40.toml | 12 ++++++++++++ configs/150M/H100.toml | 12 ++++++++++++ src/zeroband/train.py | 2 +- 4 files changed, 26 insertions(+), 5 deletions(-) rename configs/{150m.toml => 150M/3090.toml} (68%) create mode 100644 configs/150M/A40.toml create mode 100644 configs/150M/H100.toml diff --git a/configs/150m.toml b/configs/150M/3090.toml similarity index 68% rename from configs/150m.toml rename to configs/150M/3090.toml index f0ff1446..866d6054 100644 --- a/configs/150m.toml +++ b/configs/150M/3090.toml @@ -3,10 +3,7 @@ project = "debug_150m_zero_band" [train] micro_bs = 16 # change this base on the gpu - -# 16 for 3090/4090 (24gb) -# 32 for a40/a100 (48gb/40gb) -# 64 for h100/a100 (80gb) +sharding_strategy = "NO_SHARD" [optim] batch_size = 512 diff --git a/configs/150M/A40.toml b/configs/150M/A40.toml new file mode 100644 index 00000000..e7799417 --- /dev/null +++ b/configs/150M/A40.toml @@ -0,0 +1,12 @@ +name_model = "150M" +project = "debug_150m_zero_band" + +[train] +micro_bs = 32 # change this base on the gpu +sharding_strategy = "NO_SHARD" + +[optim] +batch_size = 512 +warmup_steps = 1000 +total_steps = 88_000 +lr = 4e-4 \ No newline at end of file diff --git a/configs/150M/H100.toml b/configs/150M/H100.toml new file mode 100644 index 00000000..49a65475 --- /dev/null +++ b/configs/150M/H100.toml @@ -0,0 +1,12 @@ +name_model = "150M" +project = "debug_150m_zero_band" + +[train] +micro_bs = 64 # change this base on the gpu +sharding_strategy = "NO_SHARD" + +[optim] +batch_size = 512 +warmup_steps = 1000 +total_steps = 88_000 +lr = 4e-4 \ No newline at end of file diff --git a/src/zeroband/train.py b/src/zeroband/train.py index aa529cfd..df7037a2 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -55,7 +55,7 @@ class OptimConfig(BaseConfig): class TrainConfig(BaseConfig): micro_bs: int torch_compile: bool = True - sharding_strategy: str = "FULL_SHARD" + sharding_strategy: str = "SHARD_GRAD_OP" class Config(BaseConfig): From 93be8f6a7995d06ee7f265c8710516db4007e320 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 23:41:06 +0000 Subject: [PATCH 13/19] apply ruff format --- src/zeroband/data.py | 10 +++--- src/zeroband/models/llama/__init__.py | 6 ++-- src/zeroband/models/llama/model.py | 34 +++++-------------- src/zeroband/models/norms.py | 4 +-- src/zeroband/train.py | 49 ++++++++++++++++++--------- src/zeroband/utils/logging.py | 5 +-- src/zeroband/utils/monitor.py | 8 +++-- src/zeroband/utils/world_info.py | 4 ++- tests/test_configs.py | 2 +- tests/test_model.py | 5 +-- tests/test_torchrun/test_train.py | 9 ++--- 11 files changed, 71 insertions(+), 65 deletions(-) diff --git a/src/zeroband/data.py b/src/zeroband/data.py index cf3c522d..3619bf77 100644 --- a/src/zeroband/data.py +++ b/src/zeroband/data.py @@ -1,4 +1,3 @@ - from functools import partial from typing import Any, Generator @@ -32,7 +31,7 @@ def collate_causal_mask(max_seq_length: int = -1, pad_id: int = 0, ignore_index: def _collate_fn_causal_mask( samples: list[dict[str, torch.LongTensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -100 ) -> dict[str, torch.LongTensor]: - """collate function for causal mask. Fill with padding tokens if sequence is shorter than max_seq_length. + """collate function for causal mask. Fill with padding tokens if sequence is shorter than max_seq_length. input_ids and labels are both of size max_seq_length. """ @@ -57,11 +56,13 @@ def _collate_fn_causal_mask( return {"input_ids": torch.stack(batched["input_ids"], dim=0), "labels": torch.stack(batched["labels"], dim=0)} -def get_dataloader(pad_token_id: int, world_size: int, rank: int, seq_length: int, batch_size: int, num_workers: int) -> DataLoader: +def get_dataloader( + pad_token_id: int, world_size: int, rank: int, seq_length: int, batch_size: int, num_workers: int +) -> DataLoader: """ Get a pytorch dataloader to train on """ - #todo add real dataset and world splitting + # todo add real dataset and world splitting train_dataset = FakeTokenizedDataset(seq_length, TEST_VOCAB_SIZE) data_collator = collate_causal_mask(max_seq_length=seq_length, pad_id=pad_token_id, ignore_index=-100) @@ -71,4 +72,3 @@ def get_dataloader(pad_token_id: int, world_size: int, rank: int, seq_length: in batch_size=batch_size, num_workers=num_workers, ) - diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py index 6bfdae29..c0103fcf 100644 --- a/src/zeroband/models/llama/__init__.py +++ b/src/zeroband/models/llama/__init__.py @@ -13,7 +13,7 @@ llama2_configs = { "debugmodel": ModelArgs(dim=256, n_layers=2, n_heads=8), - "150M": ModelArgs(dim=1024, n_layers=12, n_heads=16), # todo(sami): double check this + "150M": ModelArgs(dim=1024, n_layers=12, n_heads=16), # todo(sami): double check this "271M": ModelArgs(dim=1024, n_layers=16, n_heads=8), "1B": ModelArgs(dim=2048, n_layers=18, n_heads=16), "7B": ModelArgs(dim=4096, n_layers=32, n_heads=32), @@ -60,6 +60,7 @@ ), } + def get_model(name_model: str, type_model: str, vocab_size: int) -> Transformer: """get the transformer model""" @@ -69,7 +70,6 @@ def get_model(name_model: str, type_model: str, vocab_size: int) -> Transformer: config = llama3_configs[name_model] else: raise ValueError(f"Model type {type_model} not supported") - + config.vocab_size = vocab_size return Transformer(config) - diff --git a/src/zeroband/models/llama/model.py b/src/zeroband/models/llama/model.py index a08f85f4..7a20eda7 100644 --- a/src/zeroband/models/llama/model.py +++ b/src/zeroband/models/llama/model.py @@ -1,4 +1,4 @@ -# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py +# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py # the commit at time of copy paste was commit f2a1551 # Copyright (c) Meta Platforms, Inc. and affiliates. @@ -152,22 +152,14 @@ class Attention(nn.Module): def __init__(self, model_args: ModelArgs): super().__init__() self.n_heads = model_args.n_heads - self.n_kv_heads = ( - model_args.n_heads - if model_args.n_kv_heads is None - else model_args.n_kv_heads - ) + self.n_kv_heads = model_args.n_heads if model_args.n_kv_heads is None else model_args.n_kv_heads self.n_rep = self.n_heads // self.n_kv_heads self.head_dim = model_args.dim // model_args.n_heads - self.wq = nn.Linear( - model_args.dim, model_args.n_heads * self.head_dim, bias=False - ) + self.wq = nn.Linear(model_args.dim, model_args.n_heads * self.head_dim, bias=False) self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wo = nn.Linear( - model_args.n_heads * self.head_dim, model_args.dim, bias=False - ) + self.wo = nn.Linear(model_args.n_heads * self.head_dim, model_args.dim, bias=False) def init_weights(self, init_std: float): for linear in (self.wq, self.wk, self.wv): @@ -212,9 +204,7 @@ def forward( # we use casual mask for training output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) - output = output.transpose( - 1, 2 - ).contiguous() # (bs, seqlen, n_local_heads, head_dim) + output = output.transpose(1, 2).contiguous() # (bs, seqlen, n_local_heads, head_dim) output = output.view(bs, seqlen, -1) return self.wo(output) @@ -297,12 +287,8 @@ def __init__(self, layer_id: int, model_args: ModelArgs): self.layer_id = layer_id self.num_layers = model_args.n_layers - self.attention_norm = build_norm( - model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps - ) - self.ffn_norm = build_norm( - model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps - ) + self.attention_norm = build_norm(model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps) + self.ffn_norm = build_norm(model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps) if model_args.depth_init: self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5 @@ -376,9 +362,7 @@ def __init__(self, model_args: ModelArgs): for layer_id in range(model_args.n_layers): self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args) - self.norm = build_norm( - model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps - ) + self.norm = build_norm(model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps) self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) self.init_weights() @@ -457,4 +441,4 @@ def from_model_args(cls, model_args: ModelArgs) -> "Transformer": Transformer: Transformer model. """ - return cls(model_args) \ No newline at end of file + return cls(model_args) diff --git a/src/zeroband/models/norms.py b/src/zeroband/models/norms.py index e57e2cf2..cd5c2f81 100644 --- a/src/zeroband/models/norms.py +++ b/src/zeroband/models/norms.py @@ -1,4 +1,4 @@ -# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py +# this code is copy pasted from the torchtitan repo https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py # the commit at time of copy paste was commit f2a1551 # Copyright (c) Meta Platforms, Inc. and affiliates. @@ -330,4 +330,4 @@ def fused_rms_norm_fn( x, weight, eps, - ) \ No newline at end of file + ) diff --git a/src/zeroband/train.py b/src/zeroband/train.py index df7037a2..25af7d56 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -31,6 +31,7 @@ def ddp_setup(): init_process_group() torch.cuda.set_device(world_info.local_rank) + class DilocoConfig(BaseConfig): outer_lr: float = 0.7 inner_steps: int = 10 @@ -42,6 +43,7 @@ class DataConfig(BaseConfig): fake_data: bool = False num_workers: int = 4 + class OptimConfig(BaseConfig): lr: float = 4e-4 weight_decay: float = 0.1 @@ -52,6 +54,7 @@ class OptimConfig(BaseConfig): total_steps: int = 88_000 batch_size: int = 512 + class TrainConfig(BaseConfig): micro_bs: int torch_compile: bool = True @@ -59,21 +62,18 @@ class TrainConfig(BaseConfig): class Config(BaseConfig): - # main config name_model: Literal["debugmodel", "150M", "271M", "1B", "7B", "13B", "26B", "70B"] = "150M" - type_model: Literal["llama2","llama3"] = "llama2" + type_model: Literal["llama2", "llama3"] = "llama2" project: str = "zeroband" metric_logger_type: Literal["wandb", "dummy"] = "wandb" - # sub config diloco: DilocoConfig | None = None data: DataConfig = DataConfig() optim: OptimConfig = OptimConfig() train: TrainConfig - def train(config: Config): @@ -90,9 +90,20 @@ def train(config: Config): tokenizer.pad_token = "" # todo(sami): remove padding tokens once we have context stuffing logger.debug("tokenizer loaded") - train_dataloader = get_dataloader(tokenizer.pad_token_id, world_info.world_size, world_info.rank, config.data.seq_length, config.train.micro_bs, config.data.num_workers) + train_dataloader = get_dataloader( + tokenizer.pad_token_id, + world_info.world_size, + world_info.rank, + config.data.seq_length, + config.train.micro_bs, + config.data.num_workers, + ) - model = get_model(config.name_model, config.type_model, vocab_size=tokenizer.vocab_size if config.name_model != "debugmodel" else TEST_VOCAB_SIZE) + model = get_model( + config.name_model, + config.type_model, + vocab_size=tokenizer.vocab_size if config.name_model != "debugmodel" else TEST_VOCAB_SIZE, + ) model = model.to(world_info.local_rank) logger.debug("model loaded") @@ -108,13 +119,18 @@ def train(config: Config): logger.debug("model compiled and fsdped") # Setup optimizers - inner_optimizer = torch.optim.AdamW(model.parameters(), lr=config.optim.lr, weight_decay=config.optim.weight_decay, betas=(config.optim.adam_betas1, config.optim.adam_betas2)) + inner_optimizer = torch.optim.AdamW( + model.parameters(), + lr=config.optim.lr, + weight_decay=config.optim.weight_decay, + betas=(config.optim.adam_betas1, config.optim.adam_betas2), + ) scheduler = get_cosine_schedule_with_warmup( inner_optimizer, num_warmup_steps=config.optim.warmup_steps, num_training_steps=config.optim.total_steps, - ) + ) model.train() @@ -129,7 +145,6 @@ def train(config: Config): logger.info("starting training") while True: - if num_inner_steps > 1: # if we don't use diloco we don't print the outer step logs logger.info(f"outer_step step: {outer_step}") @@ -144,11 +159,13 @@ def train(config: Config): labels = batch["labels"].to("cuda") with model.no_sync() if is_accumulating else nullcontext(): - logits = model(tokens = input_ids).contiguous() + logits = model(tokens=input_ids).contiguous() flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab") flatten_labels = rearrange(labels, "b seq -> (b seq)") - loss = F.cross_entropy(flatten_logits, flatten_labels, ignore_index=-100) / gradient_accumulation_steps + loss = ( + F.cross_entropy(flatten_logits, flatten_labels, ignore_index=-100) / gradient_accumulation_steps + ) loss.backward() loss_batch += loss.detach() @@ -158,12 +175,12 @@ def train(config: Config): inner_optimizer.zero_grad() # logging - real_step = outer_step * num_inner_steps + inner_step + 1 # add + 1 because inner_step start at 0 + real_step = outer_step * num_inner_steps + inner_step + 1 # add + 1 because inner_step start at 0 inner_lr = [group["lr"] for group in inner_optimizer.param_groups][0] metrics = { - "Loss": loss_batch.item(), # todo(sami): do local all reduce for the loss - "step": real_step, + "Loss": loss_batch.item(), # todo(sami): do local all reduce for the loss + "step": real_step, "inner_lr": inner_lr, } @@ -189,10 +206,10 @@ def train(config: Config): # However, in development, we want to know that we broke torch compile torch._dynamo.config.suppress_errors = "ZERO_BAND_DEV" not in os.environ torch.set_float32_matmul_precision("high") - + world_info = get_world_info() logger = get_logger() - + ddp_setup() config = Config(**parse_argv()) diff --git a/src/zeroband/utils/logging.py b/src/zeroband/utils/logging.py index de6a4ff8..885ae3cd 100644 --- a/src/zeroband/utils/logging.py +++ b/src/zeroband/utils/logging.py @@ -5,6 +5,7 @@ logger = None + class CustomFormatter(logging.Formatter): def __init__(self, local_rank: int): super().__init__() @@ -12,10 +13,11 @@ def __init__(self, local_rank: int): def format(self, record): log_format = "{asctime} [{levelname}] [Rank {local_rank}] {message}" - formatter = logging.Formatter(log_format, style='{', datefmt="%H:%M:%S") + formatter = logging.Formatter(log_format, style="{", datefmt="%H:%M:%S") record.local_rank = self.local_rank # Add this line to set the local rank in the record return formatter.format(record) + def get_logger(): global logger # Add this line to modify the global logger variable if logger is not None: @@ -36,4 +38,3 @@ def get_logger(): logger.propagate = False # Prevent the log messages from being propagated to the root logger return logger - diff --git a/src/zeroband/utils/monitor.py b/src/zeroband/utils/monitor.py index 64fc9c02..532515ef 100644 --- a/src/zeroband/utils/monitor.py +++ b/src/zeroband/utils/monitor.py @@ -2,6 +2,7 @@ from typing import Any, Protocol import importlib + class Monitor(Protocol): def __init__(self, project, config): ... @@ -14,18 +15,21 @@ class WandbMonitor: def __init__(self, project, config, resume: bool): if importlib.util.find_spec("wandb") is None: raise ImportError("wandb is not installed. Please install it to use WandbMonitor.") - + import wandb + wandb.init( project=project, config=config, resume="auto" if resume else None ) # make wandb reuse the same run id if possible def log(self, metrics: dict[str, Any]): import wandb + wandb.log(metrics) def finish(self): import wandb + wandb.finish() @@ -42,4 +46,4 @@ def log(self, metrics: dict[str, Any]): def finish(self): with open(self.project, "wb") as f: - pickle.dump(self.data, f) \ No newline at end of file + pickle.dump(self.data, f) diff --git a/src/zeroband/utils/world_info.py b/src/zeroband/utils/world_info.py index c21c74d9..6ab3780f 100644 --- a/src/zeroband/utils/world_info.py +++ b/src/zeroband/utils/world_info.py @@ -2,8 +2,10 @@ world_info = None + class WorldInfo: """This class parse env var about torch world into class variables.""" + world_size: int rank: int local_rank: int @@ -15,6 +17,7 @@ def __init__(self): self.local_rank = int(os.environ["LOCAL_RANK"]) self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) + def get_world_info() -> WorldInfo: """ Return a WorldInfo singleton. @@ -23,4 +26,3 @@ def get_world_info() -> WorldInfo: if world_info is None: world_info = WorldInfo() return world_info - diff --git a/tests/test_configs.py b/tests/test_configs.py index 4427e2a0..0873750c 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -10,10 +10,10 @@ config_file_names = [file for file in os.listdir("configs") if file.endswith(".toml")] + @pytest.mark.parametrize("config_file_name", config_file_names) def test_load_config(config_file_name): with open(f"configs/{config_file_name}", "rb") as f: content = tomli.load(f) config = Config(**content) assert config is not None - diff --git a/tests/test_model.py b/tests/test_model.py index 017448ea..8e7ef928 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -5,12 +5,14 @@ VOCAB_SIZE = 1024 + @pytest.fixture def llama_config(): - config = llama2_configs["debugmodel"] + config = llama2_configs["debugmodel"] config.vocab_size = VOCAB_SIZE return config + def test_llama(llama_config): seq_len = 512 bs = 8 @@ -18,4 +20,3 @@ def test_llama(llama_config): input_ = torch.randint(0, llama_config.vocab_size, (bs, seq_len)) output = model(input_) assert output.shape == (bs, seq_len, llama_config.vocab_size) - diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py index 83644bdd..61c7ec74 100644 --- a/tests/test_torchrun/test_train.py +++ b/tests/test_torchrun/test_train.py @@ -15,17 +15,14 @@ def random_available_port(): return get_random_available_port() - @pytest.fixture() def config_path() -> str: # need to be executed in the root dir - return "configs/debug.toml" - + return "configs/debug.toml" @pytest.mark.parametrize("num_gpu", [1, 2]) def test_multi_gpu_ckpt(config_path, random_available_port, num_gpu): - cmd = [ "torchrun", f"--nproc_per_node={num_gpu}", @@ -34,10 +31,10 @@ def test_multi_gpu_ckpt(config_path, random_available_port, num_gpu): "src/zeroband/train.py", f"@{config_path}", "--optim.total_steps", - "10" + "10", ] result = subprocess.run(cmd) if result.returncode != 0: - pytest.fail(f"Process {result} failed {result.stderr}") \ No newline at end of file + pytest.fail(f"Process {result} failed {result.stderr}") From 52674373d714d6ad2f68113c8ff26e6c04677d00 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Fri, 20 Sep 2024 23:55:09 +0000 Subject: [PATCH 14/19] add training data --- src/zeroband/data.py | 27 ++++++++++++++++++++------- src/zeroband/train.py | 15 ++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/zeroband/data.py b/src/zeroband/data.py index 3619bf77..93b2a391 100644 --- a/src/zeroband/data.py +++ b/src/zeroband/data.py @@ -5,9 +5,13 @@ from torch.utils.data import DataLoader from torch.utils.data import IterableDataset +from datasets import load_dataset +from datasets.distributed import split_dataset_by_node TEST_VOCAB_SIZE = 1024 +# TODO sami: make sure the init of the model is the same on all rank + class FakeTokenizedDataset(IterableDataset): """This is a dummy dataset that generates random sequences of length seq_len and vocab_size""" @@ -57,14 +61,23 @@ def _collate_fn_causal_mask( def get_dataloader( - pad_token_id: int, world_size: int, rank: int, seq_length: int, batch_size: int, num_workers: int + tokenizer, world_size: int, rank: int, seq_length: int, batch_size: int, num_workers: int, fake_data: bool ) -> DataLoader: - """ - Get a pytorch dataloader to train on - """ - # todo add real dataset and world splitting - train_dataset = FakeTokenizedDataset(seq_length, TEST_VOCAB_SIZE) - data_collator = collate_causal_mask(max_seq_length=seq_length, pad_id=pad_token_id, ignore_index=-100) + if fake_data: + train_dataset = FakeTokenizedDataset(seq_length, TEST_VOCAB_SIZE) + else: + ds = load_dataset("allenai/c4", "en", streaming=True) + + def tokenize_function(data): + outputs = tokenizer(data["text"], truncation=True, max_length=seq_length, padding="max_length") + return outputs + + tokenized_datasets = ds.map( + tokenize_function, batched=True, remove_columns=["text", "timestamp", "url", "attention_mask"] + )["train"] + train_dataset = split_dataset_by_node(tokenized_datasets, world_size=world_size, rank=rank) + + data_collator = collate_causal_mask(max_seq_length=seq_length, pad_id=tokenizer.pad_token_id, ignore_index=-100) return DataLoader( train_dataset, diff --git a/src/zeroband/train.py b/src/zeroband/train.py index 25af7d56..b625cb7e 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -38,7 +38,6 @@ class DilocoConfig(BaseConfig): class DataConfig(BaseConfig): - dataset_name_or_path: str = "allenai/c4" seq_length: int = 1024 fake_data: bool = False num_workers: int = 4 @@ -90,13 +89,15 @@ def train(config: Config): tokenizer.pad_token = "" # todo(sami): remove padding tokens once we have context stuffing logger.debug("tokenizer loaded") + train_dataloader = get_dataloader( - tokenizer.pad_token_id, - world_info.world_size, - world_info.rank, - config.data.seq_length, - config.train.micro_bs, - config.data.num_workers, + tokenizer=tokenizer, + world_size=world_info.world_size, + rank=world_info.rank, + seq_length=config.data.seq_length, + batch_size=config.train.micro_bs, + num_workers=config.data.num_workers, + fake_data=config.data.fake_data, ) model = get_model( From 9a5efa713ec60df8370b54048e091bab3b7480d3 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Sat, 21 Sep 2024 00:18:36 +0000 Subject: [PATCH 15/19] add tokens / sec --- configs/debug.toml | 5 ++++- src/zeroband/train.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/configs/debug.toml b/configs/debug.toml index 2a9bea2e..e7d6e30d 100644 --- a/configs/debug.toml +++ b/configs/debug.toml @@ -7,4 +7,7 @@ micro_bs = 8 [optim] batch_size = 16 warmup_steps = 10 -total_steps = 5000 \ No newline at end of file +total_steps = 5000 + +[data] +fake_data = true \ No newline at end of file diff --git a/src/zeroband/train.py b/src/zeroband/train.py index b625cb7e..9efc8367 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -1,5 +1,6 @@ import os from contextlib import nullcontext +import time from typing import Literal import torch @@ -16,6 +17,8 @@ FullyShardedDataParallel as FSDP, MixedPrecision, ) +import torch.distributed as dist + from zeroband.utils import get_sharding_strategy from zeroband.utils.monitor import WandbMonitor, DummyMonitor from zeroband.data import TEST_VOCAB_SIZE, get_dataloader @@ -152,6 +155,7 @@ def train(config: Config): for inner_step in range(num_inner_steps): loss_batch = 0 + beginning_step_time = time.time() for grad_acc_step in range(gradient_accumulation_steps): is_accumulating = grad_acc_step < gradient_accumulation_steps - 1 @@ -179,16 +183,26 @@ def train(config: Config): real_step = outer_step * num_inner_steps + inner_step + 1 # add + 1 because inner_step start at 0 inner_lr = [group["lr"] for group in inner_optimizer.param_groups][0] + dist.all_reduce(loss_batch, op=dist.ReduceOp.AVG) + # syncing loss across all data parallel rank + # todo(sami): when using diloco make sure that the loss is computed only on local world + metrics = { - "Loss": loss_batch.item(), # todo(sami): do local all reduce for the loss + "Loss": loss_batch.item(), "step": real_step, "inner_lr": inner_lr, + "tokens_per_second": config.data.seq_length + * config.optim.batch_size + / (time.time() - beginning_step_time), + "Perplexity": torch.exp(loss_batch).item(), } if world_info.rank == 0: metric_logger.log(metrics) - logger.info(f"step: {real_step}, loss: {loss_batch.item()}, inner_lr: {inner_lr}") + logger.info( + f"step: {real_step}, loss: {loss_batch.item():.4f}, tokens_per_second: {metrics['tokens_per_second']:.2f}" + ) outer_step += 1 From 5d0deb9c2c3852e80957004628af96bfc71628e3 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Sat, 21 Sep 2024 00:19:50 +0000 Subject: [PATCH 16/19] add total samples --- src/zeroband/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zeroband/train.py b/src/zeroband/train.py index 9efc8367..056d89ac 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -195,6 +195,7 @@ def train(config: Config): * config.optim.batch_size / (time.time() - beginning_step_time), "Perplexity": torch.exp(loss_batch).item(), + "total_tokens": real_step * config.optim.batch_size * config.data.seq_length, } if world_info.rank == 0: From e5a19382457e0c5ce30cb23acf6d01008102c750 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Sat, 21 Sep 2024 00:42:19 +0000 Subject: [PATCH 17/19] add mfu --- src/zeroband/models/llama/__init__.py | 4 +-- src/zeroband/train.py | 26 +++++++++++--- src/zeroband/utils/__init__.py | 49 ++++++++++++++++++++++++++- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/src/zeroband/models/llama/__init__.py b/src/zeroband/models/llama/__init__.py index c0103fcf..5250fe57 100644 --- a/src/zeroband/models/llama/__init__.py +++ b/src/zeroband/models/llama/__init__.py @@ -61,7 +61,7 @@ } -def get_model(name_model: str, type_model: str, vocab_size: int) -> Transformer: +def get_model(name_model: str, type_model: str, vocab_size: int) -> tuple[Transformer, ModelArgs]: """get the transformer model""" if type_model == "llama2": @@ -72,4 +72,4 @@ def get_model(name_model: str, type_model: str, vocab_size: int) -> Transformer: raise ValueError(f"Model type {type_model} not supported") config.vocab_size = vocab_size - return Transformer(config) + return Transformer(config), config diff --git a/src/zeroband/train.py b/src/zeroband/train.py index 056d89ac..9988e767 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -18,6 +18,7 @@ MixedPrecision, ) import torch.distributed as dist +from zeroband import utils from zeroband.utils import get_sharding_strategy from zeroband.utils.monitor import WandbMonitor, DummyMonitor @@ -103,7 +104,7 @@ def train(config: Config): fake_data=config.data.fake_data, ) - model = get_model( + model, model_config = get_model( config.name_model, config.type_model, vocab_size=tokenizer.vocab_size if config.name_model != "debugmodel" else TEST_VOCAB_SIZE, @@ -111,6 +112,17 @@ def train(config: Config): model = model.to(world_info.local_rank) logger.debug("model loaded") + gpu_peak_flops = utils.get_peak_flops(torch.cuda.get_device_name(torch.device("cuda"))) + logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}") + + num_params = utils.get_num_params(model, exclude_embedding=True) + logger.info(f"Number of parameters: {num_params}") + num_flop_per_token = utils.get_num_flop_per_token( + num_params, + model_config, + config.data.seq_length, + ) + model = FSDP( model, sharding_strategy=sharding_strategy, @@ -187,22 +199,26 @@ def train(config: Config): # syncing loss across all data parallel rank # todo(sami): when using diloco make sure that the loss is computed only on local world + time_taken = time.time() - beginning_step_time + tokens_per_second = config.data.seq_length * config.optim.batch_size / time_taken + + mfu = 100 * num_flop_per_token * tokens_per_second / gpu_peak_flops + metrics = { "Loss": loss_batch.item(), "step": real_step, "inner_lr": inner_lr, - "tokens_per_second": config.data.seq_length - * config.optim.batch_size - / (time.time() - beginning_step_time), + "tokens_per_second": tokens_per_second, "Perplexity": torch.exp(loss_batch).item(), "total_tokens": real_step * config.optim.batch_size * config.data.seq_length, + "mfu": mfu, } if world_info.rank == 0: metric_logger.log(metrics) logger.info( - f"step: {real_step}, loss: {loss_batch.item():.4f}, tokens_per_second: {metrics['tokens_per_second']:.2f}" + f"step: {real_step}, loss: {loss_batch.item():.4f}, tokens_per_second: {metrics['tokens_per_second']:.2f}, mfu: {mfu:.2f}" ) outer_step += 1 diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils/__init__.py index d26823e4..a9b8fad2 100644 --- a/src/zeroband/utils/__init__.py +++ b/src/zeroband/utils/__init__.py @@ -1,7 +1,8 @@ +import torch from torch.distributed.fsdp import ShardingStrategy -__all__ = ["get_sharding_strategy"] +__all__ = ["get_sharding_strategy", "get_peak_flops", "get_num_flop_per_token", "get_num_params"] def get_sharding_strategy(sharding_strategy: str) -> ShardingStrategy: @@ -19,3 +20,49 @@ def get_sharding_strategy(sharding_strategy: str) -> ShardingStrategy: raise ValueError( f"Invalid sharding_strategy: {sharding_strategy}. Please choose 'FULL_SHARD', 'SHARD_GRAD_OP', 'NO_SHARD', 'HYBRID_SHARD', or '_HYBRID_SHARD_ZERO2'." ) + + +### code above inspired and copied from https://github.com/pytorch/torchtitan/blob/4b3f2e41a084bf79a8540068ed525539d1244edd/torchtitan/utils.py#L119 + + +# hardcoded BF16 type peak flops for NVIDIA A100 and H100 GPU +def get_peak_flops(device_name: str) -> int: + if "A100" in device_name: + # data from https://www.nvidia.com/en-us/data-center/a100/ + return 312e12 + elif "H100" in device_name: + # data from https://www.nvidia.com/en-us/data-center/h100/ + # NOTE: Specifications are one-half lower without sparsity. + if "NVL" in device_name: + return 835e12 + elif "PCIe" in device_name: + return 756e12 + else: # for H100 SXM and other variants + return 989e12 + else: # for other GPU types, assume A100 + return 312e12 + + +def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int: + l, h, q, t = ( # noqa: E741 + model_config.n_layers, + model_config.n_heads, + model_config.dim // model_config.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention + flop_per_token = 6 * num_params + 12 * l * h * q * t + + return flop_per_token + + +def get_num_params(model: torch.nn.Module, exclude_embedding: bool = False) -> int: + num_params = sum(p.numel() for p in model.parameters()) + if exclude_embedding: + num_params -= model.tok_embeddings.weight.numel() + return num_params From 01d13a6aeb246205c8201a8b31cfe29aea49d03a Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Sat, 21 Sep 2024 00:53:41 +0000 Subject: [PATCH 18/19] add 1B --- configs/1B/H100.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 configs/1B/H100.toml diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml new file mode 100644 index 00000000..cf8111b1 --- /dev/null +++ b/configs/1B/H100.toml @@ -0,0 +1,12 @@ +name_model = "150M" +project = "debug_150m_zero_band" + +[train] +micro_bs = 16 +sharding_strategy = "SHARD_GRAD_OP" + +[optim] +batch_size = 512 +warmup_steps = 1000 +total_steps = 88_000 +lr = 4e-4 \ No newline at end of file From 40f8fd908a0c2e7a1ac48c1b13eb8b7d1aae9114 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Sat, 21 Sep 2024 00:57:00 +0000 Subject: [PATCH 19/19] add 7b --- configs/1B/H100.toml | 4 ++-- configs/7B/H100.toml | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 configs/7B/H100.toml diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml index cf8111b1..1430dcea 100644 --- a/configs/1B/H100.toml +++ b/configs/1B/H100.toml @@ -1,5 +1,5 @@ -name_model = "150M" -project = "debug_150m_zero_band" +name_model = "1B" +project = "debug_1B_zero_band" [train] micro_bs = 16 diff --git a/configs/7B/H100.toml b/configs/7B/H100.toml new file mode 100644 index 00000000..c1272c34 --- /dev/null +++ b/configs/7B/H100.toml @@ -0,0 +1,12 @@ +name_model = "7B" +project = "debug_7B_zero_band" + +[train] +micro_bs = 6 +sharding_strategy = "SHARD_GRAD_OP" + +[optim] +batch_size = 3840 +warmup_steps = 1000 +total_steps = 88_000 +lr = 6e-4 \ No newline at end of file