Merge remote-tracking branch 'origin/main' into head64

InternLM · Nov 5, 2024 · 80e1775 · 80e1775
2 parents 47b0774 + 364a142
commit 80e1775
Show file tree

Hide file tree

Showing 32 changed files with 560 additions and 79 deletions.
diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
@@ -23,8 +23,8 @@ The Docker version is supposed to be no less than `18.03`. And `Ascend Docker Ru
 
 #### Ascend Drivers, Firmware and CANN
 
-The target machine needs to install the Huawei driver and firmware version 23.0.3, refer to
-[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+The target machine needs to install the Huawei driver and firmware version not lower than 23.0.3, refer to
+[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha).
 
 And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run`, `Ascend-cann-nnal_*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code

diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl returns the cross entropy loss without applying the exponential operation afterwards
+```
+
 - **Below is an example for pytorch backend. Please install triton first.**
 
 ```shell

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -51,7 +51,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |

diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
@@ -22,8 +22,8 @@ Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hias
 
 #### Drivers，Firmware 和 CANN
 
-目标机器需安装华为驱动程序和固件版本 23.0.3，请参考
-[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+目标机器需安装华为驱动程序和固件版本至少为 23.0.3，请参考
+[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。
 
 另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。

diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl 返回的是 cross entropy loss，没有在之后加 exp 操作
+```
+
 - **使用 pytorch 后端**
 
 需要先安装 triton

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -51,7 +51,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |

diff --git a/lmdeploy/pytorch/backends/default/rotary_embedding.py b/lmdeploy/pytorch/backends/default/rotary_embedding.py
@@ -232,9 +232,12 @@ def __init__(self,
         self.register_buffer('inv_freq', inv_freq, persistent=False)
 
         # get mscale
-        self.mscale = float(
-            yarn_get_mscale(self.scaling_factor, self.mscale) /
-            yarn_get_mscale(self.scaling_factor, self.mscale_all_dim))
+        if yarn_params.attention_factor is not None:
+            self.mscale = yarn_params.attention_factor
+        else:
+            self.mscale = float(
+                yarn_get_mscale(self.scaling_factor, self.mscale) /
+                yarn_get_mscale(self.scaling_factor, self.mscale_all_dim))
         if self.mscale == 1.0:
             self.mscale = None
 
@@ -334,10 +337,10 @@ def build(
             return LlamaDynamicNTKScalingRotaryEmbedding(
                 dim, base, scaling_factor, max_position_embeddings)
         elif emb_type == RopeType.Llama3:
-            return Llama3RotaryEmbeddingImpl(dim, base, scaling_factor,
-                                             llama3_params.low_freq_factor,
-                                             llama3_params.high_freq_factor,
-                                             max_position_embeddings)
+            return Llama3RotaryEmbeddingImpl(
+                dim, base, scaling_factor, llama3_params.low_freq_factor,
+                llama3_params.high_freq_factor,
+                llama3_params.original_max_position_embeddings)
         elif emb_type == RopeType.Yarn:
             return YarnRotaryEmbeddingImpl(dim,
                                            base,

diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+
+from lmdeploy.pytorch.kernels.dlinfer import linear
+
+from ..linear import LinearBuilder, LinearImpl
+
+
+class DlinferLinearImpl(LinearImpl):
+    """Dlinfer linear implementation api."""
+
+    def forward(self,
+                x,
+                weight: torch.Tensor,
+                bias: Optional[torch.Tensor] = None,
+                all_reduce: bool = False):
+        """forward."""
+        return linear(x, weight, bias, all_reduce)
+
+
+class DlinferLinearBuilder(LinearBuilder):
+    """Dlinfer linear implementation builder."""
+
+    @staticmethod
+    def build(in_features: int,
+              out_features: int,
+              bias: bool = True,
+              dtype: torch.dtype = None):
+        """build."""
+        return DlinferLinearImpl()
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -40,6 +40,9 @@ def get_layer_impl_builder(cls, layer_type: OpType):
         elif layer_type == OpType.FusedMoE:
             from .moe import DlinferFusedMoEBuilder
             return DlinferFusedMoEBuilder
+        elif layer_type == OpType.Linear:
+            from .linear import DlinferLinearBuilder
+            return DlinferLinearBuilder
         elif layer_type == OpType.LinearW4A16:
             from .awq_modules import AwqLinearW4A16Builder
             return AwqLinearW4A16Builder

diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -1,14 +1,44 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import math
+
 import torch
 from torch import nn
 
-from ..default.rotary_embedding import (Llama3RotaryEmbeddingImpl,
-                                        LlamaDynamicNTKScalingRotaryEmbedding)
+from ..default.rotary_embedding import LlamaDynamicNTKScalingRotaryEmbedding
 from ..rotary_embedding import (Llama3Parameters, LongRoPEScalingParameters,
                                 RopeType, RotaryEmbeddingBuilder,
                                 RotaryEmbeddingImpl, YarnParameters)
 
 
+def _rotary_embedding_fwd(position_ids: torch.Tensor,
+                          inv_freq: torch.Tensor,
+                          scaling_factor: float,
+                          mscale: float = None,
+                          dtype: torch.dtype = None):
+    """rotary embedding forward."""
+    if dtype is None:
+        dtype = torch.float16
+
+    if scaling_factor != 1.0:
+        position_ids = position_ids.float() / scaling_factor
+    else:
+        position_ids = position_ids.float()
+
+    inv_freq_expanded = inv_freq.view(1, -1, 1)
+    position_ids_expanded = position_ids.unsqueeze(1)
+
+    tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)
+    freqs = tmp.transpose(1, 2)
+    emb = torch.cat((freqs, freqs), dim=-1)
+    cos = emb.cos()
+    sin = emb.sin()
+
+    if mscale is not None:
+        cos = cos * mscale
+        sin = sin * mscale
+    return cos.to(dtype=dtype), sin.to(dtype=dtype)
+
+
 class DlinferRotaryEmbeddingImpl(RotaryEmbeddingImpl, nn.Module):
     """base rotary embedding."""
 
@@ -28,34 +58,100 @@ def __init__(self,
     def forward(self, x, position_ids):
         """forward."""
         # x: [bs, num_attention_heads, seq_len, head_size]
+        dtype = x.dtype
         if self.inv_freq.device != x.device:
             self.inv_freq = self.inv_freq.to(x.device)
+        return _rotary_embedding_fwd(position_ids,
+                                     self.inv_freq,
+                                     scaling_factor=self.scaling_factor,
+                                     dtype=dtype)
 
-        if self.scaling_factor != 1.0:
-            position_ids = position_ids.float() / self.scaling_factor
-        else:
-            position_ids = position_ids.float()
-
-        inv_freq_expanded = self.inv_freq.view(1, -1, 1)
-        position_ids_expanded = position_ids.unsqueeze(1)
-
-        # # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
-        inv_freq_expanded = inv_freq_expanded
-        position_ids_expanded = position_ids_expanded
-        tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)
-        freqs = tmp.transpose(1, 2)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos()
-        sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+class DlinferLlamaDynamicNTKScalingRotaryEmbedding(
+        LlamaDynamicNTKScalingRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(self,
+                 dim: int,
+                 base: int = 10000,
+                 scaling_factor: float = 1.0,
+                 max_position_embeddings: int = 2048):
+        super().__init__(dim, base, scaling_factor, max_position_embeddings)
+        self.dim_scale_ratio = self.dim / (self.dim - 2)
+        self.pos_freq_scaling = torch.arange(
+            0, self.dim, 2, dtype=torch.int64).float().cuda() / self.dim
+        self.scale_offset = self.scaling_factor - 1
+        self.pos_scale_factor = self.scaling_factor / \
+            self.max_position_embeddings
+
+    def _ntk_inv_freq(self, seq_len: torch.Tensor):
+        """Calculate inverse frequency with NTK scaling."""
+        base = self.base * ((self.pos_scale_factor * seq_len) -
+                            self.scale_offset)**self.dim_scale_ratio
+        inv_freq = 1.0 / (base**self.pos_freq_scaling)
+        return inv_freq
+
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
+        """forward."""
+        dtype = x.dtype
+        seq_len = torch.max(position_ids) + 1
+        ntk_inv_freq = self._ntk_inv_freq(seq_len)
+        if self.inv_freq.device != x.device:
+            self.inv_freq = self.inv_freq.to(x.device)
+        inv_freq = torch.where(seq_len > self.max_position_embeddings,
+                               ntk_inv_freq, self.inv_freq)
+
+        cos, sin = _rotary_embedding_fwd(position_ids,
+                                         inv_freq,
+                                         scaling_factor=1.0,
+                                         dtype=dtype)
+        return cos, sin
+
+
+class DlinferLlama3RotaryEmbeddingImpl(DlinferRotaryEmbeddingImpl):
+    """llama3 rotary embedding implementation."""
+
+    def __init__(
+        self,
+        dim: int,
+        base: int = 10000,
+        scaling_factor: float = 1.0,
+        low_freq_factor: float = 1.0,
+        high_freq_factor: float = 4.0,
+        original_max_position_embeddings: int = 8194,
+    ):
+        super().__init__(dim, base, scaling_factor)
+        old_context_len = original_max_position_embeddings
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        inv_freq = self.inv_freq
+        factor = self.scaling_factor
+
+        wavelen = 2 * math.pi / inv_freq
+        # wavelen < high_freq_wavelen: do nothing
+        # wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen,
+                                     inv_freq / factor, inv_freq)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+            high_freq_factor - low_freq_factor)
+        smoothed_inv_freq = (
+            1 - smooth_factor
+        ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen >
+                                                            low_freq_wavelen)
+        inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq,
+                                     inv_freq_llama)
+        self.scaling_factor = 1.0
+        self.register_buffer('inv_freq', inv_freq_llama)
 
 
 class DlinferRotaryEmbeddingBuilder(RotaryEmbeddingBuilder):
-    """rotary embedding builder."""
+    """rotary embedding dlinfer builder."""
 
     @staticmethod
     def build(
@@ -72,13 +168,12 @@ def build(
         if emb_type in (RopeType.Default, RopeType.LinearScaling):
             return DlinferRotaryEmbeddingImpl(dim, base, scaling_factor)
         elif emb_type == RopeType.DynamicNTKScaling:
-            return LlamaDynamicNTKScalingRotaryEmbedding(
+            return DlinferLlamaDynamicNTKScalingRotaryEmbedding(
                 dim, base, scaling_factor, max_position_embeddings)
         elif emb_type == RopeType.Llama3:
-            return Llama3RotaryEmbeddingImpl(dim, base, scaling_factor,
-                                             llama3_params.low_freq_factor,
-                                             llama3_params.high_freq_factor,
-                                             max_position_embeddings)
+            return DlinferLlama3RotaryEmbeddingImpl(
+                dim, base, scaling_factor, llama3_params.low_freq_factor,
+                llama3_params.high_freq_factor, max_position_embeddings)
         else:
             raise NotImplementedError(
                 f'Unsupported embedding type: {emb_type}')
diff --git a/lmdeploy/pytorch/backends/rotary_embedding.py b/lmdeploy/pytorch/backends/rotary_embedding.py
@@ -22,6 +22,7 @@ class YarnParameters:
     beta_slow: float = 1
     mscale: int = 1
     mscale_all_dim: int = 0
+    attention_factor: int = None
 
 
 @dataclass
@@ -39,6 +40,7 @@ class Llama3Parameters:
     """llama3 rope parameters."""
     low_freq_factor: float = 1.0
     high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
 
 
 class RotaryEmbeddingImpl(ABC):

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -500,15 +500,22 @@ def _start_tp_process(proc_id: int,
 def _check_context_alive(mp_context: mp.ProcessContext):
     """check context alive."""
     procs: List[mp.Process] = mp_context.processes
-    failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive())
-    if len(failed_ranks) == 0:
+    failed_procs = list(idx for idx, p in enumerate(procs) if not p.is_alive())
+    if len(failed_procs) == 0:
         return
-    for p in procs:
+
+    log_procs = []
+    for idx, p in enumerate(procs):
         if p.is_alive():
             p.terminate()
         else:
+            exitcode = p.exitcode
+            if exitcode > 0:
+                # terminated exitcode < 0
+                log_procs.append((idx, exitcode))
             p.close()
-    logger.error(f'TP process {failed_ranks} failed.')
+    for idx, exitcode in log_procs:
+        logger.error(f'TP process {idx} failed with exitcode {exitcode}.')
     # TODO: not safe exit.
     os._exit(1)
 

diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,6 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .fused_moe import fused_moe
+from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
 from .rms_norm import rms_norm
@@ -15,6 +16,7 @@
     'fill_kv_cache',
     'fused_moe',
     'paged_attention_fwd',
+    'linear',
     'moe_gating_topk_softmax',
     'multinomial_sampling',
 ]
diff --git a/lmdeploy/pytorch/kernels/dlinfer/linear.py b/lmdeploy/pytorch/kernels/dlinfer/linear.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def linear(x: Tensor,
+           weight: Tensor,
+           bias: Optional[Tensor] = None,
+           all_reduce: bool = False):
+    return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce)