diff --git a/vllm/envs.py b/vllm/envs.py index afae779fc80b2..ee4711dbec842 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,6 @@ VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS: int = 1 VLLM_MOE_PADDING: bool = False VLLM_FP8_PADDING: bool = True - VLLM_NO_TUNED_GEMM: bool = False def get_default_cache_root(): @@ -487,10 +486,6 @@ def get_default_config_root(): # Pad the weight for moe kernel or not "VLLM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_FP8_PADDING", "1"))), - - # Not supported by mllama3.2 - "VLLM_NO_TUNED_GEMM": - lambda: bool(int(os.getenv("VLLM_NO_TUNED_GEMM", "0"))), } # end-env-vars-definition diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 01626236cc5a9..fce41b92d2fac 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -2,10 +2,8 @@ from typing import Dict, List, Optional, Tuple import torch -import torch.nn.functional as F from torch.nn.parameter import Parameter, UninitializedParameter -import vllm.envs as envs from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -133,10 +131,7 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if envs.VLLM_NO_TUNED_GEMM: - return F.linear(x, layer.weight, bias) - else: - return tgemm.mm(x, layer.weight, bias) + return tgemm.mm(x, layer.weight, bias) class LinearBase(torch.nn.Module): diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 7ea1d8d93ea2b..fc58c8c6d379e 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -84,7 +84,7 @@ def mm(self, inp, weights, bias=None): # uses this for linear units. However, sampler # will use torch.matmul with 2 dimensions only if inp.dim() == 3: - inp_view = inp.view(-1, inp.size(-1)) + inp_view = inp.reshape(-1, inp.size(-1)) batched = True else: inp_view = inp