From 9c2367cefc4b497d67d76761854cb19fb7c2d4f7 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <Doug.Lehr@amd.com>
Date: Fri, 26 Jan 2024 23:40:51 -0500
Subject: [PATCH 01/34] [ROCm] Fixup arch checks for ROCM

The ROCM stack with PyTorch supports a wide set of gfx architectures.  This can be
displayed by printing PYTORCH_ROCM_ARCH env.  In the absence of PYTORCH_ROCM_ARCH
pytorch uses theoutput from rocm_agent_enumerator to choose what to compile for.

vllm supports a subset of these, (gfx908, gfx90a,...)

Due to a need to potentially support multiple architectures at once (ex. docker image)
it's important to make sure vllm is compiled with them all unless specified otherwise.

We now gather either the PYTORCH_ROCM_ARCH env or rocm_agent_enumerator output and
cross reference with ROCM_SUPPORTED_ARCHS from vllm to generate a list of
arches to build for.
---
 Dockerfile.rocm |  3 --
 setup.py        | 83 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 88172fb73b937..3c76305303037 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -10,9 +10,6 @@ RUN echo "Base image is $BASE_IMAGE"
 # BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
 # BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
-# this does not always work for all rocm versions
-RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \
-    echo "LLVM_GFX_ARCH is $LLVM_GFX_ARCH"
 
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
diff --git a/setup.py b/setup.py
index 88fa495205659..25b460fdc6cc4 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 
@@ -63,21 +63,6 @@ def _is_cuda() -> bool:
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
 
-def get_amdgpu_offload_arch():
-    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
-    try:
-        output = subprocess.check_output([command])
-        return output.decode('utf-8').strip()
-    except subprocess.CalledProcessError as e:
-        error_message = f"Error: {e}"
-        raise RuntimeError(error_message) from e
-    except FileNotFoundError as e:
-        # If the command is not found, print an error message
-        error_message = f"The command {command} was not found."
-        raise RuntimeError(error_message) from e
-
-    return None
-
 
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
@@ -138,6 +123,49 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     return nvcc_cuda_version
 
 
+def get_pytorch_rocm_arch() -> Set[str]:
+    """Get the cross section of Pytorch,and vllm supported gfx arches
+
+    ROCM can get the supported gfx architectures in one of two ways
+    Either through the PYTORCH_ROCM_ARCH env var, or output from
+    rocm_agent_enumerator.
+
+    In either case we can generate a list of supported arch's and
+    cross reference with VLLM's own ROCM_SUPPORTED_ARCHs.
+    """
+    env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
+
+    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
+    if env_arch_list is None:
+        command = "rocm_agent_enumerator"
+        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
+                        .strip().replace("\n", ";")
+        arch_source_str = "rocm_agent_enumerator"
+    else:
+        arch_source_str = "PYTORCH_ROCM_ARCH env variable"
+
+    # List are separated by ; or space.
+    pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
+
+    # Filter out the invalid architectures and print a warning.
+    arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
+
+    # If none of the specified architectures are valid, raise an error.
+    if not arch_list:
+        raise RuntimeError(
+            f"None of the ROCM architectures in {arch_source_str} "
+            f"({env_arch_list}) is supported. "
+            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
+    invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
+    if invalid_arch_list:
+        warnings.warn(
+            f"Unsupported ROCM architectures ({invalid_arch_list}) are "
+            f"excluded from the {arch_source_str} output "
+            f"({env_arch_list}). Supported ROCM architectures are: "
+            f"{ROCM_SUPPORTED_ARCHS}.",
+            stacklevel=2)
+    return arch_list
+
 def get_torch_arch_list() -> Set[str]:
     # TORCH_CUDA_ARCH_LIST can have one or more architectures,
     # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
@@ -162,22 +190,27 @@ def get_torch_arch_list() -> Set[str]:
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
         raise RuntimeError(
-            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
+            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
             f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
+            f"Supported CUDA architectures are: {valid_archs}.")
     invalid_arch_list = torch_arch_list - valid_archs
     if invalid_arch_list:
         warnings.warn(
-            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
+            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
+            f"({env_arch_list}). Supported CUDA architectures are: "
             f"{valid_archs}.",
             stacklevel=2)
     return arch_list
 
 
-# First, check the TORCH_CUDA_ARCH_LIST environment variable.
-compute_capabilities = get_torch_arch_list()
+if _is_hip():
+    rocm_arches = get_pytorch_rocm_arch()
+    NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
+else:
+    # First, check the TORCH_CUDA_ARCH_LIST environment variable.
+    compute_capabilities = get_torch_arch_list()
+
 if _is_cuda() and not compute_capabilities:
     # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
     # GPUs on the current machine.
@@ -283,12 +316,6 @@ def get_torch_arch_list() -> Set[str]:
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
             ))
-elif _is_hip():
-    amd_arch = get_amdgpu_offload_arch()
-    if amd_arch not in ROCM_SUPPORTED_ARCHS:
-        raise RuntimeError(
-            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-            f"amdgpu_arch_found: {amd_arch}")
 
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()

From a9d752c7be6686000ace68fe2bc49e3845fc7e8e Mon Sep 17 00:00:00 2001
From: Douglas Lehr <Doug.Lehr@amd.com>
Date: Sat, 27 Jan 2024 12:44:19 -0500
Subject: [PATCH 02/34] yapf cleanup

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 25b460fdc6cc4..15b9a78f6ca27 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,9 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"}
+ROCM_SUPPORTED_ARCHS = {
+    "gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"
+}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 
@@ -63,7 +65,6 @@ def _is_cuda() -> bool:
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
 
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -166,6 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]:
             stacklevel=2)
     return arch_list
 
+
 def get_torch_arch_list() -> Set[str]:
     # TORCH_CUDA_ARCH_LIST can have one or more architectures,
     # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the

From 20b5f1066dc1424929cef39a7a7b5abcc753bd68 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Sat, 10 Feb 2024 17:42:42 +0000
Subject: [PATCH 03/34] Initial port of gradlib gemm tuner

---
 run.sh                                   | 32 ++++++++
 vllm/model_executor/layers/linear.py     | 11 ++-
 vllm/model_executor/layers/tuned_gemm.py | 93 ++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100755 run.sh
 create mode 100644 vllm/model_executor/layers/tuned_gemm.py

diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000000000..7b9336a0a076a
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+BASE_DIR=/trees/
+VLLM_DIR=$BASE_DIR/vllm
+GRAD_DIR=$BASE_DIR/gradlib
+RPD_DIR=/workspace/rocmProfileData
+MODEL=/data/llama2-70b-chat
+#MODEL=/data/Llama-2-13B-Chat-fp16
+#MODEL=/data/llama-2-13b-chat-hf
+MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
+
+GEN_LEN="8"
+TP=8
+INPUT_LEN=2048
+ITER=1
+cd $VLLM_DIR
+
+    echo "tuned_gemm_csv: ./tuned_tp$TP.csv" > $VLLM_DIR/tuned_perf_tp$TP.yaml
+    tuned_file=$VLLM_DIR/tuned_tp$TP.csv
+export VLLM_PERF_YAML=./tuned_perf_tp$TP.yaml
+
+for tp in $TP;
+do
+	for gen_len in $GEN_LEN;
+	do
+		for input_len in $INPUT_LEN;
+		do
+
+python benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1    --input-len $input_len --output-len $gen_len \
+		            --tensor-parallel-size $tp --num-iters $ITER
+    done
+done
+done
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5e1d63a6a62eb..63ea7b856c1ee 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,6 +13,8 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.tuned_gemm import tgemm
+
 
 logger = init_logger(__name__)
 
@@ -66,10 +68,14 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
         if self.separate_bias_add:
+            #print(f">>> HELOOOOOOOOOOOOOO apply_weights {x.shape}, {weight.shape}, {bias}")
+
             if bias:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
-        return F.linear(x, weight, bias)
+        #tgemm.mm(x,weight)
+        #return F.linear(x, weight, bias)
+        return tgemm.mm(x,weight)
 
 
 class ReplicatedLinear(torch.nn.Module):
@@ -123,6 +129,7 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if not self.skip_bias_add else None
         output = self.linear_method.apply_weights(self.linear_weights, x, bias)
+        #print(f">>> output is {output}")
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
@@ -548,7 +555,7 @@ def forward(self, input_):
             output_ = tensor_model_parallel_all_reduce(output_parallel)
         else:
             output_ = output_parallel
-
+        #print(f">>> ROWPARALLEL {output_.shape}")
         if not self.skip_bias_add:
             output = output_ + self.bias if self.bias is not None else output_
             output_bias = None
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
new file mode 100644
index 0000000000000..556a972bc3e34
--- /dev/null
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn.functional as F
+from rocsolidxgemm import rocb_create_extension,rocb_mm
+from hipbsolidxgemm import hipb_create_extension,hipb_mm
+import os
+import yaml
+import pandas as pd
+#from vllm import custom_ops
+
+
+class TunedGemm:
+    def __init__(self):
+        #rocb_create_extension()
+        #hipb_create_extension()
+        self.extensions_created = False
+        self.bestsols = {}
+        self.load_best_sols()
+        self.create_ds()
+    def load_best_sols(self):
+        perfbits = {}
+        perf_file = os.environ.get('VLLM_PERF_YAML')
+        if perf_file is not None:
+            with open(perf_file, 'r') as file:
+                perfbits = yaml.safe_load(file)
+
+        tune_file = perfbits.get('tuned_gemm_csv',None)
+        if tune_file is not None:
+            self.bestsols = pd.read_csv(tune_file,index_col=[0])
+    def apply_custom(self,ds):
+        M,N,K = ds['M'],ds['N'],ds['K']
+        #apply custom matvec (only for f16 dtype)
+        return ds
+        if N==1:
+            ds1 = ds.copy()
+            ds1['libtype'] = 'custom'
+            if K==8192 and (M==1280 or M==7168):
+                ds1['solidx'] = 8
+                return ds1
+            elif K==3584 and M==8192:
+                ds1['solidx'] = 8
+                return ds1
+            elif K<=8192 and K%8==0 and M%4==0:
+                ds1['solidx'] = 1
+                return ds1
+        return ds
+    def create_ds(self):
+        df = self.bestsols
+        solds = {}
+        for i in range(len(df)):
+            ds = self.apply_custom(df.iloc[i])
+            key = (ds['M'],ds['N'],ds['K'])
+            if ds['libtype']=='hipblaslt': soltype = 1
+            elif ds['libtype']=='rocblas': soltype = 2
+            elif ds['libtype']=='custom': soltype = 3
+            solds[key] = (soltype,int(ds['solidx']))
+        self.solids = solds
+        #print('>>>',solds)
+    def query_sol(self,m,n,k):
+        return self.solids.get((m,n,k),(0,0))
+    def mm(self,inp,weights):
+        inp_view=inp.view(-1,inp.size(-1))
+        #print(f'>>>inp_view {inp_view.shape}')
+        if self.extensions_created == False:
+            rocb_create_extension()
+            hipb_create_extension()
+            self.extensions_created = True
+        soltype,solidx = self.query_sol(m=weights.shape[0],n=inp_view.shape[0],k=inp_view.shape[1])
+        if soltype==1:
+            #print(">>> found hipblas")
+            out = hipb_mm(inp_view,weights.t(),solidx)
+        elif soltype==3:
+            ##only matvec is supported currently
+            out = torch.empty(inp.shape[0],weights.shape[0],dtype=torch.float16,device='cuda')
+            #print('>>>Matvec',inp.shape,weights.shape,soltype,solidx)
+            if solidx<=1:
+                custom_ops.LLMM1(weights,inp,out,4)
+            elif solidx==2:
+                custom_ops.LLMM1(weights,inp,out,2)
+            elif solidx==8:
+                custom_ops.LLMM1(weights,inp,out,8)
+            elif solidx==20:
+                custom_ops.LLZZ(weights,inp,out,0)
+            elif solidx==21:
+                custom_ops.LLZZ(weights,inp,out,1)
+        elif soltype==2:
+            #print(">>> found rocblas")
+            out = rocb_mm(inp_view,weights.t(),solidx)
+        else:
+            #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
+            out = F.linear(inp,weights)
+        return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
+
+tgemm = TunedGemm()
\ No newline at end of file

From 6f281079a78e6cd2ed3a3cedd2fa1ec1d6ee2a5a Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Sun, 11 Feb 2024 18:10:01 +0000
Subject: [PATCH 04/34] Enable torchrun vs Ray

This is a bit of a hack. But Ray seems to have some serious perf degradation when
running multi gpu latency benchmarks.

Allow distributed to be used when Ray is disabled, and make sure we connect
via env ranking instead of tcp/port based.
---
 vllm/config.py                                |  2 +-
 vllm/engine/llm_engine.py                     |  4 +--
 vllm/engine/ray_utils.py                      | 20 +++++++++++---
 .../parallel_utils/communication_op.py        | 26 +++++++++++--------
 vllm/worker/worker.py                         | 10 ++++---
 5 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 8acd15a3b7d9a..11952d9471d8f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -342,7 +342,7 @@ def __init__(
 
         self.world_size = pipeline_parallel_size * tensor_parallel_size
         if self.world_size > 1:
-            self.worker_use_ray = True
+            self.worker_use_ray = False
         self._verify_args()
 
     def _verify_args(self) -> None:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0dedc232292dd..7c6808e32f3fa 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -129,8 +129,8 @@ def _init_workers(self):
         # before CUDA_VISIBLE_DEVICES is set in the Worker
         from vllm.worker.worker import Worker
 
-        assert self.parallel_config.world_size == 1, (
-            "Ray is required if parallel_config.world_size > 1.")
+        # assert self.parallel_config.world_size == 1, (
+        #     "Ray is required if parallel_config.world_size > 1.")
 
         self.workers: List[Worker] = []
         distributed_init_method = get_distributed_init_method(
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 1cb5fcda344f1..2abf571c5fc61 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,3 +1,4 @@
+import socket
 from typing import Optional, List, Tuple, TYPE_CHECKING
 
 from vllm.config import ParallelConfig
@@ -50,6 +51,10 @@ def set_cuda_visible_devices(self, device_ids) -> None:
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
 
+def get_open_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
 
 def initialize_cluster(
     parallel_config: ParallelConfig,
@@ -82,11 +87,18 @@ def initialize_cluster(
         else:
             ray.init(address=ray_address, ignore_reinit_error=True)
 
+    # if not parallel_config.worker_use_ray:
+    #     assert parallel_config.world_size == 1, (
+    #         "Ray is required if parallel_config.world_size > 1.")
+    #     return None
     if not parallel_config.worker_use_ray:
-        assert parallel_config.world_size == 1, (
-            "Ray is required if parallel_config.world_size > 1.")
-        return None
-
+        # Initialize cluster locally.
+        port = get_open_port()
+        # We need to setup the distributed init method to make sure
+        # the distributed megatron code (e.g., get world size) works correctly.
+        distributed_init_method = f"tcp://localhost:{port}"
+        return distributed_init_method, None
+    
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index fff6920be72b0..720b52cfc6904 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -63,6 +63,7 @@ def tensor_model_parallel_gather(input_: torch.Tensor,
     all the ranks.
     """
     world_size = get_tensor_model_parallel_world_size()
+
     # Bypass the function if we are using only 1 GPU.
     if world_size == 1:
         return input_
@@ -72,19 +73,22 @@ def tensor_model_parallel_gather(input_: torch.Tensor,
         # Convert negative dim to positive.
         dim += input_.dim()
     # Allocate output tensor.
-    if get_tensor_model_parallel_rank() == dst:
-        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-    else:
-        gather_list = None
+    gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+    # if get_tensor_model_parallel_rank() == dst:
+    #     gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+    # else:
+    #     gather_list = None
     # Gather.
-    torch.distributed.gather(input_,
-                             gather_list,
-                             dst=dst,
+
+    #print(f'>>> world size {world_size}, {gather_list}, {dst} {get_tensor_model_parallel_group()}')
+    torch.distributed.all_gather(gather_list, input_,
                              group=get_tensor_model_parallel_group())
-    if get_tensor_model_parallel_rank() == dst:
-        output_tensor = torch.cat(gather_list, dim=dim)
-    else:
-        output_tensor = None
+    output_tensor = torch.cat(gather_list, dim=dim)
+    # if get_tensor_model_parallel_rank() == dst:
+    #     output_tensor = torch.cat(gather_list, dim=dim)
+    # else:
+    #     output_tensor = None
+    #print(f'>>> output_tensor {output_tensor}, {dst}, {dim}')
     return output_tensor
 
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 845283586e147..aafd7306acf5d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -72,7 +72,10 @@ def init_model(self) -> None:
 
         # This env var set by Ray causes exceptions with graph building.
         os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-        self.device = torch.device(f"cuda:{self.local_rank}")
+        self.rank = self.rank if self.rank is not None else int(
+            os.getenv("RANK", "-1"))
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.device = torch.device(f"cuda:{local_rank}")
         torch.cuda.set_device(self.device)
 
         _check_if_gpu_supports_dtype(self.model_config.dtype)
@@ -240,8 +243,9 @@ def _init_distributed_environment(
         torch.distributed.init_process_group(
             backend="nccl",
             world_size=parallel_config.world_size,
-            rank=rank,
-            init_method=distributed_init_method,
+            #rank=rank,
+            #init_method=distributed_init_method,
+            init_method="env://",
         )
 
     # A small all_reduce for warmup.

From 184806e673c9e473f5559df5f636e4086ee343b0 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Sun, 11 Feb 2024 18:38:56 +0000
Subject: [PATCH 05/34] Add custom matvec kernels and sampler matmul call
 tuned_gemm

---
 csrc/custom/custom.cpp                   |  74 +++++
 csrc/custom/custom_kernels.cu            | 367 +++++++++++++++++++++++
 csrc/custom/fused_kernels.cu             | 192 ++++++++++++
 setup.py                                 |   6 +
 vllm/model_executor/layers/sampler.py    |   5 +-
 vllm/model_executor/layers/tuned_gemm.py |  20 +-
 6 files changed, 657 insertions(+), 7 deletions(-)
 create mode 100644 csrc/custom/custom.cpp
 create mode 100644 csrc/custom/custom_kernels.cu
 create mode 100644 csrc/custom/fused_kernels.cu

diff --git a/csrc/custom/custom.cpp b/csrc/custom/custom.cpp
new file mode 100644
index 0000000000000..aeff9cc5e6ae7
--- /dev/null
+++ b/csrc/custom/custom.cpp
@@ -0,0 +1,74 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <pybind11/pybind11.h>
+#include <cuda_runtime.h>
+
+namespace py = pybind11;
+
+// declare templates for front (cpp) and back (cuda) sides of function:
+//template <typename T>
+
+void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block);
+void LLMM_Silu(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+      LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
+}
+
+void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream,const int rows_per_block);
+
+//template <typename T>
+void LLMM1(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block=4) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+        //if (N != in_b.numel())
+        //        throw std::invalid_argument("Size mismatch A.numel(): " + std::to_string(in_a.numel())
+        //                          + ", B.numel(): " + std::to_string(in_b.numel()));
+
+          //out_c.resize_({N});
+
+          // call the kernel function...
+           LLGemm1(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
+}
+
+void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx);
+
+void LLZZ(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int solidx=0) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+
+           LLGemmZZ(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),solidx);
+}
+// instantiate the CPP template for T=float:
+//template void AddGPU<float>(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c);
+
+
+void MMGPUKernel(float *in_a, float *in_b, float *out_c, 
+        int numARows, int numAColumns,
+        int numBRows, int numBColumns,
+        int numCRows, int numCColumns, 
+        cudaStream_t stream);
+
+
+void MMCustomGPU(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c) {
+        auto matA_sizes { in_a.sizes() };
+        auto matB_sizes { in_b.sizes() };
+        auto matO_sizes { out_c.sizes() };
+        MMGPUKernel(in_a.data_ptr<float>(), in_b.data_ptr<float>(), out_c.data_ptr<float>(), 
+            matA_sizes[0], matA_sizes[1],
+            matB_sizes[0], matB_sizes[1],
+            matO_sizes[0], matO_sizes[1],
+        at::cuda::getCurrentCUDAStream());
+}
+
+// declare the extension module with the AddGPU function:
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
+      m.doc() = "pybind11 example plugin";
+        m.def("LLMM1", &LLMM1);
+        m.def("LLMM_Silu", &LLMM_Silu);
+        m.def("LLZZ", &LLZZ);
+//m.def("MMCustomGPU", &MMCustomGPU);
+}
diff --git a/csrc/custom/custom_kernels.cu b/csrc/custom/custom_kernels.cu
new file mode 100644
index 0000000000000..b5ab0dbe8317c
--- /dev/null
+++ b/csrc/custom/custom_kernels.cu
@@ -0,0 +1,367 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <algorithm>
+
+constexpr int WARP_SIZE = 64;
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+          return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+          auto addr_alias = reinterpret_cast<const float*>(addr);
+          auto dat0 = loadnt(addr_alias);
+          auto dat1 = loadnt(addr_alias + 1);
+          auto dat2 = loadnt(addr_alias + 2);
+          auto dat3 = loadnt(addr_alias + 3);
+          //auto dat0 = *(addr_alias);
+          //auto dat1 = *(addr_alias+1);
+          //auto dat2 = *(addr_alias+2);
+          //auto dat3 = *(addr_alias+3);
+          return make_float4(dat0,dat1,dat2,dat3);
+}
+
+//TBlock fetches entire rows of A, and entire col of B (K dimension); assume N=1 for time being
+//grid is M/A_NUM_ROWS blocks
+template <int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm1_kernel(float4 *af4, __half2 *bf4, __half2 *c) {
+      __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+      const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * blockDim.x;
+      //int row_addr_1 = row_addr + CUDA_NUM_THREADS;
+      //int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS;
+      //int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS;
+      const int threadid = threadIdx.x;
+      const int warp = threadIdx.x / WARP_SIZE;
+      const int lane = threadIdx.x % WARP_SIZE;
+      const int num_warps = blockDim.x / WARP_SIZE;
+      const int qwarpid = threadid/16;
+      const int qthreadid = threadid%16;
+      float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+      //float4 colB_elem4;
+      __half2 colB_elem4x,colB_elem4y,colB_elem4z,colB_elem4w;
+      float4 sum4; //[NUM_A_ROWS_PER_BLOCK];
+      float acc[NUM_A_ROWS_PER_BLOCK]; //= 0.0;
+      __half2 acch2;
+      __half2 oval;
+
+       //rowA_elem4 = af4[row_addr + threadid];
+       //__syncthreads();
+       //rowA_elem4_1 = af4[row_addr_1 + threadid];
+       //rowA_elem4_2 = af4[row_addr_2 + threadid];
+       //rowA_elem4_3 = af4[row_addr_3 + threadid];
+      #pragma unroll
+      for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+        rowA_elem4[i] = load_ntmprl(&af4[row_addr + i*blockDim.x + threadid]);
+        //rowA_elem4[i] = af4[row_addr + i*blockDim.x + threadid];
+       //__syncthreads();
+      }
+      colB_elem4x = bf4[threadid*4+0];
+      colB_elem4y = bf4[threadid*4+1];
+      colB_elem4z = bf4[threadid*4+2];
+      colB_elem4w = bf4[threadid*4+3];
+
+      // __syncthreads();
+       __half2 Af2; __half2 Bf2; float2 S;
+       //auto Bh2ptr = reinterpret_cast<__half2 *>(&colB_elem4);
+       //auto Bf2x = *Bh2ptr;
+       //auto Bf2y = *(Bh2ptr+1);
+       //auto Bf2z = *(Bh2ptr+2);
+       //auto Bf2w = *(Bh2ptr+3);
+       auto Ah2ptr = reinterpret_cast<__half2 *>(&rowA_elem4);
+       __half2 *ah2lptr;
+      #pragma unroll
+      for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+       ah2lptr = Ah2ptr+i*4;
+       Af2 = *(ah2lptr);
+       acch2 = __hmul2(Af2,colB_elem4x);
+       Af2 = *(ah2lptr+1);
+       acch2 = __hfma2(Af2,colB_elem4y,acch2);
+       Af2 = *(ah2lptr+2);
+       acch2 = __hfma2(Af2,colB_elem4z,acch2);
+       Af2 = *(ah2lptr+3);
+       acch2 = __hfma2(Af2,colB_elem4w,acch2);
+       S = __half22float2(acch2);
+       acc[i] = S.x + S.y;
+      }
+
+        #pragma unroll
+        for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        #pragma unroll
+          for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+            acc[i] += __shfl_xor(acc[i], mask);
+          }
+        }
+        
+          // Warp leaders store the data to shared memory.
+        //if (lane == 0) {
+        //  #pragma unroll
+        //  for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+        //    red_smem[i][warp] = acc[i];
+        //  }
+        //}
+
+        if (lane < NUM_A_ROWS_PER_BLOCK) {
+            red_smem[lane][warp] = acc[lane];
+        }
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+      if (qwarpid<NUM_A_ROWS_PER_BLOCK) {
+      //if (threadid<64) {
+        //#pragma unroll
+        //for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+        //    acc[i+2*qwarpid] = 0.0;
+        //}
+        ////acc[qwarpid] = 0.0;
+
+      ////if (qthreadid<num_warps) {
+        //#pragma unroll
+        //  for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+        //    acc[i+2*qwarpid] = red_smem[i+2*qwarpid][qthreadid];
+        //  }
+        ////acc[qwarpid] = red_smem[qwarpid][qthreadid];
+          
+      ////}
+      acc[qwarpid] = qthreadid<num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+      //if (threadid<32) {
+        #pragma unroll
+        for (int mask = 16 / 2; mask >= 1; mask /= 2) {
+          //#pragma unroll
+          //for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+          //  acc[i+2*qwarpid] += __shfl_xor(acc[i+2*qwarpid], mask);
+          //}
+          acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+        }
+        float oval2 = __shfl_xor(acc[qwarpid],16);
+        //acc[1] = __shfl_xor(acc[1],16);
+        //acc[3] = __shfl_xor(acc[3],16);
+      //}
+      //  __syncthreads();
+      //if (threadid < NUM_A_ROWS_PER_BLOCK/2) {
+      if (threadid%WARP_SIZE ==0 or threadid%WARP_SIZE==32) {
+            //oval = __float22half2_rn(make_float2(acc[2*threadid],acc[2*threadid+1])); 
+            //oval = __float22half2_rn(make_float2(acc[2*qwarpid],acc[2*qwarpid+1])); 
+            //oval = __float22half2_rn(make_float2(acc[qwarpid],acc[qwarpid+1])); 
+            oval = __float22half2_rn(make_float2(acc[qwarpid],oval2)); 
+            c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+qwarpid/2] = oval;
+      }
+      } //threadid<WARP_SIZE
+
+
+      //if (threadid < NUM_A_ROWS_PER_BLOCK/2) {
+      //    acc[2*threadid] = 0.0;
+      //    acc[2*threadid+1] = 0.0;
+      //    
+      //    if (num_warps>8) {
+      //      #pragma unroll
+      //      for (int j=0; j<8; j++) {
+      //        acc[2*threadid] += red_smem[2*threadid][j];
+      //        acc[2*threadid+1] += red_smem[2*threadid+1][j];
+      //      }
+      //    }
+      //      #pragma unroll
+      //      for (int j=0; j<num_warps-8; j++) {
+      //        acc[2*threadid] += red_smem[2*threadid][j+8];
+      //        acc[2*threadid+1] += red_smem[2*threadid+1][j+8];
+      //      }
+
+      //      oval = __float22half2_rn(make_float2(acc[2*threadid],acc[2*threadid+1])); 
+      //      c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+threadid] = oval;
+      //}
+}
+// define the kernel calling code:
+//template <typename T>
+void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block=4) {
+      float4 *af4 = reinterpret_cast<float4*>(in_a);
+      auto *bf4 = reinterpret_cast<__half2*>(in_b);
+      auto *c = reinterpret_cast<__half2*>(out_c);
+      //constexpr int A_ROWS_PER_BLOCK = 8;
+      const int NUM_THREADS = K*2/16;
+      int NUM_BLOCKS = M/rows_per_block;
+      if (rows_per_block==2) { 
+        LLGemm1_kernel<2><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c);
+      }
+      else if (rows_per_block==4) { 
+        LLGemm1_kernel<4><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c);
+      }
+      else if (rows_per_block==8) { 
+        LLGemm1_kernel<8><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c);
+      }
+      else if (rows_per_block==16) { 
+        LLGemm1_kernel<16><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c);
+      }
+      else {
+        NUM_BLOCKS = M/4;
+        LLGemm1_kernel<4><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c);
+      }
+
+
+        cudaError_t err = cudaGetLastError();
+          if (cudaSuccess != err)
+                  throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
+
+// instantiate the kernel template for T=float:
+//template void AddGPUKernel<float>(float *in_a, float *in_b, float *out_c, const int M, const int K, cudaStream_t stream);
+
+const unsigned int TILE_WIDTH = 32;
+
+// Compute C = A * B
+__global__ void matrixMultiplyShared(float *A, float *B, float *C,
+                                     int numARows, int numAColumns,
+                                     int numBRows, int numBColumns,
+                                     int numCRows, int numCColumns) {
+        __shared__ float sA[TILE_WIDTH][TILE_WIDTH];   // Tile size of 32x32
+        __shared__ float sB[TILE_WIDTH][TILE_WIDTH];
+
+        int Row = blockDim.y * blockIdx.y + threadIdx.y;
+        int Col = blockDim.x * blockIdx.x + threadIdx.x;
+        float Cvalue = 0.0;
+        sA[threadIdx.y][threadIdx.x] = 0.0;
+        sB[threadIdx.y][threadIdx.x] = 0.0;
+
+        for (int ph = 0; ph < (((numAColumns - 1) / TILE_WIDTH) + 1); ph++) {
+            if ((Row < numARows) && (threadIdx.x + (ph * TILE_WIDTH)) < numAColumns) {
+                sA[threadIdx.y][threadIdx.x] = A[(Row * numAColumns) + threadIdx.x + (ph * TILE_WIDTH)];
+            } else {
+                sA[threadIdx.y][threadIdx.x] = 0.0;
+            }
+            if (Col < numBColumns && (threadIdx.y + ph * TILE_WIDTH) < numBRows) {
+                sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + ph * TILE_WIDTH) * numBColumns + Col];
+            } else {
+                sB[threadIdx.y][threadIdx.x] = 0.0;
+            }
+            __syncthreads();
+            for (int j = 0; j < TILE_WIDTH; ++j) {
+                Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
+            }
+        }
+        if (Row < numCRows && Col < numCColumns) {
+            C[Row * numCColumns + Col] = Cvalue;
+        }
+}
+
+
+void MMGPUKernel(float *in_a, float *in_b, float *out_c, 
+        int numARows, int numAColumns,
+        int numBRows, int numBColumns,
+        int numCRows, int numCColumns, 
+        cudaStream_t stream) {
+
+            // Initialize the grid and block dimensions 
+        dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
+            dim3 dimGrid((numCColumns / TILE_WIDTH) + 1, (numCRows / TILE_WIDTH) + 1, 1);
+            //@@ Launch the GPU Kernel here
+                matrixMultiplyShared <<<dimGrid, dimBlock>>>
+                                                           (in_a, in_b, out_c, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
+
+        cudaError_t err = cudaGetLastError();
+          if (cudaSuccess != err)
+                  throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
+
+
+
+template<int nThreads_per_row, int CTA, int MT0, int MT1>
+__global__
+__launch_bounds__(512)
+void HGEMV_WFPerRow(int m, int n, const _Float16 *A, int lda, const _Float16 *x, _Float16 *y)
+{
+  int num_row_per_block = CTA / nThreads_per_row;
+  int row_id = (blockIdx.x*num_row_per_block+threadIdx.y)*MT0;
+  int inc = (gridDim.x * num_row_per_block)*MT0;
+
+  while (row_id < m) {
+    float2 sum2[MT0];
+
+#pragma unroll
+    for (int i = 0; i < MT0; ++i)
+    {
+       sum2[i] = {0.0,0.0};
+    }
+
+    for (int j = threadIdx.x; j < n; j += (nThreads_per_row*MT1)){
+        bool is_active = j < n;
+        if (is_active) {
+            float2 x2[MT1>>1];
+#pragma unroll
+	    for(int offset = 0; offset < MT1; offset += 2)
+	    {
+            	x2[offset>>1] = {x[j+nThreads_per_row*offset], x[j+nThreads_per_row*(offset+1)]};
+	    }
+	    float2 a2[MT0][MT1>>1];
+#pragma unroll
+	    for (int i = 0; i < MT0; i++)
+	    {
+#pragma unroll
+	    	for (int offset = 0; offset < MT1; offset += 2)
+	    	{
+            	    a2[i][offset>>1] = {A[(row_id+i)*n+j+nThreads_per_row*offset], A[(row_id+i)*n+j+nThreads_per_row*(offset+1)]};
+	    	}
+	    }
+
+#pragma unroll
+	    for (int i = 0; i < MT0; i++)
+	    {
+#pragma unroll
+	    	for (int offset = 0; offset < (MT1>>1); offset++)
+	    	{
+	  		sum2[i] += a2[i][offset]*x2[offset];
+		}
+	    }
+
+        }
+    }
+    float sum[MT0];
+#pragma unroll
+    for (int i = 0; i < MT0; i++)
+    {
+    	sum[i] = sum2[i].x+sum2[i].y;
+    }
+
+#pragma unroll
+    for (int i = 0; i < MT0; i++)
+    {
+#pragma unroll 
+    	for (int offset = nThreads_per_row  >> 1; offset >= 1; offset = offset >> 1) {
+            sum[i] += __shfl_down(sum[i], offset, nThreads_per_row);
+	}
+    }
+    if (threadIdx.x == 0) 
+    {
+#pragma unroll
+	for (int i = 0; i < MT0; i++)
+	{	
+           y[row_id+i] = sum[i];
+	}
+    }
+    row_id += inc;
+  }
+}
+
+void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx=0) {
+      //m -> M, n-> K
+      dim3 grid(1024);
+      dim3 block(64, 8); 
+      if (solidx==0) {
+        HGEMV_WFPerRow<64, 512, 4, 8><<<grid, block,0,stream>>>(M, K, reinterpret_cast<const _Float16*>(in_a), K, 
+              reinterpret_cast<const _Float16*>(in_b),reinterpret_cast<_Float16*>(out_c));
+      }
+      else if (solidx==1) {
+        HGEMV_WFPerRow<64, 512, 2, 8><<<grid, block,0,stream>>>(M, K, reinterpret_cast<const _Float16*>(in_a), K, 
+              reinterpret_cast<const _Float16*>(in_b),reinterpret_cast<_Float16*>(out_c));
+      }
+      else if (solidx==2) {
+        HGEMV_WFPerRow<64, 512, 1, 8><<<grid, block,0,stream>>>(M, K, reinterpret_cast<const _Float16*>(in_a), K, 
+              reinterpret_cast<const _Float16*>(in_b),reinterpret_cast<_Float16*>(out_c));
+      }
+      else {
+        HGEMV_WFPerRow<64, 512, 4, 8><<<grid, block,0,stream>>>(M, K, reinterpret_cast<const _Float16*>(in_a), K, 
+              reinterpret_cast<const _Float16*>(in_b),reinterpret_cast<_Float16*>(out_c));
+      }
+        cudaError_t err = cudaGetLastError();
+          if (cudaSuccess != err)
+                  throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
diff --git a/csrc/custom/fused_kernels.cu b/csrc/custom/fused_kernels.cu
new file mode 100644
index 0000000000000..5a4a11f914eb9
--- /dev/null
+++ b/csrc/custom/fused_kernels.cu
@@ -0,0 +1,192 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <algorithm>
+
+constexpr int WARP_SIZE = 64;
+
+template<typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  // x * sigmoid(x)
+  return (T) (((float) x) / (1.0f + expf((float) -x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+          return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+          auto addr_alias = reinterpret_cast<const float*>(addr);
+          auto dat0 = loadnt(addr_alias);
+          auto dat1 = loadnt(addr_alias + 1);
+          auto dat2 = loadnt(addr_alias + 2);
+          auto dat3 = loadnt(addr_alias + 3);
+          //auto dat0 = *(addr_alias);
+          //auto dat1 = *(addr_alias+1);
+          //auto dat2 = *(addr_alias+2);
+          //auto dat3 = *(addr_alias+3);
+          return make_float4(dat0,dat1,dat2,dat3);
+}
+
+//TBlock fetches entire rows of A, and entire col of B (K dimension); assume N=1 for time being
+//grid is M/A_NUM_ROWS blocks
+template <int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm_Silu_kernel(float4 *af4, __half2 *bf4, _Float16 *c, const int d) {
+      __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+      const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK/2 * blockDim.x;
+      const int row_addr_d = row_addr + d * blockDim.x;
+      //int row_addr_1 = row_addr + CUDA_NUM_THREADS;
+      //int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS;
+      //int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS;
+      const int threadid = threadIdx.x;
+      const int warp = threadIdx.x / WARP_SIZE;
+      const int lane = threadIdx.x % WARP_SIZE;
+      const int num_warps = blockDim.x / WARP_SIZE;
+      const int qwarpid = threadid/16;
+      const int qthreadid = threadid%16;
+      float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+      //float4 colB_elem4;
+      __half2 colB_elem4x,colB_elem4y,colB_elem4z,colB_elem4w;
+      float4 sum4; //[NUM_A_ROWS_PER_BLOCK];
+      float acc[NUM_A_ROWS_PER_BLOCK]; //= 0.0;
+      __half2 acch2;
+      __half2 oval;
+
+       //rowA_elem4 = af4[row_addr + threadid];
+       //__syncthreads();
+       //rowA_elem4_1 = af4[row_addr_1 + threadid];
+       //rowA_elem4_2 = af4[row_addr_2 + threadid];
+       //rowA_elem4_3 = af4[row_addr_3 + threadid];
+      #pragma unroll
+      for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+        rowA_elem4[2*i] = load_ntmprl(&af4[row_addr + i*blockDim.x + threadid]);
+        rowA_elem4[2*i+1] = load_ntmprl(&af4[row_addr_d + i*blockDim.x + threadid]);
+        //rowA_elem4[i] = af4[row_addr + i*blockDim.x + threadid];
+       //__syncthreads();
+      }
+      colB_elem4x = bf4[threadid*4+0];
+      colB_elem4y = bf4[threadid*4+1];
+      colB_elem4z = bf4[threadid*4+2];
+      colB_elem4w = bf4[threadid*4+3];
+
+      // __syncthreads();
+       __half2 Af2; __half2 Bf2; float2 S;
+       //auto Bh2ptr = reinterpret_cast<__half2 *>(&colB_elem4);
+       //auto Bf2x = *Bh2ptr;
+       //auto Bf2y = *(Bh2ptr+1);
+       //auto Bf2z = *(Bh2ptr+2);
+       //auto Bf2w = *(Bh2ptr+3);
+       auto Ah2ptr = reinterpret_cast<__half2 *>(&rowA_elem4);
+       __half2 *ah2lptr;
+      #pragma unroll
+      for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+       ah2lptr = Ah2ptr+i*4;
+       Af2 = *(ah2lptr);
+       acch2 = __hmul2(Af2,colB_elem4x);
+       Af2 = *(ah2lptr+1);
+       acch2 = __hfma2(Af2,colB_elem4y,acch2);
+       Af2 = *(ah2lptr+2);
+       acch2 = __hfma2(Af2,colB_elem4z,acch2);
+       Af2 = *(ah2lptr+3);
+       acch2 = __hfma2(Af2,colB_elem4w,acch2);
+       S = __half22float2(acch2);
+       acc[i] = S.x + S.y;
+      }
+
+        #pragma unroll
+        for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        #pragma unroll
+          for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+            acc[i] += __shfl_xor(acc[i], mask);
+          }
+        }
+        
+          // Warp leaders store the data to shared memory.
+        //if (lane == 0) {
+        //  #pragma unroll
+        //  for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {        
+        //    red_smem[i][warp] = acc[i];
+        //  }
+        //}
+
+        if (lane < NUM_A_ROWS_PER_BLOCK) {
+            red_smem[lane][warp] = acc[lane];
+        }
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+      if (qwarpid<NUM_A_ROWS_PER_BLOCK) {
+      //if (threadid<64) {
+        //#pragma unroll
+        //for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+        //    acc[i+2*qwarpid] = 0.0;
+        //}
+        ////acc[qwarpid] = 0.0;
+
+      ////if (qthreadid<num_warps) {
+        //#pragma unroll
+        //  for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+        //    acc[i+2*qwarpid] = red_smem[i+2*qwarpid][qthreadid];
+        //  }
+        ////acc[qwarpid] = red_smem[qwarpid][qthreadid];
+          
+      ////}
+      acc[qwarpid] = qthreadid<num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+      //if (threadid<32) {
+        #pragma unroll
+        for (int mask = 16 / 2; mask >= 1; mask /= 2) {
+          //#pragma unroll
+          //for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {        
+          //  acc[i+2*qwarpid] += __shfl_xor(acc[i+2*qwarpid], mask);
+          //}
+          acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+        }
+        float oval2 = __shfl_xor(acc[qwarpid],16);
+        //acc[1] = __shfl_xor(acc[1],16);
+        //acc[3] = __shfl_xor(acc[3],16);
+      //}
+      //  __syncthreads();
+      //if (threadid < NUM_A_ROWS_PER_BLOCK/2) {
+      if (lane ==0 or lane==32) {
+            //oval = __float22half2_rn(make_float2(acc[qwarpid],oval2)); 
+            //c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+qwarpid/2] = oval;
+
+            c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+qwarpid/2] = silu(acc[qwarpid]) * oval2;
+      }
+      } //threadid<WARP_SIZE
+
+
+}
+// define the kernel calling code:
+//template <typename T>
+void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block=4) {
+      float4 *af4 = reinterpret_cast<float4*>(in_a);
+      auto *bf4 = reinterpret_cast<__half2*>(in_b);
+      auto *c = reinterpret_cast<_Float16*>(out_c);
+      const int d = M/2;
+      const int NUM_THREADS = K*2/16;
+      int NUM_BLOCKS = M/rows_per_block;
+      if (rows_per_block==2) { 
+        LLGemm_Silu_kernel<2><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+      }
+      else if (rows_per_block==4) { 
+        LLGemm_Silu_kernel<4><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+      }
+      else if (rows_per_block==8) { 
+        LLGemm_Silu_kernel<8><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+      }
+      else if (rows_per_block==16) { 
+        LLGemm_Silu_kernel<16><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+      }
+      else {
+        NUM_BLOCKS = M/4;
+        LLGemm_Silu_kernel<4><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+      }
+
+
+        cudaError_t err = cudaGetLastError();
+          if (cudaSuccess != err)
+                  throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
+
diff --git a/setup.py b/setup.py
index 15b9a78f6ca27..15715225490af 100644
--- a/setup.py
+++ b/setup.py
@@ -348,6 +348,12 @@ def get_torch_arch_list() -> Set[str]:
     )
     ext_modules.append(vllm_extension)
 
+custom_extension = CUDAExtension(
+    name="vllm.custom_ops",
+    sources=["csrc/custom/custom.cpp", "csrc/custom/custom_kernels.cu", "csrc/custom/fused_kernels.cu"],
+    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
+)
+ext_modules.append(custom_extension)
 
 def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index bc86a916b5bbf..d7e56850f076a 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -11,6 +11,7 @@
 from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceData, SequenceGroupOutput, SequenceOutput)
 
+from vllm.model_executor.layers.tuned_gemm import tgemm
 
 class Sampler(nn.Module):
     """Samples the next tokens from the model's outputs.
@@ -38,7 +39,8 @@ def __init__(self,
     def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
                     embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
         # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
+        #logits = torch.matmul(hidden_states, embedding.t())
+        logits = tgemm.mm(hidden_states, embedding)
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)
@@ -59,7 +61,6 @@ def forward(
 
         # Get the logits for the next tokens.
         logits = self._get_logits(hidden_states, embedding, embedding_bias)
-
         # Only perform sampling in the driver worker.
         # Note: `_get_logits` is still distributed across TP workers because
         # the `embedding` weight is distributed across TP workers.
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 556a972bc3e34..84994d7f9daeb 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -5,7 +5,7 @@
 import os
 import yaml
 import pandas as pd
-#from vllm import custom_ops
+from vllm import custom_ops
 
 
 class TunedGemm:
@@ -29,7 +29,7 @@ def load_best_sols(self):
     def apply_custom(self,ds):
         M,N,K = ds['M'],ds['N'],ds['K']
         #apply custom matvec (only for f16 dtype)
-        return ds
+        #return ds
         if N==1:
             ds1 = ds.copy()
             ds1['libtype'] = 'custom'
@@ -58,7 +58,14 @@ def create_ds(self):
     def query_sol(self,m,n,k):
         return self.solids.get((m,n,k),(0,0))
     def mm(self,inp,weights):
-        inp_view=inp.view(-1,inp.size(-1))
+        # F.Linear can take a 3 dimensional input. vllm uses this for linear units.
+        # However, sampler will use torch.matmul with 2 dimensions only
+        if inp.dim() == 3:
+            inp_view=inp.view(-1,inp.size(-1))
+            batched = True
+        else:
+            inp_view = inp
+            batched = False
         #print(f'>>>inp_view {inp_view.shape}')
         if self.extensions_created == False:
             rocb_create_extension()
@@ -86,8 +93,11 @@ def mm(self,inp,weights):
             #print(">>> found rocblas")
             out = rocb_mm(inp_view,weights.t(),solidx)
         else:
-            #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
+            print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
             out = F.linear(inp,weights)
-        return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
+        if batched:
+            return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
+        else:
+            return out
 
 tgemm = TunedGemm()
\ No newline at end of file

From af9e9d17575b8160c8bf352697ea9efd936bb45c Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Wed, 14 Feb 2024 16:44:33 +0000
Subject: [PATCH 06/34] Add silu gemm fusion when batch and seq_len = 1

---
 vllm/model_executor/layers/linear.py     |  2 --
 vllm/model_executor/layers/tuned_gemm.py |  5 ++---
 vllm/model_executor/models/llama.py      | 13 +++++++++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 63ea7b856c1ee..e88ec167b5e3e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -68,8 +68,6 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
         if self.separate_bias_add:
-            #print(f">>> HELOOOOOOOOOOOOOO apply_weights {x.shape}, {weight.shape}, {bias}")
-
             if bias:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 84994d7f9daeb..0cc7b866dd219 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -29,7 +29,6 @@ def load_best_sols(self):
     def apply_custom(self,ds):
         M,N,K = ds['M'],ds['N'],ds['K']
         #apply custom matvec (only for f16 dtype)
-        #return ds
         if N==1:
             ds1 = ds.copy()
             ds1['libtype'] = 'custom'
@@ -93,11 +92,11 @@ def mm(self,inp,weights):
             #print(">>> found rocblas")
             out = rocb_mm(inp_view,weights.t(),solidx)
         else:
-            print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
+            #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
             out = F.linear(inp,weights)
         if batched:
             return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
         else:
             return out
 
-tgemm = TunedGemm()
\ No newline at end of file
+tgemm = TunedGemm()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e5a1abebf1420..52752e64e5d48 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -46,6 +46,7 @@
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.config import LoRAConfig
+from vllm import custom_ops
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -74,8 +75,16 @@ def __init__(
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
+        #print(f'>>>Shape of x in mlp {x.shape} {self.gate_up_proj.weight.shape}')
+        if x.shape[0] == 1 and x.shape[1] == 1:
+
+            out = torch.empty(x.shape[0],self.gate_up_proj.weight.shape[0]//2,dtype=x.dtype,device=x.device)
+            custom_ops.LLMM_Silu(self.gate_up_proj.weight,x.view(-1,x.size(-1)),out,8)
+            x = out.view(x.shape[0], x.shape[1], out.shape[1])
+        else:
+            gate_up, _ = self.gate_up_proj(x)
+            x = self.act_fn(gate_up)
+            #print(f'>>> x.shape {x.shape}')
         x, _ = self.down_proj(x)
         return x
 

From 5f8eac35085a16d15b903009a5a381920eaa03a3 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Wed, 14 Feb 2024 17:52:42 +0000
Subject: [PATCH 07/34] Add tunable flags to VLLM

---
 run_70b.sh                               | 109 +++++++++++++++++++++++
 run_70b_fast.sh                          |  69 ++++++++++++++
 vllm/model_executor/layers/tuned_gemm.py |  20 +++--
 3 files changed, 190 insertions(+), 8 deletions(-)
 create mode 100644 run_70b.sh
 create mode 100644 run_70b_fast.sh

diff --git a/run_70b.sh b/run_70b.sh
new file mode 100644
index 0000000000000..46e342826b2a7
--- /dev/null
+++ b/run_70b.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+BASE_DIR=/workspace
+VLLM_DIR=$BASE_DIR/vllm-private
+GRAD_DIR=$BASE_DIR/gradlib
+RPD_DIR=/workspace/rocmProfileData
+MODEL=/data/llama2-70b-chat
+MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
+#MODEL=/data/llama-2-13b-chat-hf
+GEMM_TUNER=1
+#TP="1 2 4 8"
+TP=8
+#Flag to use Triton Flash Attention vs CK
+export VLLM_USE_TRITON=1
+
+#Gemm tuner flags
+export VLLM_TUNE_GEMM=0
+export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
+export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv"
+
+#Flag to use old torch.multinomial
+#export VLLM_USE_TORCH_MULTINOMIAL=1
+
+#Delete tuned gemms before running.
+DELETE_TUNED_CSV=1
+#Flag to disable MSCCL
+#export RCCL_MSCCL_ENABLE=0
+#HIPGraph performance flags
+export HIP_FORCE_DEV_KERNARG=1
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
+#Enable full decoder graph mode
+HIP_GRAPH=--use-cuda-graph
+#Use top of tree build of RCCL
+export LD_LIBRARY_PATH=/workspace/rccl/build/
+#Enable either flag to create a profile trace (rocprof, or rocpd)
+#RPD_PROFILE="--profile"
+#ROCPROF_PROFILE="rocprof --hip-trace"
+GEN_LEN="1 32"
+#INPUT_LEN="512 1024 2048 3072"
+INPUT_LEN="512 1024 2048 3072 4096 6144 8192 16384"
+ITER=10
+# pring usage of the parameters
+usage() {
+    echo "Usage: $0 [--tp <n>] [--model <path>]"
+    exit 1
+}
+# parse parameters
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --tp) TP="$2"; shift ;;
+        --model) MODEL="$2"; shift ;;
+        --notune) GEMM_TUNER=0; shift ;;
+        *) usage ;; # Any other argument will show usage information.
+    esac
+    shift # Move to next argument
+done
+for tp in $TP;
+do
+    if (( $GEMM_TUNER ));
+    then
+      echo "tuned_gemm_csv: ./tuned_tp$tp.csv" > $VLLM_DIR/tuned_perf_tp$tp.yaml
+      tuned_file=$VLLM_DIR/tuned_tp$tp.csv
+      if [[ $DELETE_TUNED_CSV == 1 || ! -f $VLLM_DIR/tuned_tp$tp.csv ]];
+      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf.yaml
+      if [[ $DELETE_TUNED_CSV == 1 ]];
+      then
+              rm -rf $tuned_file
+              echo "INFO: Generating Tuned Gemm configs"
+              cd $GRAD_DIR
+              python gemm_tuner.py --model_dir $MODEL --output $tuned_file --tp $tp
+      fi
+      export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
+      echo "INFO: Generating Tuned Gemm configs"
+      cd $GRAD_DIR
+      python gemm_tuner.py --model_dir $MODEL --output $VLLM_TUNE_FILE --tp $tp
+
+
+      echo "================================= TUNED GEMMS  $tuned_file ==============================================="
+      cat $tuned_file
+
+    fi
+
+    cd $VLLM_DIR
+    for gen_len in $GEN_LEN;
+    do
+        for input_len in $INPUT_LEN;
+        do
+            if [[ -v RPD_PROFILE ]] ;
+            then
+                rm /workspace/trace.rpd
+                python -m rocpd.schema --create /workspace/trace.rpd
+            fi
+            echo "================================= RUNNING $MODEL $input_len $gen_len ==============================================="
+            $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1 --input-len $input_len --output-len $gen_len \
+            --tensor-parallel-size $tp --num-iters $ITER $HIP_GRAPH $RPD_PROFILE
+            if [[ -v ROCPROF_PROFILE ]] ;
+            then
+                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
+                echo "INFO: Creating Trace JSON file $TRACE_FILE"
+                mv $VLLM_DIR/results.json $TRACE_FILE
+            fi
+            if [[ -v RPD_PROFILE ]] ;
+            then
+                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
+                echo "INFO: Creating Trace JSON file $TRACE_FILE"
+                python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE
+            fi
+        done
+    done
+done
\ No newline at end of file
diff --git a/run_70b_fast.sh b/run_70b_fast.sh
new file mode 100644
index 0000000000000..585e0ebdd000c
--- /dev/null
+++ b/run_70b_fast.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+set -e
+BASE_DIR=/workspace
+VLLM_DIR=$BASE_DIR/vllm-private
+GRAD_DIR=/trees/gradlib
+RPD_DIR=/workspace/rocmProfileData
+MODEL=/data/llama2-70b-chat
+MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
+
+export VLLM_TUNE_GEMM=0
+export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
+export VLLM_TUNE_FILE=$VLLM_DIR/"tuned.csv"
+
+#Flag to use Triton Flash Attention vs CK
+export VLLM_USE_TRITON=1
+
+#Flag to use old torch.multinomial
+#export VLLM_USE_TORCH_MULTINOMIAL=1
+
+#Delete tuned gemms before running.
+#DELETE_TUNED_CSV=1
+
+#Flag to disable MSCCL
+#export RCCL_MSCCL_ENABLE=0
+
+#HIPGraph performance flags
+export HIP_FORCE_DEV_KERNARG=1
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
+
+#Enable full decoder graph mode
+HIP_GRAPH=--use-cuda-graph
+
+#Use top of tree build of RCCL
+export LD_LIBRARY_PATH=/workspace/rccl/build/
+
+#Enable either flag to create a profile trace (rocprof, or rocpd)
+#RPD_PROFILE="--profile"
+#ROCPROF_PROFILE="rocprof --hip-trace"
+
+#TP="1 2 4 8"
+TP=8
+GEN_LEN="1,32"
+INPUT_LEN="512 1024 2048 3072"
+#INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384"
+BATCH_SIZE="1"
+ITER=10
+
+rm -f $VLLM_UNTUNE_FILE
+for tp in $TP;
+do
+    cd $VLLM_DIR
+    export VLLM_TUNE_GEMM=1
+    echo "================================= WARMING UP $MODEL ==============================================="
+    $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \
+    --tensor-parallel-size $tp --num-iters 1 --warmup-only
+
+    if [ -f $VLLM_UNTUNE_FILE ]; then
+        echo "=============================== Tuning ======================================"
+        python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE
+        echo "File does not exist."
+    fi
+    echo "================================= TUNED GEMMS  $tuned_file ==============================================="
+    cat $VLLM_TUNE_FILE
+
+    export VLLM_TUNE_GEMM=0
+    echo "================================= RUNNING $MODEL ==============================================="
+    $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \
+    --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv $HIP_GRAPH
+done
\ No newline at end of file
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 0cc7b866dd219..6d0a3ed3f2a60 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -2,6 +2,7 @@
 import torch.nn.functional as F
 from rocsolidxgemm import rocb_create_extension,rocb_mm
 from hipbsolidxgemm import hipb_create_extension,hipb_mm
+from pathlib import Path
 import os
 import yaml
 import pandas as pd
@@ -16,16 +17,19 @@ def __init__(self):
         self.bestsols = {}
         self.load_best_sols()
         self.create_ds()
+        self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0))
+        self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv")
+        self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv")
+
+        if (self.save_gemm == 1):
+            self.tuned_df = pd.DataFrame(columns=['M','N','K'])
+        else:
+            self.tuned_df = None
+
     def load_best_sols(self):
-        perfbits = {}
-        perf_file = os.environ.get('VLLM_PERF_YAML')
-        if perf_file is not None:
-            with open(perf_file, 'r') as file:
-                perfbits = yaml.safe_load(file)
+        if self.tune_path is not None and Path(self.tune_path).is_file():
+            self.bestsols = pd.read_csv(self.tune_path)
 
-        tune_file = perfbits.get('tuned_gemm_csv',None)
-        if tune_file is not None:
-            self.bestsols = pd.read_csv(tune_file,index_col=[0])
     def apply_custom(self,ds):
         M,N,K = ds['M'],ds['N'],ds['K']
         #apply custom matvec (only for f16 dtype)

From 22766b48ecb18c037dc49004717d66a9027fcee2 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Wed, 14 Feb 2024 23:51:08 +0000
Subject: [PATCH 08/34] Allow benchmark_latency to take a list of
 input/output/batches for faster execution

Also add reporting functionality for easy display
---
 benchmarks/benchmark_latency.py | 118 ++++++++++++++++++--------------
 1 file changed, 67 insertions(+), 51 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d75d690cc66d4..f9b49ebfaa132 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -3,17 +3,23 @@
 import time
 from pathlib import Path
 from typing import Optional
-
+import pandas as pd
 import numpy as np
 import torch
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
+from torch.profiler import profile, record_function, ProfilerActivity
 
+def list_of_ints(arg):
+    return list(map(int, arg.split(',')))
 
 def main(args: argparse.Namespace):
     print(args)
 
+    print(f'>>>Loading LLM')
+    if args.report:
+        results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(
@@ -26,57 +32,62 @@ def main(args: argparse.Namespace):
         enforce_eager=args.enforce_eager,
     )
 
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
-        top_p=1.0,
-        use_beam_search=args.use_beam_search,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
+    for batch_size in args.batch_size:
+        for output_len in args.output_len:
+            for input_len in args.input_len:
+                print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}') 
+                sampling_params = SamplingParams(
+                    n=args.n,
+                    temperature=0.0 if args.use_beam_search else 1.0,
+                    top_p=1.0,
+                    use_beam_search=args.use_beam_search,
+                    ignore_eos=True,
+                    max_tokens=output_len,
+                )
+                print(sampling_params)
+                dummy_prompt_token_ids = [[0] * input_len] * batch_size
+
+                def run_to_completion(profile_dir: Optional[str] = None):
+                    if profile_dir:
+                        with torch.profiler.profile(
+                                activities=[
+                                    torch.profiler.ProfilerActivity.CPU,
+                                    torch.profiler.ProfilerActivity.CUDA,
+                                ],
+                                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                                    str(profile_dir))) as p:
+                            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                                        sampling_params=sampling_params,
+                                        use_tqdm=False)
+                        print(p.key_averages())
+                    else:
+                        start_time = time.perf_counter()
+                        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                                    sampling_params=sampling_params,
+                                    use_tqdm=False)
+                        end_time = time.perf_counter()
+                        latency = end_time - start_time
+                        return latency
 
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
-                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
-        else:
-            start_time = time.perf_counter()
-            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
+                print("Warming up...")
+                run_to_completion(profile_dir=None)
 
-    print("Warming up...")
-    run_to_completion(profile_dir=None)
+                if args.profile:
+                    profile_dir = args.profile_result_dir
+                    if not profile_dir:
+                        profile_dir = Path(
+                            "."
+                        ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+                    print(f"Profiling (results will be saved to '{profile_dir}')...")
+                    run_to_completion(profile_dir=args.profile_result_dir)
+                    return
 
-    if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=args.profile_result_dir)
-        return
+                # Benchmark.
+                latencies = []
+                for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+                    latencies.append(run_to_completion(profile_dir=None))
+                print(f'Avg latency: {np.mean(latencies)} seconds')
 
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    print(f'Avg latency: {np.mean(latencies)} seconds')
 
 
 if __name__ == '__main__':
@@ -90,9 +101,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         choices=['awq', 'gptq', 'squeezellm', None],
                         default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--input-len', type=list_of_ints, default=32)
+    parser.add_argument('--output-len', type=list_of_ints, default=128)
+    parser.add_argument('--batch-size', type=list_of_ints, default=8)
     parser.add_argument('--n',
                         type=int,
                         default=1,
@@ -127,5 +138,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help=('path to save the pytorch profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument('--warmup-only', action='store_true',
+                        help='only run warmup, useful for tuning')
+    parser.add_argument('--report', action='store_true',
+                        help='turn on dataframe reporting')
+    parser.add_argument('--report-file', type=str, default=None)
     args = parser.parse_args()
     main(args)

From 87b4c1bd94a03a9d706356ea4f9d39a4514074f7 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Thu, 15 Feb 2024 06:12:26 +0000
Subject: [PATCH 09/34] Add dynamic tuning feature to vllm

---
 gradlib/csrc/grad_funcs.cu               | 413 +++++++++++++++
 gradlib/csrc/hipbsolgemm.cu              | 610 +++++++++++++++++++++++
 gradlib/csrc/rocsolgemm.cu               | 563 +++++++++++++++++++++
 gradlib/gemm_runner.py                   |  62 +++
 gradlib/gemm_tuner.py                    |  92 ++++
 gradlib/gradlib/GemmTuner.py             | 208 ++++++++
 gradlib/mm_test.py                       | 234 +++++++++
 gradlib/setup.py                         | 136 +++++
 run_70b.sh                               |  39 +-
 run_70b_fast.sh                          |  16 +-
 vllm/model_executor/layers/tuned_gemm.py |   7 +-
 11 files changed, 2342 insertions(+), 38 deletions(-)
 create mode 100644 gradlib/csrc/grad_funcs.cu
 create mode 100644 gradlib/csrc/hipbsolgemm.cu
 create mode 100644 gradlib/csrc/rocsolgemm.cu
 create mode 100644 gradlib/gemm_runner.py
 create mode 100644 gradlib/gemm_tuner.py
 create mode 100644 gradlib/gradlib/GemmTuner.py
 create mode 100644 gradlib/mm_test.py
 create mode 100644 gradlib/setup.py
 mode change 100644 => 100755 run_70b.sh
 mode change 100644 => 100755 run_70b_fast.sh

diff --git a/gradlib/csrc/grad_funcs.cu b/gradlib/csrc/grad_funcs.cu
new file mode 100644
index 0000000000000..f6498fb2a3ba7
--- /dev/null
+++ b/gradlib/csrc/grad_funcs.cu
@@ -0,0 +1,413 @@
+// #ifdef __gfx908__
+// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
+// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
+// #undef __HIP_NO_HALF_OPERATORS__
+// #undef __HIP_NO_HALF_CONVERSIONS__
+// #endif
+
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/autocast_mode.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAFunctions.h>
+// #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/hip/HIPStream.h>
+#include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+#include <hip/hip_runtime.h>
+//#include <hipblaslt/hipblaslt-ext.hpp>
+#include <hipblaslt/hipblaslt.h>
+
+#include <iostream>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <assert.h>
+#include "nvToolsExt.h"
+
+// #ifdef USE_ROCM
+// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// #endif
+
+// #ifdef __HIP_PLATFORM_HCC__
+// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
+// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+// 	  #endif
+// 	#endif
+// #endif
+
+#ifndef CHECK_HIP_ERROR
+#define CHECK_HIP_ERROR(error)                    \
+    if(error != hipSuccess)                       \
+    {                                             \
+        fprintf(stderr,                           \
+                "Hip error: '%s'(%d) at %s:%d\n", \
+                hipGetErrorString(error),         \
+                error,                            \
+                __FILE__,                         \
+                __LINE__);                        \
+        exit(EXIT_FAILURE);                       \
+    }
+#endif
+
+#ifndef CHECK_HIPBLAS_ERROR
+#define CHECK_HIPBLAS_ERROR(error)                    \
+    if(error != HIPBLAS_STATUS_SUCCESS)               \
+    {                                                 \
+        fprintf(stderr,                               \
+                "hipBLAS error: '%s'(%d) at %s:%d\n", \
+                hipblasStatusToString(error),         \
+                error,                                \
+                __FILE__,                             \
+                __LINE__);                            \
+        exit(EXIT_FAILURE);                           \
+    }
+#endif
+
+namespace {
+  /*thread_local*/ cudaStream_t weight_stream;
+  // BUG: DLM has event and stream on different devices error
+  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
+  // C++ keyword: thread_local <- maybe this can help?
+  /*thread_local*/ cudaEvent_t event;
+
+  // hipBLASLt
+  hipblasLtHandle_t hipblaslt_handle;
+  hipblasLtMatmulPreference_t preference;
+  uint64_t workspace_size = 32*1024*1024;
+  //uint64_t workspace_size = 0;
+  void* d_workspace;
+  int request_solutions = 1;
+  int returnedAlgoCount = 0;
+
+  struct MatMulConfig {
+    hipblasOperation_t op_A;
+    hipblasOperation_t op_B;
+    int M;
+    int N;
+    int K;
+    hipblasDatatype_t dtype;
+
+    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
+      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
+    }
+  };
+
+  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
+  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
+
+  hipEvent_t start, stop;
+  int bench_iters { 1 };
+  int warmup_iters { 1 };
+
+  bool cout_print = true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * hipBLASLt GEMM call
+*/
+hipblasStatus_t hipblasLtMatmul_wrapper(
+    hipblasLtHandle_t handle,
+    hipblasOperation_t op_A,
+    hipblasOperation_t op_B,
+    int m, int n, int k,
+    const void *alpha,
+    const void *a,
+    int lda,
+    const void *b,
+    int ldb,
+    const void *beta,
+    void *c,
+    int ldc,
+    hipblasDatatype_t dtype,
+    hipStream_t &stream)
+{
+  // TODO: flag is not supported for hipblasLt yet
+  int flag { 0 };
+  if (dtype == HIPBLAS_R_16F) {
+    // use fp16 alt impl for MI200
+    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    flag = rocblas_gemm_flags_fp16_alt_impl;
+  }
+
+  nvtxRangePushA("hipBLASLt variables creation");
+  hipblasLtMatrixLayout_t matA, matB, matC;
+  hipblasLtMatmulDesc_t matmul;
+  if (op_A == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
+  }
+  if (op_B == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
+  }
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
+  nvtxRangePop();
+
+  // if heuristic does not exist in the map, do search and push into the map
+  auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
+  if (heuristic_map.count(gemm_key) <= 0) {
+    nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
+    if (cout_print) {
+      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
+                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
+                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
+    }
+    std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
+      handle, matmul, matA, matB, matC, matC,
+      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
+    if((returnedAlgoCount != request_solutions) && cout_print) {
+      std::cout << "less solution found! request: " << request_solutions
+                << ", found: " << returnedAlgoCount << std::endl;
+    }
+
+    if (returnedAlgoCount == 1) {
+      heuristic_map[gemm_key] = heuristicResult[0];
+    } else {
+      // benchmark requested solutions and pick best one
+      int bestIndex { -1 };
+      double bestMs { std::numeric_limits<double>::max() };
+      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
+        // warm up
+        for (int iter { 0 }; iter < warmup_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        // performance measuring
+        double eventMs;
+        CHECK_HIP_ERROR(hipEventRecord(start, stream));
+        for (int iter { 0 }; iter < bench_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
+        CHECK_HIP_ERROR(hipEventSynchronize(stop));
+        float temp;
+        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
+        eventMs = double(temp);
+        eventMs /= bench_iters;
+
+        if (cout_print) {
+          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
+        }
+        if (bestMs > eventMs) {
+          bestMs = eventMs;
+          bestIndex = sol;
+          if (cout_print) {
+            std::cout << " *" << std::endl;
+          }
+        } else {
+          if (cout_print) {
+            std::cout << std::endl;
+          }
+        }
+      }
+      heuristic_map[gemm_key] = heuristicResult[bestIndex];
+    }
+    nvtxRangePop();
+  }
+
+  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
+      alpha,
+      a, matA,
+      b, matB,
+      beta,
+      c, matC,
+      c, matC,
+      &heuristic_map[gemm_key].algo,
+      d_workspace, workspace_size,
+      stream);
+
+  nvtxRangePushA("hipBLASLt variables deletion");
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
+  nvtxRangePop();
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+torch::Tensor hipBLASLtMm_(
+    const torch::Tensor& mat1,
+    const torch::Tensor& mat2)
+{
+  auto mat1_strides { mat1.strides() };
+  auto mat2_strides { mat2.strides() };
+  auto mat1_sizes { mat1.sizes() };
+  auto mat2_sizes { mat2.sizes() };
+  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  );
+  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+  auto abcType { mat1.options().dtype() };
+  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
+  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
+  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
+
+  bool transpose_result = true;
+  bool transpose_mat1;
+  bool transpose_mat2;
+  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
+    transpose_mat2 = false;
+  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
+    transpose_mat2 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
+    transpose_mat1 = false;
+  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
+    transpose_mat1 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+
+  if (transpose_result) {
+    bool tmp = transpose_mat1;
+    transpose_mat1 = !transpose_mat2;
+    transpose_mat2 = !tmp;
+    mat1_strides = mat2.strides();
+    mat2_strides = mat1.strides();
+    mat1_sizes = mat2.sizes();
+    mat2_sizes = mat1.sizes();
+  }
+  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
+  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
+  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
+  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+  
+  float one { 1.0f };
+  float zero { 0.0f };
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
+  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
+  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
+  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
+  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
+
+  int flag { 0 };
+  hipblasDatatype_t hipblasType;
+  if (abcType == at::kHalf) {
+    hipblasType = HIPBLAS_R_16F;
+  } else if (abcType == at::kBFloat16) {
+    hipblasType = HIPBLAS_R_16B;
+  } else if (abcType == at::kFloat) {
+    hipblasType = HIPBLAS_R_32F;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+
+  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
+  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
+  void *ptrC { static_cast<void *>(result.data_ptr()) };
+  
+  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
+
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper(
+      hipblaslt_handle,
+      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      m, n, k,
+      &one,
+      ptrA, mat1_ld,
+      ptrB, mat2_ld,
+      &zero,
+      ptrC, result_ld,
+      hipblasType,
+      current_stream));
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void create_extension()
+{
+  CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
+  CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+  // hipBLASLt
+  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
+  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
+      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+  
+  CHECK_HIP_ERROR(hipEventCreate(&start));
+  CHECK_HIP_ERROR(hipEventCreate(&stop));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void destroy_extension()
+{
+    CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
+    CHECK_HIP_ERROR(hipEventDestroy(event));
+
+    // hipBLASLt
+    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
+    CHECK_HIP_ERROR(hipFree(d_workspace));
+
+    CHECK_HIP_ERROR(hipEventDestroy(start));
+    CHECK_HIP_ERROR(hipEventDestroy(stop));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("create_extension", &create_extension, "create_extension");
+  m.def("destroy_extension", &destroy_extension, "destroy_extension");
+  m.def("mm", &hipBLASLtMm_, "mm");
+}
diff --git a/gradlib/csrc/hipbsolgemm.cu b/gradlib/csrc/hipbsolgemm.cu
new file mode 100644
index 0000000000000..bf15fb1297667
--- /dev/null
+++ b/gradlib/csrc/hipbsolgemm.cu
@@ -0,0 +1,610 @@
+// #ifdef __gfx908__
+// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
+// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
+// #undef __HIP_NO_HALF_OPERATORS__
+// #undef __HIP_NO_HALF_CONVERSIONS__
+// #endif
+
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/autocast_mode.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAFunctions.h>
+// #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/hip/HIPStream.h>
+#include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+#include <hip/hip_runtime.h>
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+#include <iostream>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <assert.h>
+#include "nvToolsExt.h"
+
+//#include <rocblas/rocblas.h>
+
+
+// #ifdef USE_ROCM
+// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// #endif
+
+// #ifdef __HIP_PLATFORM_HCC__
+// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
+// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+// 	  #endif
+// 	#endif
+// #endif
+
+#ifndef CHECK_HIP_ERROR
+#define CHECK_HIP_ERROR(error)                    \
+    if(error != hipSuccess)                       \
+    {                                             \
+        fprintf(stderr,                           \
+                "Hip error: '%s'(%d) at %s:%d\n", \
+                hipGetErrorString(error),         \
+                error,                            \
+                __FILE__,                         \
+                __LINE__);                        \
+        exit(EXIT_FAILURE);                       \
+    }
+#endif
+
+#ifndef CHECK_HIPBLAS_ERROR
+#define CHECK_HIPBLAS_ERROR(error)                    \
+    if(error != HIPBLAS_STATUS_SUCCESS)               \
+    {                                                 \
+        fprintf(stderr,                               \
+                "hipBLAS error: '%s'(%d) at %s:%d\n", \
+                hipblasStatusToString(error),         \
+                error,                                \
+                __FILE__,                             \
+                __LINE__);                            \
+        exit(EXIT_FAILURE);                           \
+    }
+#endif
+
+namespace {
+  /*thread_local*/ cudaStream_t weight_stream;
+  // BUG: DLM has event and stream on different devices error
+  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
+  // C++ keyword: thread_local <- maybe this can help?
+  /*thread_local*/ cudaEvent_t event;
+
+  // hipBLASLt
+  hipblasLtHandle_t hipblaslt_handle;
+  hipblasLtMatmulPreference_t preference;
+  size_t workspace_size = 2*128*1024*1024;
+  //uint64_t workspace_size = 0;
+  void* d_workspace;
+  int request_solutions = 1;
+  int returnedAlgoCount = 0;
+
+  struct MatMulConfig {
+    hipblasOperation_t op_A;
+    hipblasOperation_t op_B;
+    int M;
+    int N;
+    int K;
+    hipDataType dtype;
+
+    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
+      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
+    }
+  };
+
+  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
+  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
+
+  hipEvent_t start, stop;
+  int bench_iters { 1 };
+  int warmup_iters { 1 };
+
+  bool cout_print = false;
+  
+  //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
+}
+
+//find all hipblaslt solutions for given gemm problem
+std::vector<int> hipblasLtMatmul_findallsols_wrapper(
+    hipblasLtHandle_t handle,
+    hipblasOperation_t op_A,
+    hipblasOperation_t op_B,
+    int m, int n, int k,
+    const void *alpha,
+    const void *a,
+    int lda,
+    const void *b,
+    int ldb,
+    const void *beta,
+    void *c,
+    int ldc,
+    hipDataType dtype,
+    hipStream_t &stream)
+{
+  int flag { 0 };
+  hipblasLtMatrixLayout_t matA, matB, matC;
+  hipblasLtMatmulDesc_t matmul;
+  if (op_A == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
+  }
+  if (op_B == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
+  }
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
+  
+  //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(10);
+  //CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
+  //    handle, matmul, matA, matB, matC, matC,
+  //    preference, 10, heuristicResult.data(), &returnedAlgoCount));
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
+  CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAllAlgos(handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+    op_A,
+    op_B,
+    dtype,
+    dtype,
+    dtype,
+    dtype,
+    HIPBLAS_COMPUTE_32F,
+    heuristicResult));
+
+  std::vector<int> algoIndex;
+  int returned_algo_count = heuristicResult.size();
+  //for (int i = 0; i < returnedAlgoCount; i++) {
+  for (int i = 0; i < returned_algo_count; i++) {
+      auto algo = heuristicResult[i].algo;
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul,
+        alpha,
+        matA,
+        matB,
+        beta,
+        matC,
+        matC,
+        algo,
+        ret_workspace_size
+      );
+      if (status == HIPBLAS_STATUS_SUCCESS) { 
+          if (ret_workspace_size<workspace_size) {
+            algoIndex.push_back(hipblaslt_ext::getIndexFromAlgo(algo));
+          }
+      }
+  }
+  
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
+  return algoIndex;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * hipBLASLt GEMM call
+*/
+hipblasStatus_t hipblasLtMatmul_sol_wrapper(
+    hipblasLtHandle_t handle,
+    hipblasOperation_t op_A,
+    hipblasOperation_t op_B,
+    int m, int n, int k,
+    const void *alpha,
+    const void *a,
+    int lda,
+    const void *b,
+    int ldb,
+    const void *beta,
+    void *c,
+    int ldc,
+    hipDataType dtype,
+    hipStream_t &stream,
+    int solution_index=-1)
+{
+  // TODO: flag is not supported for hipblasLt yet
+  int flag { 0 };
+  //if (dtype == HIPBLAS_R_16F) {
+    // use fp16 alt impl for MI200
+    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    //flag = rocblas_gemm_flags_fp16_alt_impl;
+  //}
+
+  //nvtxRangePushA("hipBLASLt variables creation");
+  hipblasLtMatrixLayout_t matA, matB, matC;
+  hipblasLtMatmulDesc_t matmul;
+  if (op_A == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
+  }
+  if (op_B == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
+  }
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
+  //nvtxRangePop();
+  // if heuristic does not exist in the map, do search and push into the map
+  //auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
+  //if (heuristic_map.count(gemm_key) <= 0) {
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(1);
+  if (solution_index<0) {
+    //nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
+    std::cout << "Warning! HipbSolId Gemm Fallback Path used for solution index <0" << std::endl;
+    if (cout_print) {
+      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
+                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
+                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
+    }
+    //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
+      handle, matmul, matA, matB, matC, matC,
+      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
+    if((returnedAlgoCount != request_solutions) && cout_print) {
+      std::cout << "less solution found! request: " << request_solutions
+                << ", found: " << returnedAlgoCount << std::endl;
+    }
+    //heuristic_map[gemm_key] = heuristicResult[0];
+/*
+    if (returnedAlgoCount == 1) {
+      heuristic_map[gemm_key] = heuristicResult[0];
+    } else {
+      // benchmark requested solutions and pick best one
+      int bestIndex { -1 };
+      double bestMs { std::numeric_limits<double>::max() };
+      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
+        // warm up
+        for (int iter { 0 }; iter < warmup_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        // performance measuring
+        double eventMs;
+        CHECK_HIP_ERROR(hipEventRecord(start, stream));
+        for (int iter { 0 }; iter < bench_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
+        CHECK_HIP_ERROR(hipEventSynchronize(stop));
+        float temp;
+        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
+        eventMs = double(temp);
+        eventMs /= bench_iters;
+
+        if (cout_print) {
+          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
+        }
+        if (bestMs > eventMs) {
+          bestMs = eventMs;
+          bestIndex = sol;
+          if (cout_print) {
+            std::cout << " *" << std::endl;
+          }
+        } else {
+          if (cout_print) {
+            std::cout << std::endl;
+          }
+        }
+      }
+      heuristic_map[gemm_key] = heuristicResult[bestIndex];
+    }
+*/
+    //nvtxRangePop();
+  } else {
+      std::vector<int> algoIndex(1);
+      algoIndex[0]=solution_index;
+      //std::vector<hipblasLtMatmulHeuristicResult_t> tmpAlgo;
+      CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAlgosFromIndex(handle, algoIndex, heuristicResult));
+  }
+    
+  //size_t ret_workspace_size = 0;
+  
+  //auto status1 = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul,
+  //      alpha,
+  //      matA,
+  //      matB,
+  //      beta,
+  //      matC,
+  //      matC,
+  //      heuristicResult[0].algo,
+  //      ret_workspace_size
+  //);
+  //if (status1 == HIPBLAS_STATUS_SUCCESS) { 
+  //  std::cout << "Workspace size" << ret_workspace_size << std::endl; 
+
+  //} else {
+  //  std::cout << "Algo not supported!!!" << std::endl; 
+
+  //}
+  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
+      alpha,
+      a, matA,
+      b, matB,
+      beta,
+      c, matC,
+      c, matC,
+      &heuristicResult[0].algo,
+      d_workspace, workspace_size,
+      stream);
+
+  //nvtxRangePushA("hipBLASLt variables deletion");
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
+  //nvtxRangePop();
+
+  return status;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+torch::Tensor HipbSolIdxBlas(
+    const torch::Tensor& mat1,
+    const torch::Tensor& mat2,
+    const int solution_index
+    )
+{
+  auto mat1_strides { mat1.strides() };
+  auto mat2_strides { mat2.strides() };
+  auto mat1_sizes { mat1.sizes() };
+  auto mat2_sizes { mat2.sizes() };
+  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  );
+  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+  auto abcType { mat1.options().dtype() };
+  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
+  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
+  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
+
+  bool transpose_result = true;
+  bool transpose_mat1;
+  bool transpose_mat2;
+  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
+    transpose_mat2 = false;
+  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
+    transpose_mat2 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
+    transpose_mat1 = false;
+  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
+    transpose_mat1 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+
+  if (transpose_result) {
+    bool tmp = transpose_mat1;
+    transpose_mat1 = !transpose_mat2;
+    transpose_mat2 = !tmp;
+    mat1_strides = mat2.strides();
+    mat2_strides = mat1.strides();
+    mat1_sizes = mat2.sizes();
+    mat2_sizes = mat1.sizes();
+  }
+  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
+  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
+  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
+  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+  
+  float one { 1.0f };
+  float zero { 0.0f };
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
+  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
+  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
+  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
+  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
+ 
+  hipDataType hipblasType;
+  if (abcType == at::kHalf) {
+    hipblasType = HIP_R_16F;
+  } else if (abcType == at::kBFloat16) {
+    hipblasType = HIP_R_16BF;
+  } else if (abcType == at::kFloat) {
+    hipblasType = HIP_R_32F;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
+  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
+  void *ptrC { static_cast<void *>(result.data_ptr()) };
+  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
+
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_sol_wrapper(
+      hipblaslt_handle,
+      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      m, n, k,
+      &one,
+      ptrA, mat1_ld,
+      ptrB, mat2_ld,
+      &zero,
+      ptrC, result_ld,
+      hipblasType,
+      current_stream,solution_index));
+
+  return result;
+}
+
+//find all hipblas solutions and return them to python land
+std::vector<int> HipbFindAllSolIdxBlas(
+    const torch::Tensor& mat1,
+    const torch::Tensor& mat2
+    )
+{
+  auto mat1_strides { mat1.strides() };
+  auto mat2_strides { mat2.strides() };
+  auto mat1_sizes { mat1.sizes() };
+  auto mat2_sizes { mat2.sizes() };
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  );
+  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+  auto abcType { mat1.options().dtype() };
+  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
+  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
+  bool transpose_result = true;
+  bool transpose_mat1;
+  bool transpose_mat2;
+  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
+    transpose_mat2 = false;
+  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
+    transpose_mat2 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
+    transpose_mat1 = false;
+  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
+    transpose_mat1 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if (transpose_result) {
+    bool tmp = transpose_mat1;
+    transpose_mat1 = !transpose_mat2;
+    transpose_mat2 = !tmp;
+    mat1_strides = mat2.strides();
+    mat2_strides = mat1.strides();
+    mat1_sizes = mat2.sizes();
+    mat2_sizes = mat1.sizes();
+  }
+  float one { 1.0f };
+  float zero { 0.0f };
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
+  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
+  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
+  hipDataType hipblasType;
+  if (abcType == at::kHalf) {
+    hipblasType = HIP_R_16F;
+  } else if (abcType == at::kBFloat16) {
+    hipblasType = HIP_R_16BF;
+  } else if (abcType == at::kFloat) {
+    hipblasType = HIP_R_32F;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
+  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
+  void *ptrC { static_cast<void *>(result.data_ptr()) };
+  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
+
+  return hipblasLtMatmul_findallsols_wrapper(
+      hipblaslt_handle,
+      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      m, n, k,
+      &one,
+      ptrA, mat1_ld,
+      ptrB, mat2_ld,
+      &zero,
+      ptrC, result_ld,
+      hipblasType,
+      current_stream);
+
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void hipb_create_extension()
+{
+  //CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
+  //CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+  // hipBLASLt
+  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
+  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
+      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+  
+  //CHECK_HIP_ERROR(hipEventCreate(&start));
+  //CHECK_HIP_ERROR(hipEventCreate(&stop)); 
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void hipb_destroy_extension()
+{
+    //CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
+    //CHECK_HIP_ERROR(hipEventDestroy(event));
+
+    // hipBLASLt
+    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
+    CHECK_HIP_ERROR(hipFree(d_workspace));
+
+    //CHECK_HIP_ERROR(hipEventDestroy(start));
+    //CHECK_HIP_ERROR(hipEventDestroy(stop)); 
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("hipb_create_extension", &hipb_create_extension, "create_extension");
+  m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension");
+  m.def("hipb_mm", &HipbSolIdxBlas, "mm");
+  m.def("hipb_findallsols", &HipbFindAllSolIdxBlas, "hipblas_find_all_sols");
+}
diff --git a/gradlib/csrc/rocsolgemm.cu b/gradlib/csrc/rocsolgemm.cu
new file mode 100644
index 0000000000000..d691fcac416a6
--- /dev/null
+++ b/gradlib/csrc/rocsolgemm.cu
@@ -0,0 +1,563 @@
+// #ifdef __gfx908__
+// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
+// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
+// #undef __HIP_NO_HALF_OPERATORS__
+// #undef __HIP_NO_HALF_CONVERSIONS__
+// #endif
+
+#define ROCBLAS_NO_DEPRECATED_WARNINGS
+#define ROCBLAS_BETA_FEATURES_API
+
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/autocast_mode.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAFunctions.h>
+// #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/hip/HIPStream.h>
+#include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+#include <hip/hip_runtime.h>
+//#include <hipblaslt/hipblaslt-ext.hpp>
+#include <hipblaslt/hipblaslt.h>
+
+#include <iostream>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <assert.h>
+#include "nvToolsExt.h"
+
+#include <rocblas/rocblas.h>
+
+
+// #ifdef USE_ROCM
+// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// #endif
+
+// #ifdef __HIP_PLATFORM_HCC__
+// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
+// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+// 	  #endif
+// 	#endif
+// #endif
+
+#ifndef CHECK_HIP_ERROR
+#define CHECK_HIP_ERROR(error)                    \
+    if(error != hipSuccess)                       \
+    {                                             \
+        fprintf(stderr,                           \
+                "Hip error: '%s'(%d) at %s:%d\n", \
+                hipGetErrorString(error),         \
+                error,                            \
+                __FILE__,                         \
+                __LINE__);                        \
+        exit(EXIT_FAILURE);                       \
+    }
+#endif
+
+#ifndef CHECK_HIPBLAS_ERROR
+#define CHECK_HIPBLAS_ERROR(error)                    \
+    if(error != HIPBLAS_STATUS_SUCCESS)               \
+    {                                                 \
+        fprintf(stderr,                               \
+                "hipBLAS error: '%s'(%d) at %s:%d\n", \
+                hipblasStatusToString(error),         \
+                error,                                \
+                __FILE__,                             \
+                __LINE__);                            \
+        exit(EXIT_FAILURE);                           \
+    }
+#endif
+
+namespace {
+  rocblas_handle r_handle;
+
+  /*thread_local*/ cudaStream_t weight_stream;
+  // BUG: DLM has event and stream on different devices error
+  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
+  // C++ keyword: thread_local <- maybe this can help?
+  /*thread_local*/ cudaEvent_t event;
+
+  // hipBLASLt
+  hipblasLtHandle_t hipblaslt_handle;
+  hipblasLtMatmulPreference_t preference;
+  uint64_t workspace_size = 32*1024*1024;
+  //uint64_t workspace_size = 0;
+  void* d_workspace;
+  int request_solutions = 1;
+  int returnedAlgoCount = 0;
+
+  struct MatMulConfig {
+    hipblasOperation_t op_A;
+    hipblasOperation_t op_B;
+    int M;
+    int N;
+    int K;
+    hipblasDatatype_t dtype;
+
+    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
+      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
+    }
+  };
+
+  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
+  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
+
+  hipEvent_t start, stop;
+  int bench_iters { 1 };
+  int warmup_iters { 1 };
+
+  bool cout_print = true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * hipBLASLt GEMM call
+*/
+/*
+hipblasStatus_t hipblasLtMatmul_wrapper(
+    hipblasLtHandle_t handle,
+    hipblasOperation_t op_A,
+    hipblasOperation_t op_B,
+    int m, int n, int k,
+    const void *alpha,
+    const void *a,
+    int lda,
+    const void *b,
+    int ldb,
+    const void *beta,
+    void *c,
+    int ldc,
+    hipblasDatatype_t dtype,
+    hipStream_t &stream)
+{
+  // TODO: flag is not supported for hipblasLt yet
+  int flag { 0 };
+  if (dtype == HIPBLAS_R_16F) {
+    // use fp16 alt impl for MI200
+    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    flag = rocblas_gemm_flags_fp16_alt_impl;
+  }
+
+  nvtxRangePushA("hipBLASLt variables creation");
+  hipblasLtMatrixLayout_t matA, matB, matC;
+  hipblasLtMatmulDesc_t matmul;
+  if (op_A == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
+  }
+  if (op_B == HIPBLAS_OP_N) {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
+  } else {
+    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
+  }
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
+      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
+  nvtxRangePop();
+
+  // if heuristic does not exist in the map, do search and push into the map
+  auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
+  if (heuristic_map.count(gemm_key) <= 0) {
+    nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
+    if (cout_print) {
+      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
+                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
+                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
+    }
+    std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
+      handle, matmul, matA, matB, matC, matC,
+      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
+    if((returnedAlgoCount != request_solutions) && cout_print) {
+      std::cout << "less solution found! request: " << request_solutions
+                << ", found: " << returnedAlgoCount << std::endl;
+    }
+
+    if (returnedAlgoCount == 1) {
+      heuristic_map[gemm_key] = heuristicResult[0];
+    } else {
+      // benchmark requested solutions and pick best one
+      int bestIndex { -1 };
+      double bestMs { std::numeric_limits<double>::max() };
+      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
+        // warm up
+        for (int iter { 0 }; iter < warmup_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        // performance measuring
+        double eventMs;
+        CHECK_HIP_ERROR(hipEventRecord(start, stream));
+        for (int iter { 0 }; iter < bench_iters; ++iter) {
+          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
+              alpha,
+              a, matA,
+              b, matB,
+              beta,
+              c, matC,
+              c, matC, // In case beta != 0, these runs can overwrite the values in c
+                       // since c and d are the same
+                       // TODO: allocates separate d memory for these runs
+              &heuristicResult[sol].algo,
+              d_workspace, workspace_size,
+              stream));
+        }
+        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
+        CHECK_HIP_ERROR(hipEventSynchronize(stop));
+        float temp;
+        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
+        eventMs = double(temp);
+        eventMs /= bench_iters;
+
+        if (cout_print) {
+          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
+        }
+        if (bestMs > eventMs) {
+          bestMs = eventMs;
+          bestIndex = sol;
+          if (cout_print) {
+            std::cout << " *" << std::endl;
+          }
+        } else {
+          if (cout_print) {
+            std::cout << std::endl;
+          }
+        }
+      }
+      heuristic_map[gemm_key] = heuristicResult[bestIndex];
+    }
+    nvtxRangePop();
+  }
+
+  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
+      alpha,
+      a, matA,
+      b, matB,
+      beta,
+      c, matC,
+      c, matC,
+      &heuristic_map[gemm_key].algo,
+      d_workspace, workspace_size,
+      stream);
+
+  nvtxRangePushA("hipBLASLt variables deletion");
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
+  nvtxRangePop();
+
+  return status;
+}
+*/
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+std::vector<rocblas_int> RocFindAllSolIdxBlas(
+    const torch::Tensor& mat1,
+    const torch::Tensor& mat2
+    )
+{
+  auto mat1_strides { mat1.strides() };
+  auto mat2_strides { mat2.strides() };
+  auto mat1_sizes { mat1.sizes() };
+  auto mat2_sizes { mat2.sizes() };
+
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  );
+  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+  auto abcType { mat1.options().dtype() };
+  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
+  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
+
+  bool transpose_result = true;
+  bool transpose_mat1;
+  bool transpose_mat2;
+  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
+    transpose_mat2 = false;
+  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
+    transpose_mat2 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
+    transpose_mat1 = false;
+  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
+    transpose_mat1 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if (transpose_result) {
+    bool tmp = transpose_mat1;
+    transpose_mat1 = !transpose_mat2;
+    transpose_mat2 = !tmp;
+    mat1_strides = mat2.strides();
+    mat2_strides = mat1.strides();
+    mat1_sizes = mat2.sizes();
+    mat2_sizes = mat1.sizes();
+  }
+  float one { 1.0f };
+  float zero { 0.0f };
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
+  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
+  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
+
+  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
+  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
+  void *ptrC { static_cast<void *>(result.data_ptr()) };
+  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
+
+  rocblas_set_stream(r_handle, current_stream);
+  uint32_t flags { 0 };
+  rocblas_datatype abcRtype;
+  if (abcType == at::kHalf) {
+    abcRtype = rocblas_datatype_f16_r;
+  } else if (abcType == at::kBFloat16) {
+    abcRtype = rocblas_datatype_bf16_r;
+  } else if (abcType == at::kFloat) {
+    abcRtype = rocblas_datatype_f32_r;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+
+  #define GEMM_EX_ARGS                                                                              \
+      r_handle, transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, \
+      m, n, k, &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, &zero, ptrC, \
+      abcRtype, result_ld, ptrC, abcRtype, result_ld, rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index
+
+      rocblas_int sizeSolve;
+      //CHECK_ROCBLAS_ERROR(
+      rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, NULL, &sizeSolve);
+                  
+      // Fill array with list of solutions that match type
+      // Note: some of these may be invalid
+      std::vector<rocblas_int> solutionsSolve(sizeSolve);
+      //CHECK_ROCBLAS_ERROR(
+      rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, solutionsSolve.data(), &sizeSolve);
+
+      std::vector<rocblas_int> validSolutions;
+      for(auto sol : solutionsSolve) {
+        auto status = rocblas_gemm_ex(r_handle, 
+                        transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none,
+                        transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none,
+                        m, n, k, 
+                        &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, 
+                        &zero, ptrC, abcRtype, result_ld, 
+                        ptrC, abcRtype, result_ld,
+                        rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, sol, rocblas_gemm_flags_none);
+        if (status == rocblas_status_success) {
+          validSolutions.push_back(sol);
+        }
+      }
+
+    return validSolutions;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+torch::Tensor RocSolIdxBlas(
+    const torch::Tensor& mat1,
+    const torch::Tensor& mat2,
+    const int32_t solution_index=0
+    )
+{
+  auto mat1_strides { mat1.strides() };
+  auto mat2_strides { mat2.strides() };
+  auto mat1_sizes { mat1.sizes() };
+  auto mat2_sizes { mat2.sizes() };
+  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  );
+  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+  auto abcType { mat1.options().dtype() };
+  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
+  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
+  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
+
+  bool transpose_result = true;
+  bool transpose_mat1;
+  bool transpose_mat2;
+  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
+    transpose_mat2 = false;
+  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
+    transpose_mat2 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
+    transpose_mat1 = false;
+  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
+    transpose_mat1 = true;
+  } else {
+    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
+  }
+
+  if (transpose_result) {
+    bool tmp = transpose_mat1;
+    transpose_mat1 = !transpose_mat2;
+    transpose_mat2 = !tmp;
+    mat1_strides = mat2.strides();
+    mat2_strides = mat1.strides();
+    mat1_sizes = mat2.sizes();
+    mat2_sizes = mat1.sizes();
+  }
+  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
+  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
+  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
+  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
+  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
+  
+  float one { 1.0f };
+  float zero { 0.0f };
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
+  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
+  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
+  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
+  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
+ 
+  /*
+  int flag { 0 };
+  hipblasDatatype_t hipblasType;
+  if (abcType == at::kHalf) {
+    hipblasType = HIPBLAS_R_16F;
+  } else if (abcType == at::kBFloat16) {
+    hipblasType = HIPBLAS_R_16B;
+  } else if (abcType == at::kFloat) {
+    hipblasType = HIPBLAS_R_32F;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+  */
+  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
+  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
+  void *ptrC { static_cast<void *>(result.data_ptr()) };
+  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
+  /* 
+
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper(
+      hipblaslt_handle,
+      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+      m, n, k,
+      &one,
+      ptrA, mat1_ld,
+      ptrB, mat2_ld,
+      &zero,
+      ptrC, result_ld,
+      hipblasType,
+      current_stream));
+  */
+  rocblas_set_stream(r_handle, current_stream);
+  uint32_t flags { 0 };
+  //int32_t solution_index {0};
+  rocblas_datatype abcRtype;
+  if (abcType == at::kHalf) {
+    abcRtype = rocblas_datatype_f16_r;
+  } else if (abcType == at::kBFloat16) {
+    abcRtype = rocblas_datatype_bf16_r;
+  } else if (abcType == at::kFloat) {
+    abcRtype = rocblas_datatype_f32_r;
+  } else {
+    assert(false && "Wrong datatype!");
+  }
+
+  //CHECK_ROCBLAS_ERROR(
+    rocblas_gemm_ex(r_handle, 
+                    transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none,
+                    transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none,
+                    m, n, k, 
+                    &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, 
+                    &zero, ptrC, abcRtype, result_ld, 
+                    ptrC, abcRtype, result_ld,
+                    rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, solution_index, flags);
+  //);
+
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void rocb_create_extension()
+{
+  /*
+  CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
+  CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+  // hipBLASLt
+  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
+  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
+  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
+      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+  
+  CHECK_HIP_ERROR(hipEventCreate(&start));
+  CHECK_HIP_ERROR(hipEventCreate(&stop)); */
+  rocblas_create_handle(&r_handle);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void rocb_destroy_extension()
+{
+    /*
+    CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
+    CHECK_HIP_ERROR(hipEventDestroy(event));
+
+    // hipBLASLt
+    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
+    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
+    CHECK_HIP_ERROR(hipFree(d_workspace));
+
+    CHECK_HIP_ERROR(hipEventDestroy(start));
+    CHECK_HIP_ERROR(hipEventDestroy(stop)); */
+    rocblas_destroy_handle(r_handle);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("rocb_create_extension", &rocb_create_extension, "create_extension");
+  m.def("rocb_destroy_extension", &rocb_destroy_extension, "destroy_extension");
+  m.def("rocb_mm", &RocSolIdxBlas, "mm");
+  m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols");
+}
diff --git a/gradlib/gemm_runner.py b/gradlib/gemm_runner.py
new file mode 100644
index 0000000000000..34a246771a820
--- /dev/null
+++ b/gradlib/gemm_runner.py
@@ -0,0 +1,62 @@
+import torch
+import rocsolidxgemm
+import hipbsolidxgemm
+import numpy as np
+import torch.nn.functional as F
+import sys
+import pandas as pd
+import timeit
+
+rocsolidxgemm.rocb_create_extension()
+hipbsolidxgemm.hipb_create_extension()
+
+class TunedGemm:
+    def __init__(self,tuned_csv_file):
+        self.bestsols = pd.read_csv(tuned_csv_file,index_col=[0])
+        self.create_ds()
+    def create_ds(self):
+        df = self.bestsols
+        solds = {}
+        for i in range(len(df)):
+            ds = df.iloc[i]
+            key = (ds['M'],ds['N'],ds['K'])
+            if ds['libtype']=='hipblaslt': soltype = 1
+            elif ds['libtype']=='rocblas': soltype = 2
+            solds[key] = (soltype,int(ds['solidx']))
+        #print(solds)
+        self.solids = solds
+    def query_sol(self,m,n,k):
+        return self.solids.get((m,n,k),(0,0))
+    def mm(self,inp,weights):
+        soltype,solidx = self.query_sol(m=weights.shape[0],n=inp.shape[0],k=inp.shape[1])
+        if soltype==1:
+            out = hipbsolidxgemm.hipb_mm(inp,weights.t(),solidx)
+        elif soltype==2:
+            out = rocsolidxgemm.rocb_mm(inp,weights.t(),solidx)
+        else:
+            out = F.linear(inp,weights)
+        return out
+    def run_all_tuned_sols(self):
+        for i in range(len(self.bestsols)):
+            ds = self.bestsols.iloc[i]
+            print('>>> Running tuned solution')
+            print(ds)
+            inp = torch.randn((ds['N'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda')
+            weights = torch.randn((ds['M'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda')
+            self.mm(inp,weights)
+
+def get_dtype(dtype_csv):
+    if dtype_csv=='torch.float16':
+        dtype = torch.float16
+    elif dtype_csv=='torch.bfloat16':
+        dtype = torch.bfloat16
+    elif dtype_csv=='torch.float32':
+        dtype = torch.float32
+    return dtype
+
+if __name__ == '__main__':
+    tgemm = TunedGemm(sys.argv[1]) #csv file with tuned sols goes in argv[1]
+    print(tgemm.bestsols)
+    tgemm.run_all_tuned_sols()
+
+
diff --git a/gradlib/gemm_tuner.py b/gradlib/gemm_tuner.py
new file mode 100644
index 0000000000000..b6c69379cf6c6
--- /dev/null
+++ b/gradlib/gemm_tuner.py
@@ -0,0 +1,92 @@
+import torch
+import os
+import argparse
+from gradlib.GemmTuner import GemmTuner
+import rocsolidxgemm
+import hipbsolidxgemm
+import numpy as np
+import torch.nn.functional as F
+import sys
+import pandas as pd
+import json
+import random
+from pathlib import Path
+rocsolidxgemm.rocb_create_extension()
+hipbsolidxgemm.hipb_create_extension()
+
+'''
+{'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 5120, 'initializer_range': 0.02, 
+'intermediate_size': 13824, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 40, 'num_hidden_layers': 40, 'num_key_value_heads': 40, 
+'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float16', 'transformers_version': '4.33.0.dev0', 'use_cache': True, 'vocab_size': 32000}
+'''
+def generate_mk_sets(model_dir, tp=1):
+    f = open(f'{model_dir}/config.json')
+    data = json.load(f)
+    hidden_size = data['hidden_size']
+    intermediate_size = data['intermediate_size']
+    total_num_heads = data['num_attention_heads']
+    total_num_kv_heads = data['num_key_value_heads']
+    head_dim = hidden_size // total_num_heads
+    return [((total_num_heads + (2*total_num_kv_heads)) * head_dim // tp, hidden_size), (hidden_size, hidden_size // tp), (intermediate_size *2 // tp, hidden_size), (hidden_size, intermediate_size // tp) ], hidden_size
+
+def get_dtype(dtype_str):
+    dtype = torch.float16
+    if dtype_str == 'f32':
+        dtype = torch.float32
+    elif dtype_str == 'bf16':
+        dtype = torch.bfloat16
+    elif dtype_str == 'f16':
+        dtype = torch.float16
+    else:
+        print('>>> Warning! Invalid dtype', dtype_str, 'using default dtype f16')
+    return dtype
+
+
+def list_of_ints(arg):
+    return list(map(int, arg.split(',')))
+
+def load_input_gemms(input_file):
+    if Path(input_file).is_file():
+        return 
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", type=str, default=os.getenv('GTUNE_MODEL', ""), help="Enter the location of your model directory")
+    parser.add_argument("--tuned_file", type=str, default=os.getenv('GTUNE_TUNED', "tuned.csv"), help="output file for tuned gemm solutions")
+    parser.add_argument("--input_file", type=str, default=os.getenv('GTUNE_INPUT', None), help="list of gemms to tune for, mutually exclusive with model_dir")
+    parser.add_argument("--tp", type=int, default=os.getenv('GTUNE_TP', 1), help="Tensor parallelism to be used.")
+    parser.add_argument("--dtype", type=str, default='f16', help="dtype f32 f16 bf16")
+    parser.add_argument("--rocblas-decode", action="store_true", default=False, help="forces rocblas solution on decode N=1")
+    parser.add_argument("--batch_size", type=int, default=os.getenv('GTUNE_BATCH_SIZE', 1), help="Batch size to tune for")
+    parser.add_argument("--nsets", type=list_of_ints, default=[1, 512, 1024, 2048, 3072, 4096, 8192, 16384], help="N sizes to tune for: 1,128,2048")
+    args = parser.parse_args()
+
+    dtype = get_dtype(args.dtype)
+
+    gtuner = GemmTuner(dtype, args.tuned_file, args.rocblas_decode)
+    nsets = [i * args.batch_size for i in args.nsets]
+    if args.input_file:
+        print(f">>> Loading {args.input_file}")
+        if not Path(args.input_file).is_file():
+            print(f">>> ERROR: {args.input_file} does not exist.  Exiting")
+            exit(1)
+        shapes = pd.read_csv(args.input_file)
+        for i in range(len(shapes)):
+            ds = shapes.iloc[i]
+            gtuner.add_gemm(ds['M'],ds['N'],ds['K'])
+    else:
+        if not args.model_dir:
+            print(">>> Warning! NO MODEL SPECIFIED. Tuning for LL2 13B TP1")
+            #LL2 13B sizes
+            mksets = [(15360, 5120), (5120, 5120), (27648, 5120), (5120, 13824)]
+            gtuner.add_gemm(m=32000, n=1, k=5120)  # logits gemm
+        else:
+            mksets, hidden_size = generate_mk_sets(args.model_dir, args.tp)
+            gtuner.add_gemm(m=32000//args.tp, n=1 * args.batch_size, k=hidden_size) #TODO: Handle cases where vocab_size is not divisible by tp
+
+        for n in sorted(nsets):
+            for m, k in mksets:
+                gtuner.add_gemm(m, n, k)
+
+    gtuner.find_best_sols()
diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py
new file mode 100644
index 0000000000000..273042cb12a05
--- /dev/null
+++ b/gradlib/gradlib/GemmTuner.py
@@ -0,0 +1,208 @@
+import torch
+import os
+import argparse
+import rocsolidxgemm
+import hipbsolidxgemm
+import numpy as np
+import torch.nn.functional as F
+import sys
+import pandas as pd
+import json
+import random
+from pathlib import Path
+rocsolidxgemm.rocb_create_extension()
+hipbsolidxgemm.hipb_create_extension()
+
+rtol = 1e-5
+atol = 1
+dtype = torch.float16
+
+class Gemm:
+    def __init__(self,m,n,k,dtype,rocblas_decode=False):
+        self.m=m
+        self.k=k
+        self.n=n
+        self.dtype=dtype
+        self.nb = 37
+        self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda')
+        self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda')
+        #weights2 is used in measurement/warm iters to ensure HBM fetch for weight tensors
+        self.weights2 = torch.randn((self.nb, self.m, self.k), dtype=self.dtype, device='cuda')
+        self.blob = torch.ones(128*1024*1024, dtype=torch.float32, device='cuda')
+        self.topn = 20 #number of top solutions from each source
+        self.hipb_sols=[]
+        self.rtol = 1e-5
+        self.atol = 1
+        self.start = torch.cuda.Event(enable_timing=True)
+        self.end = torch.cuda.Event(enable_timing=True)
+        self.hipb_prefer_ratio = 0.995 #prefer hipblaslt unless rocblas time is less than this ratio of hipblaslt time
+        self.rocblas_decode=rocblas_decode
+
+
+    def find_hipblas_sols(self):
+        sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t())
+        print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols), flush=True)
+        #print(sols)
+        self.hipb_sols = sols
+
+
+    def check_gemm_ref(self,libtype,solidx):
+        ref = F.linear(self.inp,self.weights)
+        if libtype == 'hipblaslt':
+            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
+        elif libtype == 'rocblas':
+            c = rocsolidxgemm.rocb_mm(self.inp,self.weights.t(),solidx)
+        if torch.allclose(c, ref, atol=self.atol,  rtol=self.rtol):
+            #print('>>>',libtype,'Solidx',solidx,'passed reference test')
+            return True
+        else:
+            print('>>>',libtype,'Solidx',solidx,'FAILED reference test', flush=True)
+            print(ref, flush=True)
+            print(c, flush=True)
+            return False
+    def hipb_time_sol(self,solidx,cold_iters=2,warm_iters=10):
+        #print('>>>hipbtime',solidx)
+        for i in range(cold_iters):
+            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
+        self.start.record()
+        for i in range(warm_iters):
+            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights2 [random.randint(0,self.nb-1)].t(),solidx)
+        self.end.record()
+        torch.cuda.synchronize()
+        gtime = self.start.elapsed_time(self.end)/warm_iters
+        #print('>>> Solidx GTime',solidx,gtime,'ms')
+        return gtime
+    def hipb_time_all_sols(self,fast_mode=0,top_sols=0):
+        coldi=20; warmi=20
+        if fast_mode: coldi=2; warmi=2
+        solutions = self.hipb_sols
+        if top_sols: solutions = self.hipb_top_sols
+        gtimes = {}
+        for solidx in solutions:
+            gtimes[solidx] = self.hipb_time_sol(solidx, cold_iters=coldi, warm_iters=warmi)
+        self.hipb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
+        self.hipb_gtimedf.to_csv('/tmp/hipb_gtimedf.csv')
+        print('>>> HipBlasLt top solutions, Fast Mode',fast_mode)
+        print(self.hipb_gtimedf.head(self.topn))
+    def rocb_time_sol(self, solidx, cold_iters=2, warm_iters=10):
+        for i in range(cold_iters):
+            c = rocsolidxgemm.rocb_mm(self.inp, self.weights.t(), solidx)
+        self.start.record()
+        for i in range(warm_iters):
+            c = rocsolidxgemm.rocb_mm(self.inp, self.weights2[random.randint(0, self.nb-1)].t(), solidx)
+        self.end.record()
+        torch.cuda.synchronize()
+        gtime = self.start.elapsed_time(self.end)/warm_iters
+        #print('>>> RocSolidx GTime',solidx,gtime,'ms')
+        return gtime
+    def find_rocblas_sols(self):
+        sols = rocsolidxgemm.rocb_findallsols(self.inp,self.weights.t())
+        print('M N K',self.m,self.n,self.k,'>>> Total rocb solutions',len(sols), flush=True)
+        #print(sols)
+        self.rocb_sols = sols
+    def rocb_time_all_sols(self,fast_mode=0,top_sols=0):
+        coldi=20; warmi=20
+        if fast_mode: coldi=2; warmi=2
+        solutions = self.rocb_sols
+        if top_sols: solutions = self.rocb_top_sols
+        gtimes = {}
+        for solidx in solutions:
+            gtimes[solidx] = self.rocb_time_sol(solidx,coldi,warmi)
+        self.rocb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
+        self.rocb_gtimedf.to_csv('/tmp/rocb_gtimedf.csv')
+        print('>>> Rocblas top solutions, Fast Mode',fast_mode, flush=True)
+        print(self.rocb_gtimedf.head(self.topn), flush=True)
+    def warmup(self,warmi=500):
+        for i in range(warmi):
+            self.blob = self.blob + 0.00001
+    def functional_check_topn_fastest(self):
+        rocb_topn = []
+        for solidx in self.rocb_gtimedf.index[:self.topn]:
+            if self.check_gemm_ref(libtype='rocblas',solidx=solidx):
+                rocb_topn.append(solidx)
+        self.rocb_top_sols = rocb_topn
+        hipb_topn = [] 
+        for solidx in self.hipb_gtimedf.index[:self.topn]:
+            if self.check_gemm_ref(libtype='hipblaslt',solidx=solidx):
+                hipb_topn.append(solidx)
+        self.hipb_top_sols = hipb_topn
+
+    def find_fastest_solution(self):
+        self.find_rocblas_sols()
+        if not (self.rocblas_decode and self.n == 1):
+            self.find_hipblas_sols()
+        self.warmup()
+        self.rocb_time_all_sols(fast_mode=1)
+        self.warmup()
+        self.hipb_time_all_sols(fast_mode=1)
+        self.functional_check_topn_fastest()
+        self.warmup()
+        self.rocb_time_all_sols(fast_mode=0,top_sols=1)
+        self.warmup()
+        self.hipb_time_all_sols(fast_mode=0,top_sols=1)
+        if len(self.rocb_gtimedf)>0 and len(self.hipb_gtimedf)>0:
+            best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0]
+            best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0]
+            if best_rocb_time<best_hipb_time*self.hipb_prefer_ratio:
+                self.best_libtype = 'rocblas'
+                self.best_solidx = self.rocb_gtimedf.index[0]
+                self.best_soltime = best_rocb_time
+            else:
+                self.best_libtype = 'hipblaslt'
+                self.best_solidx = self.hipb_gtimedf.index[0]
+                self.best_soltime = best_hipb_time
+            #self.check_gemm_ref(self.best_libtype,self.best_solidx)
+        elif len(self.hipb_gtimedf)>0:
+                print('>>> Only hipblas solutions found!',flush=True)
+                best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0]
+                self.best_libtype = 'hipblaslt'
+                self.best_solidx = self.hipb_gtimedf.index[0]
+                self.best_soltime = best_hipb_time
+        elif len(self.rocb_gtimedf)>0:
+                print('>>> Only rocblas solutions found!',flush=True)
+                best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0]
+                self.best_libtype = 'rocblas'
+                self.best_solidx = self.rocb_gtimedf.index[0]
+                self.best_soltime = best_rocb_time
+        else:
+            print('>>> No rocblas or hipblas solutions found!',flush=True)
+            self.best_libtype = 'rocblas'
+            self.best_solidx = 0
+            self.best_soltime = 0
+        print('>>> Fastest Solution is',self.best_libtype,self.best_solidx,self.best_soltime,flush=True)
+
+
+class GemmTuner:
+    def __init__(self, dtype, tuned_file=None, rocblas_decode=False):
+        self.gemm_problems = pd.DataFrame(columns=['M','N','K'])
+        self.dtype = dtype
+        self.rocblas_decode = rocblas_decode
+        self.tuned_file = tuned_file
+        if Path(tuned_file).is_file():
+            self.gdf = pd.read_csv(tuned_file)
+        else:
+            self.gdf = None
+
+    def add_gemm(self,m,n,k):
+        if ( self.gdf is None or (self.gdf[(self.gdf['M'] == m) & (self.gdf['N'] == n) & (self.gdf['K'] == k)].empty)):
+            entry = {'M':[m], 'N':[n], 'K':[k]}
+            df = pd.DataFrame(entry)
+            self.gemm_problems = pd.concat([self.gemm_problems, df],ignore_index=True)
+        else:
+            print(f">>>Info: Found Duplicate shape(M:{m}, N:{n}, K:{k}), skipping")
+
+    def find_best_sols(self):
+        df = self.gemm_problems
+        soldf = pd.DataFrame()
+        for i in range(len(df)):
+            ds = df.iloc[i]
+            gemmobj = Gemm(ds['M'],ds['N'],ds['K'],dtype=self.dtype, rocblas_decode=self.rocblas_decode)
+            gemmobj.find_fastest_solution()
+            soldf.loc[i,'libtype'] = gemmobj.best_libtype
+            soldf.loc[i,'solidx'] = gemmobj.best_solidx
+            soldf.loc[i,'soltimems'] = gemmobj.best_soltime
+        soldf['dtype'] = self.dtype
+        finaldf = pd.concat([self.gemm_problems, soldf],axis=1)
+        finaldf = pd.concat([finaldf, self.gdf])
+        finaldf.to_csv(self.tuned_file, index=False)
+        print(finaldf)
diff --git a/gradlib/mm_test.py b/gradlib/mm_test.py
new file mode 100644
index 0000000000000..1b21b9ca105ff
--- /dev/null
+++ b/gradlib/mm_test.py
@@ -0,0 +1,234 @@
+import torch
+#import gradlib
+import rocsolidxgemm
+import hipbsolidxgemm
+import numpy as np
+import torch.nn.functional as F
+import sys
+import pandas as pd
+#gradlib.create_extension()
+rocsolidxgemm.rocb_create_extension()
+hipbsolidxgemm.hipb_create_extension()
+
+#m = 128; n = 192 ;k = 256
+#m = 7168; k = 4096*2; n = 256
+#m = int(1024*1.25); k = int(1024*8); n = 1
+#m = 1; k = int(1024*8); n = int(1024*7)
+#m=22016; k=4096 ; n=1
+#m=int(27648/1);k=5120;n=8
+#m=5120;k=13824;n=1
+m=3*5120;k=5120;n=1
+
+
+rtol = 1e-5
+atol = 1
+dtype = torch.float16
+
+class Gemm:
+    def __init__(self,m,n,k,dtype=torch.float16):
+        self.m=m
+        self.k=k
+        self.n=n
+        self.dtype=dtype
+        self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda')
+        self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda')
+        self.hipb_sols=[]
+        self.rtol = 1e-5
+        self.atol = 1
+        self.cold_iters = 2
+        self.warm_iters = 10
+    def find_hipblas_sols(self):
+        sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t())
+        print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols))
+        #print(sols)
+        self.hipb_sols = sols
+    def hipb_check_gemm_ref(self,user_solidxs=None):
+        ref = F.linear(self.inp,self.weights)
+        if user_solidxs is not None:
+            solidxs = user_solidxs
+        else:
+            solidxs = self.hipb_sols
+        if len(solidxs)>0:
+            for solidx in solidxs:
+                c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
+                if torch.allclose(c, ref, atol=self.atol,  rtol=self.rtol):
+                    print('>>> Hipb solidx',solidx,'passed reference test')
+                else:
+                    print('>>> Hipb solidx',solidx,'FAILED reference test')
+                    print(ref)
+                    print(c)
+    def hipb_time_sol(self,solidx):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        for i in range(self.cold_iters):
+            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
+        start.record()
+        for i in range(self.warm_iters):
+            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
+        end.record()
+        torch.cuda.synchronize()
+        gtime = start.elapsed_time(end)/self.warm_iters
+        #print('>>> Solidx GTime',solidx,gtime,'ms')
+        return gtime
+    def hipb_time_all_sols(self):
+        gtimes = {}
+        for solidx in self.hipb_sols:
+            gtimes[solidx] = self.hipb_time_sol(solidx)
+        self.gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
+        self.gtimedf.to_csv('/tmp/gtimedf.csv')
+        print(self.gtimedf.head(10))
+
+
+
+gemmobj = Gemm(m=3*5120,n=1,k=5120)
+gemmobj.find_hipblas_sols()
+#gemmobj.hipb_check_gemm_ref()
+#gemmobj.hipb_check_gemm_ref(user_solidxs=[131,8190])
+#gemmobj.hipb_time_sol(gemmobj.hipb_sols[0])
+gemmobj.hipb_time_all_sols()
+gemmobj.hipb_check_gemm_ref(user_solidxs=gemmobj.gtimedf.head(5).index.values)
+
+sys.exit()
+def splitk_linear(inp,w,splitk=2):
+    wsp = torch.chunk(w,splitk,dim=1)
+    isp = torch.chunk(inp,splitk,dim=1)
+    print('>>>',isp[0].shape,wsp[1].shape)
+    cnew = []
+    for i in range(splitk):
+        cnew.append(F.linear(isp[i],wsp[i]))
+    #cnew1 = F.linear(isp[1],wsp[1])
+    c = cnew[0]
+    for i in range(1,splitk):
+        c.add_(cnew[i])
+    #c = torch.add(cnew0,cnew1)
+
+    return c
+
+def splitm_linear(inp,w,splitm=2,splits=None,splitk=1):
+    outputp=[] 
+    #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm)
+    if splits is not None:
+        wsp = torch.split(w,splits)
+    else:
+        wsp = torch.chunk(w,splitm)
+    #cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device)
+    #csp = torch.chunk(cout,splitm,dim=1)
+
+    for i,_ in enumerate(wsp):
+        #print('>>>wspi',wsp[i].shape)
+        if splitk==1:
+            outputp.append(F.linear(inp, wsp[i])) 
+            #cout[:,i*wsp[i].shape[0]:(i+1)*wsp[i].shape[0]] = F.linear(inp, wsp[i]) 
+            #csp[i].copy_(F.linear(inp, wsp[i]))
+        else:
+            outputp.append(splitk_linear(inp,wsp[i],splitk))
+    c = torch.cat((outputp),dim=1)
+    #print('>>>',c.shape,cout.shape)
+    return c
+
+def splitn_linear(inp,w,splitn=2,splits=None):
+    outputp=[] 
+    if splits is not None:
+        isp = torch.split(inp,splits)
+    else:
+        isp = torch.chunk(inp,splitn)
+    cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device)
+    for i,_ in enumerate(isp):
+        outputp.append(F.linear(isp[i], w)) 
+        #cout[i*isp[i].shape[0]:(i+1)*isp[i].shape[0],:] = F.linear(isp[i], w) 
+    c = torch.cat((outputp),dim=0)
+    #print('>>>',c.shape,cout.shape)
+    return c
+
+nncount = 0
+for _ in range(10):
+    #a = torch.randn((m, k), dtype=dtype, device='cuda')
+    #b = torch.randn((k, n), dtype=dtype, device='cuda')
+    inp = torch.randn((n, k), dtype=dtype, device='cuda')
+    weights = torch.randn((m, k), dtype=dtype, device='cuda')
+    #c = gradlib.mm(inp, weights.t())
+    c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053)
+    c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053)
+    c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995)
+    c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995)
+
+    splitm=2
+    #padm=2
+    outsp=[] 
+    #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm)
+    #wsp = torch.chunk(weights,splitm)
+    #wsp = torch.split(weights,(3*1024,4*1024))
+    #c = torch.empty((n,m),dtype=dtype,device='cuda')
+    #outtup = []
+    #for i,_ in enumerate(wsp):
+    #    print('>>>wspi',wsp[i].shape)
+    #    outsp.append(F.linear(inp, wsp[i])) 
+    #    #outtup.append(splitk_linear(inp, wsp[i])) 
+    #outsp = [torch.add(a,b) for a,b in outtup]
+    #c = torch.cat((outsp),dim=1) 
+    #c = c[:,:-padm]
+    #c = splitm_linear(inp,weights,splitm=4,splits=None,splitk=1)
+    #c = splitn_linear(inp,weights,splitn=2,splits=None)
+    
+    #wsp = torch.chunk(weights,2,dim=1)
+    #isp = torch.chunk(inp,2,dim=1)
+    #print('>>>',isp[0].shape,wsp[1].shape)
+    #cnew0 = F.linear(isp[0],wsp[0])
+    #cnew1 = F.linear(isp[1],wsp[1])
+    #c = torch.add(cnew0,cnew1)
+    #c = splitk_linear(inp, weights, splitk=4) 
+
+    #torch.cuda.synchronize()
+    ref = F.linear(inp,weights)
+    #ref = torch.matmul(a,b)
+    if torch.allclose(c, ref, atol=atol,  rtol=rtol):
+        nncount += 1
+    else:
+        print(ref)
+        print(c)
+'''
+tncount = 0
+for _ in range(10):
+    a = torch.randn((m, k), dtype=dtype, device='cuda')
+    b = torch.randn((n, k), dtype=dtype, device='cuda')
+    c = gradlib.mm(a, b.t())
+    #torch.cuda.synchronize()
+    ref = torch.matmul(a, b.t())
+    if torch.allclose(c, ref,  atol=atol,  rtol=rtol):
+        tncount += 1
+    else:
+        print(ref)
+        print(c)
+        #torch.save(c-ref, '/tmp/difference.pt')
+        #np.savetxt('my_file.txt', (c-ref).cpu().numpy())
+        dfs = ref - c
+        nz = torch.nonzero(dfs,as_tuple=True)
+        print(nz)
+        print(dfs[nz])
+        print(ref[nz])
+        print(c[nz])
+'''
+'''
+ntcount = 0
+for _ in range(10):
+    a = torch.randn((k, m), dtype=dtype, device='cuda')
+    b = torch.randn((k, n), dtype=dtype, device='cuda')
+    c = gradlib.mm(a.t(), b)
+    #torch.cuda.synchronize()
+    if torch.allclose(c, torch.matmul(a.t(), b),  atol=atol,  rtol=rtol):
+        ntcount += 1
+
+ttcount = 0
+for _ in range(10):
+    a = torch.randn((k, m), dtype=dtype, device='cuda')
+    b = torch.randn((n, k), dtype=dtype, device='cuda')
+    c = gradlib.mm(a.t(), b.t())
+    torch.cuda.synchronize()
+    if torch.allclose(c, torch.matmul(a.t(), b.t()),  atol=atol,  rtol=rtol):
+        ttcount += 1
+'''
+print(f"GEMM (m, n, k) = {n}, {m}, {k}")
+print(f"NN GEMMs: pass {nncount}/10, tol={rtol}")
+#print(f"TN GEMMs: pass {tncount}/10, tol={rtol}")
+#print(f"NT GEMMs: pass {ntcount}/10, tol={rtol}")
+#print(f"TT GEMMs: pass {ttcount}/10, tol={rtol}")
diff --git a/gradlib/setup.py b/gradlib/setup.py
new file mode 100644
index 0000000000000..1ca83dbe79f6c
--- /dev/null
+++ b/gradlib/setup.py
@@ -0,0 +1,136 @@
+import torch
+import setuptools
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from torch.utils.hipify import hipify_python
+import os
+import subprocess
+import re
+
+this_dir = os.path.dirname(os.path.abspath(__file__))
+#gpus = subprocess.check_output("/opt/rocm/bin/rocminfo").decode('UTF-8').split('\n')
+#gpus = list(set([re.search('(gfx94.)', g).group(0) for g in gpus if 'gfx94' in g]))
+gpus = ['gfx90a','gfx940','gfx941','gfx942']
+#gpus = ['gfx90a','gfx940']
+extra_args = ["--offload-arch=" + g for g in gpus]
+
+
+#sets_rocm_pytorch = False
+maj_ver, min_ver, *_ = torch.__version__.split('.')
+if int(maj_ver) > 1 or (int(maj_ver) == 1 and int(min_ver) >= 5):
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+
+ext_modules = []
+
+generator_flag = []
+torch_dir = torch.__path__[0]
+if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
+    generator_flag = ['-DOLD_GENERATOR']
+
+print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+version_ge_1_1 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+    version_ge_1_1 = ['-DVERSION_GE_1_1']
+version_ge_1_3 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+    version_ge_1_3 = ['-DVERSION_GE_1_3']
+version_ge_1_5 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+    version_ge_1_5 = ['-DVERSION_GE_1_5']
+version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+include_dirs=[os.path.join(this_dir, 'csrc')]
+
+#if is_rocm_pytorch:
+#    import shutil
+#    with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
+#        hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
+#				show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx)
+
+if not is_rocm_pytorch:
+    ext_modules.append(
+		CUDAExtension(
+		    name='gradlib',
+		    sources=['grad_funcs.cu'],
+		    extra_compile_args={
+				       'cxx': ['-O3',],
+				        'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80']
+				       }
+		    )
+	    )
+elif is_rocm_pytorch:
+    #if torch.__version__ <= '1.8':
+    hipify_ver = [int(x) for x in torch.utils.hipify.__version__.split(".")] if hasattr(torch.utils.hipify, "__version__") else [0,0,0]
+    if hipify_ver < [1,0,0]:
+        import shutil
+        with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
+            hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
+                                    show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx)
+
+        ext_modules.append(
+                    CUDAExtension(
+                        name='gradlib',
+                        sources=['./csrc/hip/grad_funcs.hip'],
+                        extra_compile_args={
+                                           'cxx': ['-O3',] + version_dependent_macros,
+                                           'nvcc':['-O3'] + extra_args
+                                           }
+                        )
+                )
+    else:
+        #ext_modules.append(
+        #            CUDAExtension(
+        #                name='gradlib',
+        #                sources=['./csrc/grad_funcs.cu'],
+        #                include_dirs=include_dirs,
+        #                # add additional libraries argument for hipblaslt
+        #                libraries=['hipblaslt'],
+        #                extra_compile_args={
+        #                                   'cxx': ['-O3',],
+        #                                    'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
+        #                             "-ftemplate-depth=1024"] + extra_args
+        #                                   }
+        #                )
+        #        )
+        ext_modules.append(
+                    CUDAExtension(
+                        name='rocsolidxgemm',
+                        sources=['./csrc/rocsolgemm.cu'],
+                        include_dirs=include_dirs,
+                        # add additional libraries argument for hipblaslt
+                        libraries=['rocblas'],
+                        extra_compile_args={
+                                           'cxx': ['-O3',],
+                                            'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
+                                     "-ftemplate-depth=1024"] + extra_args
+                                           }
+                        )
+                )
+        ext_modules.append(
+                    CUDAExtension(
+                        name='hipbsolidxgemm',
+                        sources=['./csrc/hipbsolgemm.cu'],
+                        include_dirs=include_dirs,
+                        # add additional libraries argument for hipblaslt
+                        libraries=['hipblaslt'],
+                        extra_compile_args={
+                                           'cxx': ['-O3',],
+                                            'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
+                                     "-ftemplate-depth=1024"] + extra_args
+                                           }
+                        )
+                )
+
+setup(
+    name='gradlib',
+    packages=['gradlib'],
+    ext_modules=ext_modules,
+    cmdclass={
+        'build_ext': BuildExtension
+})
+
+# python setup.py build && cp build/lib*/gradlib* ../
diff --git a/run_70b.sh b/run_70b.sh
old mode 100644
new mode 100755
index 46e342826b2a7..ed004b56c17d3
--- a/run_70b.sh
+++ b/run_70b.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-BASE_DIR=/workspace
-VLLM_DIR=$BASE_DIR/vllm-private
+BASE_DIR=/trees
+VLLM_DIR=$BASE_DIR/vllm
 GRAD_DIR=$BASE_DIR/gradlib
 RPD_DIR=/workspace/rocmProfileData
 MODEL=/data/llama2-70b-chat
@@ -10,7 +10,7 @@ GEMM_TUNER=1
 #TP="1 2 4 8"
 TP=8
 #Flag to use Triton Flash Attention vs CK
-export VLLM_USE_TRITON=1
+#export VLLM_USE_TRITON=1
 
 #Gemm tuner flags
 export VLLM_TUNE_GEMM=0
@@ -21,22 +21,21 @@ export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv"
 #export VLLM_USE_TORCH_MULTINOMIAL=1
 
 #Delete tuned gemms before running.
-DELETE_TUNED_CSV=1
+#DELETE_TUNED_CSV=1
 #Flag to disable MSCCL
 #export RCCL_MSCCL_ENABLE=0
 #HIPGraph performance flags
 export HIP_FORCE_DEV_KERNARG=1
 export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
 #Enable full decoder graph mode
-HIP_GRAPH=--use-cuda-graph
 #Use top of tree build of RCCL
 export LD_LIBRARY_PATH=/workspace/rccl/build/
 #Enable either flag to create a profile trace (rocprof, or rocpd)
 #RPD_PROFILE="--profile"
 #ROCPROF_PROFILE="rocprof --hip-trace"
-GEN_LEN="1 32"
-#INPUT_LEN="512 1024 2048 3072"
-INPUT_LEN="512 1024 2048 3072 4096 6144 8192 16384"
+GEN_LEN="1,32,128"
+INPUT_LEN="512,1024,2048,3072"
+
 ITER=10
 # pring usage of the parameters
 usage() {
@@ -57,26 +56,16 @@ for tp in $TP;
 do
     if (( $GEMM_TUNER ));
     then
-      echo "tuned_gemm_csv: ./tuned_tp$tp.csv" > $VLLM_DIR/tuned_perf_tp$tp.yaml
-      tuned_file=$VLLM_DIR/tuned_tp$tp.csv
-      if [[ $DELETE_TUNED_CSV == 1 || ! -f $VLLM_DIR/tuned_tp$tp.csv ]];
-      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf.yaml
+      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml
+
       if [[ $DELETE_TUNED_CSV == 1 ]];
       then
-              rm -rf $tuned_file
-              echo "INFO: Generating Tuned Gemm configs"
-              cd $GRAD_DIR
-              python gemm_tuner.py --model_dir $MODEL --output $tuned_file --tp $tp
+              rm -rf $VLLM_TUNE_FILE
       fi
-      export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
+      #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
       echo "INFO: Generating Tuned Gemm configs"
       cd $GRAD_DIR
-      python gemm_tuner.py --model_dir $MODEL --output $VLLM_TUNE_FILE --tp $tp
-
-
-      echo "================================= TUNED GEMMS  $tuned_file ==============================================="
-      cat $tuned_file
-
+      python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp
     fi
 
     cd $VLLM_DIR
@@ -91,7 +80,7 @@ do
             fi
             echo "================================= RUNNING $MODEL $input_len $gen_len ==============================================="
             $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1 --input-len $input_len --output-len $gen_len \
-            --tensor-parallel-size $tp --num-iters $ITER $HIP_GRAPH $RPD_PROFILE
+            --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report
             if [[ -v ROCPROF_PROFILE ]] ;
             then
                 TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
@@ -106,4 +95,4 @@ do
             fi
         done
     done
-done
\ No newline at end of file
+done
diff --git a/run_70b_fast.sh b/run_70b_fast.sh
old mode 100644
new mode 100755
index 585e0ebdd000c..0ed20e59ca3ff
--- a/run_70b_fast.sh
+++ b/run_70b_fast.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 set -e
-BASE_DIR=/workspace
-VLLM_DIR=$BASE_DIR/vllm-private
-GRAD_DIR=/trees/gradlib
+BASE_DIR=/trees
+VLLM_DIR=$BASE_DIR/vllm
+GRAD_DIR=$BASE_DIR/gradlib
 RPD_DIR=/workspace/rocmProfileData
 MODEL=/data/llama2-70b-chat
 MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
@@ -27,8 +27,6 @@ export VLLM_USE_TRITON=1
 export HIP_FORCE_DEV_KERNARG=1
 export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
 
-#Enable full decoder graph mode
-HIP_GRAPH=--use-cuda-graph
 
 #Use top of tree build of RCCL
 export LD_LIBRARY_PATH=/workspace/rccl/build/
@@ -39,8 +37,8 @@ export LD_LIBRARY_PATH=/workspace/rccl/build/
 
 #TP="1 2 4 8"
 TP=8
-GEN_LEN="1,32"
-INPUT_LEN="512 1024 2048 3072"
+GEN_LEN="32"
+INPUT_LEN="512,1024,2048,3072"
 #INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384"
 BATCH_SIZE="1"
 ITER=10
@@ -59,11 +57,9 @@ do
         python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE
         echo "File does not exist."
     fi
-    echo "================================= TUNED GEMMS  $tuned_file ==============================================="
-    cat $VLLM_TUNE_FILE
 
     export VLLM_TUNE_GEMM=0
     echo "================================= RUNNING $MODEL ==============================================="
     $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \
-    --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv $HIP_GRAPH
+    --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv
 done
\ No newline at end of file
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 6d0a3ed3f2a60..1dc0dcc9a4670 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -14,12 +14,13 @@ def __init__(self):
         #rocb_create_extension()
         #hipb_create_extension()
         self.extensions_created = False
-        self.bestsols = {}
-        self.load_best_sols()
-        self.create_ds()
         self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0))
         self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv")
         self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv")
+        self.bestsols = {}
+        self.load_best_sols()
+        self.create_ds()
+
 
         if (self.save_gemm == 1):
             self.tuned_df = pd.DataFrame(columns=['M','N','K'])

From 694ae1d78e93728a0c492c13e5d3a51ce6729c1d Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Thu, 15 Feb 2024 06:20:44 +0000
Subject: [PATCH 10/34] Add rpd tracer controls to benchmark_latency.py

---
 Dockerfile.rocm                 | 129 +++++++++++++-------------------
 benchmarks/benchmark_latency.py |  26 ++++++-
 2 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 3c76305303037..08783e0a1e0bd 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,85 +1,56 @@
-# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-FROM $BASE_IMAGE
-
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-RUN echo "Base image is $BASE_IMAGE"
-
-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
+ENV WORKSPACE_DIR=/workspace
+RUN mkdir -p $WORKSPACE_DIR
+WORKDIR $WORKSPACE_DIR
+# Limit arch's so composable kernel doesn't take days to finish
+ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-
-ARG FA_BRANCH="3d2b6f5"
-RUN echo "FA_BRANCH is $FA_BRANCH"
-
-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
-
-# Install some basic utilities
-RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    nvidia-cuda-toolkit \
-    tmux \
- && rm -rf /var/lib/apt/lists/*
-
-### Mount Point ###
-# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
-VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
-
-# Install ROCm flash-attention
-RUN mkdir libs \
-    && cd libs \
-    && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
+RUN apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention \
     && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
-    && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
-    && python3 setup.py install \
-    && cd ..
-
-COPY ./ /app/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
-
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
-
-RUN cd /app \
-    && cd vllm \
-    && pip install -U -r requirements-rocm.txt \
+    && python setup.py install
+RUN git clone -b develop https://github.com/ROCmSoftwarePlatform/hipBLASLt \
+    && export GTest_DIR="/usr/local/lib/cmake/GTest/" \
+    && cd hipBLASLt \
+    && ./install.sh -idc --architecture 'gfx90a;gfx942' \
+    && cd ../ && rm -rf hipBLASLt
+RUN sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status
+RUN sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+
+RUN git clone  https://streamhsa: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@github.com/rocm/rocBLAS-internal.git \
+    && export GTest_DIR="/usr/local/lib/cmake/GTest/" \
+    && cd rocBLAS-internal \
+    && git fetch origin 4f353a8035da38c8b8873823c09a499db777b231 \
+    && git checkout 4f353a8035da38c8b8873823c09a499db777b231 \
+    && ./install.sh -idc -a 'gfx90a;gfx942' \
+    && cd ../ && rm -rf rocBLAS-internal
+
+RUN pip uninstall -y triton
+RUN git clone https://github.com/ROCmSoftwarePlatform/triton.git \
+    && cd triton/python && pip3 install -e .
+ENV MAX_JOBS=32
+RUN cd ${WORKSPACE_DIR} \
+    && git clone -b exp_bandaid https://github.com/ROCmSoftwarePlatform/rccl \
+    && cd rccl && mkdir build && cd build \
+    && CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && make -j
+
+RUN pip install xformers==0.0.23 --no-deps
+ADD ./ $WORKSPACE_DIR/vllm
+
+RUN cd vllm \
+    && pip install -r requirements-rocm.txt \
+    && pip install typing-extensions==4.8.0 \
     && bash patch_xformers.rocm.sh \
-    && python3 setup.py install \
-    && cd ..
+    && cd gradlib && python setup.py develop && cd ../ \
+    && python setup.py build && python setup.py develop; exit 0
+
+RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3
 
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
+    && cd rocmProfileData && make; make install
 
-CMD ["/bin/bash"]
+COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000
+COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer
+COPY docker/run_13b.sh $WORKSPACE_DIR/
+COPY docker/run_70b.sh $WORKSPACE_DIR/
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index f9b49ebfaa132..e4d70851e46ef 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -71,6 +71,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
 
                 print("Warming up...")
                 run_to_completion(profile_dir=None)
+                
+                if (args.warmup_only):
+
+                    print(">>> Warmup only specified, exiting")
+                    continue
 
                 if args.profile:
                     profile_dir = args.profile_result_dir
@@ -81,12 +86,31 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     print(f"Profiling (results will be saved to '{profile_dir}')...")
                     run_to_completion(profile_dir=args.profile_result_dir)
                     return
+                if args.rpd:
+                    from rpdTracerControl import rpdTracerControl
+                    rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True)
+                    profile_rpd = rpdTracerControl()
+                    profile_rpd.start()
+                    print(f"RPD Profiling'...")
+                    run_to_completion(profile_dir=None)
+                    profile_rpd.stop()
+                    return
 
                 # Benchmark.
                 latencies = []
                 for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
                     latencies.append(run_to_completion(profile_dir=None))
-                print(f'Avg latency: {np.mean(latencies)} seconds')
+
+                if torch.distributed.get_rank() == 0:
+                #results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
+                    latency=np.mean(latencies)
+                    print(f'Avg latency: {latency} seconds') 
+                    if args.report:
+                        entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]}
+                        results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True)
+    if torch.distributed.get_rank() == 0 and args.report:
+        print(results_df)
+        results_df.to_csv(args.report_file, index=False)
 
 
 

From 0e73aed90ef2d2c8bf7de0cd7e4aeb90a98ef3b9 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Thu, 15 Feb 2024 06:23:50 +0000
Subject: [PATCH 11/34] Fix Dockerfile errors

---
 Dockerfile.rocm | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 08783e0a1e0bd..36fac166ae44b 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -18,14 +18,6 @@ RUN git clone -b develop https://github.com/ROCmSoftwarePlatform/hipBLASLt \
 RUN sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status
 RUN sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
 
-RUN git clone  https://streamhsa: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@github.com/rocm/rocBLAS-internal.git \
-    && export GTest_DIR="/usr/local/lib/cmake/GTest/" \
-    && cd rocBLAS-internal \
-    && git fetch origin 4f353a8035da38c8b8873823c09a499db777b231 \
-    && git checkout 4f353a8035da38c8b8873823c09a499db777b231 \
-    && ./install.sh -idc -a 'gfx90a;gfx942' \
-    && cd ../ && rm -rf rocBLAS-internal
-
 RUN pip uninstall -y triton
 RUN git clone https://github.com/ROCmSoftwarePlatform/triton.git \
     && cd triton/python && pip3 install -e .
@@ -52,5 +44,4 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
 
 COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000
 COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer
-COPY docker/run_13b.sh $WORKSPACE_DIR/
-COPY docker/run_70b.sh $WORKSPACE_DIR/
+ENV WORKSPACE_DIR=/workspace/vllm

From 90df0c93877438ba427721a73bf2af1338e0da68 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Fri, 16 Feb 2024 04:48:03 +0000
Subject: [PATCH 12/34] Add llama2 run script

---
 run_llama2.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100755 run_llama2.sh

diff --git a/run_llama2.sh b/run_llama2.sh
new file mode 100755
index 0000000000000..1444ca7d222a1
--- /dev/null
+++ b/run_llama2.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+BASE_DIR=/workspace
+VLLM_DIR=$BASE_DIR/vllm
+GRAD_DIR=$VLLM_DIR/gradlib
+RPD_DIR=/workspace/rocmProfileData
+MODEL=/data/llama2-70b-chat
+MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
+#MODEL=/data/llama-2-13b-chat-hf
+GEMM_TUNER=1
+#TP="1 2 4 8"
+TP=8
+#Flag to use Triton Flash Attention vs CK
+#export VLLM_USE_TRITON=1
+
+#Gemm tuner flags
+export VLLM_TUNE_GEMM=0
+export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
+export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv"
+
+#Flag to use old torch.multinomial
+#export VLLM_USE_TORCH_MULTINOMIAL=1
+
+#Delete tuned gemms before running.
+#DELETE_TUNED_CSV=1
+#Flag to disable MSCCL
+#export RCCL_MSCCL_ENABLE=0
+#HIPGraph performance flags
+export HIP_FORCE_DEV_KERNARG=1
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
+#Enable full decoder graph mode
+#Use top of tree build of RCCL
+export LD_LIBRARY_PATH=/workspace/rccl/build/
+#Enable either flag to create a profile trace (rocprof, or rocpd)
+#RPD_PROFILE="--rpd"
+#ROCPROF_PROFILE="rocprof --hip-trace"
+GEN_LEN="1,32,128"
+INPUT_LEN="512,1024,2048,3072"
+
+ITER=10
+# pring usage of the parameters
+usage() {
+    echo "Usage: $0 [--tp <n>] [--model <path>]"
+    exit 1
+}
+# parse parameters
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --tp) TP="$2"; shift ;;
+        --model) MODEL="$2"; shift ;;
+        --notune) GEMM_TUNER=0; shift ;;
+        *) usage ;; # Any other argument will show usage information.
+    esac
+    shift # Move to next argument
+done
+for tp in $TP;
+do
+    if (( $GEMM_TUNER ));
+    then
+      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml
+
+      if [[ $DELETE_TUNED_CSV == 1 ]];
+      then
+              rm -rf $VLLM_TUNE_FILE
+      fi
+      #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
+      echo "INFO: Generating Tuned Gemm configs"
+      cd $GRAD_DIR
+      python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp
+    fi
+
+    cd $VLLM_DIR
+    for gen_len in $GEN_LEN;
+    do
+        for input_len in $INPUT_LEN;
+        do
+            if [[ -v RPD_PROFILE ]] ;
+            then
+                rm /workspace/trace.rpd
+                python -m rocpd.schema --create /workspace/trace.rpd
+            fi
+            echo "================================= RUNNING $MODEL $input_len $gen_len ==============================================="
+            $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1 --input-len $input_len --output-len $gen_len \
+            --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report
+            if [[ -v ROCPROF_PROFILE ]] ;
+            then
+                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
+                echo "INFO: Creating Trace JSON file $TRACE_FILE"
+                mv $VLLM_DIR/results.json $TRACE_FILE
+            fi
+            if [[ -v RPD_PROFILE ]] ;
+            then
+                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
+                echo "INFO: Creating Trace JSON file $TRACE_FILE"
+                python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE
+            fi
+        done
+    done
+done

From ab672805783edd129fbafa7fc4854874e2bcef64 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Fri, 16 Feb 2024 21:31:25 +0000
Subject: [PATCH 13/34] Increase Partition and Num threads for attention blocks

---
 csrc/attention/attention_kernels.cu     | 7 ++++---
 vllm/model_executor/layers/attention.py | 5 ++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 9dcacfbe47d48..728ed64eab3d6 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -37,6 +37,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 
+
 namespace vllm {
 
 // Utility function for attention softmax.
@@ -572,7 +573,7 @@ __global__ void paged_attention_v2_reduce_kernel(
 template<
   typename T,
   int BLOCK_SIZE,
-  int NUM_THREADS = 128>
+  int NUM_THREADS = 1024>
 void paged_attention_v1_launcher(
   torch::Tensor& out,
   torch::Tensor& query,
@@ -731,8 +732,8 @@ void paged_attention_v1(
 template<
   typename T,
   int BLOCK_SIZE,
-  int NUM_THREADS = 128,
-  int PARTITION_SIZE = 512>
+  int NUM_THREADS = 1024,
+  int PARTITION_SIZE = 1024>
 void paged_attention_v2_launcher(
   torch::Tensor& out,
   torch::Tensor& exp_sums,
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 8b5c6ab30d7b7..f05b49d494958 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -16,8 +16,11 @@
 
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
 
+if is_hip:
+    _PARTITION_SIZE = 1024
+else:
+    _PARTITION_SIZE = 512
 
 class PagedAttention(nn.Module):
     """MHA/MQA/GQA layer with PagedAttention.

From 1d53722476833033b4e580260944322fa119d52f Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Thu, 22 Feb 2024 02:15:22 +0000
Subject: [PATCH 14/34] Fix WORKDIR

---
 Dockerfile.rocm | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 36fac166ae44b..873574c409a8f 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -42,6 +42,4 @@ RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3
 RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
     && cd rocmProfileData && make; make install
 
-COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000
-COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer
-ENV WORKSPACE_DIR=/workspace/vllm
+WORKDIR /workspace/vllm

From 5148aa59c111d60d5d2b8bdb4c59c7676221cb12 Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Thu, 22 Feb 2024 02:41:09 +0000
Subject: [PATCH 15/34] Add accuracy flag to benchmark_latency.py

---
 benchmarks/benchmark_latency.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e4d70851e46ef..be1dd223818fc 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -46,6 +46,8 @@ def main(args: argparse.Namespace):
                 )
                 print(sampling_params)
                 dummy_prompt_token_ids = [[0] * input_len] * batch_size
+                dummy_prompts = []
+                dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose')
 
                 def run_to_completion(profile_dir: Optional[str] = None):
                     if profile_dir:
@@ -60,13 +62,25 @@ def run_to_completion(profile_dir: Optional[str] = None):
                                         sampling_params=sampling_params,
                                         use_tqdm=False)
                         print(p.key_averages())
+                    elif args.accuracy:
+                        start_time = time.perf_counter()
+                        rsp = llm.generate(
+                                    #prompt_token_ids=dummy_prompt_token_ids,
+                                    prompts=dummy_prompts,
+                                    sampling_params=sampling_params,
+                                    use_tqdm=False)
+                        end_time = time.perf_counter()
+                        latency = end_time - start_time
+                        print('>>Rsp', rsp[0].outputs)
+                        return latency
                     else:
                         start_time = time.perf_counter()
-                        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                        rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                                     sampling_params=sampling_params,
                                     use_tqdm=False)
                         end_time = time.perf_counter()
                         latency = end_time - start_time
+                        print('>>Rsp', rsp[0].outputs)
                         return latency
 
                 print("Warming up...")
@@ -92,7 +106,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     profile_rpd = rpdTracerControl()
                     profile_rpd.start()
                     print(f"RPD Profiling'...")
-                    run_to_completion(profile_dir=None)
+                    with torch.autograd.profiler.emit_nvtx():
+                        run_to_completion(profile_dir=None)
                     profile_rpd.stop()
                     return
 
@@ -140,6 +155,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    
     parser.add_argument(
         '--dtype',
         type=str,
@@ -152,6 +168,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--enforce-eager',
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
+    parser.add_argument('--accuracy',
+                        action='store_true',
+                        help='Run an Actual query through vllm')
     parser.add_argument(
         '--profile',
         action='store_true',
@@ -162,6 +181,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help=('path to save the pytorch profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        '--rpd',
+        action='store_true',
+        help='profile the generation process of a single batch using the rpd tracer')
     parser.add_argument('--warmup-only', action='store_true',
                         help='only run warmup, useful for tuning')
     parser.add_argument('--report', action='store_true',

From 534dcff5ab7bbb1f7e9971438bbd4e9cd88a04fc Mon Sep 17 00:00:00 2001
From: Doug Lehr <Doug.Lehr@amd.com>
Date: Mon, 26 Feb 2024 18:07:20 +0000
Subject: [PATCH 16/34] Don't broadcast when using torchrun

---
 vllm/model_executor/layers/tuned_gemm.py |  6 +++++-
 vllm/worker/model_runner.py              |  3 ++-
 vllm/worker/worker.py                    | 15 ++++++++++-----
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
index 1dc0dcc9a4670..bebab27ebfd86 100644
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ b/vllm/model_executor/layers/tuned_gemm.py
@@ -97,7 +97,11 @@ def mm(self,inp,weights):
             #print(">>> found rocblas")
             out = rocb_mm(inp_view,weights.t(),solidx)
         else:
-            #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx)
+            
+            if (self.save_gemm == 1):
+                print('>>>Tgemm Default',inp_view.shape, inp.shape,weights.shape,soltype,solidx)
+                self.tuned_df = pd.concat([self.tuned_df, pd.DataFrame({'M':[weights.shape[0]], 'N':[inp.shape[0]*inp.shape[1]], 'K':[weights.shape[1]]})]).drop_duplicates()
+                self.tuned_df.to_csv(self.untune_path, index=False)
             out = F.linear(inp,weights)
         if batched:
             return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 985115613e044..f7e40bae30990 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -25,6 +25,7 @@
 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+#_BATCH_SIZES_TO_CAPTURE = [1]
 
 
 class ModelRunner:
@@ -477,7 +478,7 @@ def prepare_input_tensors(
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
             }
-            broadcast_tensor_dict(metadata_dict, src=0)
+            #broadcast_tensor_dict(metadata_dict, src=0)
         else:
             metadata_dict = broadcast_tensor_dict(src=0)
             input_tokens = metadata_dict["input_tokens"]
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index aafd7306acf5d..0121486dce246 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -46,6 +46,9 @@ def __init__(
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
         self.is_driver_worker = is_driver_worker
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.local_rank = local_rank
+
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
@@ -53,7 +56,7 @@ def __init__(
                                         parallel_config,
                                         scheduler_config,
                                         lora_config=self.lora_config,
-                                        is_driver_worker=is_driver_worker)
+                                        is_driver_worker=self.is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # self.init_cache_engine().
         self.cache_config = None
@@ -74,8 +77,10 @@ def init_model(self) -> None:
         os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
         self.rank = self.rank if self.rank is not None else int(
             os.getenv("RANK", "-1"))
-        local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        self.device = torch.device(f"cuda:{local_rank}")
+
+        self.device = torch.device(f"cuda:{self.local_rank}")
+       
+
         torch.cuda.set_device(self.device)
 
         _check_if_gpu_supports_dtype(self.model_config.dtype)
@@ -182,7 +187,7 @@ def execute_model(
         blocks_to_swap_out: Optional[Dict[int, int]] = None,
         blocks_to_copy: Optional[Dict[int, List[int]]] = None,
     ) -> Optional[SamplerOutput]:
-        if self.is_driver_worker:
+        if self.is_driver_worker and self.rank == 0:
             assert seq_group_metadata_list is not None
             num_seq_groups = len(seq_group_metadata_list)
             assert blocks_to_swap_in is not None
@@ -194,7 +199,7 @@ def execute_model(
                 "blocks_to_swap_out": blocks_to_swap_out,
                 "blocks_to_copy": blocks_to_copy,
             }
-            broadcast_tensor_dict(data, src=0)
+            #broadcast_tensor_dict(data, src=0)
         else:
             data = broadcast_tensor_dict(src=0)
             num_seq_groups = data["num_seq_groups"]

From e56913312f3e43ba0110c5caa391d4fb35fa2069 Mon Sep 17 00:00:00 2001
From: jpvillam <juan.villamizar@amd.com>
Date: Fri, 15 Mar 2024 13:15:54 -0400
Subject: [PATCH 17/34] Adding new rocm triton flash attention kernel

Co-authored-by: Vinayak Gokhale <vinayak.gokhale@amd.com>
---
 Dockerfile.rocm                               |  14 +
 .../layers/attention/attention.py             |   2 +-
 .../layers/attention/backends/flash_attn.py   |  40 +-
 .../attention/ops/flash_attention_triton.py   | 541 ++++++++++++++++++
 4 files changed, 586 insertions(+), 11 deletions(-)
 create mode 100644 vllm/model_executor/layers/attention/ops/flash_attention_triton.py

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index a45265d79a6ac..a7640f6841ad9 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -26,6 +26,9 @@ ARG BUILD_FA="1"
 # whether to build cupy on rocm
 ARG BUILD_CUPY="1"
 
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -95,6 +98,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
     && cd ..; \
     fi
 
+# build triton
+RUN if [ "$BUILD_TRITON" = "1"]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCmSoftwarePlatform/triton.git
+    && cd triton/python \
+    && pip3 install -e . \
+    && cd ../..; \
+    fi
+
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 830e82e10f7ad..39b66fb1fb2db 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -30,7 +30,7 @@ def __init__(
         sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
-        if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and
+        if (torch.cuda.get_device_capability()[0] >= 8 and
                 torch.get_default_dtype() in (torch.float16, torch.bfloat16)):
             # Ampere or later NVIDIA GPUs.
             # NOTE(woosuk): FlashAttention does not support FP32.
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 512f4e49c7eb2..c7543e2d54d12 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -2,12 +2,21 @@
 from typing import List, Optional
 
 # NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/.
-from flash_attn import flash_attn_func
+from vllm.utils import is_hip
+try:
+    from flash_attn import flash_attn_func
+except ImportError:
+    if is_hip():
+        pass
+    else:
+        raise
+
 import torch
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention.ops.paged_attn import (
     PagedAttentionImpl)
+from vllm.model_executor.layers.attention.ops.flash_attention_triton import attention
 
 
 class FlashAttentionBackend:
@@ -86,15 +95,26 @@ def forward(
                 query = query.unflatten(0, (batch_size, seq_len))
                 key = key.unflatten(0, (batch_size, seq_len))
                 value = value.unflatten(0, (batch_size, seq_len))
-                output = flash_attn_func(
-                    query,
-                    key,
-                    value,
-                    softmax_scale=self.scale,
-                    causal=True,
-                    window_size=self.sliding_window,
-                    alibi_slopes=self.alibi_slopes,
-                )
+                if is_hip():
+                    output, _ = attention(
+                                query,
+                                key,
+                                value,
+                                None,
+                                input_metadata,
+                                True,
+                                self.scale,
+                            )
+                else:
+                    output = flash_attn_func(
+                        query,
+                        key,
+                        value,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
+                    )
             else:
                 # prefix-enabled attention
                 output = PagedAttentionImpl.forward_prefix(
diff --git a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py
new file mode 100644
index 0000000000000..37c15e0e6fa36
--- /dev/null
+++ b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py
@@ -0,0 +1,541 @@
+#!/usr/bin/env python
+"""
+Fused Attention
+===============
+
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+
+Features supported:
+
+1) Fwd with causal masking
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
+
+Not currently supported:
+
+1) Non power of two head dims
+
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+torch_dtype:tl.constexpr = torch.float16
+
+TORCH_HAS_FP8E5 = hasattr(torch, 'float8_e5m2fnuz')
+if TORCH_HAS_FP8E5:
+    torch_dtype:tl.constexpr = torch.float8_e5m2fnuz
+
+@triton.jit
+def cdiv_fn(x,y):
+    return (x + y - 1) // y
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+@triton.jit
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
+
+@triton.jit
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
+
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
+
+@triton.jit
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0,1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
+
+@triton.jit
+def _attn_fwd_inner(
+    acc, l_i, m_i, q,
+    K_block_ptr, V_block_ptr,
+    start_m,
+    actual_seqlen_k,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
+    encoded_softmax_block_ptr,
+    block_min, block_max,
+    offs_n_causal,
+    masked_blocks,
+    n_extra_tokens,
+    bias_ptr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    OFFS_M: tl.constexpr,
+    OFFS_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr
+):
+    # loop over k, v, and update accumulator
+    for start_n in range (block_min, block_max, BLOCK_N):
+        # For padded blocks, we will overrun the tensor size if
+        # we load all BLOCK_N. For others, the blocks are all within range.
+        k = load_fn(K_block_ptr, PADDED_HEAD, MASK_STEPS and (n_extra_tokens != 0), "zero")
+        if PRE_LOAD_V:
+            v = load_fn(V_block_ptr, MASK_STEPS and (n_extra_tokens != 0), PADDED_HEAD, "zero")
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # We start from end of seqlen_k so only the first iteration would need
+        # to be checked for padding if it is not a multiple of block_n
+        # TODO: This can be optimized to only be true for the padded block.
+        if MASK_STEPS:
+            # If this is the last block / iteration, we want to
+            # mask if the sequence length is not a multiple of block size
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not is_modulo_mn.
+            # last step might get wasted but that is okay. check if this masking works For
+            # that case.
+            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
+                size_n = start_n + OFFS_N[None,:]
+                mask = size_n < boundary_m[:,None]
+                qk = tl.where(mask, qk, float("-inf"))
+        if IS_CAUSAL:
+            causal_boundary = start_n + offs_n_causal
+            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
+            qk = tl.where(causal_mask, qk, float("-inf"))
+        # -- compute qk ----
+        qk += tl.dot(q, k)
+        if bias_ptr is not None:
+            bias = load_fn(bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero")
+            # While bias is added after multiplying qk with sm_scale,
+            # our optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += (bias * 1.44269504089)
+        m_ij = tl.maximum(m_i, tl.max(qk,1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+
+        # CAVEAT: Must update l_ij before applying dropout
+        l_ij = tl.sum(p, 1)
+        if ENABLE_DROPOUT:
+            philox_offset = batch_philox_offset + start_m * BLOCK_M * actual_seqlen_k + start_n - BLOCK_N
+            keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, actual_seqlen_k)
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(encoded_softmax_block_ptr, tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty))
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(encoded_softmax_block_ptr, p.to(encoded_softmax_block_ptr.type.element_ty))
+        # -- update output accumulator --
+        alpha = tl.math.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None]
+        if not PRE_LOAD_V:
+            v = load_fn(V_block_ptr, MASK_STEPS and (n_extra_tokens != 0), PADDED_HEAD, "zero")
+        # -- update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        # update m_i and l_i
+        m_i = m_ij
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, (0, BLOCK_N))
+    return acc, l_i, m_i
+
+@triton.autotune(
+   configs=[
+       triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8),
+       triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+       triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8),
+       triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': True}, num_stages=1, num_warps=4),
+       triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+       triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8),
+       triton.Config({'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8),
+       # TODO: This config fails with head_size not pow2 with data mismatches. Check why.
+    #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+       triton.Config({'BLOCK_M': 16, 'BLOCK_N': 16, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+   ],
+   key=['hq', 'hk', 'IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL'],
+)
+@triton.jit
+def attn_fwd(
+    Q, K, V, bias, sm_scale, L, Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vk, stride_vn,
+    stride_oz, stride_oh, stride_om, stride_on,
+    stride_bz, stride_bh, stride_bm, stride_bn,
+    cu_seqlens_q, cu_seqlens_k,
+    dropout_p, philox_seed, philox_offset_base, encoded_softmax,
+    hq, hk,
+    ACTUAL_BLOCK_DMODEL:tl.constexpr,
+    MAX_SEQLENS_Q:tl.constexpr, MAX_SEQLENS_K:tl.constexpr,
+    VARLEN: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr, RETURN_ENCODED_SOFTMAX: tl.constexpr
+):
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    if VARLEN:
+        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
+        if start_m * BLOCK_M > seqlen_q:
+            return
+        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
+    else:
+        cu_seqlens_q_start = 0
+        cu_seqlens_k_start = 0
+        seqlen_q = MAX_SEQLENS_Q
+        seqlen_k = MAX_SEQLENS_K
+
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if (IS_CAUSAL):
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q,
+            BLOCK_N
+        )
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is part of
+        # the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0)
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1))
+            l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass, we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0 for these masked blocks.
+            l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here too?
+            return
+
+    is_mqa = hq != hk
+    off_h_k = off_h_q % hk if is_mqa else off_h_q
+    need_padding = False
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        need_padding = True
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        need_padding = True
+        n_extra_tokens = seqlen_k % BLOCK_N
+    padded_head = (ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL)
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = off_z * stride_qz +  off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = philox_offset_base + off_hz * seqlen_q * seqlen_k
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout. In
+    # this case, we return an invalid pointer so indicate the mask is not valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+                base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+                shape=(seqlen_q, seqlen_k),
+                strides=(seqlen_k, 1),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_N),
+                order=(1, 0)
+                )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, padded_head, "zero")
+    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional block.
+    # In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its actual
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc, l_i, m_i, q, K_block_ptr, V_block_ptr,
+            start_m, seqlen_k,
+            dropout_p, philox_seed, batch_philox_offset, encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min, block_max, 0, 0, 0, bias_ptr,
+            # IS_CAUSAL, ....
+            False, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V, False, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX, padded_head
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if (masked_blocks > 0):
+        if IS_CAUSAL:
+            offs_n_causal = offs_n + (seqlen_q - seqlen_k)
+        else:
+            offs_n_causal = 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks*BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks*BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks*BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, n_full_blocks))
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc, l_i, m_i, q, K_block_ptr, V_block_ptr,
+            start_m, seqlen_k,
+            dropout_p, philox_seed, batch_philox_offset, encoded_softmax_block_ptr,
+            block_min, block_max, offs_n_causal, masked_blocks,  n_extra_tokens, bias_ptr,
+            IS_CAUSAL, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V, True, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX, padded_head
+        )
+    # epilogue
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full((BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32)
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
+            z = 0.0
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last few rows.
+    # This is only true for the last M block. For others, overflow_size will be -ve
+    overflow_size = end_m_idx - seqlen_q
+    if overflow_size > 0:
+        boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+        # This is a > check because mask being 0 blocks the store.
+        l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+        tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    else:
+        tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # Need boundary check on this to make sure the padding from the 
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally. 
+    tl.store(O_block_ptr, acc, boundary_check=(0,1))
+
+def check_args(q, k, v, o, max_seqlens):
+        assert q.dim() == k.dim() and q.dim() == v.dim()
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+        assert k.shape == v.shape
+        assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+        # TODO: Change assert if we support qkl f8 and v f16
+        assert q.dtype == k.dtype and q.dtype == v.dtype
+        # TODO: Fix assert to check head size <=256 once supported
+        assert head_size <= 128
+        assert o.shape == q.shape
+        assert (nheads_q % nheads_k) == 0
+
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None):
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
+        check_args(q, k, v, o, metadata.max_seq_len)
+
+        batch, seqlen_q, nheads_q, head_size = q.shape
+        _, seqlen_k, nheads_k, _ = k.shape
+        q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+        k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+        v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+        o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+
+        # Get closest power of 2 over or equal to 32.
+        unpadded_head_dims = {32, 64, 128}
+        if head_size not in unpadded_head_dims:
+            padded_d_model = None
+            for i in unpadded_head_dims:
+                if i > head_size:
+                    padded_d_model = i
+                    break
+            assert padded_d_model is not None
+        else:
+            padded_d_model = head_size
+
+
+        grid = lambda META: (
+            triton.cdiv(metadata.max_seq_len, META['BLOCK_M']),
+            nheads_q,
+            batch
+        )
+
+        encoded_softmax = None
+
+        M = torch.empty((batch, nheads_q, metadata.max_seq_len), device=q.device, dtype=torch.float32)
+
+        # Seed the RNG so we get reproducible results for testing.
+        philox_seed = 0x1BF52
+        philox_offset = 0x1D4B42
+
+        if bias is not None:
+            bias_strides = (bias.stride(0), bias.stride(1),
+                            bias.stride(2), bias.stride(3))
+        else:
+            bias_strides = (0,0,0,0)
+
+        attn_fwd[grid](
+            q, k, v, bias, sm_scale, M, o,
+            *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides,
+            None, None,
+            dropout_p=0.0,
+            philox_seed=philox_seed,
+            philox_offset_base=philox_offset,
+            encoded_softmax=encoded_softmax,
+            hq=nheads_q, hk=nheads_k,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=metadata.max_seq_len,
+            MAX_SEQLENS_K=metadata.max_seq_len,
+            IS_CAUSAL=causal,
+            VARLEN=False,
+            BLOCK_DMODEL=padded_d_model,
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False
+        )
+
+        ctx.save_for_backward(q, k, v, o, M)
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = head_size
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
+        ctx.philox_seed = philox_seed
+        ctx.philox_offset = philox_offset
+        ctx.encoded_softmax = encoded_softmax
+        ctx.return_encoded_softmax = False
+        return o, encoded_softmax
+
+attention = _attention.apply

From be708d0d10284571f52074d338e95b859e275dfe Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Mon, 18 Mar 2024 20:23:32 +0000
Subject: [PATCH 18/34] Removed gradlib and its tuned gemm in favor of tunable
 ops

---
 .gitignore                                    |   1 +
 gradlib/csrc/grad_funcs.cu                    | 413 ------------
 gradlib/csrc/hipbsolgemm.cu                   | 610 ------------------
 gradlib/csrc/rocsolgemm.cu                    | 563 ----------------
 gradlib/gemm_runner.py                        |  62 --
 gradlib/gemm_tuner.py                         |  92 ---
 gradlib/gradlib/GemmTuner.py                  | 208 ------
 gradlib/mm_test.py                            | 234 -------
 gradlib/setup.py                              | 136 ----
 run.sh                                        |  32 -
 run_70b.sh                                    |  98 ---
 run_70b_fast.sh                               |  65 --
 run_llama2.sh                                 |  98 ---
 vllm/config.py                                |   2 +-
 vllm/engine/ray_utils.py                      |  24 +-
 vllm/model_executor/layers/linear.py          |   9 +-
 vllm/model_executor/layers/sampler.py         |   3 +-
 vllm/model_executor/layers/tuned_gemm.py      | 111 ----
 .../parallel_utils/communication_op.py        |  26 +-
 vllm/worker/model_runner.py                   |   3 +-
 vllm/worker/worker.py                         |  31 +-
 21 files changed, 24 insertions(+), 2797 deletions(-)
 delete mode 100644 gradlib/csrc/grad_funcs.cu
 delete mode 100644 gradlib/csrc/hipbsolgemm.cu
 delete mode 100644 gradlib/csrc/rocsolgemm.cu
 delete mode 100644 gradlib/gemm_runner.py
 delete mode 100644 gradlib/gemm_tuner.py
 delete mode 100644 gradlib/gradlib/GemmTuner.py
 delete mode 100644 gradlib/mm_test.py
 delete mode 100644 gradlib/setup.py
 delete mode 100755 run.sh
 delete mode 100755 run_70b.sh
 delete mode 100755 run_70b_fast.sh
 delete mode 100755 run_llama2.sh
 delete mode 100644 vllm/model_executor/layers/tuned_gemm.py

diff --git a/.gitignore b/.gitignore
index b5195629e5cf3..b1513ef0ddb0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,6 +181,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+hip_compat.h
 
 # Benchmark dataset
 *.json
diff --git a/gradlib/csrc/grad_funcs.cu b/gradlib/csrc/grad_funcs.cu
deleted file mode 100644
index f6498fb2a3ba7..0000000000000
--- a/gradlib/csrc/grad_funcs.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-// #ifdef __gfx908__
-// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
-// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
-// #undef __HIP_NO_HALF_OPERATORS__
-// #undef __HIP_NO_HALF_CONVERSIONS__
-// #endif
-
-#include <torch/torch.h>
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-#include <ATen/autocast_mode.h>
-#include <ATen/cuda/CUDABlas.h>
-#include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAFunctions.h>
-// #include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/hip/HIPStream.h>
-#include <c10/macros/Export.h>
-#include <c10/util/irange.h>
-#include <ATen/cuda/CUDAEvent.h>
-
-#include <hip/hip_runtime.h>
-//#include <hipblaslt/hipblaslt-ext.hpp>
-#include <hipblaslt/hipblaslt.h>
-
-#include <iostream>
-#include <limits>
-#include <map>
-#include <string>
-#include <tuple>
-#include <assert.h>
-#include "nvToolsExt.h"
-
-// #ifdef USE_ROCM
-// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// #endif
-
-// #ifdef __HIP_PLATFORM_HCC__
-// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
-// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
-// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-// 	  #endif
-// 	#endif
-// #endif
-
-#ifndef CHECK_HIP_ERROR
-#define CHECK_HIP_ERROR(error)                    \
-    if(error != hipSuccess)                       \
-    {                                             \
-        fprintf(stderr,                           \
-                "Hip error: '%s'(%d) at %s:%d\n", \
-                hipGetErrorString(error),         \
-                error,                            \
-                __FILE__,                         \
-                __LINE__);                        \
-        exit(EXIT_FAILURE);                       \
-    }
-#endif
-
-#ifndef CHECK_HIPBLAS_ERROR
-#define CHECK_HIPBLAS_ERROR(error)                    \
-    if(error != HIPBLAS_STATUS_SUCCESS)               \
-    {                                                 \
-        fprintf(stderr,                               \
-                "hipBLAS error: '%s'(%d) at %s:%d\n", \
-                hipblasStatusToString(error),         \
-                error,                                \
-                __FILE__,                             \
-                __LINE__);                            \
-        exit(EXIT_FAILURE);                           \
-    }
-#endif
-
-namespace {
-  /*thread_local*/ cudaStream_t weight_stream;
-  // BUG: DLM has event and stream on different devices error
-  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
-  // C++ keyword: thread_local <- maybe this can help?
-  /*thread_local*/ cudaEvent_t event;
-
-  // hipBLASLt
-  hipblasLtHandle_t hipblaslt_handle;
-  hipblasLtMatmulPreference_t preference;
-  uint64_t workspace_size = 32*1024*1024;
-  //uint64_t workspace_size = 0;
-  void* d_workspace;
-  int request_solutions = 1;
-  int returnedAlgoCount = 0;
-
-  struct MatMulConfig {
-    hipblasOperation_t op_A;
-    hipblasOperation_t op_B;
-    int M;
-    int N;
-    int K;
-    hipblasDatatype_t dtype;
-
-    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
-      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
-    }
-  };
-
-  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
-  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
-
-  hipEvent_t start, stop;
-  int bench_iters { 1 };
-  int warmup_iters { 1 };
-
-  bool cout_print = true;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * hipBLASLt GEMM call
-*/
-hipblasStatus_t hipblasLtMatmul_wrapper(
-    hipblasLtHandle_t handle,
-    hipblasOperation_t op_A,
-    hipblasOperation_t op_B,
-    int m, int n, int k,
-    const void *alpha,
-    const void *a,
-    int lda,
-    const void *b,
-    int ldb,
-    const void *beta,
-    void *c,
-    int ldc,
-    hipblasDatatype_t dtype,
-    hipStream_t &stream)
-{
-  // TODO: flag is not supported for hipblasLt yet
-  int flag { 0 };
-  if (dtype == HIPBLAS_R_16F) {
-    // use fp16 alt impl for MI200
-    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-    flag = rocblas_gemm_flags_fp16_alt_impl;
-  }
-
-  nvtxRangePushA("hipBLASLt variables creation");
-  hipblasLtMatrixLayout_t matA, matB, matC;
-  hipblasLtMatmulDesc_t matmul;
-  if (op_A == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
-  }
-  if (op_B == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
-  }
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
-  nvtxRangePop();
-
-  // if heuristic does not exist in the map, do search and push into the map
-  auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
-  if (heuristic_map.count(gemm_key) <= 0) {
-    nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
-    if (cout_print) {
-      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
-                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
-                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
-    }
-    std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
-      handle, matmul, matA, matB, matC, matC,
-      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
-    if((returnedAlgoCount != request_solutions) && cout_print) {
-      std::cout << "less solution found! request: " << request_solutions
-                << ", found: " << returnedAlgoCount << std::endl;
-    }
-
-    if (returnedAlgoCount == 1) {
-      heuristic_map[gemm_key] = heuristicResult[0];
-    } else {
-      // benchmark requested solutions and pick best one
-      int bestIndex { -1 };
-      double bestMs { std::numeric_limits<double>::max() };
-      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
-        // warm up
-        for (int iter { 0 }; iter < warmup_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        // performance measuring
-        double eventMs;
-        CHECK_HIP_ERROR(hipEventRecord(start, stream));
-        for (int iter { 0 }; iter < bench_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
-        CHECK_HIP_ERROR(hipEventSynchronize(stop));
-        float temp;
-        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
-        eventMs = double(temp);
-        eventMs /= bench_iters;
-
-        if (cout_print) {
-          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
-        }
-        if (bestMs > eventMs) {
-          bestMs = eventMs;
-          bestIndex = sol;
-          if (cout_print) {
-            std::cout << " *" << std::endl;
-          }
-        } else {
-          if (cout_print) {
-            std::cout << std::endl;
-          }
-        }
-      }
-      heuristic_map[gemm_key] = heuristicResult[bestIndex];
-    }
-    nvtxRangePop();
-  }
-
-  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
-      alpha,
-      a, matA,
-      b, matB,
-      beta,
-      c, matC,
-      c, matC,
-      &heuristic_map[gemm_key].algo,
-      d_workspace, workspace_size,
-      stream);
-
-  nvtxRangePushA("hipBLASLt variables deletion");
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
-  nvtxRangePop();
-
-  return status;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-torch::Tensor hipBLASLtMm_(
-    const torch::Tensor& mat1,
-    const torch::Tensor& mat2)
-{
-  auto mat1_strides { mat1.strides() };
-  auto mat2_strides { mat2.strides() };
-  auto mat1_sizes { mat1.sizes() };
-  auto mat2_sizes { mat2.sizes() };
-  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(
-    mat1.dtype() == mat2.dtype(),
-    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-  );
-  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-  auto abcType { mat1.options().dtype() };
-  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
-  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
-  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
-
-  bool transpose_result = true;
-  bool transpose_mat1;
-  bool transpose_mat2;
-  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
-    transpose_mat2 = false;
-  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
-    transpose_mat2 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
-    transpose_mat1 = false;
-  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
-    transpose_mat1 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-
-  if (transpose_result) {
-    bool tmp = transpose_mat1;
-    transpose_mat1 = !transpose_mat2;
-    transpose_mat2 = !tmp;
-    mat1_strides = mat2.strides();
-    mat2_strides = mat1.strides();
-    mat1_sizes = mat2.sizes();
-    mat2_sizes = mat1.sizes();
-  }
-  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
-  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
-  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
-  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-  
-  float one { 1.0f };
-  float zero { 0.0f };
-  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
-  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
-  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
-  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
-  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
-  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
-  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
-  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
-
-  int flag { 0 };
-  hipblasDatatype_t hipblasType;
-  if (abcType == at::kHalf) {
-    hipblasType = HIPBLAS_R_16F;
-  } else if (abcType == at::kBFloat16) {
-    hipblasType = HIPBLAS_R_16B;
-  } else if (abcType == at::kFloat) {
-    hipblasType = HIPBLAS_R_32F;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-
-  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
-  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
-  void *ptrC { static_cast<void *>(result.data_ptr()) };
-  
-  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
-
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper(
-      hipblaslt_handle,
-      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      m, n, k,
-      &one,
-      ptrA, mat1_ld,
-      ptrB, mat2_ld,
-      &zero,
-      ptrC, result_ld,
-      hipblasType,
-      current_stream));
-
-  return result;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void create_extension()
-{
-  CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
-  CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
-
-  // hipBLASLt
-  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
-  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
-      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
-  
-  CHECK_HIP_ERROR(hipEventCreate(&start));
-  CHECK_HIP_ERROR(hipEventCreate(&stop));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void destroy_extension()
-{
-    CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
-    CHECK_HIP_ERROR(hipEventDestroy(event));
-
-    // hipBLASLt
-    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
-    CHECK_HIP_ERROR(hipFree(d_workspace));
-
-    CHECK_HIP_ERROR(hipEventDestroy(start));
-    CHECK_HIP_ERROR(hipEventDestroy(stop));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-  m.def("create_extension", &create_extension, "create_extension");
-  m.def("destroy_extension", &destroy_extension, "destroy_extension");
-  m.def("mm", &hipBLASLtMm_, "mm");
-}
diff --git a/gradlib/csrc/hipbsolgemm.cu b/gradlib/csrc/hipbsolgemm.cu
deleted file mode 100644
index bf15fb1297667..0000000000000
--- a/gradlib/csrc/hipbsolgemm.cu
+++ /dev/null
@@ -1,610 +0,0 @@
-// #ifdef __gfx908__
-// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
-// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
-// #undef __HIP_NO_HALF_OPERATORS__
-// #undef __HIP_NO_HALF_CONVERSIONS__
-// #endif
-
-#include <torch/torch.h>
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-#include <ATen/autocast_mode.h>
-#include <ATen/cuda/CUDABlas.h>
-#include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAFunctions.h>
-// #include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/hip/HIPStream.h>
-#include <c10/macros/Export.h>
-#include <c10/util/irange.h>
-#include <ATen/cuda/CUDAEvent.h>
-
-#include <hip/hip_runtime.h>
-#include <hipblaslt/hipblaslt.h>
-#include <hipblaslt/hipblaslt-ext.hpp>
-
-#include <iostream>
-#include <limits>
-#include <map>
-#include <string>
-#include <tuple>
-#include <assert.h>
-#include "nvToolsExt.h"
-
-//#include <rocblas/rocblas.h>
-
-
-// #ifdef USE_ROCM
-// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// #endif
-
-// #ifdef __HIP_PLATFORM_HCC__
-// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
-// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
-// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-// 	  #endif
-// 	#endif
-// #endif
-
-#ifndef CHECK_HIP_ERROR
-#define CHECK_HIP_ERROR(error)                    \
-    if(error != hipSuccess)                       \
-    {                                             \
-        fprintf(stderr,                           \
-                "Hip error: '%s'(%d) at %s:%d\n", \
-                hipGetErrorString(error),         \
-                error,                            \
-                __FILE__,                         \
-                __LINE__);                        \
-        exit(EXIT_FAILURE);                       \
-    }
-#endif
-
-#ifndef CHECK_HIPBLAS_ERROR
-#define CHECK_HIPBLAS_ERROR(error)                    \
-    if(error != HIPBLAS_STATUS_SUCCESS)               \
-    {                                                 \
-        fprintf(stderr,                               \
-                "hipBLAS error: '%s'(%d) at %s:%d\n", \
-                hipblasStatusToString(error),         \
-                error,                                \
-                __FILE__,                             \
-                __LINE__);                            \
-        exit(EXIT_FAILURE);                           \
-    }
-#endif
-
-namespace {
-  /*thread_local*/ cudaStream_t weight_stream;
-  // BUG: DLM has event and stream on different devices error
-  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
-  // C++ keyword: thread_local <- maybe this can help?
-  /*thread_local*/ cudaEvent_t event;
-
-  // hipBLASLt
-  hipblasLtHandle_t hipblaslt_handle;
-  hipblasLtMatmulPreference_t preference;
-  size_t workspace_size = 2*128*1024*1024;
-  //uint64_t workspace_size = 0;
-  void* d_workspace;
-  int request_solutions = 1;
-  int returnedAlgoCount = 0;
-
-  struct MatMulConfig {
-    hipblasOperation_t op_A;
-    hipblasOperation_t op_B;
-    int M;
-    int N;
-    int K;
-    hipDataType dtype;
-
-    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
-      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
-    }
-  };
-
-  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
-  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
-
-  hipEvent_t start, stop;
-  int bench_iters { 1 };
-  int warmup_iters { 1 };
-
-  bool cout_print = false;
-  
-  //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
-}
-
-//find all hipblaslt solutions for given gemm problem
-std::vector<int> hipblasLtMatmul_findallsols_wrapper(
-    hipblasLtHandle_t handle,
-    hipblasOperation_t op_A,
-    hipblasOperation_t op_B,
-    int m, int n, int k,
-    const void *alpha,
-    const void *a,
-    int lda,
-    const void *b,
-    int ldb,
-    const void *beta,
-    void *c,
-    int ldc,
-    hipDataType dtype,
-    hipStream_t &stream)
-{
-  int flag { 0 };
-  hipblasLtMatrixLayout_t matA, matB, matC;
-  hipblasLtMatmulDesc_t matmul;
-  if (op_A == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
-  }
-  if (op_B == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
-  }
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
-  
-  //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(10);
-  //CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
-  //    handle, matmul, matA, matB, matC, matC,
-  //    preference, 10, heuristicResult.data(), &returnedAlgoCount));
-  std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
-  CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAllAlgos(handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
-    op_A,
-    op_B,
-    dtype,
-    dtype,
-    dtype,
-    dtype,
-    HIPBLAS_COMPUTE_32F,
-    heuristicResult));
-
-  std::vector<int> algoIndex;
-  int returned_algo_count = heuristicResult.size();
-  //for (int i = 0; i < returnedAlgoCount; i++) {
-  for (int i = 0; i < returned_algo_count; i++) {
-      auto algo = heuristicResult[i].algo;
-      size_t ret_workspace_size = 0;
-      auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul,
-        alpha,
-        matA,
-        matB,
-        beta,
-        matC,
-        matC,
-        algo,
-        ret_workspace_size
-      );
-      if (status == HIPBLAS_STATUS_SUCCESS) { 
-          if (ret_workspace_size<workspace_size) {
-            algoIndex.push_back(hipblaslt_ext::getIndexFromAlgo(algo));
-          }
-      }
-  }
-  
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
-  return algoIndex;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * hipBLASLt GEMM call
-*/
-hipblasStatus_t hipblasLtMatmul_sol_wrapper(
-    hipblasLtHandle_t handle,
-    hipblasOperation_t op_A,
-    hipblasOperation_t op_B,
-    int m, int n, int k,
-    const void *alpha,
-    const void *a,
-    int lda,
-    const void *b,
-    int ldb,
-    const void *beta,
-    void *c,
-    int ldc,
-    hipDataType dtype,
-    hipStream_t &stream,
-    int solution_index=-1)
-{
-  // TODO: flag is not supported for hipblasLt yet
-  int flag { 0 };
-  //if (dtype == HIPBLAS_R_16F) {
-    // use fp16 alt impl for MI200
-    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-    //flag = rocblas_gemm_flags_fp16_alt_impl;
-  //}
-
-  //nvtxRangePushA("hipBLASLt variables creation");
-  hipblasLtMatrixLayout_t matA, matB, matC;
-  hipblasLtMatmulDesc_t matmul;
-  if (op_A == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
-  }
-  if (op_B == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
-  }
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
-  //nvtxRangePop();
-  // if heuristic does not exist in the map, do search and push into the map
-  //auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
-  //if (heuristic_map.count(gemm_key) <= 0) {
-  std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(1);
-  if (solution_index<0) {
-    //nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
-    std::cout << "Warning! HipbSolId Gemm Fallback Path used for solution index <0" << std::endl;
-    if (cout_print) {
-      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
-                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
-                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
-    }
-    //std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
-      handle, matmul, matA, matB, matC, matC,
-      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
-    if((returnedAlgoCount != request_solutions) && cout_print) {
-      std::cout << "less solution found! request: " << request_solutions
-                << ", found: " << returnedAlgoCount << std::endl;
-    }
-    //heuristic_map[gemm_key] = heuristicResult[0];
-/*
-    if (returnedAlgoCount == 1) {
-      heuristic_map[gemm_key] = heuristicResult[0];
-    } else {
-      // benchmark requested solutions and pick best one
-      int bestIndex { -1 };
-      double bestMs { std::numeric_limits<double>::max() };
-      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
-        // warm up
-        for (int iter { 0 }; iter < warmup_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        // performance measuring
-        double eventMs;
-        CHECK_HIP_ERROR(hipEventRecord(start, stream));
-        for (int iter { 0 }; iter < bench_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
-        CHECK_HIP_ERROR(hipEventSynchronize(stop));
-        float temp;
-        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
-        eventMs = double(temp);
-        eventMs /= bench_iters;
-
-        if (cout_print) {
-          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
-        }
-        if (bestMs > eventMs) {
-          bestMs = eventMs;
-          bestIndex = sol;
-          if (cout_print) {
-            std::cout << " *" << std::endl;
-          }
-        } else {
-          if (cout_print) {
-            std::cout << std::endl;
-          }
-        }
-      }
-      heuristic_map[gemm_key] = heuristicResult[bestIndex];
-    }
-*/
-    //nvtxRangePop();
-  } else {
-      std::vector<int> algoIndex(1);
-      algoIndex[0]=solution_index;
-      //std::vector<hipblasLtMatmulHeuristicResult_t> tmpAlgo;
-      CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAlgosFromIndex(handle, algoIndex, heuristicResult));
-  }
-    
-  //size_t ret_workspace_size = 0;
-  
-  //auto status1 = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul,
-  //      alpha,
-  //      matA,
-  //      matB,
-  //      beta,
-  //      matC,
-  //      matC,
-  //      heuristicResult[0].algo,
-  //      ret_workspace_size
-  //);
-  //if (status1 == HIPBLAS_STATUS_SUCCESS) { 
-  //  std::cout << "Workspace size" << ret_workspace_size << std::endl; 
-
-  //} else {
-  //  std::cout << "Algo not supported!!!" << std::endl; 
-
-  //}
-  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
-      alpha,
-      a, matA,
-      b, matB,
-      beta,
-      c, matC,
-      c, matC,
-      &heuristicResult[0].algo,
-      d_workspace, workspace_size,
-      stream);
-
-  //nvtxRangePushA("hipBLASLt variables deletion");
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
-  //nvtxRangePop();
-
-  return status;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-torch::Tensor HipbSolIdxBlas(
-    const torch::Tensor& mat1,
-    const torch::Tensor& mat2,
-    const int solution_index
-    )
-{
-  auto mat1_strides { mat1.strides() };
-  auto mat2_strides { mat2.strides() };
-  auto mat1_sizes { mat1.sizes() };
-  auto mat2_sizes { mat2.sizes() };
-  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(
-    mat1.dtype() == mat2.dtype(),
-    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-  );
-  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-  auto abcType { mat1.options().dtype() };
-  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
-  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
-  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
-
-  bool transpose_result = true;
-  bool transpose_mat1;
-  bool transpose_mat2;
-  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
-    transpose_mat2 = false;
-  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
-    transpose_mat2 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
-    transpose_mat1 = false;
-  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
-    transpose_mat1 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-
-  if (transpose_result) {
-    bool tmp = transpose_mat1;
-    transpose_mat1 = !transpose_mat2;
-    transpose_mat2 = !tmp;
-    mat1_strides = mat2.strides();
-    mat2_strides = mat1.strides();
-    mat1_sizes = mat2.sizes();
-    mat2_sizes = mat1.sizes();
-  }
-  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
-  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
-  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
-  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-  
-  float one { 1.0f };
-  float zero { 0.0f };
-  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
-  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
-  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
-  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
-  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
-  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
-  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
-  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
- 
-  hipDataType hipblasType;
-  if (abcType == at::kHalf) {
-    hipblasType = HIP_R_16F;
-  } else if (abcType == at::kBFloat16) {
-    hipblasType = HIP_R_16BF;
-  } else if (abcType == at::kFloat) {
-    hipblasType = HIP_R_32F;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
-  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
-  void *ptrC { static_cast<void *>(result.data_ptr()) };
-  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
-
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_sol_wrapper(
-      hipblaslt_handle,
-      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      m, n, k,
-      &one,
-      ptrA, mat1_ld,
-      ptrB, mat2_ld,
-      &zero,
-      ptrC, result_ld,
-      hipblasType,
-      current_stream,solution_index));
-
-  return result;
-}
-
-//find all hipblas solutions and return them to python land
-std::vector<int> HipbFindAllSolIdxBlas(
-    const torch::Tensor& mat1,
-    const torch::Tensor& mat2
-    )
-{
-  auto mat1_strides { mat1.strides() };
-  auto mat2_strides { mat2.strides() };
-  auto mat1_sizes { mat1.sizes() };
-  auto mat2_sizes { mat2.sizes() };
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(
-    mat1.dtype() == mat2.dtype(),
-    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-  );
-  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-  auto abcType { mat1.options().dtype() };
-  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
-  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
-  bool transpose_result = true;
-  bool transpose_mat1;
-  bool transpose_mat2;
-  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
-    transpose_mat2 = false;
-  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
-    transpose_mat2 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
-    transpose_mat1 = false;
-  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
-    transpose_mat1 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if (transpose_result) {
-    bool tmp = transpose_mat1;
-    transpose_mat1 = !transpose_mat2;
-    transpose_mat2 = !tmp;
-    mat1_strides = mat2.strides();
-    mat2_strides = mat1.strides();
-    mat1_sizes = mat2.sizes();
-    mat2_sizes = mat1.sizes();
-  }
-  float one { 1.0f };
-  float zero { 0.0f };
-  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
-  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
-  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
-  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
-  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
-  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
-  hipDataType hipblasType;
-  if (abcType == at::kHalf) {
-    hipblasType = HIP_R_16F;
-  } else if (abcType == at::kBFloat16) {
-    hipblasType = HIP_R_16BF;
-  } else if (abcType == at::kFloat) {
-    hipblasType = HIP_R_32F;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
-  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
-  void *ptrC { static_cast<void *>(result.data_ptr()) };
-  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
-
-  return hipblasLtMatmul_findallsols_wrapper(
-      hipblaslt_handle,
-      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      m, n, k,
-      &one,
-      ptrA, mat1_ld,
-      ptrB, mat2_ld,
-      &zero,
-      ptrC, result_ld,
-      hipblasType,
-      current_stream);
-
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void hipb_create_extension()
-{
-  //CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
-  //CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
-
-  // hipBLASLt
-  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
-  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
-      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
-  
-  //CHECK_HIP_ERROR(hipEventCreate(&start));
-  //CHECK_HIP_ERROR(hipEventCreate(&stop)); 
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void hipb_destroy_extension()
-{
-    //CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
-    //CHECK_HIP_ERROR(hipEventDestroy(event));
-
-    // hipBLASLt
-    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
-    CHECK_HIP_ERROR(hipFree(d_workspace));
-
-    //CHECK_HIP_ERROR(hipEventDestroy(start));
-    //CHECK_HIP_ERROR(hipEventDestroy(stop)); 
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-  m.def("hipb_create_extension", &hipb_create_extension, "create_extension");
-  m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension");
-  m.def("hipb_mm", &HipbSolIdxBlas, "mm");
-  m.def("hipb_findallsols", &HipbFindAllSolIdxBlas, "hipblas_find_all_sols");
-}
diff --git a/gradlib/csrc/rocsolgemm.cu b/gradlib/csrc/rocsolgemm.cu
deleted file mode 100644
index d691fcac416a6..0000000000000
--- a/gradlib/csrc/rocsolgemm.cu
+++ /dev/null
@@ -1,563 +0,0 @@
-// #ifdef __gfx908__
-// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others
-// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h
-// #undef __HIP_NO_HALF_OPERATORS__
-// #undef __HIP_NO_HALF_CONVERSIONS__
-// #endif
-
-#define ROCBLAS_NO_DEPRECATED_WARNINGS
-#define ROCBLAS_BETA_FEATURES_API
-
-#include <torch/torch.h>
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-#include <ATen/autocast_mode.h>
-#include <ATen/cuda/CUDABlas.h>
-#include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAFunctions.h>
-// #include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/hip/HIPStream.h>
-#include <c10/macros/Export.h>
-#include <c10/util/irange.h>
-#include <ATen/cuda/CUDAEvent.h>
-
-#include <hip/hip_runtime.h>
-//#include <hipblaslt/hipblaslt-ext.hpp>
-#include <hipblaslt/hipblaslt.h>
-
-#include <iostream>
-#include <limits>
-#include <map>
-#include <string>
-#include <tuple>
-#include <assert.h>
-#include "nvToolsExt.h"
-
-#include <rocblas/rocblas.h>
-
-
-// #ifdef USE_ROCM
-// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// #endif
-
-// #ifdef __HIP_PLATFORM_HCC__
-// 	#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
-// 	#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
-// 	#if USE_GEMM_FLAGS_FP16_ALT_IMPL
-// 	  #ifdef ROCM_BACKWARD_PASS_GUARD
-// 		flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-// 	  #endif
-// 	#endif
-// #endif
-
-#ifndef CHECK_HIP_ERROR
-#define CHECK_HIP_ERROR(error)                    \
-    if(error != hipSuccess)                       \
-    {                                             \
-        fprintf(stderr,                           \
-                "Hip error: '%s'(%d) at %s:%d\n", \
-                hipGetErrorString(error),         \
-                error,                            \
-                __FILE__,                         \
-                __LINE__);                        \
-        exit(EXIT_FAILURE);                       \
-    }
-#endif
-
-#ifndef CHECK_HIPBLAS_ERROR
-#define CHECK_HIPBLAS_ERROR(error)                    \
-    if(error != HIPBLAS_STATUS_SUCCESS)               \
-    {                                                 \
-        fprintf(stderr,                               \
-                "hipBLAS error: '%s'(%d) at %s:%d\n", \
-                hipblasStatusToString(error),         \
-                error,                                \
-                __FILE__,                             \
-                __LINE__);                            \
-        exit(EXIT_FAILURE);                           \
-    }
-#endif
-
-namespace {
-  rocblas_handle r_handle;
-
-  /*thread_local*/ cudaStream_t weight_stream;
-  // BUG: DLM has event and stream on different devices error
-  // In multi-GPU scenerio, do names defined in this namespace exist on all devices?
-  // C++ keyword: thread_local <- maybe this can help?
-  /*thread_local*/ cudaEvent_t event;
-
-  // hipBLASLt
-  hipblasLtHandle_t hipblaslt_handle;
-  hipblasLtMatmulPreference_t preference;
-  uint64_t workspace_size = 32*1024*1024;
-  //uint64_t workspace_size = 0;
-  void* d_workspace;
-  int request_solutions = 1;
-  int returnedAlgoCount = 0;
-
-  struct MatMulConfig {
-    hipblasOperation_t op_A;
-    hipblasOperation_t op_B;
-    int M;
-    int N;
-    int K;
-    hipblasDatatype_t dtype;
-
-    friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool {
-      return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype);
-    }
-  };
-
-  // std::map<std::tuple<int, int, int, int, int, int>, std::vector<hipblasLtMatmulHeuristicResult_t>> heuristic_map;
-  std::map<MatMulConfig, hipblasLtMatmulHeuristicResult_t> heuristic_map;
-
-  hipEvent_t start, stop;
-  int bench_iters { 1 };
-  int warmup_iters { 1 };
-
-  bool cout_print = true;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * hipBLASLt GEMM call
-*/
-/*
-hipblasStatus_t hipblasLtMatmul_wrapper(
-    hipblasLtHandle_t handle,
-    hipblasOperation_t op_A,
-    hipblasOperation_t op_B,
-    int m, int n, int k,
-    const void *alpha,
-    const void *a,
-    int lda,
-    const void *b,
-    int ldb,
-    const void *beta,
-    void *c,
-    int ldc,
-    hipblasDatatype_t dtype,
-    hipStream_t &stream)
-{
-  // TODO: flag is not supported for hipblasLt yet
-  int flag { 0 };
-  if (dtype == HIPBLAS_R_16F) {
-    // use fp16 alt impl for MI200
-    // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-    flag = rocblas_gemm_flags_fp16_alt_impl;
-  }
-
-  nvtxRangePushA("hipBLASLt variables creation");
-  hipblasLtMatrixLayout_t matA, matB, matC;
-  hipblasLtMatmulDesc_t matmul;
-  if (op_A == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda));
-  }
-  if (op_B == HIPBLAS_OP_N) {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb));
-  } else {
-    CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb));
-  }
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t)));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute(
-      matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t)));
-  nvtxRangePop();
-
-  // if heuristic does not exist in the map, do search and push into the map
-  auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; 
-  if (heuristic_map.count(gemm_key) <= 0) {
-    nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic");
-    if (cout_print) {
-      std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") 
-                << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype
-                << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl;
-    }
-    std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult(request_solutions);
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic(
-      handle, matmul, matA, matB, matC, matC,
-      preference, request_solutions, heuristicResult.data(), &returnedAlgoCount));
-    if((returnedAlgoCount != request_solutions) && cout_print) {
-      std::cout << "less solution found! request: " << request_solutions
-                << ", found: " << returnedAlgoCount << std::endl;
-    }
-
-    if (returnedAlgoCount == 1) {
-      heuristic_map[gemm_key] = heuristicResult[0];
-    } else {
-      // benchmark requested solutions and pick best one
-      int bestIndex { -1 };
-      double bestMs { std::numeric_limits<double>::max() };
-      for (int sol { 0 }; sol < returnedAlgoCount; ++sol) {
-        // warm up
-        for (int iter { 0 }; iter < warmup_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        // performance measuring
-        double eventMs;
-        CHECK_HIP_ERROR(hipEventRecord(start, stream));
-        for (int iter { 0 }; iter < bench_iters; ++iter) {
-          CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul,
-              alpha,
-              a, matA,
-              b, matB,
-              beta,
-              c, matC,
-              c, matC, // In case beta != 0, these runs can overwrite the values in c
-                       // since c and d are the same
-                       // TODO: allocates separate d memory for these runs
-              &heuristicResult[sol].algo,
-              d_workspace, workspace_size,
-              stream));
-        }
-        CHECK_HIP_ERROR(hipEventRecord(stop, stream));
-        CHECK_HIP_ERROR(hipEventSynchronize(stop));
-        float temp;
-        CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop));
-        eventMs = double(temp);
-        eventMs /= bench_iters;
-
-        if (cout_print) {
-          std::cout << "    Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms";
-        }
-        if (bestMs > eventMs) {
-          bestMs = eventMs;
-          bestIndex = sol;
-          if (cout_print) {
-            std::cout << " *" << std::endl;
-          }
-        } else {
-          if (cout_print) {
-            std::cout << std::endl;
-          }
-        }
-      }
-      heuristic_map[gemm_key] = heuristicResult[bestIndex];
-    }
-    nvtxRangePop();
-  }
-
-  hipblasStatus_t status = hipblasLtMatmul(handle, matmul,
-      alpha,
-      a, matA,
-      b, matB,
-      beta,
-      c, matC,
-      c, matC,
-      &heuristic_map[gemm_key].algo,
-      d_workspace, workspace_size,
-      stream);
-
-  nvtxRangePushA("hipBLASLt variables deletion");
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC));
-  nvtxRangePop();
-
-  return status;
-}
-*/
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<rocblas_int> RocFindAllSolIdxBlas(
-    const torch::Tensor& mat1,
-    const torch::Tensor& mat2
-    )
-{
-  auto mat1_strides { mat1.strides() };
-  auto mat2_strides { mat2.strides() };
-  auto mat1_sizes { mat1.sizes() };
-  auto mat2_sizes { mat2.sizes() };
-
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(
-    mat1.dtype() == mat2.dtype(),
-    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-  );
-  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-  auto abcType { mat1.options().dtype() };
-  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
-  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
-
-  bool transpose_result = true;
-  bool transpose_mat1;
-  bool transpose_mat2;
-  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
-    transpose_mat2 = false;
-  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
-    transpose_mat2 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
-    transpose_mat1 = false;
-  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
-    transpose_mat1 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if (transpose_result) {
-    bool tmp = transpose_mat1;
-    transpose_mat1 = !transpose_mat2;
-    transpose_mat2 = !tmp;
-    mat1_strides = mat2.strides();
-    mat2_strides = mat1.strides();
-    mat1_sizes = mat2.sizes();
-    mat2_sizes = mat1.sizes();
-  }
-  float one { 1.0f };
-  float zero { 0.0f };
-  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
-  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
-  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
-  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
-  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
-  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
-
-  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
-  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
-  void *ptrC { static_cast<void *>(result.data_ptr()) };
-  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
-
-  rocblas_set_stream(r_handle, current_stream);
-  uint32_t flags { 0 };
-  rocblas_datatype abcRtype;
-  if (abcType == at::kHalf) {
-    abcRtype = rocblas_datatype_f16_r;
-  } else if (abcType == at::kBFloat16) {
-    abcRtype = rocblas_datatype_bf16_r;
-  } else if (abcType == at::kFloat) {
-    abcRtype = rocblas_datatype_f32_r;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-
-  #define GEMM_EX_ARGS                                                                              \
-      r_handle, transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, \
-      m, n, k, &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, &zero, ptrC, \
-      abcRtype, result_ld, ptrC, abcRtype, result_ld, rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index
-
-      rocblas_int sizeSolve;
-      //CHECK_ROCBLAS_ERROR(
-      rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, NULL, &sizeSolve);
-                  
-      // Fill array with list of solutions that match type
-      // Note: some of these may be invalid
-      std::vector<rocblas_int> solutionsSolve(sizeSolve);
-      //CHECK_ROCBLAS_ERROR(
-      rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, solutionsSolve.data(), &sizeSolve);
-
-      std::vector<rocblas_int> validSolutions;
-      for(auto sol : solutionsSolve) {
-        auto status = rocblas_gemm_ex(r_handle, 
-                        transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none,
-                        transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none,
-                        m, n, k, 
-                        &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, 
-                        &zero, ptrC, abcRtype, result_ld, 
-                        ptrC, abcRtype, result_ld,
-                        rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, sol, rocblas_gemm_flags_none);
-        if (status == rocblas_status_success) {
-          validSolutions.push_back(sol);
-        }
-      }
-
-    return validSolutions;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-torch::Tensor RocSolIdxBlas(
-    const torch::Tensor& mat1,
-    const torch::Tensor& mat2,
-    const int32_t solution_index=0
-    )
-{
-  auto mat1_strides { mat1.strides() };
-  auto mat2_strides { mat2.strides() };
-  auto mat1_sizes { mat1.sizes() };
-  auto mat2_sizes { mat2.sizes() };
-  // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-
-  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(
-    mat1.dtype() == mat2.dtype(),
-    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-  );
-  TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-  auto abcType { mat1.options().dtype() };
-  auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) };
-  auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) };
-  // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl;
-
-  bool transpose_result = true;
-  bool transpose_mat1;
-  bool transpose_mat2;
-  if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max<int64_t>(1, mat2_sizes[0]))) {
-    transpose_mat2 = false;
-  } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max<int64_t>(1, mat2_sizes[1]))) {
-    transpose_mat2 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-  if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max<int64_t>(1, mat1_sizes[0]))) {
-    transpose_mat1 = false;
-  } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max<int64_t>(1, mat1_sizes[1]))) {
-    transpose_mat1 = true;
-  } else {
-    assert(false && "unusual strides detected, may need to clone a contiguous tensor");
-  }
-
-  if (transpose_result) {
-    bool tmp = transpose_mat1;
-    transpose_mat1 = !transpose_mat2;
-    transpose_mat2 = !tmp;
-    mat1_strides = mat2.strides();
-    mat2_strides = mat1.strides();
-    mat1_sizes = mat2.sizes();
-    mat2_sizes = mat1.sizes();
-  }
-  // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl
-  //           << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl
-  //           << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl;
-  // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl
-  //           << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl;
-  
-  float one { 1.0f };
-  float zero { 0.0f };
-  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
-  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
-  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
-  int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0];
-  int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0];
-  int64_t result_ld = result.stride(transpose_result ? 0 : 1);
-  // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl
-  //           << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl;
- 
-  /*
-  int flag { 0 };
-  hipblasDatatype_t hipblasType;
-  if (abcType == at::kHalf) {
-    hipblasType = HIPBLAS_R_16F;
-  } else if (abcType == at::kBFloat16) {
-    hipblasType = HIPBLAS_R_16B;
-  } else if (abcType == at::kFloat) {
-    hipblasType = HIPBLAS_R_32F;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-  */
-  void *ptrA { static_cast<void *>((transpose_result ? mat2 : mat1).data_ptr()) };
-  void *ptrB { static_cast<void *>((transpose_result ? mat1 : mat2).data_ptr()) };
-  void *ptrC { static_cast<void *>(result.data_ptr()) };
-  auto current_stream { torch::hip::getCurrentHIPStream().stream() };
-  /* 
-
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper(
-      hipblaslt_handle,
-      transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N,
-      m, n, k,
-      &one,
-      ptrA, mat1_ld,
-      ptrB, mat2_ld,
-      &zero,
-      ptrC, result_ld,
-      hipblasType,
-      current_stream));
-  */
-  rocblas_set_stream(r_handle, current_stream);
-  uint32_t flags { 0 };
-  //int32_t solution_index {0};
-  rocblas_datatype abcRtype;
-  if (abcType == at::kHalf) {
-    abcRtype = rocblas_datatype_f16_r;
-  } else if (abcType == at::kBFloat16) {
-    abcRtype = rocblas_datatype_bf16_r;
-  } else if (abcType == at::kFloat) {
-    abcRtype = rocblas_datatype_f32_r;
-  } else {
-    assert(false && "Wrong datatype!");
-  }
-
-  //CHECK_ROCBLAS_ERROR(
-    rocblas_gemm_ex(r_handle, 
-                    transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none,
-                    transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none,
-                    m, n, k, 
-                    &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, 
-                    &zero, ptrC, abcRtype, result_ld, 
-                    ptrC, abcRtype, result_ld,
-                    rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, solution_index, flags);
-  //);
-
-
-  return result;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void rocb_create_extension()
-{
-  /*
-  CHECK_HIP_ERROR(hipStreamCreate(&weight_stream));
-  CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming));
-
-  // hipBLASLt
-  CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle));
-  CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference));
-  CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute(
-      preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
-  
-  CHECK_HIP_ERROR(hipEventCreate(&start));
-  CHECK_HIP_ERROR(hipEventCreate(&stop)); */
-  rocblas_create_handle(&r_handle);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void rocb_destroy_extension()
-{
-    /*
-    CHECK_HIP_ERROR(hipStreamDestroy(weight_stream));
-    CHECK_HIP_ERROR(hipEventDestroy(event));
-
-    // hipBLASLt
-    CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle));
-    CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference));
-    CHECK_HIP_ERROR(hipFree(d_workspace));
-
-    CHECK_HIP_ERROR(hipEventDestroy(start));
-    CHECK_HIP_ERROR(hipEventDestroy(stop)); */
-    rocblas_destroy_handle(r_handle);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-  m.def("rocb_create_extension", &rocb_create_extension, "create_extension");
-  m.def("rocb_destroy_extension", &rocb_destroy_extension, "destroy_extension");
-  m.def("rocb_mm", &RocSolIdxBlas, "mm");
-  m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols");
-}
diff --git a/gradlib/gemm_runner.py b/gradlib/gemm_runner.py
deleted file mode 100644
index 34a246771a820..0000000000000
--- a/gradlib/gemm_runner.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import torch
-import rocsolidxgemm
-import hipbsolidxgemm
-import numpy as np
-import torch.nn.functional as F
-import sys
-import pandas as pd
-import timeit
-
-rocsolidxgemm.rocb_create_extension()
-hipbsolidxgemm.hipb_create_extension()
-
-class TunedGemm:
-    def __init__(self,tuned_csv_file):
-        self.bestsols = pd.read_csv(tuned_csv_file,index_col=[0])
-        self.create_ds()
-    def create_ds(self):
-        df = self.bestsols
-        solds = {}
-        for i in range(len(df)):
-            ds = df.iloc[i]
-            key = (ds['M'],ds['N'],ds['K'])
-            if ds['libtype']=='hipblaslt': soltype = 1
-            elif ds['libtype']=='rocblas': soltype = 2
-            solds[key] = (soltype,int(ds['solidx']))
-        #print(solds)
-        self.solids = solds
-    def query_sol(self,m,n,k):
-        return self.solids.get((m,n,k),(0,0))
-    def mm(self,inp,weights):
-        soltype,solidx = self.query_sol(m=weights.shape[0],n=inp.shape[0],k=inp.shape[1])
-        if soltype==1:
-            out = hipbsolidxgemm.hipb_mm(inp,weights.t(),solidx)
-        elif soltype==2:
-            out = rocsolidxgemm.rocb_mm(inp,weights.t(),solidx)
-        else:
-            out = F.linear(inp,weights)
-        return out
-    def run_all_tuned_sols(self):
-        for i in range(len(self.bestsols)):
-            ds = self.bestsols.iloc[i]
-            print('>>> Running tuned solution')
-            print(ds)
-            inp = torch.randn((ds['N'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda')
-            weights = torch.randn((ds['M'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda')
-            self.mm(inp,weights)
-
-def get_dtype(dtype_csv):
-    if dtype_csv=='torch.float16':
-        dtype = torch.float16
-    elif dtype_csv=='torch.bfloat16':
-        dtype = torch.bfloat16
-    elif dtype_csv=='torch.float32':
-        dtype = torch.float32
-    return dtype
-
-if __name__ == '__main__':
-    tgemm = TunedGemm(sys.argv[1]) #csv file with tuned sols goes in argv[1]
-    print(tgemm.bestsols)
-    tgemm.run_all_tuned_sols()
-
-
diff --git a/gradlib/gemm_tuner.py b/gradlib/gemm_tuner.py
deleted file mode 100644
index b6c69379cf6c6..0000000000000
--- a/gradlib/gemm_tuner.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-import os
-import argparse
-from gradlib.GemmTuner import GemmTuner
-import rocsolidxgemm
-import hipbsolidxgemm
-import numpy as np
-import torch.nn.functional as F
-import sys
-import pandas as pd
-import json
-import random
-from pathlib import Path
-rocsolidxgemm.rocb_create_extension()
-hipbsolidxgemm.hipb_create_extension()
-
-'''
-{'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 5120, 'initializer_range': 0.02, 
-'intermediate_size': 13824, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 40, 'num_hidden_layers': 40, 'num_key_value_heads': 40, 
-'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float16', 'transformers_version': '4.33.0.dev0', 'use_cache': True, 'vocab_size': 32000}
-'''
-def generate_mk_sets(model_dir, tp=1):
-    f = open(f'{model_dir}/config.json')
-    data = json.load(f)
-    hidden_size = data['hidden_size']
-    intermediate_size = data['intermediate_size']
-    total_num_heads = data['num_attention_heads']
-    total_num_kv_heads = data['num_key_value_heads']
-    head_dim = hidden_size // total_num_heads
-    return [((total_num_heads + (2*total_num_kv_heads)) * head_dim // tp, hidden_size), (hidden_size, hidden_size // tp), (intermediate_size *2 // tp, hidden_size), (hidden_size, intermediate_size // tp) ], hidden_size
-
-def get_dtype(dtype_str):
-    dtype = torch.float16
-    if dtype_str == 'f32':
-        dtype = torch.float32
-    elif dtype_str == 'bf16':
-        dtype = torch.bfloat16
-    elif dtype_str == 'f16':
-        dtype = torch.float16
-    else:
-        print('>>> Warning! Invalid dtype', dtype_str, 'using default dtype f16')
-    return dtype
-
-
-def list_of_ints(arg):
-    return list(map(int, arg.split(',')))
-
-def load_input_gemms(input_file):
-    if Path(input_file).is_file():
-        return 
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir", type=str, default=os.getenv('GTUNE_MODEL', ""), help="Enter the location of your model directory")
-    parser.add_argument("--tuned_file", type=str, default=os.getenv('GTUNE_TUNED', "tuned.csv"), help="output file for tuned gemm solutions")
-    parser.add_argument("--input_file", type=str, default=os.getenv('GTUNE_INPUT', None), help="list of gemms to tune for, mutually exclusive with model_dir")
-    parser.add_argument("--tp", type=int, default=os.getenv('GTUNE_TP', 1), help="Tensor parallelism to be used.")
-    parser.add_argument("--dtype", type=str, default='f16', help="dtype f32 f16 bf16")
-    parser.add_argument("--rocblas-decode", action="store_true", default=False, help="forces rocblas solution on decode N=1")
-    parser.add_argument("--batch_size", type=int, default=os.getenv('GTUNE_BATCH_SIZE', 1), help="Batch size to tune for")
-    parser.add_argument("--nsets", type=list_of_ints, default=[1, 512, 1024, 2048, 3072, 4096, 8192, 16384], help="N sizes to tune for: 1,128,2048")
-    args = parser.parse_args()
-
-    dtype = get_dtype(args.dtype)
-
-    gtuner = GemmTuner(dtype, args.tuned_file, args.rocblas_decode)
-    nsets = [i * args.batch_size for i in args.nsets]
-    if args.input_file:
-        print(f">>> Loading {args.input_file}")
-        if not Path(args.input_file).is_file():
-            print(f">>> ERROR: {args.input_file} does not exist.  Exiting")
-            exit(1)
-        shapes = pd.read_csv(args.input_file)
-        for i in range(len(shapes)):
-            ds = shapes.iloc[i]
-            gtuner.add_gemm(ds['M'],ds['N'],ds['K'])
-    else:
-        if not args.model_dir:
-            print(">>> Warning! NO MODEL SPECIFIED. Tuning for LL2 13B TP1")
-            #LL2 13B sizes
-            mksets = [(15360, 5120), (5120, 5120), (27648, 5120), (5120, 13824)]
-            gtuner.add_gemm(m=32000, n=1, k=5120)  # logits gemm
-        else:
-            mksets, hidden_size = generate_mk_sets(args.model_dir, args.tp)
-            gtuner.add_gemm(m=32000//args.tp, n=1 * args.batch_size, k=hidden_size) #TODO: Handle cases where vocab_size is not divisible by tp
-
-        for n in sorted(nsets):
-            for m, k in mksets:
-                gtuner.add_gemm(m, n, k)
-
-    gtuner.find_best_sols()
diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py
deleted file mode 100644
index 273042cb12a05..0000000000000
--- a/gradlib/gradlib/GemmTuner.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import torch
-import os
-import argparse
-import rocsolidxgemm
-import hipbsolidxgemm
-import numpy as np
-import torch.nn.functional as F
-import sys
-import pandas as pd
-import json
-import random
-from pathlib import Path
-rocsolidxgemm.rocb_create_extension()
-hipbsolidxgemm.hipb_create_extension()
-
-rtol = 1e-5
-atol = 1
-dtype = torch.float16
-
-class Gemm:
-    def __init__(self,m,n,k,dtype,rocblas_decode=False):
-        self.m=m
-        self.k=k
-        self.n=n
-        self.dtype=dtype
-        self.nb = 37
-        self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda')
-        self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda')
-        #weights2 is used in measurement/warm iters to ensure HBM fetch for weight tensors
-        self.weights2 = torch.randn((self.nb, self.m, self.k), dtype=self.dtype, device='cuda')
-        self.blob = torch.ones(128*1024*1024, dtype=torch.float32, device='cuda')
-        self.topn = 20 #number of top solutions from each source
-        self.hipb_sols=[]
-        self.rtol = 1e-5
-        self.atol = 1
-        self.start = torch.cuda.Event(enable_timing=True)
-        self.end = torch.cuda.Event(enable_timing=True)
-        self.hipb_prefer_ratio = 0.995 #prefer hipblaslt unless rocblas time is less than this ratio of hipblaslt time
-        self.rocblas_decode=rocblas_decode
-
-
-    def find_hipblas_sols(self):
-        sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t())
-        print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols), flush=True)
-        #print(sols)
-        self.hipb_sols = sols
-
-
-    def check_gemm_ref(self,libtype,solidx):
-        ref = F.linear(self.inp,self.weights)
-        if libtype == 'hipblaslt':
-            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
-        elif libtype == 'rocblas':
-            c = rocsolidxgemm.rocb_mm(self.inp,self.weights.t(),solidx)
-        if torch.allclose(c, ref, atol=self.atol,  rtol=self.rtol):
-            #print('>>>',libtype,'Solidx',solidx,'passed reference test')
-            return True
-        else:
-            print('>>>',libtype,'Solidx',solidx,'FAILED reference test', flush=True)
-            print(ref, flush=True)
-            print(c, flush=True)
-            return False
-    def hipb_time_sol(self,solidx,cold_iters=2,warm_iters=10):
-        #print('>>>hipbtime',solidx)
-        for i in range(cold_iters):
-            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
-        self.start.record()
-        for i in range(warm_iters):
-            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights2 [random.randint(0,self.nb-1)].t(),solidx)
-        self.end.record()
-        torch.cuda.synchronize()
-        gtime = self.start.elapsed_time(self.end)/warm_iters
-        #print('>>> Solidx GTime',solidx,gtime,'ms')
-        return gtime
-    def hipb_time_all_sols(self,fast_mode=0,top_sols=0):
-        coldi=20; warmi=20
-        if fast_mode: coldi=2; warmi=2
-        solutions = self.hipb_sols
-        if top_sols: solutions = self.hipb_top_sols
-        gtimes = {}
-        for solidx in solutions:
-            gtimes[solidx] = self.hipb_time_sol(solidx, cold_iters=coldi, warm_iters=warmi)
-        self.hipb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
-        self.hipb_gtimedf.to_csv('/tmp/hipb_gtimedf.csv')
-        print('>>> HipBlasLt top solutions, Fast Mode',fast_mode)
-        print(self.hipb_gtimedf.head(self.topn))
-    def rocb_time_sol(self, solidx, cold_iters=2, warm_iters=10):
-        for i in range(cold_iters):
-            c = rocsolidxgemm.rocb_mm(self.inp, self.weights.t(), solidx)
-        self.start.record()
-        for i in range(warm_iters):
-            c = rocsolidxgemm.rocb_mm(self.inp, self.weights2[random.randint(0, self.nb-1)].t(), solidx)
-        self.end.record()
-        torch.cuda.synchronize()
-        gtime = self.start.elapsed_time(self.end)/warm_iters
-        #print('>>> RocSolidx GTime',solidx,gtime,'ms')
-        return gtime
-    def find_rocblas_sols(self):
-        sols = rocsolidxgemm.rocb_findallsols(self.inp,self.weights.t())
-        print('M N K',self.m,self.n,self.k,'>>> Total rocb solutions',len(sols), flush=True)
-        #print(sols)
-        self.rocb_sols = sols
-    def rocb_time_all_sols(self,fast_mode=0,top_sols=0):
-        coldi=20; warmi=20
-        if fast_mode: coldi=2; warmi=2
-        solutions = self.rocb_sols
-        if top_sols: solutions = self.rocb_top_sols
-        gtimes = {}
-        for solidx in solutions:
-            gtimes[solidx] = self.rocb_time_sol(solidx,coldi,warmi)
-        self.rocb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
-        self.rocb_gtimedf.to_csv('/tmp/rocb_gtimedf.csv')
-        print('>>> Rocblas top solutions, Fast Mode',fast_mode, flush=True)
-        print(self.rocb_gtimedf.head(self.topn), flush=True)
-    def warmup(self,warmi=500):
-        for i in range(warmi):
-            self.blob = self.blob + 0.00001
-    def functional_check_topn_fastest(self):
-        rocb_topn = []
-        for solidx in self.rocb_gtimedf.index[:self.topn]:
-            if self.check_gemm_ref(libtype='rocblas',solidx=solidx):
-                rocb_topn.append(solidx)
-        self.rocb_top_sols = rocb_topn
-        hipb_topn = [] 
-        for solidx in self.hipb_gtimedf.index[:self.topn]:
-            if self.check_gemm_ref(libtype='hipblaslt',solidx=solidx):
-                hipb_topn.append(solidx)
-        self.hipb_top_sols = hipb_topn
-
-    def find_fastest_solution(self):
-        self.find_rocblas_sols()
-        if not (self.rocblas_decode and self.n == 1):
-            self.find_hipblas_sols()
-        self.warmup()
-        self.rocb_time_all_sols(fast_mode=1)
-        self.warmup()
-        self.hipb_time_all_sols(fast_mode=1)
-        self.functional_check_topn_fastest()
-        self.warmup()
-        self.rocb_time_all_sols(fast_mode=0,top_sols=1)
-        self.warmup()
-        self.hipb_time_all_sols(fast_mode=0,top_sols=1)
-        if len(self.rocb_gtimedf)>0 and len(self.hipb_gtimedf)>0:
-            best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0]
-            best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0]
-            if best_rocb_time<best_hipb_time*self.hipb_prefer_ratio:
-                self.best_libtype = 'rocblas'
-                self.best_solidx = self.rocb_gtimedf.index[0]
-                self.best_soltime = best_rocb_time
-            else:
-                self.best_libtype = 'hipblaslt'
-                self.best_solidx = self.hipb_gtimedf.index[0]
-                self.best_soltime = best_hipb_time
-            #self.check_gemm_ref(self.best_libtype,self.best_solidx)
-        elif len(self.hipb_gtimedf)>0:
-                print('>>> Only hipblas solutions found!',flush=True)
-                best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0]
-                self.best_libtype = 'hipblaslt'
-                self.best_solidx = self.hipb_gtimedf.index[0]
-                self.best_soltime = best_hipb_time
-        elif len(self.rocb_gtimedf)>0:
-                print('>>> Only rocblas solutions found!',flush=True)
-                best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0]
-                self.best_libtype = 'rocblas'
-                self.best_solidx = self.rocb_gtimedf.index[0]
-                self.best_soltime = best_rocb_time
-        else:
-            print('>>> No rocblas or hipblas solutions found!',flush=True)
-            self.best_libtype = 'rocblas'
-            self.best_solidx = 0
-            self.best_soltime = 0
-        print('>>> Fastest Solution is',self.best_libtype,self.best_solidx,self.best_soltime,flush=True)
-
-
-class GemmTuner:
-    def __init__(self, dtype, tuned_file=None, rocblas_decode=False):
-        self.gemm_problems = pd.DataFrame(columns=['M','N','K'])
-        self.dtype = dtype
-        self.rocblas_decode = rocblas_decode
-        self.tuned_file = tuned_file
-        if Path(tuned_file).is_file():
-            self.gdf = pd.read_csv(tuned_file)
-        else:
-            self.gdf = None
-
-    def add_gemm(self,m,n,k):
-        if ( self.gdf is None or (self.gdf[(self.gdf['M'] == m) & (self.gdf['N'] == n) & (self.gdf['K'] == k)].empty)):
-            entry = {'M':[m], 'N':[n], 'K':[k]}
-            df = pd.DataFrame(entry)
-            self.gemm_problems = pd.concat([self.gemm_problems, df],ignore_index=True)
-        else:
-            print(f">>>Info: Found Duplicate shape(M:{m}, N:{n}, K:{k}), skipping")
-
-    def find_best_sols(self):
-        df = self.gemm_problems
-        soldf = pd.DataFrame()
-        for i in range(len(df)):
-            ds = df.iloc[i]
-            gemmobj = Gemm(ds['M'],ds['N'],ds['K'],dtype=self.dtype, rocblas_decode=self.rocblas_decode)
-            gemmobj.find_fastest_solution()
-            soldf.loc[i,'libtype'] = gemmobj.best_libtype
-            soldf.loc[i,'solidx'] = gemmobj.best_solidx
-            soldf.loc[i,'soltimems'] = gemmobj.best_soltime
-        soldf['dtype'] = self.dtype
-        finaldf = pd.concat([self.gemm_problems, soldf],axis=1)
-        finaldf = pd.concat([finaldf, self.gdf])
-        finaldf.to_csv(self.tuned_file, index=False)
-        print(finaldf)
diff --git a/gradlib/mm_test.py b/gradlib/mm_test.py
deleted file mode 100644
index 1b21b9ca105ff..0000000000000
--- a/gradlib/mm_test.py
+++ /dev/null
@@ -1,234 +0,0 @@
-import torch
-#import gradlib
-import rocsolidxgemm
-import hipbsolidxgemm
-import numpy as np
-import torch.nn.functional as F
-import sys
-import pandas as pd
-#gradlib.create_extension()
-rocsolidxgemm.rocb_create_extension()
-hipbsolidxgemm.hipb_create_extension()
-
-#m = 128; n = 192 ;k = 256
-#m = 7168; k = 4096*2; n = 256
-#m = int(1024*1.25); k = int(1024*8); n = 1
-#m = 1; k = int(1024*8); n = int(1024*7)
-#m=22016; k=4096 ; n=1
-#m=int(27648/1);k=5120;n=8
-#m=5120;k=13824;n=1
-m=3*5120;k=5120;n=1
-
-
-rtol = 1e-5
-atol = 1
-dtype = torch.float16
-
-class Gemm:
-    def __init__(self,m,n,k,dtype=torch.float16):
-        self.m=m
-        self.k=k
-        self.n=n
-        self.dtype=dtype
-        self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda')
-        self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda')
-        self.hipb_sols=[]
-        self.rtol = 1e-5
-        self.atol = 1
-        self.cold_iters = 2
-        self.warm_iters = 10
-    def find_hipblas_sols(self):
-        sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t())
-        print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols))
-        #print(sols)
-        self.hipb_sols = sols
-    def hipb_check_gemm_ref(self,user_solidxs=None):
-        ref = F.linear(self.inp,self.weights)
-        if user_solidxs is not None:
-            solidxs = user_solidxs
-        else:
-            solidxs = self.hipb_sols
-        if len(solidxs)>0:
-            for solidx in solidxs:
-                c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
-                if torch.allclose(c, ref, atol=self.atol,  rtol=self.rtol):
-                    print('>>> Hipb solidx',solidx,'passed reference test')
-                else:
-                    print('>>> Hipb solidx',solidx,'FAILED reference test')
-                    print(ref)
-                    print(c)
-    def hipb_time_sol(self,solidx):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        for i in range(self.cold_iters):
-            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
-        start.record()
-        for i in range(self.warm_iters):
-            c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx)
-        end.record()
-        torch.cuda.synchronize()
-        gtime = start.elapsed_time(end)/self.warm_iters
-        #print('>>> Solidx GTime',solidx,gtime,'ms')
-        return gtime
-    def hipb_time_all_sols(self):
-        gtimes = {}
-        for solidx in self.hipb_sols:
-            gtimes[solidx] = self.hipb_time_sol(solidx)
-        self.gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems')
-        self.gtimedf.to_csv('/tmp/gtimedf.csv')
-        print(self.gtimedf.head(10))
-
-
-
-gemmobj = Gemm(m=3*5120,n=1,k=5120)
-gemmobj.find_hipblas_sols()
-#gemmobj.hipb_check_gemm_ref()
-#gemmobj.hipb_check_gemm_ref(user_solidxs=[131,8190])
-#gemmobj.hipb_time_sol(gemmobj.hipb_sols[0])
-gemmobj.hipb_time_all_sols()
-gemmobj.hipb_check_gemm_ref(user_solidxs=gemmobj.gtimedf.head(5).index.values)
-
-sys.exit()
-def splitk_linear(inp,w,splitk=2):
-    wsp = torch.chunk(w,splitk,dim=1)
-    isp = torch.chunk(inp,splitk,dim=1)
-    print('>>>',isp[0].shape,wsp[1].shape)
-    cnew = []
-    for i in range(splitk):
-        cnew.append(F.linear(isp[i],wsp[i]))
-    #cnew1 = F.linear(isp[1],wsp[1])
-    c = cnew[0]
-    for i in range(1,splitk):
-        c.add_(cnew[i])
-    #c = torch.add(cnew0,cnew1)
-
-    return c
-
-def splitm_linear(inp,w,splitm=2,splits=None,splitk=1):
-    outputp=[] 
-    #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm)
-    if splits is not None:
-        wsp = torch.split(w,splits)
-    else:
-        wsp = torch.chunk(w,splitm)
-    #cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device)
-    #csp = torch.chunk(cout,splitm,dim=1)
-
-    for i,_ in enumerate(wsp):
-        #print('>>>wspi',wsp[i].shape)
-        if splitk==1:
-            outputp.append(F.linear(inp, wsp[i])) 
-            #cout[:,i*wsp[i].shape[0]:(i+1)*wsp[i].shape[0]] = F.linear(inp, wsp[i]) 
-            #csp[i].copy_(F.linear(inp, wsp[i]))
-        else:
-            outputp.append(splitk_linear(inp,wsp[i],splitk))
-    c = torch.cat((outputp),dim=1)
-    #print('>>>',c.shape,cout.shape)
-    return c
-
-def splitn_linear(inp,w,splitn=2,splits=None):
-    outputp=[] 
-    if splits is not None:
-        isp = torch.split(inp,splits)
-    else:
-        isp = torch.chunk(inp,splitn)
-    cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device)
-    for i,_ in enumerate(isp):
-        outputp.append(F.linear(isp[i], w)) 
-        #cout[i*isp[i].shape[0]:(i+1)*isp[i].shape[0],:] = F.linear(isp[i], w) 
-    c = torch.cat((outputp),dim=0)
-    #print('>>>',c.shape,cout.shape)
-    return c
-
-nncount = 0
-for _ in range(10):
-    #a = torch.randn((m, k), dtype=dtype, device='cuda')
-    #b = torch.randn((k, n), dtype=dtype, device='cuda')
-    inp = torch.randn((n, k), dtype=dtype, device='cuda')
-    weights = torch.randn((m, k), dtype=dtype, device='cuda')
-    #c = gradlib.mm(inp, weights.t())
-    c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053)
-    c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053)
-    c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995)
-    c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995)
-
-    splitm=2
-    #padm=2
-    outsp=[] 
-    #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm)
-    #wsp = torch.chunk(weights,splitm)
-    #wsp = torch.split(weights,(3*1024,4*1024))
-    #c = torch.empty((n,m),dtype=dtype,device='cuda')
-    #outtup = []
-    #for i,_ in enumerate(wsp):
-    #    print('>>>wspi',wsp[i].shape)
-    #    outsp.append(F.linear(inp, wsp[i])) 
-    #    #outtup.append(splitk_linear(inp, wsp[i])) 
-    #outsp = [torch.add(a,b) for a,b in outtup]
-    #c = torch.cat((outsp),dim=1) 
-    #c = c[:,:-padm]
-    #c = splitm_linear(inp,weights,splitm=4,splits=None,splitk=1)
-    #c = splitn_linear(inp,weights,splitn=2,splits=None)
-    
-    #wsp = torch.chunk(weights,2,dim=1)
-    #isp = torch.chunk(inp,2,dim=1)
-    #print('>>>',isp[0].shape,wsp[1].shape)
-    #cnew0 = F.linear(isp[0],wsp[0])
-    #cnew1 = F.linear(isp[1],wsp[1])
-    #c = torch.add(cnew0,cnew1)
-    #c = splitk_linear(inp, weights, splitk=4) 
-
-    #torch.cuda.synchronize()
-    ref = F.linear(inp,weights)
-    #ref = torch.matmul(a,b)
-    if torch.allclose(c, ref, atol=atol,  rtol=rtol):
-        nncount += 1
-    else:
-        print(ref)
-        print(c)
-'''
-tncount = 0
-for _ in range(10):
-    a = torch.randn((m, k), dtype=dtype, device='cuda')
-    b = torch.randn((n, k), dtype=dtype, device='cuda')
-    c = gradlib.mm(a, b.t())
-    #torch.cuda.synchronize()
-    ref = torch.matmul(a, b.t())
-    if torch.allclose(c, ref,  atol=atol,  rtol=rtol):
-        tncount += 1
-    else:
-        print(ref)
-        print(c)
-        #torch.save(c-ref, '/tmp/difference.pt')
-        #np.savetxt('my_file.txt', (c-ref).cpu().numpy())
-        dfs = ref - c
-        nz = torch.nonzero(dfs,as_tuple=True)
-        print(nz)
-        print(dfs[nz])
-        print(ref[nz])
-        print(c[nz])
-'''
-'''
-ntcount = 0
-for _ in range(10):
-    a = torch.randn((k, m), dtype=dtype, device='cuda')
-    b = torch.randn((k, n), dtype=dtype, device='cuda')
-    c = gradlib.mm(a.t(), b)
-    #torch.cuda.synchronize()
-    if torch.allclose(c, torch.matmul(a.t(), b),  atol=atol,  rtol=rtol):
-        ntcount += 1
-
-ttcount = 0
-for _ in range(10):
-    a = torch.randn((k, m), dtype=dtype, device='cuda')
-    b = torch.randn((n, k), dtype=dtype, device='cuda')
-    c = gradlib.mm(a.t(), b.t())
-    torch.cuda.synchronize()
-    if torch.allclose(c, torch.matmul(a.t(), b.t()),  atol=atol,  rtol=rtol):
-        ttcount += 1
-'''
-print(f"GEMM (m, n, k) = {n}, {m}, {k}")
-print(f"NN GEMMs: pass {nncount}/10, tol={rtol}")
-#print(f"TN GEMMs: pass {tncount}/10, tol={rtol}")
-#print(f"NT GEMMs: pass {ntcount}/10, tol={rtol}")
-#print(f"TT GEMMs: pass {ttcount}/10, tol={rtol}")
diff --git a/gradlib/setup.py b/gradlib/setup.py
deleted file mode 100644
index 1ca83dbe79f6c..0000000000000
--- a/gradlib/setup.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import torch
-import setuptools
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-from torch.utils.hipify import hipify_python
-import os
-import subprocess
-import re
-
-this_dir = os.path.dirname(os.path.abspath(__file__))
-#gpus = subprocess.check_output("/opt/rocm/bin/rocminfo").decode('UTF-8').split('\n')
-#gpus = list(set([re.search('(gfx94.)', g).group(0) for g in gpus if 'gfx94' in g]))
-gpus = ['gfx90a','gfx940','gfx941','gfx942']
-#gpus = ['gfx90a','gfx940']
-extra_args = ["--offload-arch=" + g for g in gpus]
-
-
-#sets_rocm_pytorch = False
-maj_ver, min_ver, *_ = torch.__version__.split('.')
-if int(maj_ver) > 1 or (int(maj_ver) == 1 and int(min_ver) >= 5):
-    from torch.utils.cpp_extension import ROCM_HOME
-    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
-
-ext_modules = []
-
-generator_flag = []
-torch_dir = torch.__path__[0]
-if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
-    generator_flag = ['-DOLD_GENERATOR']
-
-print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-version_ge_1_1 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-    version_ge_1_1 = ['-DVERSION_GE_1_1']
-version_ge_1_3 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    version_ge_1_3 = ['-DVERSION_GE_1_3']
-version_ge_1_5 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-    version_ge_1_5 = ['-DVERSION_GE_1_5']
-version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
-include_dirs=[os.path.join(this_dir, 'csrc')]
-
-#if is_rocm_pytorch:
-#    import shutil
-#    with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
-#        hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
-#				show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx)
-
-if not is_rocm_pytorch:
-    ext_modules.append(
-		CUDAExtension(
-		    name='gradlib',
-		    sources=['grad_funcs.cu'],
-		    extra_compile_args={
-				       'cxx': ['-O3',],
-				        'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80']
-				       }
-		    )
-	    )
-elif is_rocm_pytorch:
-    #if torch.__version__ <= '1.8':
-    hipify_ver = [int(x) for x in torch.utils.hipify.__version__.split(".")] if hasattr(torch.utils.hipify, "__version__") else [0,0,0]
-    if hipify_ver < [1,0,0]:
-        import shutil
-        with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
-            hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
-                                    show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx)
-
-        ext_modules.append(
-                    CUDAExtension(
-                        name='gradlib',
-                        sources=['./csrc/hip/grad_funcs.hip'],
-                        extra_compile_args={
-                                           'cxx': ['-O3',] + version_dependent_macros,
-                                           'nvcc':['-O3'] + extra_args
-                                           }
-                        )
-                )
-    else:
-        #ext_modules.append(
-        #            CUDAExtension(
-        #                name='gradlib',
-        #                sources=['./csrc/grad_funcs.cu'],
-        #                include_dirs=include_dirs,
-        #                # add additional libraries argument for hipblaslt
-        #                libraries=['hipblaslt'],
-        #                extra_compile_args={
-        #                                   'cxx': ['-O3',],
-        #                                    'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
-        #                             "-ftemplate-depth=1024"] + extra_args
-        #                                   }
-        #                )
-        #        )
-        ext_modules.append(
-                    CUDAExtension(
-                        name='rocsolidxgemm',
-                        sources=['./csrc/rocsolgemm.cu'],
-                        include_dirs=include_dirs,
-                        # add additional libraries argument for hipblaslt
-                        libraries=['rocblas'],
-                        extra_compile_args={
-                                           'cxx': ['-O3',],
-                                            'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
-                                     "-ftemplate-depth=1024"] + extra_args
-                                           }
-                        )
-                )
-        ext_modules.append(
-                    CUDAExtension(
-                        name='hipbsolidxgemm',
-                        sources=['./csrc/hipbsolgemm.cu'],
-                        include_dirs=include_dirs,
-                        # add additional libraries argument for hipblaslt
-                        libraries=['hipblaslt'],
-                        extra_compile_args={
-                                           'cxx': ['-O3',],
-                                            'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', 
-                                     "-ftemplate-depth=1024"] + extra_args
-                                           }
-                        )
-                )
-
-setup(
-    name='gradlib',
-    packages=['gradlib'],
-    ext_modules=ext_modules,
-    cmdclass={
-        'build_ext': BuildExtension
-})
-
-# python setup.py build && cp build/lib*/gradlib* ../
diff --git a/run.sh b/run.sh
deleted file mode 100755
index 7b9336a0a076a..0000000000000
--- a/run.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-BASE_DIR=/trees/
-VLLM_DIR=$BASE_DIR/vllm
-GRAD_DIR=$BASE_DIR/gradlib
-RPD_DIR=/workspace/rocmProfileData
-MODEL=/data/llama2-70b-chat
-#MODEL=/data/Llama-2-13B-Chat-fp16
-#MODEL=/data/llama-2-13b-chat-hf
-MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
-
-GEN_LEN="8"
-TP=8
-INPUT_LEN=2048
-ITER=1
-cd $VLLM_DIR
-
-    echo "tuned_gemm_csv: ./tuned_tp$TP.csv" > $VLLM_DIR/tuned_perf_tp$TP.yaml
-    tuned_file=$VLLM_DIR/tuned_tp$TP.csv
-export VLLM_PERF_YAML=./tuned_perf_tp$TP.yaml
-
-for tp in $TP;
-do
-	for gen_len in $GEN_LEN;
-	do
-		for input_len in $INPUT_LEN;
-		do
-
-python benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1    --input-len $input_len --output-len $gen_len \
-		            --tensor-parallel-size $tp --num-iters $ITER
-    done
-done
-done
diff --git a/run_70b.sh b/run_70b.sh
deleted file mode 100755
index ed004b56c17d3..0000000000000
--- a/run_70b.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-BASE_DIR=/trees
-VLLM_DIR=$BASE_DIR/vllm
-GRAD_DIR=$BASE_DIR/gradlib
-RPD_DIR=/workspace/rocmProfileData
-MODEL=/data/llama2-70b-chat
-MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
-#MODEL=/data/llama-2-13b-chat-hf
-GEMM_TUNER=1
-#TP="1 2 4 8"
-TP=8
-#Flag to use Triton Flash Attention vs CK
-#export VLLM_USE_TRITON=1
-
-#Gemm tuner flags
-export VLLM_TUNE_GEMM=0
-export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
-export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv"
-
-#Flag to use old torch.multinomial
-#export VLLM_USE_TORCH_MULTINOMIAL=1
-
-#Delete tuned gemms before running.
-#DELETE_TUNED_CSV=1
-#Flag to disable MSCCL
-#export RCCL_MSCCL_ENABLE=0
-#HIPGraph performance flags
-export HIP_FORCE_DEV_KERNARG=1
-export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
-#Enable full decoder graph mode
-#Use top of tree build of RCCL
-export LD_LIBRARY_PATH=/workspace/rccl/build/
-#Enable either flag to create a profile trace (rocprof, or rocpd)
-#RPD_PROFILE="--profile"
-#ROCPROF_PROFILE="rocprof --hip-trace"
-GEN_LEN="1,32,128"
-INPUT_LEN="512,1024,2048,3072"
-
-ITER=10
-# pring usage of the parameters
-usage() {
-    echo "Usage: $0 [--tp <n>] [--model <path>]"
-    exit 1
-}
-# parse parameters
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        --tp) TP="$2"; shift ;;
-        --model) MODEL="$2"; shift ;;
-        --notune) GEMM_TUNER=0; shift ;;
-        *) usage ;; # Any other argument will show usage information.
-    esac
-    shift # Move to next argument
-done
-for tp in $TP;
-do
-    if (( $GEMM_TUNER ));
-    then
-      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml
-
-      if [[ $DELETE_TUNED_CSV == 1 ]];
-      then
-              rm -rf $VLLM_TUNE_FILE
-      fi
-      #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
-      echo "INFO: Generating Tuned Gemm configs"
-      cd $GRAD_DIR
-      python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp
-    fi
-
-    cd $VLLM_DIR
-    for gen_len in $GEN_LEN;
-    do
-        for input_len in $INPUT_LEN;
-        do
-            if [[ -v RPD_PROFILE ]] ;
-            then
-                rm /workspace/trace.rpd
-                python -m rocpd.schema --create /workspace/trace.rpd
-            fi
-            echo "================================= RUNNING $MODEL $input_len $gen_len ==============================================="
-            $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1 --input-len $input_len --output-len $gen_len \
-            --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report
-            if [[ -v ROCPROF_PROFILE ]] ;
-            then
-                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
-                echo "INFO: Creating Trace JSON file $TRACE_FILE"
-                mv $VLLM_DIR/results.json $TRACE_FILE
-            fi
-            if [[ -v RPD_PROFILE ]] ;
-            then
-                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
-                echo "INFO: Creating Trace JSON file $TRACE_FILE"
-                python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE
-            fi
-        done
-    done
-done
diff --git a/run_70b_fast.sh b/run_70b_fast.sh
deleted file mode 100755
index 0ed20e59ca3ff..0000000000000
--- a/run_70b_fast.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -e
-BASE_DIR=/trees
-VLLM_DIR=$BASE_DIR/vllm
-GRAD_DIR=$BASE_DIR/gradlib
-RPD_DIR=/workspace/rocmProfileData
-MODEL=/data/llama2-70b-chat
-MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
-
-export VLLM_TUNE_GEMM=0
-export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
-export VLLM_TUNE_FILE=$VLLM_DIR/"tuned.csv"
-
-#Flag to use Triton Flash Attention vs CK
-export VLLM_USE_TRITON=1
-
-#Flag to use old torch.multinomial
-#export VLLM_USE_TORCH_MULTINOMIAL=1
-
-#Delete tuned gemms before running.
-#DELETE_TUNED_CSV=1
-
-#Flag to disable MSCCL
-#export RCCL_MSCCL_ENABLE=0
-
-#HIPGraph performance flags
-export HIP_FORCE_DEV_KERNARG=1
-export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
-
-
-#Use top of tree build of RCCL
-export LD_LIBRARY_PATH=/workspace/rccl/build/
-
-#Enable either flag to create a profile trace (rocprof, or rocpd)
-#RPD_PROFILE="--profile"
-#ROCPROF_PROFILE="rocprof --hip-trace"
-
-#TP="1 2 4 8"
-TP=8
-GEN_LEN="32"
-INPUT_LEN="512,1024,2048,3072"
-#INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384"
-BATCH_SIZE="1"
-ITER=10
-
-rm -f $VLLM_UNTUNE_FILE
-for tp in $TP;
-do
-    cd $VLLM_DIR
-    export VLLM_TUNE_GEMM=1
-    echo "================================= WARMING UP $MODEL ==============================================="
-    $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \
-    --tensor-parallel-size $tp --num-iters 1 --warmup-only
-
-    if [ -f $VLLM_UNTUNE_FILE ]; then
-        echo "=============================== Tuning ======================================"
-        python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE
-        echo "File does not exist."
-    fi
-
-    export VLLM_TUNE_GEMM=0
-    echo "================================= RUNNING $MODEL ==============================================="
-    $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \
-    --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv
-done
\ No newline at end of file
diff --git a/run_llama2.sh b/run_llama2.sh
deleted file mode 100755
index 1444ca7d222a1..0000000000000
--- a/run_llama2.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-BASE_DIR=/workspace
-VLLM_DIR=$BASE_DIR/vllm
-GRAD_DIR=$VLLM_DIR/gradlib
-RPD_DIR=/workspace/rocmProfileData
-MODEL=/data/llama2-70b-chat
-MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'`
-#MODEL=/data/llama-2-13b-chat-hf
-GEMM_TUNER=1
-#TP="1 2 4 8"
-TP=8
-#Flag to use Triton Flash Attention vs CK
-#export VLLM_USE_TRITON=1
-
-#Gemm tuner flags
-export VLLM_TUNE_GEMM=0
-export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv"
-export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv"
-
-#Flag to use old torch.multinomial
-#export VLLM_USE_TORCH_MULTINOMIAL=1
-
-#Delete tuned gemms before running.
-#DELETE_TUNED_CSV=1
-#Flag to disable MSCCL
-#export RCCL_MSCCL_ENABLE=0
-#HIPGraph performance flags
-export HIP_FORCE_DEV_KERNARG=1
-export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1
-#Enable full decoder graph mode
-#Use top of tree build of RCCL
-export LD_LIBRARY_PATH=/workspace/rccl/build/
-#Enable either flag to create a profile trace (rocprof, or rocpd)
-#RPD_PROFILE="--rpd"
-#ROCPROF_PROFILE="rocprof --hip-trace"
-GEN_LEN="1,32,128"
-INPUT_LEN="512,1024,2048,3072"
-
-ITER=10
-# pring usage of the parameters
-usage() {
-    echo "Usage: $0 [--tp <n>] [--model <path>]"
-    exit 1
-}
-# parse parameters
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        --tp) TP="$2"; shift ;;
-        --model) MODEL="$2"; shift ;;
-        --notune) GEMM_TUNER=0; shift ;;
-        *) usage ;; # Any other argument will show usage information.
-    esac
-    shift # Move to next argument
-done
-for tp in $TP;
-do
-    if (( $GEMM_TUNER ));
-    then
-      echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml
-
-      if [[ $DELETE_TUNED_CSV == 1 ]];
-      then
-              rm -rf $VLLM_TUNE_FILE
-      fi
-      #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml
-      echo "INFO: Generating Tuned Gemm configs"
-      cd $GRAD_DIR
-      python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp
-    fi
-
-    cd $VLLM_DIR
-    for gen_len in $GEN_LEN;
-    do
-        for input_len in $INPUT_LEN;
-        do
-            if [[ -v RPD_PROFILE ]] ;
-            then
-                rm /workspace/trace.rpd
-                python -m rocpd.schema --create /workspace/trace.rpd
-            fi
-            echo "================================= RUNNING $MODEL $input_len $gen_len ==============================================="
-            $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL  --batch-size 1 --input-len $input_len --output-len $gen_len \
-            --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report
-            if [[ -v ROCPROF_PROFILE ]] ;
-            then
-                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
-                echo "INFO: Creating Trace JSON file $TRACE_FILE"
-                mv $VLLM_DIR/results.json $TRACE_FILE
-            fi
-            if [[ -v RPD_PROFILE ]] ;
-            then
-                TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json
-                echo "INFO: Creating Trace JSON file $TRACE_FILE"
-                python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE
-            fi
-        done
-    done
-done
diff --git a/vllm/config.py b/vllm/config.py
index 787b7feb737cd..de687395a0001 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -436,7 +436,7 @@ def __init__(
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
         if self.world_size > 1 and not is_neuron():
-            self.worker_use_ray = False
+            self.worker_use_ray = True
         self._verify_args()
 
     def _verify_args(self) -> None:
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 123b02c8d6cc8..742f3dc575190 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,6 +1,5 @@
 import pickle
 
-import socket
 from typing import Optional, List, Tuple
 
 from vllm.config import ParallelConfig
@@ -67,12 +66,7 @@ def execute_model_compiled_dag_remote(self, ignored):
     RayWorkerVllm = None
 
 
-def get_open_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-def initialize_cluster(
+def initialize_ray_cluster(
     parallel_config: ParallelConfig,
     ray_address: Optional[str] = None,
 ):
@@ -100,18 +94,10 @@ def initialize_cluster(
     else:
         ray.init(address=ray_address, ignore_reinit_error=True)
 
-    # if not parallel_config.worker_use_ray:
-    #     assert parallel_config.world_size == 1, (
-    #         "Ray is required if parallel_config.world_size > 1.")
-    #     return None
-    if not parallel_config.worker_use_ray:
-        # Initialize cluster locally.
-        port = get_open_port()
-        # We need to setup the distributed init method to make sure
-        # the distributed megatron code (e.g., get world size) works correctly.
-        distributed_init_method = f"tcp://localhost:{port}"
-        return distributed_init_method, None
-    
+    if parallel_config.placement_group:
+        # Placement group is already set.
+        return
+
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6e2dbefa44a49..40e681df48f86 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,8 +13,6 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.tuned_gemm import tgemm
-
 
 logger = init_logger(__name__)
 
@@ -78,9 +76,7 @@ def apply_weights(self,
             if bias is not None:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
-        #tgemm.mm(x,weight)
-        #return F.linear(x, weight, bias)
-        return tgemm.mm(x,weight)
+        return F.linear(x, weight, bias)
 
 
 class ReplicatedLinear(torch.nn.Module):
@@ -132,7 +128,6 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if not self.skip_bias_add else None
         output = self.linear_method.apply_weights(self.linear_weights, x, bias)
-        #print(f">>> output is {output}")
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
@@ -580,7 +575,7 @@ def forward(self, input_):
             output_ = tensor_model_parallel_all_reduce(output_parallel)
         else:
             output_ = output_parallel
-        #print(f">>> ROWPARALLEL {output_.shape}")
+
         if not self.skip_bias_add:
             output = output_ + self.bias if self.bias is not None else output_
             output_bias = None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index a7adacea7716d..4377b845df628 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -43,8 +43,7 @@ def __init__(self,
     def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
                     embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
         # Get the logits for the next tokens.
-        #logits = torch.matmul(hidden_states, embedding.t())
-        logits = tgemm.mm(hidden_states, embedding)
+        logits = torch.matmul(hidden_states, embedding.t())
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
deleted file mode 100644
index bebab27ebfd86..0000000000000
--- a/vllm/model_executor/layers/tuned_gemm.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.nn.functional as F
-from rocsolidxgemm import rocb_create_extension,rocb_mm
-from hipbsolidxgemm import hipb_create_extension,hipb_mm
-from pathlib import Path
-import os
-import yaml
-import pandas as pd
-from vllm import custom_ops
-
-
-class TunedGemm:
-    def __init__(self):
-        #rocb_create_extension()
-        #hipb_create_extension()
-        self.extensions_created = False
-        self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0))
-        self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv")
-        self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv")
-        self.bestsols = {}
-        self.load_best_sols()
-        self.create_ds()
-
-
-        if (self.save_gemm == 1):
-            self.tuned_df = pd.DataFrame(columns=['M','N','K'])
-        else:
-            self.tuned_df = None
-
-    def load_best_sols(self):
-        if self.tune_path is not None and Path(self.tune_path).is_file():
-            self.bestsols = pd.read_csv(self.tune_path)
-
-    def apply_custom(self,ds):
-        M,N,K = ds['M'],ds['N'],ds['K']
-        #apply custom matvec (only for f16 dtype)
-        if N==1:
-            ds1 = ds.copy()
-            ds1['libtype'] = 'custom'
-            if K==8192 and (M==1280 or M==7168):
-                ds1['solidx'] = 8
-                return ds1
-            elif K==3584 and M==8192:
-                ds1['solidx'] = 8
-                return ds1
-            elif K<=8192 and K%8==0 and M%4==0:
-                ds1['solidx'] = 1
-                return ds1
-        return ds
-    def create_ds(self):
-        df = self.bestsols
-        solds = {}
-        for i in range(len(df)):
-            ds = self.apply_custom(df.iloc[i])
-            key = (ds['M'],ds['N'],ds['K'])
-            if ds['libtype']=='hipblaslt': soltype = 1
-            elif ds['libtype']=='rocblas': soltype = 2
-            elif ds['libtype']=='custom': soltype = 3
-            solds[key] = (soltype,int(ds['solidx']))
-        self.solids = solds
-        #print('>>>',solds)
-    def query_sol(self,m,n,k):
-        return self.solids.get((m,n,k),(0,0))
-    def mm(self,inp,weights):
-        # F.Linear can take a 3 dimensional input. vllm uses this for linear units.
-        # However, sampler will use torch.matmul with 2 dimensions only
-        if inp.dim() == 3:
-            inp_view=inp.view(-1,inp.size(-1))
-            batched = True
-        else:
-            inp_view = inp
-            batched = False
-        #print(f'>>>inp_view {inp_view.shape}')
-        if self.extensions_created == False:
-            rocb_create_extension()
-            hipb_create_extension()
-            self.extensions_created = True
-        soltype,solidx = self.query_sol(m=weights.shape[0],n=inp_view.shape[0],k=inp_view.shape[1])
-        if soltype==1:
-            #print(">>> found hipblas")
-            out = hipb_mm(inp_view,weights.t(),solidx)
-        elif soltype==3:
-            ##only matvec is supported currently
-            out = torch.empty(inp.shape[0],weights.shape[0],dtype=torch.float16,device='cuda')
-            #print('>>>Matvec',inp.shape,weights.shape,soltype,solidx)
-            if solidx<=1:
-                custom_ops.LLMM1(weights,inp,out,4)
-            elif solidx==2:
-                custom_ops.LLMM1(weights,inp,out,2)
-            elif solidx==8:
-                custom_ops.LLMM1(weights,inp,out,8)
-            elif solidx==20:
-                custom_ops.LLZZ(weights,inp,out,0)
-            elif solidx==21:
-                custom_ops.LLZZ(weights,inp,out,1)
-        elif soltype==2:
-            #print(">>> found rocblas")
-            out = rocb_mm(inp_view,weights.t(),solidx)
-        else:
-            
-            if (self.save_gemm == 1):
-                print('>>>Tgemm Default',inp_view.shape, inp.shape,weights.shape,soltype,solidx)
-                self.tuned_df = pd.concat([self.tuned_df, pd.DataFrame({'M':[weights.shape[0]], 'N':[inp.shape[0]*inp.shape[1]], 'K':[weights.shape[1]]})]).drop_duplicates()
-                self.tuned_df.to_csv(self.untune_path, index=False)
-            out = F.linear(inp,weights)
-        if batched:
-            return out.view(inp.shape[0], inp.shape[1], weights.shape[0])
-        else:
-            return out
-
-tgemm = TunedGemm()
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index 58bf6c2d97e24..521b6b8a383b0 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -79,7 +79,6 @@ def tensor_model_parallel_gather(input_: torch.Tensor,
     all the ranks.
     """
     world_size = get_tensor_model_parallel_world_size()
-
     # Bypass the function if we are using only 1 GPU.
     if world_size == 1:
         return input_
@@ -89,22 +88,19 @@ def tensor_model_parallel_gather(input_: torch.Tensor,
         # Convert negative dim to positive.
         dim += input_.dim()
     # Allocate output tensor.
-    gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-    # if get_tensor_model_parallel_rank() == dst:
-    #     gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-    # else:
-    #     gather_list = None
+    if get_tensor_model_parallel_rank() == dst:
+        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+    else:
+        gather_list = None
     # Gather.
-
-    #print(f'>>> world size {world_size}, {gather_list}, {dst} {get_tensor_model_parallel_group()}')
-    torch.distributed.all_gather(gather_list, input_,
+    torch.distributed.gather(input_,
+                             gather_list,
+                             dst=dst,
                              group=get_tensor_model_parallel_group())
-    output_tensor = torch.cat(gather_list, dim=dim)
-    # if get_tensor_model_parallel_rank() == dst:
-    #     output_tensor = torch.cat(gather_list, dim=dim)
-    # else:
-    #     output_tensor = None
-    #print(f'>>> output_tensor {output_tensor}, {dst}, {dim}')
+    if get_tensor_model_parallel_rank() == dst:
+        output_tensor = torch.cat(gather_list, dim=dim)
+    else:
+        output_tensor = None
     return output_tensor
 
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 593d5aadcd2ab..7eac576e3f0fe 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -31,7 +31,6 @@
 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
-#_BATCH_SIZES_TO_CAPTURE = [1]
 
 
 class ModelRunner:
@@ -537,7 +536,7 @@ def prepare_input_tensors(
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
             }
-            #broadcast_tensor_dict(metadata_dict, src=0)
+            broadcast_tensor_dict(metadata_dict, src=0)
         else:
             metadata_dict = broadcast_tensor_dict(src=0)
             input_tokens = metadata_dict["input_tokens"]
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index e201b484aa070..0dcd4018afa5f 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -51,9 +51,6 @@ def __init__(
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
         self.is_driver_worker = is_driver_worker
-        local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        self.local_rank = local_rank
-
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
@@ -83,12 +80,7 @@ def init_model(self, cupy_port: Optional[int] = None) -> None:
 
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-            self.rank = self.rank if self.rank is not None else int(
-                os.getenv("RANK", "-1"))
-
             self.device = torch.device(f"cuda:{self.local_rank}")
-        
-
             torch.cuda.set_device(self.device)
 
             _check_if_gpu_supports_dtype(self.model_config.dtype)
@@ -201,7 +193,7 @@ def execute_model(
         blocks_to_swap_out: Optional[Dict[int, int]] = None,
         blocks_to_copy: Optional[Dict[int, List[int]]] = None,
     ) -> Optional[SamplerOutput]:
-        if self.is_driver_worker and self.rank == 0:
+        if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             num_seq_groups = len(seq_group_metadata_list)
             assert blocks_to_swap_in is not None
@@ -213,7 +205,7 @@ def execute_model(
                 "blocks_to_swap_out": blocks_to_swap_out,
                 "blocks_to_copy": blocks_to_copy,
             }
-            #broadcast_tensor_dict(data, src=0)
+            broadcast_tensor_dict(data, src=0)
         else:
             data = broadcast_tensor_dict(src=0)
             num_seq_groups = data["num_seq_groups"]
@@ -281,25 +273,6 @@ def init_distributed_environment(
             world_size=parallel_config.world_size,
             rank=rank,
             init_method=distributed_init_method,
-            #init_method="env://",
-        )
-
-    if cupy_utils.is_initialized():
-        cupy_world_size = cupy_utils.get_world_size()
-        if cupy_world_size != parallel_config.world_size:
-            raise RuntimeError(
-                "cupy.distributed is already initialized but the cupy world "
-                "size does not match parallel_config.world_size "
-                f"({cupy_world_size} vs. {parallel_config.world_size}).")
-    elif (parallel_config.world_size > 1 and cupy_port is not None):
-        # NOTE(woosuk): We don't initialize CuPy process group when world size
-        # is 1.
-        # TODO(woosuk): Support multi-node connection.
-        cupy_utils.init_process_group(
-            world_size=parallel_config.world_size,
-            rank=rank,
-            host="localhost",
-            port=cupy_port,
         )
 
     if cupy_utils.is_initialized():

From 0e63661b0cdac0c60857f7cc277819c5b88ed2f6 Mon Sep 17 00:00:00 2001
From: jpvillam <juan.villamizar@amd.com>
Date: Tue, 19 Mar 2024 11:49:21 -0400
Subject: [PATCH 19/34] Small fix on dockerfile

---
 Dockerfile.rocm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index a7640f6841ad9..080e5b04d28bc 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -99,11 +99,11 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
     fi
 
 # build triton
-RUN if [ "$BUILD_TRITON" = "1"]; then \
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
     mkdir -p libs \
     && cd libs \
     && pip uninstall -y triton \
-    && git clone https://github.com/ROCmSoftwarePlatform/triton.git
+    && git clone https://github.com/ROCmSoftwarePlatform/triton.git \
     && cd triton/python \
     && pip3 install -e . \
     && cd ../..; \

From d4cb905dfec5abdbbe9f585ff4ee9da59efedbfe Mon Sep 17 00:00:00 2001
From: jpvillam <juan.villamizar@amd.com>
Date: Tue, 19 Mar 2024 19:41:43 -0400
Subject: [PATCH 20/34] Rebase updates and PR review changes

Added Flag for controlling triton vs default flow.
More small changes to dockerfile
---
 Dockerfile.rocm                               |  2 +-
 .../layers/attention/attention.py             | 47 ++++++++++++-------
 .../layers/attention/backends/flash_attn.py   | 36 +++++++++-----
 .../attention/ops/flash_attention_triton.py   | 33 ++++++-------
 4 files changed, 70 insertions(+), 48 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 080e5b04d28bc..e7f52307a6aa2 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -105,7 +105,7 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
     && pip uninstall -y triton \
     && git clone https://github.com/ROCmSoftwarePlatform/triton.git \
     && cd triton/python \
-    && pip3 install -e . \
+    && pip3 install . \
     && cd ../..; \
     fi
 
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 4b63b9eaf59a7..89b5816f7a47a 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.utils import is_hip
+import os
 
 logger = init_logger(__name__)
 
@@ -34,11 +35,12 @@ def __init__(
         sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
-        if _use_flash_attn():
+        if use_triton := _use_flash_attn():
             from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend  # noqa: E501
             self.backend = FlashAttentionBackend(num_heads, head_size, scale,
                                                  num_kv_heads, alibi_slopes,
-                                                 sliding_window)
+                                                 sliding_window,
+                                                 use_triton == 2)
         else:
             from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend  # noqa: E501
             self.backend = XFormersBackend(num_heads, head_size, scale,
@@ -59,26 +61,37 @@ def forward(
 
 
 @lru_cache(maxsize=1)
-def _use_flash_attn() -> bool:
-    try:
-        import flash_attn  # noqa: F401
-    except ImportError:
-        logger.info("flash_attn is not found. Using xformers backend.")
-        return False
-
-    if is_hip():
-        # AMD GPUs.
-        return False
-    if torch.cuda.get_device_capability()[0] < 8:
+def _use_flash_attn() -> int:
+    """Returns if and which flash attention to use.
+
+    Returns:
+        int: 0 for none, 1 for default implementation, 2 for triton implementation.
+    """
+    if not (os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') and is_hip()):
+        # AMD GPUs can use flash_attn package or triton impl.
+        try:
+            import flash_attn  # noqa: F401
+        except ImportError:
+            logger.info("flash_attn is not found. Using xformers backend.")
+            return 0
+
+    if (not is_hip()) and torch.cuda.get_device_capability()[0] < 8:
         # Volta and Turing NVIDIA GPUs.
         logger.info("flash_attn is not supported on Turing or older GPUs. "
                     "Using xformers backend.")
-        return False
+        return 0
+
+    if is_hip() and torch.cuda.get_device_capability()[0] != 9:
+        # not Instinct series GPUs.
+        logger.info("flash_atten is not supported on NAVI GPUs. "
+                    "Using xformers backend.")
+        return 0
+
     if torch.get_default_dtype() not in (torch.float16, torch.bfloat16):
         logger.info(
             "flash_attn only supports torch.float16 or torch.bfloat16. "
             "Using xformers backend.")
-        return False
+        return 0
 
-    logger.info("Using flash_attn backend.")
-    return True
+    logger.info(f"Using {'Triton' if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else ''} flash_attn backend.")
+    return 2 if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else 1
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index c2d7b5acc467e..726b42cad9e3f 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -8,7 +8,7 @@
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention.ops.paged_attn import (
     PagedAttentionImpl)
-from vllm.model_executor.layers.attention.ops.flash_attention_triton import attention
+from vllm.model_executor.layers.attention.ops.flash_attention_triton import triton_attention
 
 
 class FlashAttentionBackend:
@@ -21,6 +21,7 @@ def __init__(
         num_kv_heads: Optional[int] = None,
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
+        use_triton: Optional[bool] = False,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -30,6 +31,7 @@ def __init__(
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
+        self.use_triton = use_triton
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -87,8 +89,8 @@ def forward(
                 query = query.unflatten(0, (batch_size, seq_len))
                 key = key.unflatten(0, (batch_size, seq_len))
                 value = value.unflatten(0, (batch_size, seq_len))
-                if is_hip():
-                    output, _ = attention(
+                if self.use_triton:
+                    output, _ = triton_attention(
                                 query,
                                 key,
                                 value,
@@ -98,15 +100,25 @@ def forward(
                                 self.scale,
                             )
                 else:
-                    output = flash_attn_func(
-                        query,
-                        key,
-                        value,
-                        softmax_scale=self.scale,
-                        causal=True,
-                        window_size=self.sliding_window,
-                        alibi_slopes=self.alibi_slopes,
-                    )
+                    if is_hip():
+                        #XXX: window_size and alibi_slopes not supported
+                        output = flash_attn_func(
+                            query,
+                            key,
+                            value,
+                            softmax_scale=self.scale,
+                            causal=True,
+                        )
+                    else:
+                        output = flash_attn_func(
+                            query,
+                            key,
+                            value,
+                            softmax_scale=self.scale,
+                            causal=True,
+                            window_size=self.sliding_window,
+                            alibi_slopes=self.alibi_slopes,
+                        )
             else:
                 # prefix-enabled attention
                 output = PagedAttentionImpl.forward_prefix(
diff --git a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py
index 37c15e0e6fa36..80962e4cf9d9a 100644
--- a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py
+++ b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py
@@ -251,12 +251,12 @@ def attn_fwd(
             )
             acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
             # We still need to write 0s to the result
-            tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1))
-            l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+            #tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1))
+            #l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
             # We store inf to LSE, not -inf because in the bwd pass, we subtract this
             # from qk which makes it -inf, such that exp(qk - inf) = 0 for these masked blocks.
-            l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            tl.store(l_ptrs, l)
+            #l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            #tl.store(l_ptrs, l)
             # TODO: Should dropout and return encoded softmax be handled here too?
             return
 
@@ -417,17 +417,17 @@ def attn_fwd(
             z = 0.0
             acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
     # write back LSE
-    l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    #l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
     # If seqlen_q not multiple of BLOCK_M, we need to mask out the last few rows.
     # This is only true for the last M block. For others, overflow_size will be -ve
-    overflow_size = end_m_idx - seqlen_q
-    if overflow_size > 0:
-        boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-        # This is a > check because mask being 0 blocks the store.
-        l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-        tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    else:
-        tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+    #overflow_size = end_m_idx - seqlen_q
+    #if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    #else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
 
     # write back O
     o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
@@ -494,8 +494,6 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None):
 
         encoded_softmax = None
 
-        M = torch.empty((batch, nheads_q, metadata.max_seq_len), device=q.device, dtype=torch.float32)
-
         # Seed the RNG so we get reproducible results for testing.
         philox_seed = 0x1BF52
         philox_offset = 0x1D4B42
@@ -507,7 +505,7 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None):
             bias_strides = (0,0,0,0)
 
         attn_fwd[grid](
-            q, k, v, bias, sm_scale, M, o,
+            q, k, v, bias, sm_scale, None, o,
             *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides,
             None, None,
             dropout_p=0.0,
@@ -526,7 +524,6 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None):
             RETURN_ENCODED_SOFTMAX=False
         )
 
-        ctx.save_for_backward(q, k, v, o, M)
         ctx.grid = grid
         ctx.sm_scale = sm_scale
         ctx.BLOCK_DMODEL = head_size
@@ -538,4 +535,4 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None):
         ctx.return_encoded_softmax = False
         return o, encoded_softmax
 
-attention = _attention.apply
+triton_attention = _attention.apply

From bc750faa853877bb281ea3c3a15be7b776e18b40 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Wed, 20 Mar 2024 23:47:48 +0000
Subject: [PATCH 21/34] Introducing torchrun multi GPU support

---
 benchmarks/benchmark_latency.py        |   6 +
 benchmarks/benchmark_throughput.py     |  11 +-
 csrc/hip_compat.h                      |  39 ++++++
 vllm/config.py                         |   4 +-
 vllm/engine/arg_utils.py               |   6 +
 vllm/engine/llm_engine.py              |   3 +
 vllm/executor/torchrun_gpu_executor.py | 173 +++++++++++++++++++++++++
 vllm/worker/worker.py                  |  19 ++-
 8 files changed, 252 insertions(+), 9 deletions(-)
 create mode 100644 csrc/hip_compat.h
 create mode 100644 vllm/executor/torchrun_gpu_executor.py

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 2fdc08c5c26df..8ff04fccb0004 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -27,6 +27,7 @@ def main(args: argparse.Namespace):
         kv_cache_dtype=args.kv_cache_dtype,
         device=args.device,
         ray_workers_use_nsight=args.ray_workers_use_nsight,
+        worker_use_torchrun=args.worker_use_torchrun
     )
 
     sampling_params = SamplingParams(
@@ -151,5 +152,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
+    parser.add_argument('--worker-use-torchrun',
+        action='store_true',
+        help='use torchrun instead of ray when using '
+        'more than 1 GPU. Preferable for ROCm'
+    )
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 72bdc4b3b4540..67062db0c93ad 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -75,6 +75,7 @@ def run_vllm(
     device: str,
     enable_prefix_caching: bool,
     gpu_memory_utilization: float = 0.9,
+    worker_use_torchrun: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(model=model,
@@ -89,7 +90,8 @@ def run_vllm(
               enforce_eager=enforce_eager,
               kv_cache_dtype=kv_cache_dtype,
               device=device,
-              enable_prefix_caching=enable_prefix_caching)
+              enable_prefix_caching=enable_prefix_caching,
+              worker_use_torchrun=worker_use_torchrun,)
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
@@ -213,7 +215,8 @@ def main(args: argparse.Namespace):
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype, args.device,
-            args.enable_prefix_caching, args.gpu_memory_utilization)
+            args.enable_prefix_caching, args.gpu_memory_utilization,
+            args.worker_use_torchrun)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -314,6 +317,10 @@ def main(args: argparse.Namespace):
         "--enable-prefix-caching",
         action='store_true',
         help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument('--worker-use-torchrun',
+        action='store_true',
+        help='use torchrun instead of ray when using '
+        'more than 1 GPU. Preferable for ROCm')    
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/csrc/hip_compat.h b/csrc/hip_compat.h
new file mode 100644
index 0000000000000..d9fe30b1e7b5d
--- /dev/null
+++ b/csrc/hip_compat.h
@@ -0,0 +1,39 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif
+
diff --git a/vllm/config.py b/vllm/config.py
index de687395a0001..bb01300f762a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -412,6 +412,7 @@ def __init__(
         pipeline_parallel_size: int,
         tensor_parallel_size: int,
         worker_use_ray: bool,
+        worker_use_torchrun: bool,
         max_parallel_loading_workers: Optional[int] = None,
         disable_custom_all_reduce: bool = False,
         ray_workers_use_nsight: bool = False,
@@ -428,6 +429,7 @@ def __init__(
         else:
             self.tensor_parallel_size = tensor_parallel_size
         self.worker_use_ray = worker_use_ray
+        self.worker_use_torchrun = worker_use_torchrun
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce
         self.ray_workers_use_nsight = ray_workers_use_nsight
@@ -435,7 +437,7 @@ def __init__(
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
-        if self.world_size > 1 and not is_neuron():
+        if not self.worker_use_torchrun and self.world_size > 1 and not is_neuron():
             self.worker_use_ray = True
         self._verify_args()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c3dccdd5bb50b..557fb9c2d4061 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -21,6 +21,7 @@ class EngineArgs:
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
+    worker_use_torchrun: bool = False
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
@@ -153,6 +154,10 @@ def add_cli_args(
                             action='store_true',
                             help='use Ray for distributed serving, will be '
                             'automatically set when using more than 1 GPU')
+        parser.add_argument('--worker-use-torchrun',
+                            action='store_true',
+                            help='use torchrun instead of ray when using '
+                            'more than 1 GPU. Preferable for ROCm')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -317,6 +322,7 @@ def create_engine_configs(
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
+                                         self.worker_use_torchrun,
                                          self.max_parallel_loading_workers,
                                          self.disable_custom_all_reduce,
                                          self.ray_workers_use_nsight)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4cdad4180aa14..e8272968d190d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -126,6 +126,9 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
             initialize_ray_cluster(parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutor
             executor_class = RayGPUExecutor
+        elif parallel_config.worker_use_torchrun:
+            from vllm.executor.torchrun_gpu_executor import TorchrunGPUExecutor
+            executor_class = TorchrunGPUExecutor
         else:
             assert parallel_config.world_size == 1, (
                 "Ray is required if parallel_config.world_size > 1.")
diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py
new file mode 100644
index 0000000000000..88823ba5d4920
--- /dev/null
+++ b/vllm/executor/torchrun_gpu_executor.py
@@ -0,0 +1,173 @@
+import importlib
+import os
+from typing import Dict, List, Optional
+
+from vllm.lora.request import LoRARequest
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.utils import check_block_size_valid
+from vllm.logger import init_logger
+from vllm.model_executor.parallel_utils.communication_op import broadcast_object_list, broadcast_tensor_dict
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
+                        make_async)
+
+logger = init_logger(__name__)
+
+# A map between the device type (in device config) to its worker module.
+DEVICE_TO_WORKER_MODULE_MAP = {
+    "cuda": "vllm.worker.worker",
+    "neuron": "vllm.worker.neuron_worker",
+}
+
+
+class TorchrunGPUExecutor(ExecutorBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.is_driver_worker = self.local_rank == 0
+
+        # Instantiate the worker and load the model to GPU.
+        self._init_worker()
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+    def _dispatch_worker(self):
+        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
+            self.device_config.device_type]
+        imported_worker = importlib.import_module(worker_module)
+        Worker = imported_worker.Worker
+        return Worker
+
+    def _init_worker(self):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        Worker = self._dispatch_worker()
+
+        assert self.parallel_config.world_size > 1, (
+            "TorchrunGPUExecutor only supports multiple GPUs.")
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            local_rank=self.local_rank,
+            rank=self.local_rank,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=self.is_driver_worker,
+        )
+        self.worker.init_model()
+        self.worker.load_model()
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache.
+
+        The engine first profiles the existing memory usage.
+        Then, it allocates the remaining memory for KV blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_gpu_blocks, num_cpu_blocks = (
+            self.worker.profile_num_available_blocks(
+                block_size=self.cache_config.block_size,
+                gpu_memory_utilization=self.cache_config.
+                gpu_memory_utilization,
+                cpu_swap_space=self.cache_config.swap_space_bytes,
+                cache_dtype=self.cache_config.cache_dtype,
+            ))
+
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
+                               self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self.worker.init_cache_engine(cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self.worker.warm_up_model()
+
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        output = self.worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
+        if self.is_driver_worker:
+            broadcast_object_list([output], src=0)
+        else:
+            res = [None]
+            broadcast_object_list(res, src=0)
+            output = res[0]
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.worker.remove_lora(lora_id)
+
+    def list_loras(self) -> List[int]:
+        return self.worker.list_loras()
+
+    def check_health(self) -> None:
+        # TorchrunGPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class TorchrunGPUExecutorAsync(TorchrunGPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output
+
+    async def check_health_async(self) -> None:
+        # TorchrunGPUExecutor will always be healthy as long as
+        # it's running.
+        return
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0dcd4018afa5f..6f446db6c1cda 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -268,12 +268,19 @@ def init_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
-        torch.distributed.init_process_group(
-            backend="nccl",
-            world_size=parallel_config.world_size,
-            rank=rank,
-            init_method=distributed_init_method,
-        )
+        if parallel_config.worker_use_torchrun:
+            torch.distributed.init_process_group(
+                backend="nccl",
+                world_size=parallel_config.world_size,
+                init_method="env://",
+            )
+        else:
+            torch.distributed.init_process_group(
+                backend="nccl",
+                world_size=parallel_config.world_size,
+                rank=rank,
+                init_method=distributed_init_method,
+            )
 
     if cupy_utils.is_initialized():
         cupy_world_size = cupy_utils.get_world_size()

From a83b7eae748d10ee7092b3e62c4fd55dd67d4129 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Thu, 21 Mar 2024 23:33:43 +0000
Subject: [PATCH 22/34] Update dockerfile

---
 Dockerfile.rocm | 154 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 112 insertions(+), 42 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 873574c409a8f..802f124224879 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,45 +1,115 @@
-FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-# Limit arch's so composable kernel doesn't take days to finish
-ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
-RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention \
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+
+ARG FA_BRANCH="3d2b6f5"
+RUN echo "FA_BRANCH is $FA_BRANCH"
+
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
+
+# whether to build cupy on rocm
+ARG BUILD_CUPY="1"
+
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    sqlite3 libsqlite3-dev libfmt-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+
+ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+
+# Install ROCm flash-attention
+RUN if [ "$BUILD_FA" = "1" ]; then \
+    mkdir libs \
+    && cd libs \
+    && git clone https://github.com/ROCm/flash-attention.git \
     && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && python setup.py install
-RUN git clone -b develop https://github.com/ROCmSoftwarePlatform/hipBLASLt \
-    && export GTest_DIR="/usr/local/lib/cmake/GTest/" \
-    && cd hipBLASLt \
-    && ./install.sh -idc --architecture 'gfx90a;gfx942' \
-    && cd ../ && rm -rf hipBLASLt
-RUN sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status
-RUN sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
-
-RUN pip uninstall -y triton
-RUN git clone https://github.com/ROCmSoftwarePlatform/triton.git \
-    && cd triton/python && pip3 install -e .
-ENV MAX_JOBS=32
-RUN cd ${WORKSPACE_DIR} \
-    && git clone -b exp_bandaid https://github.com/ROCmSoftwarePlatform/rccl \
-    && cd rccl && mkdir build && cd build \
-    && CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && make -j
-
-RUN pip install xformers==0.0.23 --no-deps
-ADD ./ $WORKSPACE_DIR/vllm
-
-RUN cd vllm \
-    && pip install -r requirements-rocm.txt \
-    && pip install typing-extensions==4.8.0 \
-    && bash patch_xformers.rocm.sh \
-    && cd gradlib && python setup.py develop && cd ../ \
-    && python setup.py build && python setup.py develop; exit 0
-
-RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3
-
-RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
-    && cd rocmProfileData && make; make install
-
-WORKDIR /workspace/vllm
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+    && python3 setup.py install \
+    && cd ..; \
+    fi
+
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
+
+# build cupy
+RUN if [ "$BUILD_CUPY" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
+    && cd cupy \
+    && pip install mpi4py-mpich \
+    && pip install scipy==1.9.3 \
+    && pip install cython==0.29.* \
+    && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
+    && export CUPY_INSTALL_USE_HIP=1 \
+    && export ROCM_HOME=/opt/rocm \
+    && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
+    && pip install . \
+    && cd ..; \
+    fi
+
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCm/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
+    fi
+
+COPY ./ /app/vllm
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
+RUN cd /app \
+    && cd vllm \
+    && pip install -U -r requirements-rocm.txt \
+    && if [ "$BUILD_FA" = "1" ]; then \
+    bash patch_xformers.rocm.sh; fi \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
+    && python3 setup.py install \
+    && cd ..
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir ray[all]
+
+CMD ["/bin/bash"]

From e01b8cd9eeca0ef08d3ca23953744bdb7c7a14f2 Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Thu, 21 Mar 2024 15:39:24 +0000
Subject: [PATCH 23/34] add use case for custom kernel for matvec operation

---
 vllm/model_executor/layers/linear.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 40e681df48f86..77e03aba573ad 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,6 +13,7 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
+from vllm import custom_ops
 
 logger = init_logger(__name__)
 
@@ -72,6 +73,20 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
+        if x.shape[0] == 1:
+            m, n, k = weight.shape[0], x.shape[0], x.shape[1]
+            out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype)
+            if k == 8192 and (m == 1280 or m == 7168):
+                custom_ops.LLMM1(weight, x, out, 8)
+            elif k == 3584 and m == 8192:
+                custom_ops.LLMM1(weight, x, out, 8)
+            elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
+                custom_ops.LLMM1(weight, x, out, 4)
+            else:
+                out = F.linear(x, weight)
+            if bias != None:
+                out = out + bias
+            return out
         if self.separate_bias_add:
             if bias is not None:
                 return F.linear(x, weight) + bias

From eb21ad7e154e65e7135988ea1bc9a0ade69ac4dd Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Thu, 21 Mar 2024 16:27:34 +0000
Subject: [PATCH 24/34] limit the custom kernel under is_hip

---
 vllm/model_executor/layers/linear.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 77e03aba573ad..6398c53bfda06 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,6 +13,7 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
+from vllm.utils import is_hip
 from vllm import custom_ops
 
 logger = init_logger(__name__)
@@ -73,7 +74,7 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
-        if x.shape[0] == 1:
+        if is_hip() and x.shape[0] == 1:
             m, n, k = weight.shape[0], x.shape[0], x.shape[1]
             out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype)
             if k == 8192 and (m == 1280 or m == 7168):

From 1bf736f8f40e2efd5f407a30c3f0364aa89a3c86 Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Thu, 21 Mar 2024 21:00:56 +0000
Subject: [PATCH 25/34] fix custom kernel

---
 vllm/model_executor/layers/linear.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6398c53bfda06..edcb448741f7e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -74,17 +74,25 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
-        if is_hip() and x.shape[0] == 1:
-            m, n, k = weight.shape[0], x.shape[0], x.shape[1]
-            out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype)
+        if is_hip() and x.view(-1, x.size(-1)).shape[0] == 1:
+            batched = False
+            if x.dim() == 3:
+                inp = x.view(-1, x.size(-1))
+                batched = True
+            else:
+                inp = x
+            m, n, k = weight.shape[0], inp.shape[0], inp.shape[1]
+            out = torch.empty(inp.shape[0], weight.shape[0], dtype=inp.dtype, device='cuda')
             if k == 8192 and (m == 1280 or m == 7168):
-                custom_ops.LLMM1(weight, x, out, 8)
+                custom_ops.LLMM1(weight, inp, out, 8)
             elif k == 3584 and m == 8192:
-                custom_ops.LLMM1(weight, x, out, 8)
+                custom_ops.LLMM1(weight, inp, out, 8)
             elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
-                custom_ops.LLMM1(weight, x, out, 4)
+                custom_ops.LLMM1(weight, inp, out, 4)
             else:
-                out = F.linear(x, weight)
+                out = F.linear(inp, weight)
+            if batched:
+                out = out.view(x.shape[0], x.shape[1], weight.shape[0])
             if bias != None:
                 out = out + bias
             return out

From 7ab4a24323de88384c550f13fe83615bddd9b84a Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Thu, 21 Mar 2024 23:43:56 +0000
Subject: [PATCH 26/34] Rocm defaults and cleanup

---
 benchmarks/benchmark_latency.py     | 175 ++++++++++------------------
 csrc/attention/attention_kernels.cu |  10 +-
 vllm/model_executor/models/llama.py |   3 -
 3 files changed, 70 insertions(+), 118 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 04658b70d3ed4..8ff04fccb0004 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -3,23 +3,17 @@
 import time
 from pathlib import Path
 from typing import Optional
-import pandas as pd
+
 import numpy as np
 import torch
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
-from torch.profiler import profile, record_function, ProfilerActivity
 
-def list_of_ints(arg):
-    return list(map(int, arg.split(',')))
 
 def main(args: argparse.Namespace):
     print(args)
 
-    print(f'>>>Loading LLM')
-    if args.report:
-        results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(
@@ -36,101 +30,60 @@ def main(args: argparse.Namespace):
         worker_use_torchrun=args.worker_use_torchrun
     )
 
-    for batch_size in args.batch_size:
-        for output_len in args.output_len:
-            for input_len in args.input_len:
-                print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}') 
-                sampling_params = SamplingParams(
-                    n=args.n,
-                    temperature=0.0 if args.use_beam_search else 1.0,
-                    top_p=1.0,
-                    use_beam_search=args.use_beam_search,
-                    ignore_eos=True,
-                    max_tokens=output_len,
-                )
-                print(sampling_params)
-                dummy_prompt_token_ids = [[0] * input_len] * batch_size
-                dummy_prompts = []
-                dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose')
-
-                def run_to_completion(profile_dir: Optional[str] = None):
-                    if profile_dir:
-                        with torch.profiler.profile(
-                                activities=[
-                                    torch.profiler.ProfilerActivity.CPU,
-                                    torch.profiler.ProfilerActivity.CUDA,
-                                ],
-                                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                                    str(profile_dir))) as p:
-                            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                                        sampling_params=sampling_params,
-                                        use_tqdm=False)
-                        print(p.key_averages())
-                    elif args.accuracy:
-                        start_time = time.perf_counter()
-                        rsp = llm.generate(
-                                    #prompt_token_ids=dummy_prompt_token_ids,
-                                    prompts=dummy_prompts,
-                                    sampling_params=sampling_params,
-                                    use_tqdm=False)
-                        end_time = time.perf_counter()
-                        latency = end_time - start_time
-                        print('>>Rsp', rsp[0].outputs)
-                        return latency
-                    else:
-                        start_time = time.perf_counter()
-                        rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                                    sampling_params=sampling_params,
-                                    use_tqdm=False)
-                        end_time = time.perf_counter()
-                        latency = end_time - start_time
-                        print('>>Rsp', rsp[0].outputs)
-                        return latency
-
-                print("Warming up...")
-                run_to_completion(profile_dir=None)
-                
-                if (args.warmup_only):
-
-                    print(">>> Warmup only specified, exiting")
-                    continue
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
 
-                if args.profile:
-                    profile_dir = args.profile_result_dir
-                    if not profile_dir:
-                        profile_dir = Path(
-                            "."
-                        ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-                    print(f"Profiling (results will be saved to '{profile_dir}')...")
-                    run_to_completion(profile_dir=args.profile_result_dir)
-                    return
-                if args.rpd:
-                    from rpdTracerControl import rpdTracerControl
-                    rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True)
-                    profile_rpd = rpdTracerControl()
-                    profile_rpd.start()
-                    print(f"RPD Profiling'...")
-                    with torch.autograd.profiler.emit_nvtx():
-                        run_to_completion(profile_dir=None)
-                    profile_rpd.stop()
-                    return
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir))) as p:
+                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
 
-                # Benchmark.
-                latencies = []
-                for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-                    latencies.append(run_to_completion(profile_dir=None))
+    print("Warming up...")
+    run_to_completion(profile_dir=None)
 
-                if torch.distributed.get_rank() == 0:
-                #results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
-                    latency=np.mean(latencies)
-                    print(f'Avg latency: {latency} seconds') 
-                    if args.report:
-                        entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]}
-                        results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True)
-    if torch.distributed.get_rank() == 0 and args.report:
-        print(results_df)
-        results_df.to_csv(args.report_file, index=False)
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = Path(
+                "."
+            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
 
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    print(f'Avg latency: {np.mean(latencies)} seconds')
 
 
 if __name__ == '__main__':
@@ -144,9 +97,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         choices=['awq', 'gptq', 'squeezellm', None],
                         default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=list_of_ints, default=32)
-    parser.add_argument('--output-len', type=list_of_ints, default=128)
-    parser.add_argument('--batch-size', type=list_of_ints, default=8)
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
     parser.add_argument('--n',
                         type=int,
                         default=1,
@@ -159,7 +112,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
-    
     parser.add_argument(
         '--dtype',
         type=str,
@@ -172,9 +124,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--enforce-eager',
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
-    parser.add_argument('--accuracy',
-                        action='store_true',
-                        help='Run an Actual query through vllm')
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=['auto', 'fp8_e5m2'],
+        default='auto',
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
     parser.add_argument(
         '--profile',
         action='store_true',
@@ -196,15 +152,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
-    parser.add_argument(
-        '--rpd',
-        action='store_true',
-        help='profile the generation process of a single batch using the rpd tracer')
-    parser.add_argument('--warmup-only', action='store_true',
-                        help='only run warmup, useful for tuning')
-    parser.add_argument('--report', action='store_true',
-                        help='turn on dataframe reporting')
-    parser.add_argument('--report-file', type=str, default=None)
     parser.add_argument('--worker-use-torchrun',
         action='store_true',
         help='use torchrun instead of ray when using '
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 4ad3ece3fab37..1b92bf6791d9f 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -32,7 +32,6 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 
-
 namespace vllm {
 
 // Utility function for attention softmax.
@@ -603,7 +602,11 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_E5M2_KV_CACHE,
+#ifdef USE_ROCM
   int NUM_THREADS = 1024>
+#else
+  int NUM_THREADS = 128>
+#endif
 void paged_attention_v1_launcher(
   torch::Tensor& out,
   torch::Tensor& query,
@@ -780,8 +783,13 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_E5M2_KV_CACHE,
+#ifdef USE_ROCM
   int NUM_THREADS = 1024,
   int PARTITION_SIZE = 1024>
+#else
+  int NUM_THREADS = 128,
+  int PARTITION_SIZE = 512>
+#endif
 void paged_attention_v2_launcher(
   torch::Tensor& out,
   torch::Tensor& exp_sums,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f64929ee5df5e..a99c50c73081c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -75,16 +75,13 @@ def __init__(
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
-        #print(f'>>>Shape of x in mlp {x.shape} {self.gate_up_proj.weight.shape}')
         if x.shape[0] == 1 and x.shape[1] == 1:
-
             out = torch.empty(x.shape[0],self.gate_up_proj.weight.shape[0]//2,dtype=x.dtype,device=x.device)
             custom_ops.LLMM_Silu(self.gate_up_proj.weight,x.view(-1,x.size(-1)),out,8)
             x = out.view(x.shape[0], x.shape[1], out.shape[1])
         else:
             gate_up, _ = self.gate_up_proj(x)
             x = self.act_fn(gate_up)
-            #print(f'>>> x.shape {x.shape}')
         x, _ = self.down_proj(x)
         return x
 

From fbea66700b241d3c3a2d5b2f16e3933551337cba Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Thu, 21 Mar 2024 23:53:10 +0000
Subject: [PATCH 27/34] Remove ignored file

---
 csrc/hip_compat.h | 39 ---------------------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 csrc/hip_compat.h

diff --git a/csrc/hip_compat.h b/csrc/hip_compat.h
deleted file mode 100644
index d9fe30b1e7b5d..0000000000000
--- a/csrc/hip_compat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif
-
-#ifndef USE_ROCM
-  #define WARP_SIZE 32
-#else
-  #define WARP_SIZE warpSize
-#endif
-
-#ifndef USE_ROCM
-  #define VLLM_LDG(arg) __ldg(arg)
-#else
-  #define VLLM_LDG(arg) *(arg)
-#endif
-
-#ifndef USE_ROCM
-  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
-#else
-  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
-#endif
-
-#ifndef USE_ROCM
-  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
-#else
-  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
-#endif
-
-#ifndef USE_ROCM
-  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
-    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
-#else
-  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
-    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
-#endif
-

From c8fce27cd9e20c532251a5fdcf66913c99859ea8 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Thu, 21 Mar 2024 21:16:03 +0000
Subject: [PATCH 28/34] Refactor torchrun executor to reuse single gpu executor
 code

---
 vllm/executor/torchrun_gpu_executor.py | 89 ++++----------------------
 1 file changed, 13 insertions(+), 76 deletions(-)

diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py
index 88823ba5d4920..837e6e9368e77 100644
--- a/vllm/executor/torchrun_gpu_executor.py
+++ b/vllm/executor/torchrun_gpu_executor.py
@@ -2,6 +2,7 @@
 import os
 from typing import Dict, List, Optional
 
+from vllm.executor.gpu_executor import GPUExecutor
 from vllm.lora.request import LoRARequest
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, LoRAConfig)
@@ -22,7 +23,7 @@
 }
 
 
-class TorchrunGPUExecutor(ExecutorBase):
+class TorchrunGPUExecutor(GPUExecutor):
 
     def __init__(
         self,
@@ -33,27 +34,15 @@ def __init__(
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
     ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.is_driver_worker = self.local_rank == 0
+        super().__init__(model_config,
+                         cache_config,
+                         parallel_config,
+                         scheduler_config,
+                         device_config,
+                         lora_config)
 
-        # Instantiate the worker and load the model to GPU.
-        self._init_worker()
-
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
-
-    def _dispatch_worker(self):
-        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
-            self.device_config.device_type]
-        imported_worker = importlib.import_module(worker_module)
-        Worker = imported_worker.Worker
-        return Worker
 
     def _init_worker(self):
         # Lazy import the Worker to avoid importing torch.cuda/xformers
@@ -65,7 +54,7 @@ def _init_worker(self):
 
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.worker = Worker(
+        self.driver_worker = Worker(
             self.model_config,
             self.parallel_config,
             self.scheduler_config,
@@ -77,50 +66,15 @@ def _init_worker(self):
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=self.is_driver_worker,
         )
-        self.worker.init_model()
-        self.worker.load_model()
-
-    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache.
-
-        The engine first profiles the existing memory usage.
-        Then, it allocates the remaining memory for KV blocks.
-
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_gpu_blocks, num_cpu_blocks = (
-            self.worker.profile_num_available_blocks(
-                block_size=self.cache_config.block_size,
-                gpu_memory_utilization=self.cache_config.
-                gpu_memory_utilization,
-                cpu_swap_space=self.cache_config.swap_space_bytes,
-                cache_dtype=self.cache_config.cache_dtype,
-            ))
-
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
-
-        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
-                               self.model_config.max_model_len)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        # Initialize the cache.
-        self.worker.init_cache_engine(cache_config=self.cache_config)
-        # Warm up the model. This includes capturing the model into CUDA graph
-        # if enforce_eager is False.
-        self.worker.warm_up_model()
+        self.driver_worker.init_model()
+        self.driver_worker.load_model()
 
     def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],
                       blocks_to_swap_out: Dict[int, int],
                       blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
-        output = self.worker.execute_model(
+        output = self.driver_worker.execute_model(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,
@@ -134,23 +88,6 @@ def execute_model(self,
             output = res[0]
         return output
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self.worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.worker.remove_lora(lora_id)
-
-    def list_loras(self) -> List[int]:
-        return self.worker.list_loras()
-
-    def check_health(self) -> None:
-        # TorchrunGPUExecutor will always be healthy as long as
-        # it's running.
-        return
-
-
 class TorchrunGPUExecutorAsync(TorchrunGPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
@@ -160,7 +97,7 @@ async def execute_model_async(
         blocks_to_swap_out: Dict[int, int],
         blocks_to_copy: Dict[int, List[int]],
     ) -> SamplerOutput:
-        output = await make_async(self.worker.execute_model)(
+        output = await make_async(self.driver_worker.execute_model)(
             seq_group_metadata_list=seq_group_metadata_list,
             blocks_to_swap_in=blocks_to_swap_in,
             blocks_to_swap_out=blocks_to_swap_out,

From 1fff99a0fa4e2be0af8bf335ad1fb6e3c156edf5 Mon Sep 17 00:00:00 2001
From: jpvillam <jpvillam@amd.com>
Date: Fri, 22 Mar 2024 16:48:37 +0000
Subject: [PATCH 29/34] Added interleaving for MQA for triton kernel

---
 .../layers/attention/backends/flash_attn.py        | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 726b42cad9e3f..c5fba494fdbce 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -44,6 +44,15 @@ def __init__(
         self.sliding_window = ((self.sliding_window, self.sliding_window) if
                                self.sliding_window is not None else (-1, -1))
 
+    def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
+        tokens, n_kv_heads, head_dim = x.shape
+        return (
+                x[:, :, None, :]
+                .expand(tokens, n_kv_heads, n_rep, head_dim)
+                .reshape(tokens, n_kv_heads * n_rep, head_dim)
+        )
+
     def forward(
         self,
         query: torch.Tensor,
@@ -85,6 +94,11 @@ def forward(
             # Prompt run.
             if (key_cache is None or value_cache is None
                     or input_metadata.block_tables.numel() == 0):
+                if self.num_kv_heads != self.num_heads:
+                    # Interleave for MQA
+                    key = self.repeat_kv(key, self.num_queries_per_kv)
+                    value = self.repeat_kv(value, self.num_queries_per_kv)
+
                 # normal attention
                 query = query.unflatten(0, (batch_size, seq_len))
                 key = key.unflatten(0, (batch_size, seq_len))

From 0a2309aa8865e737078dfa4f0fc526f3ef1d49cd Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 22 Mar 2024 16:50:57 +0000
Subject: [PATCH 30/34] linter

---
 benchmarks/benchmark_latency.py               | 31 ++++++++--------
 benchmarks/benchmark_throughput.py            | 36 ++++++++++---------
 setup.py                                      | 11 ++++--
 vllm/config.py                                |  3 +-
 vllm/engine/arg_utils.py                      | 12 +++----
 vllm/executor/torchrun_gpu_executor.py        | 18 ++++------
 .../layers/attention/attention.py             |  8 +++--
 .../layers/attention/backends/flash_attn.py   | 19 +++++-----
 vllm/model_executor/layers/linear.py          | 14 ++++----
 vllm/model_executor/models/llama.py           |  8 +++--
 10 files changed, 85 insertions(+), 75 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 8ff04fccb0004..aea86c03d15af 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -16,19 +16,17 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        worker_use_torchrun=args.worker_use_torchrun
-    )
+    llm = LLM(model=args.model,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              worker_use_torchrun=args.worker_use_torchrun)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -153,9 +151,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
         help="If specified, use nsight to profile ray workers",
     )
     parser.add_argument('--worker-use-torchrun',
-        action='store_true',
-        help='use torchrun instead of ray when using '
-        'more than 1 GPU. Preferable for ROCm'
-    )
+                        action='store_true',
+                        help='use torchrun instead of ray when using '
+                        'more than 1 GPU. Preferable for ROCm')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 67062db0c93ad..d9fa821372776 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -78,20 +78,22 @@ def run_vllm(
     worker_use_torchrun: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
-    llm = LLM(model=model,
-              tokenizer=tokenizer,
-              quantization=quantization,
-              tensor_parallel_size=tensor_parallel_size,
-              seed=seed,
-              trust_remote_code=trust_remote_code,
-              dtype=dtype,
-              max_model_len=max_model_len,
-              gpu_memory_utilization=gpu_memory_utilization,
-              enforce_eager=enforce_eager,
-              kv_cache_dtype=kv_cache_dtype,
-              device=device,
-              enable_prefix_caching=enable_prefix_caching,
-              worker_use_torchrun=worker_use_torchrun,)
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        worker_use_torchrun=worker_use_torchrun,
+    )
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
@@ -318,9 +320,9 @@ def main(args: argparse.Namespace):
         action='store_true',
         help="enable automatic prefix caching for vLLM backend.")
     parser.add_argument('--worker-use-torchrun',
-        action='store_true',
-        help='use torchrun instead of ray when using '
-        'more than 1 GPU. Preferable for ROCm')    
+                        action='store_true',
+                        help='use torchrun instead of ray when using '
+                        'more than 1 GPU. Preferable for ROCm')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/setup.py b/setup.py
index 4accc5294c794..7f5e82f85b35c 100644
--- a/setup.py
+++ b/setup.py
@@ -373,11 +373,18 @@ def get_torch_arch_list() -> Set[str]:
 
 custom_extension = CUDAExtension(
     name="vllm.custom_ops",
-    sources=["csrc/custom/custom.cpp", "csrc/custom/custom_kernels.cu", "csrc/custom/fused_kernels.cu"],
-    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
+    sources=[
+        "csrc/custom/custom.cpp", "csrc/custom/custom_kernels.cu",
+        "csrc/custom/fused_kernels.cu"
+    ],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS
+    },
 )
 ext_modules.append(custom_extension)
 
+
 def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
 
diff --git a/vllm/config.py b/vllm/config.py
index bb01300f762a2..1b3129514703a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -437,7 +437,8 @@ def __init__(
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
-        if not self.worker_use_torchrun and self.world_size > 1 and not is_neuron():
+        if (not self.worker_use_torchrun and self.world_size > 1
+                and not is_neuron()):
             self.worker_use_ray = True
         self._verify_args()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 557fb9c2d4061..9c9bfe2436cd0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -319,13 +319,11 @@ def create_engine_configs(
                                    self.swap_space, self.kv_cache_dtype,
                                    model_config.get_sliding_window(),
                                    self.enable_prefix_caching)
-        parallel_config = ParallelConfig(self.pipeline_parallel_size,
-                                         self.tensor_parallel_size,
-                                         self.worker_use_ray,
-                                         self.worker_use_torchrun,
-                                         self.max_parallel_loading_workers,
-                                         self.disable_custom_all_reduce,
-                                         self.ray_workers_use_nsight)
+        parallel_config = ParallelConfig(
+            self.pipeline_parallel_size, self.tensor_parallel_size,
+            self.worker_use_ray, self.worker_use_torchrun,
+            self.max_parallel_loading_workers, self.disable_custom_all_reduce,
+            self.ray_workers_use_nsight)
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.max_model_len,
diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py
index 837e6e9368e77..b30d0cb0b9246 100644
--- a/vllm/executor/torchrun_gpu_executor.py
+++ b/vllm/executor/torchrun_gpu_executor.py
@@ -1,15 +1,13 @@
-import importlib
 import os
 from typing import Dict, List, Optional
 
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.lora.request import LoRARequest
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, LoRAConfig)
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.utils import check_block_size_valid
+from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.logger import init_logger
-from vllm.model_executor.parallel_utils.communication_op import broadcast_object_list, broadcast_tensor_dict
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast_object_list)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
                         make_async)
@@ -36,13 +34,8 @@ def __init__(
     ) -> None:
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.is_driver_worker = self.local_rank == 0
-        super().__init__(model_config,
-                         cache_config,
-                         parallel_config,
-                         scheduler_config,
-                         device_config,
-                         lora_config)
-
+        super().__init__(model_config, cache_config, parallel_config,
+                         scheduler_config, device_config, lora_config)
 
     def _init_worker(self):
         # Lazy import the Worker to avoid importing torch.cuda/xformers
@@ -88,6 +81,7 @@ def execute_model(self,
             output = res[0]
         return output
 
+
 class TorchrunGPUExecutorAsync(TorchrunGPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 89b5816f7a47a..1038ec54082d7 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -65,7 +65,9 @@ def _use_flash_attn() -> int:
     """Returns if and which flash attention to use.
 
     Returns:
-        int: 0 for none, 1 for default implementation, 2 for triton implementation.
+        int: 0 for none,
+        1 for default implementation,
+        2 for triton implementation.
     """
     if not (os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') and is_hip()):
         # AMD GPUs can use flash_attn package or triton impl.
@@ -93,5 +95,7 @@ def _use_flash_attn() -> int:
             "Using xformers backend.")
         return 0
 
-    logger.info(f"Using {'Triton' if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else ''} flash_attn backend.")
+    logger.info(f"""Using {'Triton'
+                    if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON')
+                    else ''} flash_attn backend.""")
     return 2 if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else 1
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 726b42cad9e3f..4b23a99c8370e 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -8,7 +8,8 @@
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention.ops.paged_attn import (
     PagedAttentionImpl)
-from vllm.model_executor.layers.attention.ops.flash_attention_triton import triton_attention
+from vllm.model_executor.layers.attention.ops.flash_attention_triton import (
+    triton_attention)
 
 
 class FlashAttentionBackend:
@@ -91,14 +92,14 @@ def forward(
                 value = value.unflatten(0, (batch_size, seq_len))
                 if self.use_triton:
                     output, _ = triton_attention(
-                                query,
-                                key,
-                                value,
-                                None,
-                                input_metadata,
-                                True,
-                                self.scale,
-                            )
+                        query,
+                        key,
+                        value,
+                        None,
+                        input_metadata,
+                        True,
+                        self.scale,
+                    )
                 else:
                     if is_hip():
                         #XXX: window_size and alibi_slopes not supported
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index edcb448741f7e..6e66072627391 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -81,11 +81,13 @@ def apply_weights(self,
                 batched = True
             else:
                 inp = x
-            m, n, k = weight.shape[0], inp.shape[0], inp.shape[1]
-            out = torch.empty(inp.shape[0], weight.shape[0], dtype=inp.dtype, device='cuda')
-            if k == 8192 and (m == 1280 or m == 7168):
-                custom_ops.LLMM1(weight, inp, out, 8)
-            elif k == 3584 and m == 8192:
+            m, k = weight.shape[0], inp.shape[1]
+            out = torch.empty(inp.shape[0],
+                              weight.shape[0],
+                              dtype=inp.dtype,
+                              device='cuda')
+            if (k == 8192 and
+                (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
                 custom_ops.LLMM1(weight, inp, out, 8)
             elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
                 custom_ops.LLMM1(weight, inp, out, 4)
@@ -93,7 +95,7 @@ def apply_weights(self,
                 out = F.linear(inp, weight)
             if batched:
                 out = out.view(x.shape[0], x.shape[1], weight.shape[0])
-            if bias != None:
+            if bias is not None:
                 out = out + bias
             return out
         if self.separate_bias_add:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a99c50c73081c..30bfc11f22fe7 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -76,8 +76,12 @@ def __init__(
 
     def forward(self, x):
         if x.shape[0] == 1 and x.shape[1] == 1:
-            out = torch.empty(x.shape[0],self.gate_up_proj.weight.shape[0]//2,dtype=x.dtype,device=x.device)
-            custom_ops.LLMM_Silu(self.gate_up_proj.weight,x.view(-1,x.size(-1)),out,8)
+            out = torch.empty(x.shape[0],
+                              self.gate_up_proj.weight.shape[0] // 2,
+                              dtype=x.dtype,
+                              device=x.device)
+            custom_ops.LLMM_Silu(self.gate_up_proj.weight,
+                                 x.view(-1, x.size(-1)), out, 8)
             x = out.view(x.shape[0], x.shape[1], out.shape[1])
         else:
             gate_up, _ = self.gate_up_proj(x)

From 44c2cee3ad9053ecd72fc37c47f7d8defa447fbb Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 22 Mar 2024 18:06:16 +0000
Subject: [PATCH 31/34] Making torchrun the default multi GPU executor on ROCm
 unless overridden by --worker-use-ray

---
 benchmarks/benchmark_latency.py    |  9 +++++----
 benchmarks/benchmark_throughput.py | 13 +++++++------
 vllm/config.py                     | 17 ++++++++++++-----
 vllm/engine/arg_utils.py           | 13 ++++---------
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index aea86c03d15af..4563d4c45def2 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -26,7 +26,7 @@ def main(args: argparse.Namespace):
               kv_cache_dtype=args.kv_cache_dtype,
               device=args.device,
               ray_workers_use_nsight=args.ray_workers_use_nsight,
-              worker_use_torchrun=args.worker_use_torchrun)
+              worker_use_ray=args.worker_use_ray)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -150,9 +150,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
-    parser.add_argument('--worker-use-torchrun',
+    parser.add_argument('--worker-use-ray',
                         action='store_true',
-                        help='use torchrun instead of ray when using '
-                        'more than 1 GPU. Preferable for ROCm')
+                        help='use Ray for distributed serving, will be '
+                        'automatically set when using more than 1 GPU '
+                        'unless on ROCm where the default is torchrun')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index d9fa821372776..8cd05c4863a28 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -75,7 +75,7 @@ def run_vllm(
     device: str,
     enable_prefix_caching: bool,
     gpu_memory_utilization: float = 0.9,
-    worker_use_torchrun: bool = False,
+    worker_use_ray: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -92,7 +92,7 @@ def run_vllm(
         kv_cache_dtype=kv_cache_dtype,
         device=device,
         enable_prefix_caching=enable_prefix_caching,
-        worker_use_torchrun=worker_use_torchrun,
+        worker_use_ray=worker_use_ray,
     )
 
     # Add the requests to the engine.
@@ -218,7 +218,7 @@ def main(args: argparse.Namespace):
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype, args.device,
             args.enable_prefix_caching, args.gpu_memory_utilization,
-            args.worker_use_torchrun)
+            args.worker_use_ray)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -319,10 +319,11 @@ def main(args: argparse.Namespace):
         "--enable-prefix-caching",
         action='store_true',
         help="enable automatic prefix caching for vLLM backend.")
-    parser.add_argument('--worker-use-torchrun',
+    parser.add_argument('--worker-use-ray',
                         action='store_true',
-                        help='use torchrun instead of ray when using '
-                        'more than 1 GPU. Preferable for ROCm')
+                        help='use Ray for distributed serving, will be '
+                        'automatically set when using more than 1 GPU '
+                        'unless on ROCm where the default is torchrun')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/vllm/config.py b/vllm/config.py
index 1b3129514703a..444a979d20305 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -412,7 +412,6 @@ def __init__(
         pipeline_parallel_size: int,
         tensor_parallel_size: int,
         worker_use_ray: bool,
-        worker_use_torchrun: bool,
         max_parallel_loading_workers: Optional[int] = None,
         disable_custom_all_reduce: bool = False,
         ray_workers_use_nsight: bool = False,
@@ -429,7 +428,7 @@ def __init__(
         else:
             self.tensor_parallel_size = tensor_parallel_size
         self.worker_use_ray = worker_use_ray
-        self.worker_use_torchrun = worker_use_torchrun
+        self.worker_use_torchrun = False
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce
         self.ray_workers_use_nsight = ray_workers_use_nsight
@@ -437,9 +436,17 @@ def __init__(
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
-        if (not self.worker_use_torchrun and self.world_size > 1
-                and not is_neuron()):
-            self.worker_use_ray = True
+        if self.world_size > 1 and not is_neuron():
+            if is_hip() and not self.worker_use_ray:
+                logger.info("Using torchrun for multi-GPU on "
+                            "ROCM platform. Use --worker-use-ray "
+                            "to override")
+                if not os.environ.get("RANK"):
+                    raise RuntimeError("Needs to be run in torchrun: "
+                        "torchrun --standalone --nproc_per_node=<tp> ...")
+                self.worker_use_torchrun = True
+            else:
+                self.worker_use_ray = True
         self._verify_args()
 
     def _verify_args(self) -> None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9c9bfe2436cd0..d9c56e4daaeee 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -21,7 +21,6 @@ class EngineArgs:
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
-    worker_use_torchrun: bool = False
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
@@ -153,11 +152,8 @@ def add_cli_args(
         parser.add_argument('--worker-use-ray',
                             action='store_true',
                             help='use Ray for distributed serving, will be '
-                            'automatically set when using more than 1 GPU')
-        parser.add_argument('--worker-use-torchrun',
-                            action='store_true',
-                            help='use torchrun instead of ray when using '
-                            'more than 1 GPU. Preferable for ROCm')
+                            'automatically set when using more than 1 GPU '
+                            'unless on ROCm where the default is torchrun')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -321,9 +317,8 @@ def create_engine_configs(
                                    self.enable_prefix_caching)
         parallel_config = ParallelConfig(
             self.pipeline_parallel_size, self.tensor_parallel_size,
-            self.worker_use_ray, self.worker_use_torchrun,
-            self.max_parallel_loading_workers, self.disable_custom_all_reduce,
-            self.ray_workers_use_nsight)
+            self.worker_use_ray, self.max_parallel_loading_workers,
+            self.disable_custom_all_reduce, self.ray_workers_use_nsight)
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.max_model_len,

From 1256beec4dde87a6bb40aef4e15745df72e58eb6 Mon Sep 17 00:00:00 2001
From: jpvillam <juan.villamizar@amd.com>
Date: Fri, 22 Mar 2024 15:56:35 -0400
Subject: [PATCH 32/34] Make triton the default FA

---
 vllm/model_executor/layers/attention/attention.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 1038ec54082d7..60f456d451f19 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -69,7 +69,8 @@ def _use_flash_attn() -> int:
         1 for default implementation,
         2 for triton implementation.
     """
-    if not (os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') and is_hip()):
+    use_flash_attn_triton = os.environ.get('VLLM_USE_FLASH_ATTN_TRITON', "True").lower() in ("true", "1")
+    if not ( use_flash_attn_triton and is_hip()):
         # AMD GPUs can use flash_attn package or triton impl.
         try:
             import flash_attn  # noqa: F401
@@ -96,6 +97,6 @@ def _use_flash_attn() -> int:
         return 0
 
     logger.info(f"""Using {'Triton'
-                    if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON')
+                    if use_flash_attn_triton
                     else ''} flash_attn backend.""")
-    return 2 if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else 1
+    return 2 if use_flash_attn_triton else 1

From b687795d7789f70782ffb7e242b30a3d372740cd Mon Sep 17 00:00:00 2001
From: jpvillam <juan.villamizar@amd.com>
Date: Fri, 22 Mar 2024 15:59:58 -0400
Subject: [PATCH 33/34] Make workaround only applicable to triton path

---
 vllm/model_executor/layers/attention/backends/flash_attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 587fe08476cca..15fcc56922586 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -95,8 +95,8 @@ def forward(
             # Prompt run.
             if (key_cache is None or value_cache is None
                     or input_metadata.block_tables.numel() == 0):
-                if self.num_kv_heads != self.num_heads:
-                    # Interleave for MQA
+                if self.use_triton and (self.num_kv_heads != self.num_heads):
+                    # Interleave for MQA workaround.
                     key = self.repeat_kv(key, self.num_queries_per_kv)
                     value = self.repeat_kv(value, self.num_queries_per_kv)
 

From 5e3ec52c925a2589bc4e89dcda6ffaf5d6b0fd4f Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 22 Mar 2024 20:59:20 +0000
Subject: [PATCH 34/34] Pin ray version to 2.9.3

---
 Dockerfile.rocm       | 2 +-
 requirements-rocm.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 802f124224879..32ab7cf96fc59 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -110,6 +110,6 @@ RUN cd /app \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 53bd11de7c9de..c7ef913460816 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,7 +2,7 @@ ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
 psutil
-ray >= 2.9
+ray == 2.9.3
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0