From ab5a56e3c876e5cee6f7ae1e54d310404eea5037 Mon Sep 17 00:00:00 2001 From: LeiWang Date: Tue, 6 Feb 2024 10:53:57 -0400 Subject: [PATCH 001/286] base tuner --- python/bitblas/base/__init__.py | 30 + python/bitblas/base/analysis.py | 333 ++++++++ python/bitblas/base/common_schedules.py | 157 ++++ python/bitblas/base/config.py | 26 + python/bitblas/base/roller/arch/__init__.py | 18 + python/bitblas/base/roller/arch/arch_base.py | 54 ++ python/bitblas/base/roller/arch/cuda.py | 72 ++ python/bitblas/base/roller/bestfit.py | 79 ++ python/bitblas/base/roller/config.py | 263 ++++++ python/bitblas/base/roller/node.py | 385 +++++++++ python/bitblas/base/roller/policy/__init__.py | 18 + python/bitblas/base/roller/policy/common.py | 69 ++ python/bitblas/base/roller/policy/default.py | 784 ++++++++++++++++++ .../bitblas/base/roller/policy/tensorcore.py | 338 ++++++++ python/bitblas/base/roller/rasterization.py | 98 +++ .../base/roller/shape_inference/__init__.py | 17 + .../base/roller/shape_inference/common.py | 79 ++ .../base/roller/shape_inference/tir.py | 412 +++++++++ python/bitblas/base/schedule_rule.py | 143 ++++ python/bitblas/base/transform.py | 220 +++++ python/bitblas/base/utils.py | 479 +++++++++++ 21 files changed, 4074 insertions(+) create mode 100644 python/bitblas/base/__init__.py create mode 100644 python/bitblas/base/analysis.py create mode 100644 python/bitblas/base/common_schedules.py create mode 100644 python/bitblas/base/config.py create mode 100644 python/bitblas/base/roller/arch/__init__.py create mode 100644 python/bitblas/base/roller/arch/arch_base.py create mode 100644 python/bitblas/base/roller/arch/cuda.py create mode 100644 python/bitblas/base/roller/bestfit.py create mode 100644 python/bitblas/base/roller/config.py create mode 100644 python/bitblas/base/roller/node.py create mode 100644 python/bitblas/base/roller/policy/__init__.py create mode 100644 python/bitblas/base/roller/policy/common.py create mode 100644 python/bitblas/base/roller/policy/default.py create mode 100644 python/bitblas/base/roller/policy/tensorcore.py create mode 100644 python/bitblas/base/roller/rasterization.py create mode 100644 python/bitblas/base/roller/shape_inference/__init__.py create mode 100644 python/bitblas/base/roller/shape_inference/common.py create mode 100644 python/bitblas/base/roller/shape_inference/tir.py create mode 100644 python/bitblas/base/schedule_rule.py create mode 100644 python/bitblas/base/transform.py create mode 100644 python/bitblas/base/utils.py diff --git a/python/bitblas/base/__init__.py b/python/bitblas/base/__init__.py new file mode 100644 index 0000000000..d3d89322fb --- /dev/null +++ b/python/bitblas/base/__init__.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Base infra""" +from .analysis import ( + BlockInfo, + IterInfo, + collect_block_iter_vars_used_in_access_region, + collect_vars_used_in_prim_expr, + detect_dominant_read, + is_broadcast_epilogue, + normalize_prim_func, +) +from .common_schedules import get_block, get_output_blocks, try_inline, try_inline_contiguous_spatial +from .schedule_rule import ScheduleRule +from .transform import ApplyDefaultSchedule, ApplyFastTuning +from .utils import fast_tune, fast_tune_with_dynamic_range diff --git a/python/bitblas/base/analysis.py b/python/bitblas/base/analysis.py new file mode 100644 index 0000000000..8cf539ef07 --- /dev/null +++ b/python/bitblas/base/analysis.py @@ -0,0 +1,333 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Analysis on TIR blocks, loops and functions.""" +from typing import List, Optional, Set, Union, Tuple, Dict +from typing_extensions import Literal +from dataclasses import dataclass +from enum import Enum + +from tvm import ir, tir +from tvm.ir import Range +from tvm.tir.analysis import undefined_vars +from tvm._ffi import get_global_func +from tvm.target.target import Target +from tvm.tir import Schedule, IterVar, Var, PrimExpr +from tvm.tir.schedule import BlockRV + + +def get_reduction_blocks(sch, blocks) -> bool: + # Get the main computation block + def is_reduction(block: BlockRV) -> bool: + block_stmt = sch.get(block) + iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} + return iter_types == {IterVar.CommReduce, IterVar.DataPar} + + def is_spatial(block: BlockRV) -> bool: + block_stmt = sch.get(block) + iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} + return iter_types == {IterVar.DataPar} + + # NOTE: We assume there is only one reduction block in the function + # all blocks are required to be spatial or reduction + if not all([is_reduction(block) or is_spatial(block) for block in blocks]): + return None + + # There is only one reduction block + reduction_blocks = [block for block in blocks if is_reduction(block)] + if len(reduction_blocks) != 1: + return None + + return reduction_blocks + + +class IterInfo: + """Information about a loop/iter var.""" + + kind: Literal["S", "R", "O"] + var: tir.Var + _dom: tir.PrimExpr + loop_rv: tir.schedule.LoopRV + + def __init__( + self, + kind: Literal["S", "R", "O"], + var: tir.Var, + dom: tir.PrimExpr, + loop_rv: tir.schedule.LoopRV, + ): + """Construct an IterInfo object.""" + self.kind = kind + self.var = var + self._dom = dom + self.loop_rv = loop_rv + + @property + def dom(self) -> Union[int, tir.PrimExpr]: + """The iteration domain of the loop.""" + return int(self._dom) if isinstance(self._dom, tir.IntImm) else self._dom + + def __str__(self) -> str: + return f'Iter("{self.kind}", {self.dom})' + + def __repr__(self) -> str: + return str(self) + + +class BlockInfo: + """Information about a TIR block.""" + + name: str + iters: List[IterInfo] + block_rv: tir.schedule.BlockRV + _reduction_block: bool + + def __init__( + self, + name: str, + iters: List[IterInfo], + block_rv: tir.schedule.BlockRV, + reduction_block: bool = False, + ): + """Construct a BlockInfo object.""" + self.name = name + self.block_rv = block_rv + self.iters = iters + self._reduction_block = reduction_block + + def dom(self) -> List[Union[int, tir.PrimExpr]]: + """The iteration domain of the block.""" + return [i.dom for i in self.iters] + + def dom_kind(self) -> str: + """The iteration domain kind of the block, for example, SSSS, SSSR.""" + return "".join(i.kind for i in self.iters) + + def is_injective(self) -> bool: + """Whether the block is injective, i.e. all its iteration domains are injective.""" + return all(k == "S" for k in self.dom_kind()) + + def is_elementwise(self, sch: tir.Schedule) -> bool: + """Whether the block is elementwise, i.e. trivial mapping between read/write region""" + + def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool: + return dom.min.same_as(var) and dom.extent == 1 + + if not self.is_injective(): + return False + block = sch.get(self.block_rv) + if len(block.reads) != 1 or len(block.writes) != 1: + return False + r_region = block.reads[0].region + w_region = block.writes[0].region + if len(r_region) != len(w_region): + return False + for var, r_dom, w_dom in zip(block.iter_vars, r_region, w_region): + if not _check_unit_var_range(var, r_dom) or not _check_unit_var_range(var, w_dom): + return False + return True + + def is_reduction(self) -> bool: + """Whether the block is a reduction workload.""" + # TODO(@junrushao): distinguish GEMV and reduction + return self._reduction_block + + def is_gemv(self) -> bool: + """Whether the block is a GEMV workload.""" + raise NotImplementedError + + def is_gemm(self) -> bool: + """Whether the block is a GEMM workload.""" + raise NotImplementedError + + def __str__(self) -> str: + return f'BlockInfo("{self.name}", "{self.dom_kind()}", {self.dom()})' + + def __repr__(self) -> str: + return str(self) + + +_normalize_prim_func = get_global_func("tir.schedule.NormalizePrimFunc") + + +def normalize_prim_func(sch: tir.Schedule) -> Optional[List[BlockInfo]]: + """Normalize the primfunc to normal form""" + try: + result = _normalize_prim_func(sch) + if result is None: + return None + except Exception: # pylint: disable=broad-except + return None + + def _iter_kind(i: tir.IterVar) -> str: + return { + tir.IterVar.DataPar: "S", + tir.IterVar.CommReduce: "R", + }.get(i.iter_type, "O") + + blocks: List[BlockInfo] = [] + for block, loops, iters, is_reduction in zip(*result): + blocks.append( + BlockInfo( + name=sch.get(block).name_hint, + iters=[ + IterInfo( + kind=_iter_kind(iter), # type: ignore + var=iter.var, + dom=iter.dom, + loop_rv=loop, + ) + for loop, iter in zip(loops, iters) + ], + block_rv=block, + reduction_block=is_reduction, + ) + ) + return blocks + + +def find_var_from_func(func, var: str): + for buffer in func.buffer_map.values(): + for i in buffer.shape: + if isinstance(i, tir.Var) and i.name == var: + return i + return None + +def check_func_with_dynamic(func): + for buffer in func.buffer_map.values(): + for i in buffer.shape: + if isinstance(i, tir.Var): + return True + return False + +def _assert_gpu_target(target: Target): + if "gpu" not in target.keys: + raise ValueError(f"Expect a GPU target, but got {target}") + + +def get_max_threads_per_block(target: Target) -> int: + _assert_gpu_target(target) + max_threads_per_block = None + for name in ["max_threads_per_block", "max_num_threads"]: + if max_threads_per_block is None: + max_threads_per_block = target.attrs.get(name, None) + if max_threads_per_block is None: + max_threads_per_block = 64 + return int(max_threads_per_block) + + +def get_max_shared_memory_per_block(target: Target) -> int: + _assert_gpu_target(target) + max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None) + if max_shared_memory_per_block is None: + raise ValueError( + f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually" + ) + return int(max_shared_memory_per_block) + + +def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV: + try: + block = sch.mod[func_name].body.block + except: + raise ValueError( + f"The function body is expected to be the root block, but got:\n" + f"{sch.mod[func_name].body}" + ) + return sch.get_block(block.name_hint) + + +def collect_block_iter_vars_used_in_access_region( + block: tir.Block, region: List[ir.Range] +) -> Set[tir.Var]: + """Collect the block iter variables used in the access region of a buffer region.""" + tir_vars = set() + for expr in region: + assert expr.extent == 1 + tir_vars |= collect_vars_used_in_prim_expr(expr.min) + tir_vars &= set(iter_var.var for iter_var in block.iter_vars) + return tir_vars + + +def collect_vars_used_in_prim_expr(expr: tir.PrimExpr) -> Set[tir.Var]: + """Collect the variables used in the PrimExpr.""" + tir_vars = set() + + def _collect_tir_var(expr): + if isinstance(expr, tir.Var): + tir_vars.add(expr) + + tir.stmt_functor.post_order_visit(expr, _collect_tir_var) + return tir_vars + + +def detect_dominant_read(block: tir.Block) -> tir.PrimExpr: + """Detect the dominant read indices in the block.""" + dominant_read = None + num_read_iters = -1 + for buffer_region in block.reads: + tir_vars = collect_block_iter_vars_used_in_access_region(block, buffer_region.region) + if num_read_iters < len(tir_vars): + num_read_iters = len(tir_vars) + dominant_read = buffer_region + assert dominant_read is not None + (result,) = dominant_read.buffer.offset_of([e.min for e in dominant_read.region]) + return result + + +def is_broadcast_epilogue( + sch: tir.Schedule, + block: tir.schedule.BlockRV, + epilogue: tir.schedule.BlockRV, +) -> bool: + """Check if the epilogue block is a broadcast pattern""" + write_buffers = {r.buffer for r in sch.get(block).writes} + epilogue_iters = {i.var: i for i in sch.get(epilogue).iter_vars if i.dom != 1} + for buffer_region in sch.get(epilogue).reads: + if buffer_region.buffer not in write_buffers: + continue + tir_vars = collect_block_iter_vars_used_in_access_region( + sch.get(epilogue), buffer_region.region + ) + if len(tir_vars) < len(epilogue_iters): + return True + return False + + +def get_reduction_blocks( + sch: tir.Schedule, blocks: List[tir.schedule.BlockRV] +) -> List[tir.schedule.BlockRV]: + # Get the main computation block + def is_reduction(block: BlockRV) -> bool: + block_stmt = sch.get(block) + iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} + return iter_types == {IterVar.CommReduce, IterVar.DataPar} + + def is_spatial(block: BlockRV) -> bool: + block_stmt = sch.get(block) + iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} + return iter_types == {IterVar.DataPar} + + # NOTE: We assume there is only one reduction block in the function + # all blocks are required to be spatial or reduction + if not all([is_reduction(block) or is_spatial(block) for block in blocks]): + return None + + # There is only one reduction block + reduction_blocks = [block for block in blocks if is_reduction(block)] + if len(reduction_blocks) == 0: + return None + return reduction_blocks diff --git a/python/bitblas/base/common_schedules.py b/python/bitblas/base/common_schedules.py new file mode 100644 index 0000000000..77d6bf1a73 --- /dev/null +++ b/python/bitblas/base/common_schedules.py @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Common schedule strategies for TIR.""" +from typing import Callable, List + +from tvm import tir + +from .analysis import BlockInfo + +def get_block( + sch: tir.Schedule, + blocks: List[BlockInfo], + name: str, +): + """Get the target block from a schedule. + + Parameters + ---------- + sch : tir.Schedule + The TIR schedule used to get target block. + name : str + The name of the target block. + + Returns + ------- + target_block : BlockRV + The target block. + """ + + target_block : tir.BlockRV = None + for block_info in blocks: + block = block_info.block_rv + if sch.get(block).name_hint == name: + target_block = block + return target_block + +def get_output_blocks( + sch: tir.Schedule, + blocks: List[BlockInfo], +): + """Get the output blocks of a schedule. + + Parameters + ---------- + sch : tir.Schedule + The TIR schedule used to get output blocks. + blocks : List[BlockInfo] + The blocks to be analyzed. + + Returns + ------- + output_blocks : List[BlockInfo] + The output blocks. + """ + + # collect arguments buffer + func = sch.mod["main"] + args = list(func.buffer_map.values()) + + output_blocks = [] + for block_info in blocks: + block = block_info.block_rv + for write in sch.get(block).writes: + if write.buffer in args: + output_blocks.append(block) + + return output_blocks + + +def try_inline( + sch: tir.Schedule, + blocks: List[BlockInfo], +) -> List[BlockInfo]: + """Try to inline as many blocks as possible, and return the remaining blocks. + + Parameters + ---------- + sch : tir.Schedule + The TIR schedule used to inline blocks. + blocks : List[BlockInfo] + The blocks to be inlined. + + Returns + ------- + remaining : List[BlockInfo] + The remaining blocks that cannot be inlined. + """ + + def _trial(func: Callable): + for i, block in enumerate(blocks): + try: + func(block.block_rv) + except: # pylint: disable=bare-except + continue + return i + return None + + while True: + i = _trial(sch.compute_inline) + if i is None: + i = _trial(sch.reverse_compute_inline) + if i is None: + break + blocks.pop(i) + return blocks + + +def try_inline_contiguous_spatial( + sch: tir.Schedule, + block_infos: List[BlockInfo], +) -> List[BlockInfo]: + """Try to inline contiguous spatial blocks in a schedule + + Parameters + ---------- + sch : tir.Schedule + The TIR schedule used to inline blocks. + block_infos : List[BlockInfo] + The blocks to be try. + + Returns + ------- + remaining : List[BlockInfo] + The remaining blocks that cannot be inlined. + """ + + if block_infos is None: + return None + results = [] + spatial_blocks = [] + block: BlockInfo + for block in block_infos: + if block.is_injective(): + spatial_blocks.append(block) + elif spatial_blocks: + results.extend(try_inline(sch, spatial_blocks)) + results.append(block) + spatial_blocks = [] + else: + results.append(block) + if spatial_blocks: + results.extend(try_inline(sch, spatial_blocks)) + return results diff --git a/python/bitblas/base/config.py b/python/bitblas/base/config.py new file mode 100644 index 0000000000..94f2031f81 --- /dev/null +++ b/python/bitblas/base/config.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass + +class ScheduleConfig: + """Configuration for dlight schedule""" + def __init__(self): + self._config = {} + self.block_factors = [] + self.thread_factors = [] + self.rstep = [] + self.reduce_thread = [] + self.pipeline_stage = 1 + self.vectorize = {} + + def __getattr__(self, name): + return self._config[name] + + def __setattr__(self, name, value): + self._config[name] = value + + def from_roller(self, roller_config): + self.block = roller_config.block + self.thread = roller_config.thread + self.rstep = roller_config.rstep + self.reduce_thread = roller_config.reduce_thread + self.pipeline_stage = roller_config.pipeline_stage + self.vectorize = roller_config.vectorize diff --git a/python/bitblas/base/roller/arch/__init__.py b/python/bitblas/base/roller/arch/__init__.py new file mode 100644 index 0000000000..35ee7bd9a3 --- /dev/null +++ b/python/bitblas/base/roller/arch/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from .arch_base import Arch +from .cuda import * diff --git a/python/bitblas/base/roller/arch/arch_base.py b/python/bitblas/base/roller/arch/arch_base.py new file mode 100644 index 0000000000..6cb77cc97e --- /dev/null +++ b/python/bitblas/base/roller/arch/arch_base.py @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import List + + +class Arch: + """ + Represents the architecture of a computing device, capturing various hardware specifications. + """ + + def __init__(self) -> None: + self.reg_cap: int = 0 # Register capacity: The amount of register memory available + self.smem_cap: int = 0 # Shared memory capacity: The amount of shared memory available + self.compute_max_core: int = 0 # The maximum number of computing cores + self.warp_size: int = ( + 0 # The size of a warp, a group of threads that execute instructions in lockstep + ) + self.sm_partition: int = 0 # The number of streaming multiprocessor partitions + self.transaction_size: List[int] = [ + 0, + 0, + ] # The size of memory transactions, typically in bytes + self.max_smem_usage: int = 0 # The maximum shared memory usage allowed + self.bandwidth: List[int] = [ + 0, + 0, + ] # Bandwidth specifications, possibly including peak and sustained rates + self.platform: str = "unknown" # The platform or manufacturer of the device + self.compute_capability: str = ( + "unknown" # The compute capability, indicating the feature set and performance level + ) + self.l2_cache_size_bytes: int = 0 + # the number of transaction size in bytes + self.transaction_size: List[int] = [0, 0] # in bytes + # bandwidth in MB/s, will be used for recommend basic tile size + self.bandwidth: List[int] = [0, 0] + + def get_avaliable_tensorintrin_shapes(self): + raise NotImplementedError() + \ No newline at end of file diff --git a/python/bitblas/base/roller/arch/cuda.py b/python/bitblas/base/roller/arch/cuda.py new file mode 100644 index 0000000000..d0b3a11fe1 --- /dev/null +++ b/python/bitblas/base/roller/arch/cuda.py @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +from tvm.target import Target +from .arch_base import Arch +from typing import List, Dict + +def check_sm_version(arch: str) -> int: + sm_version = arch.replace("sm_", "") + return int(sm_version) if sm_version.isdigit() else -1 + +class TensorInstruction(object): + def __init__( + self, + name: str, + intrin_group: Dict, + shape: List[int], + ): + self.name: str = name + self.intrin_group: Dict = intrin_group + # only mantain the shape of M and N + self.shape: List[int] = shape + +class CUDA(Arch): + def __init__(self, target: Target): + self.target = target + self.sm_version = check_sm_version(self.target.arch) + device = tvm.runtime.cuda(0) + if not device.exist: + raise RuntimeError("Cannot find cuda device 0.") + self.device: tvm.runtime.Device = device + self.platform: str = "CUDA" + self.smem_cap = device.max_shared_memory_per_block + self.compute_max_core = device.multi_processor_count + self.warp_size = device.warp_size + self.compute_capability = device.compute_version.replace(".", "") + self.reg_cap: int = 65536 + self.max_smem_usage: int = 2 * self.smem_cap + self.sm_partition: int = 4 + self.l2_cache_size_bytes: int = target.l2_cache_size_bytes + # the number of transaction size in bytes + self.transaction_size: List[int] = [32, 128] # in bytes + # bandwidth in MB/s, will be used for recommend basic tile size + # TODO(lei): find some way to get the real bandwidth + # However, the ratio of bandwidth between different devices can + # be similar. The bandwidth can work for another devices as well. + self.bandwidth: List[int] = [750, 12080] + # the tensor instruction informations + + from tvm.tir.tensor_intrin.cuda import get_wmma_intrin_group, get_mma_intrin_group + + self.available_tensor_instructions = ( + TensorInstruction("mma", get_mma_intrin_group, [16, 16]), + TensorInstruction("wmma", get_wmma_intrin_group, [16, 16]), + ) + + def get_avaliable_tensorintrin_shapes(self): + return [t.shape for t in self.available_tensor_instructions] \ No newline at end of file diff --git a/python/bitblas/base/roller/bestfit.py b/python/bitblas/base/roller/bestfit.py new file mode 100644 index 0000000000..5c260ee378 --- /dev/null +++ b/python/bitblas/base/roller/bestfit.py @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Benifit For Dlight Schedule""" +class Block: + def __init__(self, start, end, is_free): + self.start = start + self.end = end + self.is_free = is_free + + def size(self) -> int: + return self.end - self.start + + def merge(self, other): + assert self.is_free == other.is_free + self.start = min(self.start, other.start) + self.end = max(self.end, other.end) + + def __repr__(self) -> str: + return "".format(self.start, self.size()) + + +class BestFit: + def __init__(self, align=32): + self.limit = 0 + self.list = [] + self.align = align + + def malloc(self, size) -> Block: + size = (size + self.align - 1) // self.align * self.align + found = None + for block in self.list: + if block.is_free and block.size() >= size: + if not found or found.size() > block.size(): + found = block + if found: + found.is_free = False + remain = found.size() - size + if remain != 0: + found.end -= remain + self.list.insert( + self.list.index(found) + 1, Block(found.end, found.end + remain, True) + ) + return found + elif len(self.list) > 0 and self.list[-1].is_free: + add = size - self.list[-1].size() + self.list[-1].end += add + self.limit = self.list[-1].end + self.list[-1].is_free = False + return self.list[-1] + else: + block = Block(self.limit, self.limit + size, False) + self.list.append(block) + self.limit += size + return block + + def free(self, block: Block) -> None: + assert not block.is_free + idx = self.list.index(block) + self.list[idx] = Block(block.start, block.end, True) + if idx + 1 < len(self.list) and self.list[idx + 1].is_free: + self.list[idx].merge(self.list[idx + 1]) + self.list.pop(idx + 1) + if idx - 1 >= 0 and self.list[idx - 1].is_free: + self.list[idx].merge(self.list[idx - 1]) + self.list.pop(idx - 1) diff --git a/python/bitblas/base/roller/config.py b/python/bitblas/base/roller/config.py new file mode 100644 index 0000000000..9255980d05 --- /dev/null +++ b/python/bitblas/base/roller/config.py @@ -0,0 +1,263 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Config definition for schedule""" +from typing import Dict, List, Optional, Tuple + +import numpy as np + + +class TensorCoreExtraConfig: + """ + This class is used to store extra information for tensorcore + """ + + def __init__( + self, + AS_shape: Tuple[int], + BS_shape: Tuple[int], + AF_shape: Tuple[int], + BF_shape: Tuple[int], + tc_axis: Tuple[int], + ) -> None: + self.AS_shape: Tuple[int] = AS_shape + self.BS_shape: Tuple[int] = BS_shape + self.AF_shape: Tuple[int] = AF_shape + self.BF_shape: Tuple[int] = BF_shape + self.tc_axis: Tuple[int] = tc_axis + + +class Stride: + """ + Manages stride information for a given axis of a tensor. + """ + + def __init__(self, stride: int = 1, ax: int = -1) -> None: + # which axis to put stride on + self._ax: int = int(ax) + # the stride size of the axis + self._stride: int = int(stride) + + @property + def ax(self) -> int: + return self._ax + + @property + def stride(self) -> int: + return self._stride + + def compute_strides_from_shape(self, shape: List[int]) -> List[int]: + ndim = len(shape) + strides = [1 for _ in shape] + for i in range(ndim - 2, -1, -1): + if i == self.ax: + strides[i] = self.stride + else: + strides[i] = int(strides[i + 1] * shape[i + 1]) + return strides + + def compute_elements_from_shape(self, shape: List[int]) -> int: + original_shape = np.prod(shape) + if not self.is_valid(): + strided_elem = original_shape + else: + assert self.ax < len(shape) + strided_elem = np.prod(shape[0 : self.ax + 1]) * self.stride + assert strided_elem >= original_shape + return int(strided_elem) + + def is_valid(self) -> bool: + return self.ax >= 0 + + def __repr__(self) -> str: + return f"" + + +class TileDict: + """ + Manages tiling information and configurations for computational tasks. + """ + + def __init__(self, output_tile) -> None: + self.output_tile = output_tile + # schedule config + self.tile_map = {} + self.rstep_map = {} + self.cached_tensors_map = {} + self.output_strides_map = {} + self.tensor_strides_map = {} + + # analysis + self.traffic = -1 + self.smem_cost = -1 + self.block_per_SM = -1 + self.num_wave = -1 + self.grid_size = -1 + self.valid = True + + def get_tile(self, func) -> List[int]: + return self.tile_map[func] + + def get_rstep(self, func) -> Dict[str, int]: + return self.rstep_map + + def __hash__(self) -> int: + return hash(tuple(self.output_tile)) + + +class IntrinInfo: + """ + The information of tensorcore intrinsic related infomation + """ + + def __init__( + self, + in_dtype: str, + out_dtype: str, + trans_b: bool, + smooth_a: bool = False, + smooth_b: bool = False, + ) -> None: + self.in_dtype = in_dtype + self.out_dtype = out_dtype + self.trans_a = False + self.trans_b = trans_b + self.smooth_a = smooth_a + self.smooth_b = smooth_b + + def __repr__(self) -> str: + return ( + f"" + ) + + +class Config(object): + """ + Central configuration class for managing various parameters of computational tasks. + """ + + def __init__(self) -> None: + self.arch = None + self.use_tc = None # todo(lei): this should be renamed. + self.compute_capability = None + + # spacial axes tiling info + self.block = [] + self.thread = [] + # special axes for tensorCore + self.warp = [] + self.wmma = [] + self.tc_extra_conf: Optional[TensorCoreExtraConfig] = None + # reduce axes tiling info + self.rstep = [] + self.reduce_thread = [] + self.rasterization_plan = None + self.cached_tensors = [] + self.output_strides = {} + self.schedule_stages = None + + # Experimental + self._raxis_order = [] + self._step = [] + self.vectorize: Dict[str, int] = {} + self.pipeline_stage = 1 + self.use_async = False + self.opt_shapes: Dict[str, int] = {} + self.intrin_info = IntrinInfo("float16", "float16", True) + + def to_dict(self) -> Dict: + dic = {} + dic["block"] = self.block + if self.use_tc: + dic["warp"] = self.warp + dic["wmma"] = self.wmma + else: + dic["thread"] = self.thread + dic["rstep"] = self.rstep + if np.prod(self.reduce_thread) > 1: + dic["reduce_thread"] = self.reduce_thread + if self.use_tc: + dic["use_tc"] = self.use_tc + if self.output_strides: + dic["strides"] = {} + for k, stride in self.output_strides.items(): + if stride.is_valid(): + dic["strides"][k] = stride + if len(dic["strides"]) == 0: + del dic["strides"] + if np.prod(self._step) > 1: + dic["step"] = self._step + if self._raxis_order != []: + dic["raxis_order"] = self._raxis_order + if self.vectorize != {}: + dic["vectorize"] = self.vectorize + return dic + + def from_dict(self, dic: Dict) -> "Config": + self.__init__() + if "use_tc" in dic: + self.use_tc = dic["use_tc"] + self.block = dic["block"] + if self.use_tc: + self.warp = dic["warp"] + self.wmma = dic["wmma"] + else: + self.thread = dic["thread"] + self.rstep = dic["rstep"] + if "reduce_thread" in dic: + self.reduce_thread = dic["reduce_thread"] + else: + self.reduce_thread = [1 for _ in self.rstep] + if "strides" in dic: + self.output_strides = dic["strides"] + if "step" in dic: + self._step = dic["step"] + if "raxis_order" in dic: + self._raxis_order = dic["raxis_order"] + if "vectorize" in dic: + self.vectorize = dic["vectorize"] + return self + + @property + def raxis_order(self) -> List[int]: + if self._raxis_order != []: + return self._raxis_order + return list(range(len(self.rstep))) + + @property + def step(self) -> List[int]: + if self._step != []: + return self._step + return [1 for _ in self.block] + + def __repr__(self) -> str: + return str(self.to_dict()) + + def complete_config(self, node): + if not self.use_tc: + return self + _, _, wmma_k = self.wmma + + tc_axis = node.infer_tensorcore_axis() + + shapes = node.propogate_reduction_inputs(self.block, {x: self.rstep[0] for x in node.raxis}) + AS_shape, BS_shape = shapes.values() + + shapes = node.propogate_reduction_inputs(self.warp, {x: wmma_k for x in node.raxis}) + AF_shape, BF_shape = shapes.values() + + self.tc_extra_conf = TensorCoreExtraConfig(AS_shape, BS_shape, AF_shape, BF_shape, tc_axis) + return self diff --git a/python/bitblas/base/roller/node.py b/python/bitblas/base/roller/node.py new file mode 100644 index 0000000000..04cfe8fb72 --- /dev/null +++ b/python/bitblas/base/roller/node.py @@ -0,0 +1,385 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""PrimFunc Warpper and Block Infomation Analaysis""" + +import tvm +from tvm import tir +from tvm.tir import IterVar, Var, PrimFunc +from typing import Any, Iterable, Dict, List, Tuple +import functools +import numpy as np +from tvm.tir.schedule.schedule import BlockRV +from ..analysis import BlockInfo, get_reduction_blocks +from .. import analysis +from .. import normalize_prim_func +from .shape_inference import get_analyzer_by_tir + + +def pre_order_traverse(block_analyzer, blocks, func): + visited = set() + + def _traverse(block): + if block in visited: + return + visited.add(block) + for dep_blocks in block_analyzer.get_consumer_blocks(block): + _traverse(dep_blocks) + func(block) + + for block in blocks: + _traverse(block) + + +class BlockAnalyzer(object): + def __init__(self, sch) -> None: + self.sch: tir.Schedule = sch + self.block_infos: List[BlockInfo] = normalize_prim_func(self.sch) + + def get_block_name(self, block: BlockRV) -> str: + return self.sch.get(block).name_hint + + def get_block_info(self, block: BlockRV) -> BlockInfo: + for block_info in self.block_infos: + if self.get_block_name(block) == block_info.name: + return block_info + return None + + def get_spatial_axis(self, block: BlockRV) -> List[IterVar]: + block_info = self.get_block_info(block) + axis = [] + for iter in block_info.iters: + if iter.kind == "S": + axis.append(iter) + return axis + + def get_reduce_axis(self, block: BlockRV) -> List[IterVar]: + block_info = self.get_block_info(block) + raxis = [] + for iter in block_info.iters: + if iter.kind == "R": + raxis.append(iter) + return raxis + + def get_input_buffers(self, block: BlockRV) -> List[tir.Buffer]: + buffers = [] + for read in self.sch.get(block).reads: + buffers.append(read.buffer) + return buffers + + def get_output_buffers(self, block: BlockRV) -> List[tir.Buffer]: + buffers = [] + for write in self.sch.get(block).writes: + buffers.append(write.buffer) + return buffers + + def get_buffers(self, block: BlockRV) -> List[tir.Buffer]: + return self.get_input_buffers(block) + self.get_output_buffers(block) + + def get_producer_blocks(self, block: BlockRV) -> List[BlockRV]: + return self.sch.get_producers(block) + + def get_consumer_blocks(self, block: BlockRV) -> List[BlockRV]: + return self.sch.get_consumers(block) + + +class Node(object): + def __init__(self, tags: Dict = {}) -> None: + self._dtypes = [] + self._tag: Dict = {} + for tag in tags: + self.add_tag(tag, tags[tag]) + + def set_tag(self, k: str, v: Any = True) -> None: + self.add_tag(k, v) + + def add_tag(self, k: str, v: Any = True) -> None: + self._tag[k] = v + + def get_tag(self, k: str) -> Any: + if k not in self._tag: + return None + return self._tag[k] + + +class PrimFuncNode(Node): + def __init__(self, prim_func: PrimFunc, tags: Dict = {}) -> None: + super().__init__(tags) + self.prim_func = self._specialize_func(prim_func) + self.sch: tir.Schedule = tir.Schedule(self.prim_func) + self.block_analyzer: BlockAnalyzer = BlockAnalyzer(self.sch) + self.schedule_stages: List[BlockRV] = [] + self.blocks: List[BlockRV] = [] + self.output_blocks: List[BlockRV] = None + self.reduction_block: BlockRV = None + self.raxis = [] + self.input_buffers = [] + self.output_buffers = [] + self.buffers = [] + self.args = [] + self._analysis_funcinfo() + self.ana = get_analyzer_by_tir(self.block_analyzer, self.blocks) + + def _specialize_func(self, func: PrimFunc): + # Specialize the function to make it more friendly for analysis. + # set attrs + for k, v in func.attrs.items(): + self.set_tag(k, v) + if self.get_tag("is_speclized"): + return func + opt_shapes = self.get_tag("opt_shapes") + if opt_shapes: + for name, shape in opt_shapes.items(): + var = analysis.find_var_from_func(func, name) + if var is not None: + func = func.specialize({var: shape.astype(var.dtype)}) + return func + + def _analysis_funcinfo(self): + root_block = analysis.get_root_block(self.sch) + blocks = self.sch.get_child_blocks(root_block) + self.blocks = blocks + + self.output_blocks = self.sch.get_output_blocks(root_block) + reduction_blocks = get_reduction_blocks(self.sch, blocks) + if reduction_blocks is None: + self.reduction_block = None + self.schedule_stages.append(*self.output_blocks) + else: + # analysis on the last reduction block + self.reduction_block = reduction_blocks[-1] + # set raxis + reduce_block_info = self.block_analyzer.get_block_info(self.reduction_block) + for iter in reduce_block_info.iters: + if iter.kind == "R": + self.raxis.append(iter) + self.schedule_stages.append(self.reduction_block) + + # collect output buffers + for output_block in self.output_blocks: + for write in self.sch.get(output_block).writes: + if write not in self.output_buffers: + self.output_buffers.append(write.buffer) + + for param in self.prim_func.params: + if param not in self.prim_func.buffer_map: + # in case of dynamic symbolic may in params + continue + buffer = self.prim_func.buffer_map[param] + if buffer not in self.output_buffers: + self.input_buffers.append(buffer) + + self.args = self.input_buffers + self.output_buffers + self.buffers = [buffer for buffer in self.prim_func.buffer_map.values()] + + # set dtype + self.set_dtype(tvm.DataType(self.output_buffers[0].dtype)) + + def get_opt_shape(self, name) -> int: + opt_shapes = self.get_tag("opt_shapes") + if opt_shapes is None: + return None + return opt_shapes[name] + + def extent_warpper(self, value) -> int: + if isinstance(value, tvm.tir.Var): + return self.get_opt_shape(value.name) + elif isinstance(value, tvm.tir.IntImm): + return int(value) + else: + return value + + @functools.lru_cache() + def get_space_dim(self) -> List[int]: + dim_size = [] + if self.reduction_block: + block_info = self.block_analyzer.get_block_info(self.reduction_block) + for iter in block_info.iters: + if iter.kind == "S": + if isinstance(iter.dom.extent, tvm.tir.IntImm): + dim_size.append(int(iter.dom.extent)) + else: + assert isinstance(iter.dom.extent, tvm.tir.Var) + dim_size.append(self.get_opt_shape(iter.dom.extent.name)) + else: + # assume outer stage has the same shape + loops = self.sch.get_loops(self.schedule_stages[0]) + for loop in loops: + dim_size.append(int(self.sch.get(loop).extent)) + return [int(x) for x in dim_size] + + def set_dtype(self, dtype: tvm.DataType, id=0) -> None: + assert isinstance(dtype, tvm.DataType), type(dtype) + if dtype == tvm.DataType("bool"): + dtype = tvm.DataType("int8") + if len(self._dtypes) <= id: + self._dtypes.extend([None for _ in range(id - len(self._dtypes) + 1)]) + elif self._dtypes[id] is not None: + assert self._dtypes[id] == dtype, (self._dtypes, dtype) + self._dtypes[id] = dtype + + def get_dtype(self, id=0) -> tvm.DataType: + return self._dtypes[id] + + def get_buffer_dtype(self, buffer: tir.Buffer) -> tvm.DataType: + return tvm.DataType(buffer.dtype) + + def propogate(self, tile, rstep={}, targets=None): + shape = { + self.block_analyzer.get_output_buffers(block)[0].name: [ + tvm.arith.ConstIntBound(0, val - 1) for val in tile + ] + for block in self.schedule_stages + } + return self.ana.infer(shape, rstep, targets) + + def propogate_inputs(self, tile, rstep={}) -> List[List[int]]: + read_idx_offset = len(self.input_buffers) + targets = [t.name for t in self.args[:read_idx_offset]] + shapes, intermediate_bind = self.propogate(tile, rstep, targets) + results = [] + for i, arg in enumerate(self.args[:read_idx_offset]): + if arg.name in intermediate_bind: + results.append(shapes[arg.name]) + continue + # should not exceed original shape + trimmed_shape = [ + self.extent_warpper(i) + for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape))) + ] + results.append(trimmed_shape) + return results + + def propogate_outputs(self, tile, rstep={}) -> List[List[int]]: + read_idx_offset = len(self.input_buffers) + targets = [t.name for t in self.args[read_idx_offset:]] + shapes, _ = self.propogate(tile, rstep, targets) + results = [] + for i, arg in enumerate(self.args[read_idx_offset:]): + # should not exceed original shape + trimmed_shape = list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape))) + results.append(trimmed_shape) + return results + + def propogate_reduction_inputs(self, shape, rstep={}) -> Dict[str, List[int]]: + if self.reduction_block is None: + return {} + targets = [b.name for b in self.block_analyzer.get_input_buffers(self.reduction_block)] + results, _ = self.propogate(shape, rstep, targets) + return results + + def get_reduce_inputs_dtype(self): + if self.reduction_block is None: + return {} + return { + b.name: tvm.DataType(b.dtype) + for b in self.block_analyzer.get_input_buffers(self.reduction_block) + } + + @functools.lru_cache() + def infer_tensorcore_axis(self) -> Tuple[int]: + # axis is fixed for one expression, so only inference and cached + assert self.get_tag("tensorcore_config") + + C_ax_m, C_ax_n = self.get_tag("tensorcore_config") + wmma_m, wmma_n, wmma_k = [16, 16, 16] # just for testing, any number is ok + + def get_cl_shapes(c_ax_m, c_ax_n): + output_buffer_shape = ( + self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape + ) + valid_region = [] + for region in output_buffer_shape: + if region.value == 1: + continue + valid_region.append(region) + + num_nvalid_regions = len(output_buffer_shape) - len(valid_region) + + spatial_dim = self.get_space_dim() + assert len(valid_region) == len( + spatial_dim + ), f" {valid_region} mismatch with {spatial_dim}" + cl_shapes = [1] * len(spatial_dim) + cl_shapes[c_ax_m - num_nvalid_regions] = wmma_m + cl_shapes[c_ax_n - num_nvalid_regions] = wmma_n + self.set_tag("tensorcore_config", [s - num_nvalid_regions for s in [c_ax_m, c_ax_n]]) + return cl_shapes + + CL_shape = get_cl_shapes(C_ax_m, C_ax_n) + shapes = self.propogate_reduction_inputs(CL_shape, {x.var.name: 1 for x in self.raxis}) + A_deps, B_deps = shapes.values() + A_ax_m = A_deps.index(wmma_m) + B_ax_n = B_deps.index(wmma_n) + + CL_shape = [1] * len(self.get_space_dim()) + shapes = self.propogate_reduction_inputs(CL_shape, {x.var.name: wmma_k for x in self.raxis}) + A_deps, B_deps = shapes.values() + A_ax_k = len(A_deps) - 1 - A_deps[::-1].index(wmma_k) + B_ax_k = len(B_deps) - 1 - B_deps[::-1].index(wmma_k) + tc_axis = (A_ax_m, A_ax_k, B_ax_k, B_ax_n, C_ax_m, C_ax_n) + return tc_axis + + def footprint(self, shape, rstep, stride_map={}) -> int: + result = 0 + shapes, _ = self.propogate(shape, rstep) + + def is_broadcast_pattern(buffer, output_buffer): + return ( + buffer in self.args + and len(shapes[output_buffer.name]) > len(shapes[buffer.name]) + and np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name]) + ) + + def is_after_reduce_stage(block): + if not self.reduction_block: + return False + reduce_dependent_blocks = getattr(self, "reduce_dependent_blocks", None) + if reduce_dependent_blocks is None: + reduce_dependent_blocks = set() + pre_order_traverse( + self.block_analyzer, + [self.reduction_block], + lambda block: reduce_dependent_blocks.add(block), + ) + self.reduce_dependent_blocks = reduce_dependent_blocks + return block not in reduce_dependent_blocks + + # compute cached stages + cached_tensor = [] + for block in self.blocks: + output_buffer = self.block_analyzer.get_output_buffers(block)[0] + for buffer in self.block_analyzer.get_input_buffers(block): + cache = buffer.name not in cached_tensor and ( + is_broadcast_pattern(buffer, output_buffer) + or self.block_analyzer.get_block_info(block).is_reduction + ) + if not cache: + continue + cached_tensor.append(buffer.name) + if is_after_reduce_stage(block): + continue # cache after reduce op can often reuse buffer in reduce stage + + if buffer.name in stride_map: + num_elem = stride_map[buffer.name].compute_elements_from_shape( + shapes[buffer.name] + ) + else: + num_elem = np.prod(shapes[buffer.name]) + buffer_len = num_elem * int((tvm.DataType(buffer.dtype).bits + 7) // 8) + buffer_len = (buffer_len + 31) // 32 * 32 + result += buffer_len + return result, cached_tensor diff --git a/python/bitblas/base/roller/policy/__init__.py b/python/bitblas/base/roller/policy/__init__.py new file mode 100644 index 0000000000..c63b0e6ddb --- /dev/null +++ b/python/bitblas/base/roller/policy/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from .default import DefaultPolicy +from .tensorcore import TensorCorePolicy diff --git a/python/bitblas/base/roller/policy/common.py b/python/bitblas/base/roller/policy/common.py new file mode 100644 index 0000000000..ca8d9614d7 --- /dev/null +++ b/python/bitblas/base/roller/policy/common.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import List +import numpy as np + + +def get_all_factors(n: int) -> List[int]: + # Calculate the square root of n and round it up to the nearest integer + n0 = int(np.ceil(np.sqrt(n))) + + # Find all divisors of n that are less than n0 + val = np.where(n % np.arange(1, n0) == 0)[0] + 1 + + # If n is a perfect square, add the square root to the list of factors + mid = np.array([], dtype=int) if n0 * n0 != n else [n0] + + # Combine the factors and their corresponding larger pair factors + return [int(x) for x in np.concatenate([val, mid, n // val[::-1]])] + + +def factorize(n: int) -> List[int]: + i = 2 # Start with the smallest prime number + result = [] + + # Iterate through numbers to find factors + while n > 1: + if n % i == 0: # If i is a factor of n + n //= i # Divide n by i and keep the integer part + result.append(i) + else: + i += 1 # Try the next number + return result + + +def coalesced_factor(subtensor: List[int], tensor: List[int]) -> int: + # If the last dimension of the subtensor and tensor differ, or subtensor has only one dimension + if subtensor[-1] != tensor[-1] or len(subtensor) == 1: + return subtensor[-1] + else: + # Recursively calculate the coalesced factor for the remaining dimensions + return subtensor[-1] * coalesced_factor(subtensor[:-1], tensor[:-1]) + + +def coalesced_tensor_shape(subtensor: List[int], tensor: List[int], transaction_size: int) -> int: + # Calculate the total number of elements in the subtensor + bytes = int(np.prod(subtensor)) + + if bytes == 0: + return 0 + + # Calculate the coalesced factor for the subtensor + factor = int(coalesced_factor(subtensor, tensor)) + + # Compute the shape of the coalesced tensor + return transaction_size * bytes / min(transaction_size, factor) diff --git a/python/bitblas/base/roller/policy/default.py b/python/bitblas/base/roller/policy/default.py new file mode 100644 index 0000000000..9d698540bf --- /dev/null +++ b/python/bitblas/base/roller/policy/default.py @@ -0,0 +1,784 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Policy for cuda core schedule""" +import functools +import math +from queue import PriorityQueue +from typing import Iterable, Dict, List + +import numpy as np +import tvm + + +from ..arch import Arch +from ..bestfit import BestFit +from ..config import Config, Stride, TileDict +from .common import coalesced_factor, coalesced_tensor_shape, factorize, get_all_factors +from ..node import PrimFuncNode +from ..rasterization import * + + +class DefaultPolicy: + """ + Default Policy for fastdlight, a heuristic plan that tries to + minimize memory traffic and maximize parallelism.for Dlight Schedule. + """ + + def __init__(self, func: tvm.tir.PrimFunc, arch: Arch, tags: Dict = {}) -> None: + self.arch = arch + self.prim_func_node = PrimFuncNode(func, tags) + self.ordered_nodes = [self.prim_func_node] + self.output_nodes = [self.prim_func_node] + + def emit_config(self, topk: int) -> List[Config]: + base_tile = self.get_base_tile() + if base_tile is None: + return [] + + rstep_map = self._assign_reduce_step(self.prim_func_node) + smem_tile_condidates = self.dfs_smem_tile(base_tile, rstep_map) + results = [] + for td in smem_tile_condidates: + if not self.check_tile_shape_isvalid(td): + continue + + self._expand_reduce_axis(td) + for codegen_dicts in self.assign_block_size(td): + results.append(codegen_dicts) + if len(results) >= topk: + break + if len(results) >= topk: + break + return results + + def dfs_smem_tile(self, init_tile, rstep_map) -> Iterable[TileDict]: + _steps = [get_all_factors(n) for n in self.prim_func_node.get_space_dim()] + steps = [step[step.index(t) :] for step, t in zip(_steps, init_tile)] + for i in range(len(steps)): + added = list( + filter( + lambda s: s < steps[i][-1] and s > steps[i][0] and s not in steps[i], + [2, 4, 8, 16, 32], + ) + ) + steps[i].extend(added) + steps[i] = sorted(steps[i]) + visited_tiles = {} + queue = PriorityQueue() + + def prio(td: TileDict): + return (td.traffic + 1) * td.num_wave + + def add_to_queue(tile): + if tuple(tile) in visited_tiles: + return + td = self.compute_tile_dict(tile, rstep_map) + visited_tiles[tuple(tile)] = td + if td.valid: + queue.put([prio(td), tile]) + + add_to_queue(init_tile) + while not (queue.empty() or len(visited_tiles) > 2000): + _, tile = queue.get() + dim_ids = [step.index(t) for step, t in zip(steps, tile)] + for i in reversed(range(len(dim_ids))): + if dim_ids[i] + 1 < len(steps[i]): + new_tile = tile.copy() + new_tile[i] = steps[i][dim_ids[i] + 1] + add_to_queue(new_tile) + + visited_tiles = filter(lambda td: td.valid, visited_tiles.values()) + sorted_tiles = sorted(visited_tiles, key=lambda td: prio(td)) + return sorted_tiles + + def get_base_tile(self): + """ + Gets the minimum tile configuration that satisfies no redundancy in computation. + + Returns + ------- + List[int] + The base tile configuration, which is a list of 1s equal in length to the space dimensions + of the primary function node. + """ + shape = self.prim_func_node.get_space_dim() + base_tile = [1 for _ in shape] + + return base_tile + + # handles multiple output cases + def _get_output_tile_map(self, tile): + """ + Handles multiple output cases by mapping output nodes to their respective tile configurations. + + Parameters + ---------- + tile : List[int] + The tile configuration. + + Returns + ------- + Dict + A dictionary mapping the primary function node to its corresponding tile configuration + based on the output nodes' space dimensions. + """ + tile_map = {} + tile_map[self.prim_func_node] = [ + tile[i] + * self.prim_func_node.get_space_dim()[i] + // self.output_nodes[0].get_space_dim()[i] + for i in range(len(tile)) + ] + return tile_map + + def score_block_size(self, n): + """ + Scores a block size based on its efficiency and fit relative to the architecture's warp size and SM partition. + + Parameters + ---------- + n : int + The block size to score. + + Returns + ------- + Tuple[float, float] + A tuple containing two scores representing efficiency and fit, respectively. + """ + num_wrap = (n + self.arch.warp_size - 1) // self.arch.warp_size + r1 = max(num_wrap / self.arch.sm_partition, self.arch.sm_partition / num_wrap) + r2 = (num_wrap * self.arch.warp_size - n) / n + return (r1, r2) + + def get_block_size(self, n): + """ + Determines the optimal block size for a given constraint, based on scoring various factors. + + Parameters + ---------- + n : int + The constraint size. + + Returns + ------- + int + The optimal block size chosen from the factors of n, constrained by a maximum of 1024 and + scored by the `score_block_size` method. + """ + factors = get_all_factors(n) + factors = list(filter(lambda x: x <= 1024, factors)) + factor_ordered = sorted(factors, key=self.score_block_size) + return factor_ordered[0] + + def get_node_reduce_step_candidates(self, node: PrimFuncNode): + """ + Calculates reduction step candidates for each reduction axis in a PrimFuncNode. General idea : use factor first, since it does not require extra boundary check. for large prime number, which is rare case, use power of 2. + + Parameters + ---------- + node : PrimFuncNode + The node for which to calculate reduction step candidates. It contains reduction axes (raxis) + with their domains (dom.extent). + + Returns + ------- + Dict[str, List[int]] + A dictionary mapping axis variable names to lists of step candidates. For each axis in the node, + this function calculates possible step sizes. For axes with a large prime domain, it uses powers of 2 + as step candidates; for others, it uses all factors of the domain. + """ + + results = {} + for k_iter in node.raxis: + all_factors = get_all_factors(int(k_iter.dom.extent)) + if len(all_factors) == 2 and int(k_iter.dom.extent) > 64: + all_factors = [1] + while all_factors[-1] * 2 < int(k_iter.dom.extent): + all_factors.append(all_factors[-1] * 2) + results[k_iter.var.name] = all_factors + return results + + def _assign_reduce_step(self, node: PrimFuncNode): + """ + Assigns an optimal reduction step for the given PrimFuncNode. + + Parameters + ---------- + node : PrimFuncNode + The node for which the reduction step is to be assigned. + + Returns + ------- + Dict + A dictionary mapping reduction axis variable names to their optimal reduction steps. + """ + if node.reduction_block is None: + return {} + + raxis = node.raxis + tile = [1] * len(node.get_space_dim()) + all_steps = self.get_node_reduce_step_candidates(node) + + def sim(a: int, b: int): + return (2 * a * b) / (a * a + b * b) + + def _score(rstep_id): + rstep = {k: all_steps[k][rstep_id[k]] for k in rstep_id} + score = 0 + shape = node.propogate_inputs(tile, rstep=rstep) + for i, input_buffer in enumerate(node.input_buffers): + read_transaction_elements = self.arch.transaction_size[1] // ( + (node.get_buffer_dtype(input_buffer).bits + 7) // 8 + ) + score += sim( + int(coalesced_factor(shape[i], input_buffer.shape)), + read_transaction_elements, + ) + return score + + def _enlarge(rstep_id): + candidates = [] + candidates.append((rstep_id, _score(rstep_id))) + for ax in rstep_id: + if rstep_id[ax] + 1 == len(all_steps[ax]): + continue + r = rstep_id.copy() + r[ax] += 1 + candidates.append((r, _score(r))) + best = max(candidates, key=lambda x: x[1]) + return best + + # enlarge rstep to ensure read is coaleased + cur_rstep_id = {ax.var.name: 0 for ax in raxis} + cur_score = _score(cur_rstep_id) + while True: + if cur_score == 0: + break + new_rstep, new_score = _enlarge(cur_rstep_id) + if new_score <= cur_score: + break + else: + cur_rstep_id, cur_score = new_rstep, new_score + rstep = {k: all_steps[k][cur_rstep_id[k]] for k in cur_rstep_id} + return rstep + + def _expand_reduce_axis(self, td: TileDict): + """ + Expands the reduction axis in the TileDict based on shared memory limits. + + Parameters + ---------- + td : TileDict + The TileDict object to be optimized. + + Returns + ------- + None + This function modifies the TileDict in place. + """ + smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap) + rstep_map = td.rstep_map.copy() + + def _optimize(node, rstep): + all_steps = self.get_node_reduce_step_candidates(node) + for k in all_steps: + all_steps[k] = list(filter(lambda x: x % rstep[k] == 0, all_steps[k])) + + def _score(rstep_id): + rstep = { + k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis + } + score = 0 + shape = node.propogate_inputs(td.get_tile(node), rstep=rstep) + for i, input_buffer in enumerate(node.input_buffers): + score += coalesced_factor(shape[i], input_buffer.shape) + return score + + def _enlarge(rstep_id): + candidates = [] + for ax in rstep_id: + if rstep_id[ax] + 1 == len(all_steps[ax]): + continue + r = rstep_id.copy() + r[ax] += 1 + candidates.append((r, _score(r))) + if len(candidates) == 0: + return None + return max(candidates, key=lambda x: x[1])[0] + + cur_rstep_id = { + k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis + } + new_rstep_map = rstep_map.copy() + while True: + new_rstep_id = _enlarge(cur_rstep_id) + if new_rstep_id is None: + break + new_rstep_map = { + k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis + } + old_rstep_map = td.rstep_map + td.rstep_map = new_rstep_map + smem_usage, _ = self._compute_shared_memory_usage(td) + td.rstep_map = old_rstep_map + if smem_usage > smem_limit: + break + else: + cur_rstep_id = new_rstep_id + rstep = { + k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis + } + return rstep + + for node in self.ordered_nodes: + if len(node.raxis) > 0: + rstep = _optimize(node, rstep_map) + rstep_map = rstep + td.rstep_map = rstep_map + td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td) + + def _compute_memory_traffic(self, output_tile): + """ + Computes the memory traffic for a given output tile configuration. + + Parameters + ---------- + output_tile : List[int] + The output tile configuration. + + Returns + ------- + Tuple[int, Dict] + The total memory traffic and a map of operation tiles. + """ + op_tile_map = self._get_output_tile_map(output_tile) + traffic = 0 + for node in reversed(self.ordered_nodes): + tile = op_tile_map[node] + input_shapes = node.propogate_inputs(tile) + output_shapes = node.propogate_outputs(tile) + for i, buffer in enumerate(node.input_buffers): + nbytes = (node.get_buffer_dtype(buffer).bits + 7) // 8 + read_transaction_elements = self.arch.transaction_size[1] // nbytes + traffic += ( + coalesced_tensor_shape(input_shapes[i], buffer.shape, read_transaction_elements) + * nbytes + ) + for i, buffer in enumerate(node.output_buffers): + nbytes = (node.get_buffer_dtype(buffer).bits + 7) // 8 + write_transaction_elements = self.arch.transaction_size[0] // nbytes + traffic += ( + coalesced_tensor_shape( + output_shapes[i], buffer.shape, write_transaction_elements + ) + * nbytes + ) + return traffic, op_tile_map + + def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode): + """ + Infers the shared memory usage of a node given a TileDict configuration. + + Parameters + ---------- + td : TileDict + The TileDict object containing the tile configuration. + node : PrimFuncNode + The node for which to infer the shared memory usage. + + Returns + ------- + int + The estimated amount of shared memory used by the node. + """ + return node.footprint(td.get_tile(node), td.get_rstep(node), td.tensor_strides_map[node]) + + def _compute_shared_memory_usage(self, td: TileDict): + """ + Computes the stride map for a given node and TileDict configuration. + + Parameters + ---------- + node : PrimFuncNode + The node for which to compute the stride map. + td : TileDict + The TileDict object containing the tile configuration. + + Returns + ------- + Tuple[Dict, Dict] + The output strides and tensor strides. + """ + self._compute_stride_map(td) + allocator = BestFit() + block_map = {} + cached_tensors_map = {} + + node_internal_bytes, cached_tensors_map[self.prim_func_node] = self.infer_node_smem_usage( + td, self.prim_func_node + ) + block = allocator.malloc(node_internal_bytes) + allocator.free(block) + assert len(block_map) == 0 + return allocator.limit, cached_tensors_map + + def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict): + """ + Computes the stride map for a given node based on the TileDict configuration. + + Parameters + ---------- + node : PrimFuncNode + The node for which to compute the stride map. + td : TileDict + The TileDict object containing the tile configuration. + + Returns + ------- + Tuple[Dict, Dict] + A tuple of dictionaries containing the output strides and tensor strides. + """ + output_strides = { + int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers) + } + tensor_strides = {} + return output_strides, tensor_strides + + def _compute_stride_map(self, td: TileDict): + """ + Computes the stride map for all nodes in a TileDict. + + Parameters + ---------- + td : TileDict + The TileDict object for which to compute the stride maps. + + Returns + ------- + None + This function updates the TileDict object in place with the computed stride maps. + """ + output_strides_map = {} + tensor_strides_map = {} + for node in self.ordered_nodes: + output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map( + node, td + ) + td.output_strides_map, td.tensor_strides_map = output_strides_map, tensor_strides_map + + def compute_tile_dict(self, output_tile: List[int], rstep_map) -> TileDict: + """ + Computes and returns a TileDict object for a given output tile configuration and reduction step map. + + Parameters + ---------- + output_tile : List[int] + The output tile configuration. + rstep_map : Dict + The reduction step map. + + Returns + ------- + TileDict + A TileDict object containing the computed tile configuration, memory traffic, shared memory cost, + grid size, and other related parameters. + """ + td = TileDict(output_tile) + td.rstep_map = rstep_map + td.traffic, td.tile_map = self._compute_memory_traffic(output_tile) + td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td) + if td.smem_cost > self.arch.smem_cap: + td.valid = False + return td + output_shape = self.output_nodes[0].get_space_dim() + td.grid_size = int(np.prod([(y + x - 1) // x for x, y in zip(output_tile, output_shape)])) + # estimated reg usage + reg_usage = int( + 2 + * max( + [ + np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 + for node in self.ordered_nodes + ] + ) + ) + if reg_usage > self.arch.reg_cap: + td.valid = False + return td + td.block_per_SM = min( + self.arch.max_smem_usage // max(td.smem_cost, 1), + self.arch.reg_cap // max(reg_usage, 1), + self.arch.sm_partition, + ) + td.num_wave = int(np.ceil(td.grid_size / int(td.block_per_SM * self.arch.compute_max_core))) + return td + + def check_tile_shape_isvalid(self, td: TileDict) -> bool: + """ + Checks if the tile shapes in the TileDict are valid for the nodes in this context. + + Parameters: + - td (TileDict): The TileDict object containing tile shapes and other configurations. + + Returns: + - bool: True if all tile shapes are valid, False otherwise. + """ + for node in self.ordered_nodes: + if np.prod(td.get_tile(node)) == 0: + return False + node_grid_size = np.prod( + [(y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())] + ) + if node_grid_size != td.grid_size: + return False + if ( + hasattr(node, "reduce_op") + and node.reduce_op is not None + and len(node.reduce_op.axis) == len(td.output_tile) + ): + for i, tile_extent in enumerate(td.output_tile): + if node.reduce_op.axis[i].dom.extent % tile_extent: + return False + + return True + + def recommend_block_size(self, td: TileDict) -> List[int]: + """ + Recommends optimal block sizes based on the TileDict configuration. + + Parameters + ---------- + td : TileDict + The TileDict object containing the tile configuration. + + Returns + ------- + List[int] + A list of recommended block sizes sorted based on their score. + """ + node_space_sizes = [int(np.prod(td.get_tile(node))) for node in self.ordered_nodes] + max_block_size = functools.reduce(math.gcd, node_space_sizes) + + if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min( + node_space_sizes + ): + node_reduce_sizes = [ + int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes + ] + total_sizes = [x * y for x, y in zip(node_space_sizes, node_reduce_sizes)] + max_possible_size = functools.reduce(math.gcd, total_sizes) + possible_block_sizes = list( + filter( + lambda x: x % max_block_size == 0 and x <= 1024, + get_all_factors(max_possible_size), + ) + ) + possible_block_sizes = list( + filter( # either be a factor of space or cover fully cover the space + lambda x: all([x % s == 0 or s % x == 0 for s in node_space_sizes]), + possible_block_sizes, + ) + ) + factor_ordered = sorted(possible_block_sizes, key=self.score_block_size) + return factor_ordered + else: + possible_block_sizes = get_all_factors(max_block_size) + possible_block_sizes = list(filter(lambda x: x <= 1024, possible_block_sizes)) + factor_ordered = sorted(possible_block_sizes, key=self.score_block_size) + return factor_ordered + + def assign_block_size(self, td: TileDict, topk=1): + """ + Assigns block sizes to the TileDict based on the recommended block sizes. + + Parameters + ---------- + td : TileDict + The TileDict object to assign block sizes to. + topk : int, optional + The number of top block sizes to consider. + + Yields + ------- + Dict + The block size assignment for the primary function node. + """ + block_size_ordered = self.recommend_block_size(td) + for block_size in block_size_ordered: + result = {} + failed = False + result = self._assign_block_size(self.prim_func_node, td, block_size) + if result is None: + failed = True + break + if failed: + continue + else: + yield result + topk -= 1 + if topk == 0: + break + + def _assign_block_size(self, node: PrimFuncNode, td: TileDict, block_size: int): + """ + Assigns a block size to a given PrimFuncNode based on the TileDict configuration and the specified block size. + + Parameters + ---------- + node : PrimFuncNode + The node to assign the block size to. + td : TileDict + The TileDict object containing the tile configuration. + block_size : int + The block size to be assigned. + + Returns + ------- + Config + A Config object containing the assigned block size and other related settings. + """ + tile, rsteps = td.get_tile(node), td.get_rstep(node) + factors = factorize(block_size) + cur_threads = [1 for _ in tile] + reduce_thread = {k: 1 for k in rsteps} + ndim = len(tile) + + def _score(node, thread): # small is better + score = 0 + block_tile = [int(np.ceil(tile[i] / thread[i])) for i in range(ndim)] + shape = node.propogate_inputs(block_tile) + for i, buffer in enumerate(node.input_buffers): + score += np.prod(shape[i]) / self.arch.bandwidth[1] + for buffer in node.output_buffers: + score += coalesced_tensor_shape(thread, buffer.shape, 8) / self.arch.bandwidth[0] + return score + + for factor in reversed(factors): + score_map = {} + for i in range(ndim): + if cur_threads[i] >= tile[i]: + continue + if (tile[i] % (cur_threads[i] * factor)) != 0: + continue + cur_threads[i] *= factor + score_map[i] = (_score(node, cur_threads), i) + cur_threads[i] //= factor + if len(score_map) > 0: + # assign to space axis + dim_order = sorted(score_map.keys(), key=lambda x: score_map[x]) + cur_threads[dim_order[0]] *= factor + else: + # assign to reduce axis + target_ax = None + for ax, ax_len in reversed(list(rsteps.items())): + if ax_len % (reduce_thread[ax] * factor) == 0: + target_ax = ax + break + assert target_ax + reduce_thread[target_ax] *= factor + + codegen_dict = Config() + codegen_dict.compute_capability = self.arch.compute_capability + codegen_dict.block = tile + codegen_dict.thread = cur_threads + codegen_dict.rstep = [rsteps[ax.var.name] for ax in node.raxis] + codegen_dict.reduce_thread = [reduce_thread[ax.var.name] for ax in node.raxis] + codegen_dict.cached_tensors = td.cached_tensors_map[node] + codegen_dict.rasterization_plan = self.plan_rasterization(td) + + if node.get_dtype().bits == 16: # set step=2 for 16bit case to ensure coalesced access + codegen_dict._step = [1 for _ in range(ndim)] + for i in reversed(range(ndim)): + if codegen_dict.block[i] // codegen_dict.thread[i] % 2 == 0: + codegen_dict._step[i] = 2 + break + elif node.get_dtype().bits == 8: # set step=4 for 8bit case to ensure coalesced access + codegen_dict._step = [1 for _ in range(ndim)] + for i in reversed(range(ndim)): + if codegen_dict.block[i] // codegen_dict.thread[i] % 4 == 0: + codegen_dict._step[i] = 4 + break + # Plan vectorize + codegen_dict.vectorize = self._plan_vectorize(node, td, block_size) + codegen_dict.arch = self.arch + codegen_dict.opt_shapes = self.prim_func_node.get_tag("opt_shapes") + return codegen_dict + + def _plan_vectorize(self, node: PrimFuncNode, td: TileDict, block_size: int): + """ + Plans vectorization for a given PrimFuncNode based on the TileDict configuration and block size. + + Parameters + ---------- + node : PrimFuncNode + The node for which to plan vectorization. + td : TileDict + The TileDict object containing the tile configuration. + block_size : int + The block size used for vectorization planning. + + Returns + ------- + Dict + A dictionary mapping tensors to their vectorization size. + """ + + def is_cont(shape, vec): + if len(shape) == 0: + return vec == 1 + last = shape[-1] + if last == 1: + return is_cont(shape[0:-1], vec // last) + else: + return last % vec == 0 + + def is_shape_aligned(shape, factor): + return int(np.prod(shape)) % factor == 0 + + def is_type_allowed(dtype, vec): + return dtype.bits * vec <= 128 + + vectorize_sizes = [16, 8, 4, 2] + dtypes = node.get_reduce_inputs_dtype() + shapes = node.propogate_reduction_inputs(td.get_tile(node), td.get_rstep(node)) + vectorize_result = {} + for tensor, shape in shapes.items(): + for v in vectorize_sizes: + if ( + is_shape_aligned(shape, block_size * v) + and is_cont(shape, v) + and is_type_allowed(dtypes[tensor], v) + ): + vectorize_result[tensor] = v + break + return vectorize_result + + def plan_rasterization(self, td: TileDict): # pylint: disable=unused-argument + """ + Plans the rasterization for the given TileDict. This function is not implemented yet. + + Parameters + ---------- + td : TileDict + The TileDict object to plan rasterization for. + + Raises + ------- + RasterRationPlan + This function is not implemented yet. + """ + return NoRasterization() diff --git a/python/bitblas/base/roller/policy/tensorcore.py b/python/bitblas/base/roller/policy/tensorcore.py new file mode 100644 index 0000000000..1171a79156 --- /dev/null +++ b/python/bitblas/base/roller/policy/tensorcore.py @@ -0,0 +1,338 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Policy for tensorcore schedule""" +import tvm +from typing import Dict, List, Tuple +import numpy as np + +from ..arch import Arch +from ..config import Config, Stride, TileDict, IntrinInfo +from ..node import PrimFuncNode +from .common import coalesced_factor, factorize, get_all_factors +from .default import DefaultPolicy +from ..rasterization import * + + +class TensorCorePolicy(DefaultPolicy): + def __init__(self, func: tvm.tir.PrimFunc, arch: Arch, tags: Dict = {}) -> None: + super().__init__(func, arch, tags) + # this is the trick for wmma. + # However, for int8 mma, the wmma_k should be 32. + self.wmma_k = 16 + self.pipeline_stage: int = 1 + self.use_async_copy: bool = False + self._legalize_info() + + def _legalize_info(self): + pipleline_stage = self.prim_func_node.get_tag("pipeline_stage") + if pipleline_stage: + self.pipeline_stage = pipleline_stage + else: + if self.arch.compute_capability == "sm_80": + self.pipeline_stage = 2 + else: + self.pipeline_stage = 1 + use_async_copy = self.prim_func_node.get_tag("use_async_copy") + if use_async_copy: + self.use_async_copy = use_async_copy + else: + if self.arch.compute_capability == "sm_80": + self.use_async_copy = 1 + else: + self.use_async_copy = 0 + + def _compute_tc_strides( + self, node: PrimFuncNode, tile: List[int], rstep: Dict[str, int] = {} + ) -> Tuple[Stride, Stride, Stride]: + # strides was used for shared memory padding. which is necessary for avoiding + # shared memory load bank conflict when we do not applying tensorcore layout. + shapes = node.propogate_reduction_inputs(tile, rstep) + AS_shape, BS_shape = shapes.values() + CS_shape = tile + A_ax_m, A_ax_k, B_ax_k, B_ax_n, C_ax_m, C_ax_n = node.infer_tensorcore_axis() + + # applying strides + # TODO(leiwang1999): offset should be dynamically set. we can use tag -> enable_offset to control this option.. + offset = 8 + A_high_ax = min(A_ax_m, A_ax_k) + B_high_ax = min(B_ax_n, B_ax_k) + C_high_ax = min(C_ax_m, C_ax_n) + A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1 :]) + offset, ax=A_high_ax) + B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1 :]) + offset, ax=B_high_ax) + C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1 :]) + offset, ax=C_high_ax) + return A_stride, B_stride, C_stride + + def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode): + value, cached_tensors = super().infer_node_smem_usage(td, node) + value *= self.pipeline_stage + return value, cached_tensors + + def _assign_reduce_step(self, node): + if not node.get_tag("tensorcore_config"): + return super()._assign_reduce_step(node) + # get reduce input size + target_transaction = self.arch.transaction_size[0] * 2 + # 512 bytes // type bits + reduce_input_dtype = node.get_buffer_dtype( + node.block_analyzer.get_input_buffers(node.reduction_block)[0] + ) + basic = (target_transaction * 8) // reduce_input_dtype.bits + + result = {} + for iter_info in node.raxis: + iter_name = iter_info.var.name + iter_dom = iter_info.dom.extent + if iter_dom % 16 > 0: + result[iter_name] = 16 if iter_dom < basic else basic # for the case of padding + elif iter_dom % basic == 0: + result[iter_name] = basic + else: + return super()._assign_reduce_step(node) + return result + + def _expand_reduce_axis(self, td: TileDict): + # For tensorcore program, if we got a small tilesize, we should consider expand the reduce axis + # to improve compute efficiency. + def _check_small_tile(td: TileDict): + minimal_threadhold = 32 + for node in self.ordered_nodes: + tile = td.get_tile(node) + if any([t <= minimal_threadhold for t in tile]): + return True + return False + + if not _check_small_tile(td): + return None + + smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap) + rstep_map = td.rstep_map.copy() + + def _optimize(node, rstep): + all_steps = self.get_node_reduce_step_candidates(node) + # todo(lei): optimzie the all_steps enlarge policy to be a multiple of the original all_steps[k] + for k in all_steps: + all_steps[k] = list(filter(lambda x: x % rstep[k] == 0, all_steps[k])) + if any([v == [] for v in all_steps.values()]): + return rstep + + def _shared_memory_usage(td: TileDict): + return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node]) + + def _score(rstep_id): + rstep = { + k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis + } + score = 0 + shape = node.propogate_inputs(td.get_tile(node), rstep=rstep) + for i, input_buffer in enumerate(node.input_buffers): + score += coalesced_factor(shape[i], input_buffer.shape) + return score + + def _enlarge(rstep_id): + candidates = [] + for ax in rstep_id: + if rstep_id[ax] + 1 == len(all_steps[ax]): + continue + r = rstep_id.copy() + r[ax] += 1 + candidates.append((r, _score(r))) + if len(candidates) == 0: + return None + return max(candidates, key=lambda x: x[1])[0] + + cur_rstep_id = { + k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis + } + new_rstep_map = rstep_map.copy() + while True: + new_rstep_id = _enlarge(cur_rstep_id) + if new_rstep_id is None: + break + new_rstep_map = { + k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis + } + old_rstep_map = td.rstep_map + td.rstep_map = new_rstep_map + smem_usage, _ = _shared_memory_usage(td) + td.rstep_map = old_rstep_map + if smem_usage > smem_limit: + break + else: + cur_rstep_id = new_rstep_id + rstep = { + k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis + } + return rstep + + for node in self.ordered_nodes: + if len(node.raxis) > 0: + rstep = _optimize(node, rstep_map) + rstep_map = rstep + td.rstep_map = rstep_map + td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td) + return + + def get_node_reduce_step_candidates(self, node): + if not node.get_tag("tensorcore_config"): + return super().get_node_reduce_step_candidates(node) + else: + # must be a a multiple of wmma_k + return { + k.var.name: [ + x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k) + ] + for k in node.raxis + } + + def check_tile_shape_isvalid(self, td: TileDict): + for node in self.ordered_nodes: + if node.get_tag("tensorcore_config"): + ax_m, ax_n = node.get_tag("tensorcore_config") + block_m, block_n = td.tile_map[node][ax_m], td.tile_map[node][ax_n] + # check the tile size is valid + wmma_invalid = [ + block_m < wmma_m or block_n < wmma_n + for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes() + ] + if all(wmma_invalid): + return False + if any([y % x for x, y in zip(td.tile_map[node], node.get_space_dim())]): + return False + return super().check_tile_shape_isvalid(td) + + def _can_implement_layout(self, node: PrimFuncNode, td: TileDict): + # Not implemented yet + # This function is used to check whether we can implement swizzling + # layout under this tile config + return False + + def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict): + if not node.get_tag("tensorcore_config"): + return super().compute_node_stride_map(node, td) + use_layout = self._can_implement_layout(node, td) + + AS_stride, BS_stride, C_stride = self._compute_tc_strides( + node, td.get_tile(node), td.get_rstep(node) + ) + A_stride, B_stride, _ = self._compute_tc_strides(node, td.get_tile(node)) + tensor_strides = {} + output_strides = { + int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers) + } + tensor_strides = {} + # when connected to shared input, should use full stride without rstep + for i, (stride, stride_full) in enumerate( + zip([AS_stride, BS_stride], [A_stride, B_stride]) + ): + if use_layout: + continue + _ = node.block_analyzer.get_input_buffers(node.reduction_block)[i].name + # TODO(lei): should dig further for shared memory connection case. + + return output_strides, tensor_strides + + def _assign_block_size(self, node: PrimFuncNode, td: TileDict, block_size: int): + if not node.get_tag("tensorcore_config"): + return super()._assign_block_size(node, td, block_size) + ax_m, ax_n = node.get_tag("tensorcore_config") + if block_size % self.arch.warp_size != 0: + return None + tile, rsteps = td.get_tile(node), td.get_rstep(node) + warps = block_size // self.arch.warp_size + ndim = len(tile) + + wmma = self.arch.get_avaliable_tensorintrin_shapes()[-1] + wmma_tile = [1 for _ in range(ndim)] + wmma_tile[ax_m] = wmma[0] + wmma_tile[ax_n] = wmma[1] + + space = [tile[i] // wmma_tile[i] for i in range(ndim)] + if tile[ax_m] < wmma_tile[ax_m] or tile[ax_n] < wmma_tile[ax_n]: + # allow pad, otherwise, we can not get a valid tile shape + return None + if np.prod(space) % warps != 0: + return None + factors = factorize(np.prod(space) // warps) + + def _score(node, thread): # small is better + score = 0 + block_tile = [int(np.ceil(tile[i] / thread[i])) for i in range(ndim)] + shape = node.propogate_inputs(block_tile) + for i, _ in enumerate(node.input_buffers): + score += np.prod(shape[i]) / self.arch.bandwidth[1] + return score + + warp_tile = wmma_tile.copy() + for factor in reversed(factors): + score_map = {} + for i in range(ndim): + if tile[i] % (warp_tile[i] * factor) != 0: + continue + warp_tile[i] *= factor + score_map[i] = (_score(node, warp_tile), i) + warp_tile[i] //= factor + if len(score_map) == 0: + return None + dim_order = sorted(score_map.keys(), key=lambda x: score_map[x]) + warp_tile[dim_order[0]] *= factor + + codegen_dict = Config() + codegen_dict.block = tile + codegen_dict.warp = warp_tile + codegen_dict.use_tc = True + codegen_dict.pipeline_stage = self.pipeline_stage + codegen_dict.use_async = self.use_async_copy + codegen_dict.rstep = [int(rsteps[ax.var.name]) for ax in node.raxis] + codegen_dict.cached_tensors = td.cached_tensors_map[node] + codegen_dict.rasterization_plan = self.plan_rasterization(td) + codegen_dict.wmma = wmma + [self.wmma_k] + + intrin_info = node.get_tag("intrin_info") + if intrin_info: + codegen_dict.intrin_info = IntrinInfo(**intrin_info) + + codegen_dict.complete_config(node) + codegen_dict.vectorize = self._plan_vectorize(self.prim_func_node, td, block_size) + codegen_dict.arch = self.arch + codegen_dict.opt_shapes = self.prim_func_node.get_tag("opt_shapes") + return codegen_dict + + def plan_rasterization(self, td: TileDict): + conditions = [] + # only support single node for now + conditions.append(len(self.ordered_nodes) > 1) + # small op don't need this + conditions.append(td.num_wave < 4) + # only on Ampere+ arch + conditions.append(self.arch.compute_capability < "80") + + def _check_memory_size(): + overall_gmem_size_in_bytes: int = 0 + for node in self.ordered_nodes: + for arg in node.args: + overall_gmem_size_in_bytes += ( + int(np.prod(arg.shape)) * tvm.DataType(arg.dtype).bits // 8 + ) + return overall_gmem_size_in_bytes < (self.arch.l2_cache_size_bytes * 4) + + conditions.append(_check_memory_size()) + if any(conditions): + return NoRasterization() + # otherwise, simply provide a block rasterization factor + raster_factor = int(self.arch.compute_max_core**0.5) + + return Rasterization2DColumn(raster_factor) diff --git a/python/bitblas/base/roller/rasterization.py b/python/bitblas/base/roller/rasterization.py new file mode 100644 index 0000000000..3244a748e8 --- /dev/null +++ b/python/bitblas/base/roller/rasterization.py @@ -0,0 +1,98 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Rasteration Plan For L2 Cache Locality""" + +from typing import List + + +class Rasterization: + def __init__(self) -> None: + pass + + def get_code(self) -> List[str]: + raise NotImplementedError() + + +class NoRasterization(Rasterization): + def __init__(self) -> None: + super().__init__() + + def __repr__(self) -> str: + return "" + + def get_code(self) -> List[str]: + return [] + + +class Rasterization2DRow(Rasterization): + """ + Rasterization by Row, each Row line width is panel_width + _________ + _________| + |_________ + __________| + """ + + def __init__(self, panel_width=4) -> None: + super().__init__() + self.panel_width_ = panel_width + + def __repr__(self) -> str: + return f"" + + def get_code(self) -> List[str]: + raise NotImplementedError() + + +class Rasterization2DColumn(Rasterization): + """ + Rasterization by Column, each column line width is panel_width + _ + | | | | + | | | | + |_| |_| + """ + + def __init__(self, panel_width=4) -> None: + super().__init__() + self.panel_width_ = panel_width + + def __repr__(self) -> str: + return f"" + + def get_device_function(self) -> str: + return """ +__device__ dim3 rasterization2DColumn(const int panel_width) { + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +panel_width * gridDim.x - 1) / (panel_width * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (panel_width *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?panel_width : (totalBlock - panelIdx * (panel_width *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * panel_width * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * panel_width *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * panel_width *gridDim.x) % strideLd + panelIdx * panel_width; + const auto bz = blockIdx.z; + + dim3 blockIdx(bx, by, bz); + return blockIdx; +} + """ + + def get_code(self) -> List[str]: + return [ + self.get_device_function(), + "const dim3 blockIdx(rasterization2DColumn({});".format(self.panel_width_), + ] diff --git a/python/bitblas/base/roller/shape_inference/__init__.py b/python/bitblas/base/roller/shape_inference/__init__.py new file mode 100644 index 0000000000..25d0caf480 --- /dev/null +++ b/python/bitblas/base/roller/shape_inference/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from .tir import get_analyzer_by_tir # pylint: disable=unused-import diff --git a/python/bitblas/base/roller/shape_inference/common.py b/python/bitblas/base/roller/shape_inference/common.py new file mode 100644 index 0000000000..5f04884fa4 --- /dev/null +++ b/python/bitblas/base/roller/shape_inference/common.py @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from collections import OrderedDict +from typing import Dict, List + +from tvm import arith + + +class Statement(): + def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict): + self.output = output + self.dependent_region = dependent_region + self.var_map = var_map + self.range_map = range_map + +def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound): + return arith.ConstIntBound(min(x.min_value, y.min_value), max(x.max_value, y.max_value)) + +class InputShapeInference(): + def __init__(self, deps: List[Statement]): + self.deps = deps + + def _infer(self, shape: Dict[str, List[arith.ConstIntBound]], rstep: Dict[str, int]): + shape = shape.copy() + ana = arith.Analyzer() + for dep in reversed(self.deps): + for var, bound in zip(dep.var_map.values(), shape[dep.output]): + ana.update(var, bound) + for var, bound in dep.range_map.items(): + if var.name in rstep: + bound = arith.ConstIntBound(0, min(bound.max_value, rstep[var.name] - 1)) + ana.update(var, bound) + for name, regions in dep.dependent_region.items(): + for region in regions: + bounds = [ana.const_int_bound(index) for index in region] + if name in shape: # simply merge two bounds + bounds = [_merge_two_bounds(x, y) for x, y in zip(shape[name], bounds)] + shape[name] = bounds + + for name, bounds in shape.items(): + shape[name] = [c.max_value - c.min_value + 1 for c in bounds] + return shape + + def infer(self, shape, rstep: Dict[str, int] = {}): + if isinstance(shape, (list, tuple)): + shape = {"output0" : [arith.ConstIntBound(0, val - 1) for val in shape]} + shape = self._infer(shape, rstep) + return shape + + def get_input_exprs(self, output_exprs): + result = output_exprs.copy() + ana = arith.Analyzer() + for dep in reversed(self.deps): + for var, expr in zip(dep.var_map.values(), result[dep.output]): + ana.bind(var, expr) + for var in dep.range_map: + ana.bind(var, 0) + for name, regions in dep.dependent_region.items(): + if name in result: + continue + region = regions[0] + input_expr = [ana.simplify(index) for index in region] + result[name] = input_expr + return result + diff --git a/python/bitblas/base/roller/shape_inference/tir.py b/python/bitblas/base/roller/shape_inference/tir.py new file mode 100644 index 0000000000..aa6f90b6c7 --- /dev/null +++ b/python/bitblas/base/roller/shape_inference/tir.py @@ -0,0 +1,412 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Dict, List, Tuple, Set, Mapping +from tvm.tir.schedule.schedule import BlockRV +from tvm.ir import structural_equal +from tvm import arith, tir + + +class Statement: + def __init__(self, block_analyzer, block: BlockRV): + self.block_analyzer = block_analyzer + self.block = block + # assume one tir block only has one output buffer + self.dep_name = block_analyzer.get_output_buffers(block)[0].name + self.dependent_region = _extract_dependent_region(block_analyzer, block) + + self.reverse_bound_inference = {} + + def make_reverse(self, input_name: str, input_iter: List[tir.PrimExpr]): + if len(self.block_analyzer.get_reduce_axis(self.block)) > 0: + return None + if len(self.dependent_region[input_name]) != 1: + return None + indices = self.dependent_region[input_name][0] + iter_map_range = { + _iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block) + } + iter_map_result = arith.detect_iter_map( + indices, + iter_map_range, + check_level=arith.iter_affine_map.IterMapLevel.Surjective, + simplify_trivial_iterators=False, + ) + if len(iter_map_result.errors) > 0: + return None + results = arith.iter_affine_map.inverse_affine_iter_map(iter_map_result.indices, input_iter) + output_indices = [] + for _iter in self.block_analyzer.get_spatial_axis(self.block): + if _iter.var in results: + output_indices.append(results[_iter.var]) + else: + # not Bijective mapping case + output_indices.append(tir.Var("undefined", dtype="int32") % int(_iter.dom.extent)) + return output_indices + + +def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound): + return arith.ConstIntBound(min(x.min_value, y.min_value), max(x.max_value, y.max_value)) + + +class TensorDepNode(object): + """ + For tensor dependency analysis. + """ + + def __init__(self, name): + self.name = name + self._next = [] + self._prev = [] + + def add_next(self, node): + self._next.append(node) + self.deduplicate(self._next) + + def add_prev(self, node): + self._prev.append(node) + self.deduplicate(self._prev) + + def deduplicate(self, lst): + seen = set() + lst[:] = [n for n in lst if not (n in seen or seen.add(n))] + + def __str__(self): + return self.name + + def __repr__(self): + return self.name + + +class DependencyAnalysis(object): + def __init__(self, deps): + self.deps = deps + # issue: duplicate name when we have two same ops. + self.name2dep = self._construct_unique_name2dep(deps) + self.mapping = {} # name -> TensorDepNode + + def _construct_unique_name2dep(self, deps): + """ + This is a workaround for the issue that we have two same ops' fuse case. + See https://github.com/apache/tvm/issues/16433 + """ + _names:Set = set() + name2dep:Mapping = {} + for dep in deps: + output_buffer = dep.block_analyzer.get_output_buffers(dep.block)[0] + base_name = output_buffer.name + if base_name not in _names: + _names.add(base_name) + else: + i = 1 + while f"{base_name}_{i}" in _names: + i += 1 + base_name = f"{base_name}_{i}" + _names.add(base_name) + name2dep[base_name] = dep + return name2dep + + def get_or_create_node(self, name): + if name not in self.mapping: + self.mapping[name] = TensorDepNode(name) + return self.mapping[name] + + def traverse_dependencies(self, compute): + if isinstance(compute, Statement): + node = self.get_or_create_node( + compute.block_analyzer.get_output_buffers(compute.block)[0].name + ) + # Loop through input tensors + for input_buffer in compute.block_analyzer.get_input_buffers(compute.block): + # Get the input node + input_node = self.traverse_dependencies(input_buffer) + input_node.add_next(node) + node.add_prev(input_node) + elif isinstance(compute, tir.Buffer): + node = self.get_or_create_node(compute.name) + return node + + def analyze(self): + # Starting point for traversal + for _, compute in self.name2dep.items(): + self.traverse_dependencies(compute) + + def print_dependencies(self): + for name, node in self.mapping.items(): + print(f"{name} depends on {', '.join([prev.name for prev in node._prev])}") + + def find_path_from_source(self, start_name, target_name): + """ + Finds the path (if it exists) from a starting node (source) to a target node. + Returns the path as a list of nodes. + """ + visited = set() + path = [] + if self._find_path_recursive(self.mapping[start_name], target_name, visited, path): + return path + return [] + + def _find_path_recursive(self, current_node, target_name, visited, path): + """ + Recursive helper function for find_path_from_source. + """ + if current_node.name == target_name: + path.append(current_node) + return True + + if current_node.name in visited: + return False + + visited.add(current_node.name) + path.append(current_node) + + for next_node in current_node._next: + if self._find_path_recursive(next_node, target_name, visited, path): + return True + + path.pop() + return False + + +class InputShapeInference: + def __init__(self, deps: List[Statement]): + self.deps = deps + self.target_mapping = {} + self.buffer_mapping = {} + self.reduce_axes = [] + for dep in self.deps: + for ax in dep.block_analyzer.get_reduce_axis(dep.block): + self.reduce_axes.append(ax) + self.dep_analysis = DependencyAnalysis(self.deps) + self.dep_analysis.analyze() + + def construct_dependency_target(self, targets: Tuple[str]): + if targets in self.target_mapping: + return self.target_mapping[targets] + # should be buffer name instead of block name + name2dep = { + dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps + } + mapping = {} + input_vars = [] + for target in targets: + vars = [ + iter.var + for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block) + ] + input_vars.append(vars) + mapping[target] = [vars] + ana = arith.Analyzer() + + for dep in self.deps: + for name in dep.dependent_region: + if name not in mapping: + continue + dep_name = dep.dep_name + indices = mapping[name][0] + output_indices = dep.make_reverse(name, indices) + if dep_name in targets: + continue + if dep_name not in mapping: + mapping[dep_name] = [output_indices] + elif not region_exist_in_list(output_indices, mapping[dep_name]): + mapping[dep_name].append(output_indices) + + for dep in reversed(self.deps): + indices_list = mapping[dep.dep_name] + ax_vars = [iter.var for iter in dep.block_analyzer.get_spatial_axis(dep.block)] + for input_name, regions in dep.dependent_region.items(): + if input_name in targets: + continue + if input_name not in mapping: + mapping[input_name] = [] + for indices in indices_list: + for region in regions: + vmap = { + k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v) + for k, v in zip(ax_vars, indices) + } + region = [ + ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region + ] + if not region_exist_in_list(region, mapping[input_name]): + mapping[input_name].append(region) + buffers = [] + for dep in self.deps: + for buffer in dep.block_analyzer.get_buffers(dep.block): + buffers.append(buffer) + + for buffer in buffers: + self.buffer_mapping[buffer.name] = buffer + + self.target_mapping[targets] = input_vars, mapping + return input_vars, mapping + + def infer( + self, shape: Dict[str, List[arith.ConstIntBound]], rstep: Dict[str, int] = {}, targets=None + ): + compute_targets = tuple(shape.keys()) + input_vars, mapping = self.construct_dependency_target(compute_targets) + ana = arith.Analyzer() + results = {} + intermediate_bind = {} + for vars, bounds in zip(input_vars, shape.values()): + for var, bound in zip(vars, bounds): + ana.update(var, bound, True) + for ax in self.reduce_axes: + # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value. + if ax.var.name in rstep: + bound = arith.ConstIntBound( + int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1) + ) + else: + bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1)) + ana.update(ax.var, bound, True) + + for name, regions in mapping.items(): + if targets is not None and name not in targets: + continue + if compute_targets[0:1] == compute_targets: + (compute_target,) = compute_targets + path = self.dep_analysis.find_path_from_source(name, compute_target) + if len(path) > 2: + intermediate_nodes = path[1:-1] + for node in intermediate_nodes: + iters = mapping[node.name] + if len(iters) != len(regions) or len(iters) != 1: + continue + if len(*iters) != len(*regions): + break + regions = iters + intermediate_bind[name] = compute_target + + for region in regions: + bound = [ana.const_int_bound(indice) for indice in region] + if name in results: # simply merge two bounds + bound = [_merge_two_bounds(x, y) for x, y in zip(results[name], bound)] + results[name] = bound + else: + for region in regions: + bound = [ana.const_int_bound(indice) for indice in region] + if name in results: # simply merge two bounds + bound = [_merge_two_bounds(x, y) for x, y in zip(results[name], bound)] + results[name] = bound + + for name, bounds in results.items(): + results[name] = [c.max_value - c.min_value + 1 for c in bounds] + return results, intermediate_bind + + def get_input_exprs(self, output_exprs): + input_vars, mapping = self.construct_dependency_target(tuple(output_exprs.keys())) + ana = arith.Analyzer() + for ax in self.reduce_axes: + ana.bind(ax.var, 0) + vmap = {} + for vars, exprs in zip(input_vars, output_exprs.values()): + for var, expr in zip(vars, exprs): + if expr.dtype != var.dtype: + expr = tir.Cast(var.dtype, expr) + vmap[var] = expr + result = {} + + for name, regions in mapping.items(): + region = regions[0] + result[name] = [ + ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region + ] + return result + + +def region_exist_in_list(a, list) -> bool: + def expr_is_same(a, b) -> bool: + if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm): + return a.value == b.value + return structural_equal(a, b) + + def region_is_same(a, b) -> bool: + for indice_a, indice_b in zip(a, b): + if not expr_is_same(indice_a, indice_b): + return False + return True + + return any([region_is_same(a, x) for x in list]) + + +def walk_indice(expr): + if isinstance(expr, tir.expr.BinaryOpExpr): + a = walk_indice(expr.a) + b = walk_indice(expr.b) + if a is not None and b is not None: + return expr + else: + return None + elif isinstance(expr, tir.expr.ConstExpr): + return expr + elif isinstance(expr, tir.Var): + return expr + elif isinstance(expr, tir.ProducerLoad): + return None + elif isinstance(expr, tir.Cast): + a = walk_indice(expr.value) + if a is not None: + return expr + return None + elif isinstance(expr, tir.Call): + return None + else: + raise Exception("Unhandled node type in walk_indice(): %s" % expr) + + +def _extract_dependent_region(block_analyzer, block: BlockRV) -> Dict[str, List[tir.PrimExpr]]: + input_buffers = block_analyzer.get_input_buffers(block) + dependent_region = {buffer.name: [] for buffer in input_buffers} + + def fvisit(x): + if not isinstance(x, tir.BufferLoad): + return + if x.buffer.name not in dependent_region: + return + index = [] + for indice, shape_limit in zip(x.indices, x.buffer.shape): + expr = walk_indice(indice) + if expr is None: + expr = tir.Var("undefined", dtype="int8") % shape_limit + if isinstance(expr, tir.IntImm) and expr.value == 0: + """for tensor ir zero dim smplification case. + for ax0, ax1, ax2 in T.grid(T.int64(1024), T.int64(1024), T.int64(1024)): + with T.block("T_dense"): + v0, v1, v2 = T.axis.remap("SSR", [ax0, ax1, ax2]) + T.reads(A_reindex[T.int64(0), v0, v2], B_reindex[T.int64(0), v1, v2]) + T.writes(T_dense_reindex[T.int64(0), v0, v1]) + with T.init(): + T_dense_reindex[T.int64(0), v0, v1] = T.float16(0) + T_dense_reindex[T.int64(0), v0, v1] = T_dense_reindex[T.int64(0), v0, v1] + A_reindex[T.int64(0), v0, v2] * B_reindex[T.int64(0), v1, v2] + For exmaple, the T_dense_reindex has three dims, however there're only two spatial loops. + """ + continue + index.append(expr) + if not region_exist_in_list(index, dependent_region[x.buffer.name]): + dependent_region[x.buffer.name].append(index) + + stmt = block_analyzer.sch.get(block) + tir.stmt_functor.post_order_visit(stmt, fvisit=fvisit) + return dependent_region + + +def get_analyzer_by_tir(block_analyzer, args) -> InputShapeInference: + deps = [Statement(block_analyzer, block) for block in args] + + return InputShapeInference(deps) diff --git a/python/bitblas/base/schedule_rule.py b/python/bitblas/base/schedule_rule.py new file mode 100644 index 0000000000..a1dc4ea349 --- /dev/null +++ b/python/bitblas/base/schedule_rule.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A lightweight wrapper on an arbitrary function that can be used to schedule a TIR PrimFunc.""" +from typing import Callable, List, Union + +from tvm import tir +from tvm.target import Target + + +class ScheduleRule: # pylint: disable=too-few-public-methods + """A thin wrapper on an arbitrary function that can be used to schedule a TIR PrimFunc. + + Given a PrimFunc, a target, and a tunable flag, the apply method of a ScheduleRule + returns either a Schedule, a list of Schedules, or None, where None means that the rule + is not applicable to the given PrimFunc. If the tunable flag is True, the ScheduleRule is + allowed to return either a Schedule or a list of Schedules, and the Schedules are allowed to + contain tunable instructions. If the tunable flag is False, the ScheduleRule is only allowed to + return a Schedule, and the Schedule is not allowed to contain tunable instructions. + """ + + def apply( + self, + func: tir.PrimFunc, + target: Target, + tunable: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + """Apply the ScheduleRule to the given PrimFunc. + + Parameters + ---------- + func : tir.PrimFunc + The PrimFunc to apply the ScheduleRule to. + target : Target + The compilation target the schedule is supposed to be built for. + tunable : bool + Whether the schedule is allowed to contain tunable instructions. + + Returns + ------- + results : Union[None, tir.Schedule, List[tir.Schedule]] + Either a Schedule, a list of Schedules, or None, where None means that the rule + is not applicable to the given PrimFunc. + """ + raise NotImplementedError + + def apply_config( + self, + func: tir.PrimFunc, + config, + ): + """Apply the ScheduleRule to the given PrimFunc. + + Parameters + ---------- + func : tir.PrimFunc + The PrimFunc to apply the ScheduleRule to. + target : Target + The compilation target the schedule is supposed to be built for. + configs : + # todo: Discribe the configs + Returns + ------- + results : Union[None, tir.Schedule, List[tir.Schedule]] + Either a Schedule, a list of Schedules, or None, where None means that the rule + is not applicable to the given PrimFunc. + """ + raise NotImplementedError + + @staticmethod + def from_callable( + name, + ) -> Callable[ + [ + Callable[ + [tir.PrimFunc, Target, bool], + Union[None, tir.Schedule, List[tir.Schedule]], + ], + ], + "ScheduleRule", + ]: + """Create a ScheduleRule from a callable. + + Parameters + ---------- + name : str + + Returns + ------- + decorator : Callable + A decorator that takes a callable and returns a ScheduleRule. + + Examples + -------- + .. code-block:: python + + @ScheduleRule.from_callable("MyRule") + def my_rule(func: tir.PrimFunc, target: Target, tunable: bool) -> Union[None, Schedule] + # Do something with func and target + """ + + def decorator(f) -> "ScheduleRule": # pylint: disable=invalid-name + class _Rule(ScheduleRule): + def apply( + self, + func: tir.PrimFunc, + target: Target, + tunable: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + return f(func, target, tunable) + + _Rule.__name__ = name + return _Rule() + + return decorator + + def is_target_available(self, target: Target) -> bool: # pylint: disable=unused-argument + """Check whether the rule is available for the given target. + + Parameters + ---------- + target : Target + The compilation target the schedule is supposed to be built for. + + Returns + ------- + available : bool + Whether the rule is available for the given target. + """ + return True diff --git a/python/bitblas/base/transform.py b/python/bitblas/base/transform.py new file mode 100644 index 0000000000..c0f8e87441 --- /dev/null +++ b/python/bitblas/base/transform.py @@ -0,0 +1,220 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Apply ScheduleRules onto an IRModule to generate default schedules without tuning, +or a space for MetaSchedule tuning +""" +from typing import List, Optional, Dict +import os +import shutil +import tempfile +import os.path as osp +import tvm +from tvm import tir +from tvm import meta_schedule as ms +from tvm.ir import IRModule +from tvm.ir.transform import PassContext, module_pass +from tvm.target import Target +from .roller.policy import DefaultPolicy, TensorCorePolicy +from .roller.arch import CUDA +from .schedule_rule import ScheduleRule +from ..gpu.matmul_analysis import get_tensorized_func_and_tags +from ..base.analysis import check_func_with_dynamic +from .utils import apply_and_build, fast_tune, fast_tune_with_dynamic_range + + +def _is_scheduled(func: tir.PrimFunc) -> bool: + if not isinstance(func, tir.PrimFunc): + return False + if not func.attrs: + return False + if "tir.is_scheduled" not in func.attrs: + return False + return func.attrs["tir.is_scheduled"] == 1 + + +@module_pass(opt_level=0, name="ApplyDefaultSchedule") +class ApplyDefaultSchedule: # pylint: disable=too-few-public-methods + """A IRModule pass that applies a list of ScheduleRules to all PrimFuncs in the module.""" + + def __init__(self, *rules: ScheduleRule): + """Construct a new ApplyDefaultSchedule pass. + + Parameters + ---------- + *rules : ScheduleRule + The ScheduleRules to apply to all PrimFuncs in the module. + """ + self.rules = list(rules) + + def transform_module( # pylint: disable=missing-function-docstring + self, + mod: IRModule, + _: PassContext, + ) -> IRModule: + target = Target.current(allow_none=False) + + updated_functions = {} + for g_var, func in mod.functions_items(): + if isinstance(func, tir.PrimFunc) and not _is_scheduled(func): + sch = _apply_rules(func, target, self.rules, tunable=False) + if sch is not None: + assert len(sch) == 1 + updated_functions[g_var] = sch[0].mod["main"].with_attr("tir.is_scheduled", 1) + for g_var, func in updated_functions.items(): + mod[g_var] = func + return mod + + +@module_pass(opt_level=0, name="ApplyFastTuning") +class ApplyFastTuning: # pylint: disable=too-few-public-methods + """A IRModule pass that applies a list of ScheduleRules to all PrimFuncs in the module.""" + + def __init__( + self, + topk: int = 10, + target: Optional[Target] = None, + parallel_build: bool = True, + meta_database_dir: str = None, + dynamic_range: Dict[str, List[int]] = {}, + ): + """Construct a new ApplyFastTuning pass. + + Parameters + ---------- + meta_database : str + The path of database. + dynamic_range : Dict[str, List[int]] + Use for generate kernel based on dynamic range. + """ + self.topk = topk + self.target = Target.current() if target is None else target + self.parallel_build = parallel_build + self.meta_database_dir = meta_database_dir + self.dynamic_range = dynamic_range + self.temp_dir = tempfile.TemporaryDirectory() + print(f"[FastDlight] Using meta database dir {self.temp_dir}") + path_workload = osp.join(self.temp_dir.name, "database_workload.json") + path_tuning_record = osp.join(self.temp_dir.name, "database_tuning_record.json") + self.cache_meta_database = ms.database.JSONDatabase( + path_workload, path_tuning_record, module_equality="structural" + ) + + def transform_module( # pylint: disable=missing-function-docstring + self, + mod: IRModule, + _: PassContext, + ) -> IRModule: + target = self.target + updated_functions = {} + + for g_var, func in mod.functions_items(): + if isinstance(func, tir.PrimFunc) and not _is_scheduled(func): + # if g_var.name_hint not in ["extend_te"]: + # continue + print(f"[FastDlight] Start to apply fast tuning for {g_var}") + normalize_mod_func_ = tvm._ffi.get_global_func("tvm.meta_schedule.normalize_mod") + _normalized_func_mod = normalize_mod_func_(func) + + if self.cache_meta_database.has_workload(_normalized_func_mod): + tuning_record = self.cache_meta_database.query_tuning_record( + _normalized_func_mod, + target, + g_var.name_hint, + ) + if tuning_record: + trace = tuning_record.trace + sch = tvm.tir.Schedule(func) + trace.apply_to_schedule(sch, remove_postproc=False) + print(f"[FastDlight] Find Cache for {g_var}") + updated_functions[g_var] = sch.mod["main"].with_attr("tir.is_scheduled", 1) + continue + + if check_func_with_dynamic(func): + try: + dispatch_mod = fast_tune_with_dynamic_range( + func, + target=target, + topk=self.topk, + parallel_build=self.parallel_build, + global_symbol=g_var.name_hint, + dynamic_range=self.dynamic_range, + ) + except: + continue + if dispatch_mod: + for g, f in dispatch_mod.functions_items(): + if g.name_hint == g_var.name_hint: + # avoid duplicated global symbol + updated_functions[g_var] = f.without_attr("global_symbol").with_attr("tir.is_scheduled", 1) + else: + updated_functions[g] = f.with_attr("tir.is_scheduled", 1) + # cannot reuse meta database as it canot be recorvered from the trace + workload = self.cache_meta_database.commit_workload(_normalized_func_mod) + else: + # otherwise is static shape analysis + try: + _, best = fast_tune( + func, target=target, topk=self.topk, parallel_build=self.parallel_build + ) + except: + continue + if best is not None: + updated_functions[g_var] = best.sch.mod["main"].with_attr("tir.is_scheduled", 1) + workload = self.cache_meta_database.commit_workload(_normalized_func_mod) + # only record the best schedule + self.cache_meta_database.commit_tuning_record( + ms.database.TuningRecord( + best.sch.trace, + workload, + [best.latency], + target, + ms.arg_info.ArgInfo.from_prim_func(func=best.sch.mod["main"]), + ) + ) + + for g_var, func in updated_functions.items(): + mod[g_var] = func + + # copy database + if self.meta_database_dir is not None: + if not osp.exists(self.meta_database_dir): + os.makedirs(self.meta_database_dir) + # TODO(lei): maybe another way to copy the database + shutil.copytree(self.temp_dir.name, self.meta_database_dir, dirs_exist_ok=True) + + return mod + + def __del__(self): + # clean up the temp cache + self.temp_dir.cleanup() + + +def _apply_rules( + func: tir.PrimFunc, + target: Target, + rules: List[ScheduleRule], + tunable: bool, +) -> Optional[List[tir.Schedule]]: + for rule in rules: + space = rule.apply(func, target, tunable) + if space is None: + continue + if isinstance(space, tir.Schedule): + space = [space] + return space + return None diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py new file mode 100644 index 0000000000..f8edd2159b --- /dev/null +++ b/python/bitblas/base/utils.py @@ -0,0 +1,479 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +import os +from tvm.contrib.popen_pool import PopenPoolExecutor, StatusKind, MapResult +from concurrent.futures import ThreadPoolExecutor, as_completed +import numpy as np +from typing import List, Tuple, Optional, Dict +from tvm import tir, IRModule +from tvm.runtime import Module +from tvm.tir import Schedule +from tvm import dlight as dl +from .analysis import get_root_block, get_reduction_blocks, find_var_from_func +from .roller.arch import Arch +from bitblas.base.roller.arch import CUDA +from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy +from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags +from bitblas.base.roller.rasterization import NoRasterization +import tempfile +import re +import itertools +from tvm.ir.supply import GlobalVarSupply + + +def match_global_kernel(source: str) -> int: + pattern = r"__global__\s+void\s+[__launch_bounds__\(\d+\)\s+]\w+" + matched = re.findall(pattern, source) + assert len(matched) > 1 # may have statement before kernel + return source.index(matched[0]) + + +def get_rasterization_code(pannel_width: int = 8) -> str: + return f""" + const int MAX_BLOCK_N = {pannel_width}; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + """ + + +class CompileResult: + """ + Class to store the result of compilation + """ + + def __init__(self, config, sch, mod: Module): + self.config = config + self.sch = sch + self.mod = mod + self.code = mod.imported_modules[0].get_source() if mod else None + self.latency = 1e9 + self.profile_tensors = [] + self.time_evaluator = None + + def profile(self): + return self.time_evaluator(*self.profile_tensors).mean + + +def _apply_config( + func: tir.PrimFunc, + config=None, # todo(lei): update typing +) -> Optional[List[tir.Schedule]]: + """ + find rules: + case 1. if the main block has no reduce op, then use the Elementwise rule. + case 2. if the config enabled tensorcore, then use the TensorCore rule. + case 3. if any([t > 1 for t in config.reduce_thread]), we should use the InnerThread Reduction Rule. + case 4. else we should use general reduction rule. + """ + print("[FastDlight] Apply config ", config) + + sch = tir.Schedule(func) + root_block = get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + reduction_blocks = get_reduction_blocks(sch, blocks) + try: + if not reduction_blocks: + return dl.gpu.ElementWise().apply_config(func, config) + elif config.use_tc: + if config.arch.sm_version >= 80: + # For A100(sm_80) or more advanced gpu, use MMA tensorization. + return dl.gpu.MatmulTensorizationMMA().apply_config(func, config) + else: + # For other GPUs, use WMMA tensorization. + return dl.gpu.MatmulTensorizationWMMA().apply_config(func, config) + else: + _reduction_rules = [] + + _reduction_rules.append(dl.gpu.GEMV()) + if not any([t > 1 for t in config.reduce_thread]): + # Matrix multiplication template doesn't support inner thread reduction + _reduction_rules.append(dl.gpu.Matmul()) + _reduction_rules.append(dl.gpu.GeneralReduction()) + + for rule in _reduction_rules: + try: + sch = rule.apply_config(func, config) + except: + continue + if sch is not None: + return sch + except Exception as e_msg: + print("[FastDlight] Apply config failed: ", e_msg) + return None + + +def apply_and_build_parallel(func, configs, arch, num_repeats=5, max_workers=10) -> CompileResult: + cpresults = [] + + def var_warpper(v): + if isinstance(v, tvm.tir.Var): + assert "opt_shapes" in func.attrs + assert v.name in func.attrs["opt_shapes"] + return func.attrs["opt_shapes"][v.name].value + elif isinstance(v, tvm.tir.IntImm): + return v.value + else: + raise RuntimeError("Not supported type: ", type(v)) + + profile_tensors = [] + for param in func.params: + if param not in func.buffer_map: + # in case of dynamic symbolic may in params + continue + arg = func.buffer_map[param] + if arg.dtype == "int8": + profile_tensors.append( + tvm.nd.array( + np.random.randint(-127, 128, [var_warpper(i) for i in arg.shape]).astype( + arg.dtype + ), + device=arch.device, + ) + ) + else: + profile_tensors.append( + tvm.nd.array( + np.random.uniform(0, 1, [var_warpper(i) for i in arg.shape]).astype(arg.dtype), + device=arch.device, + ) + ) + + max_workers = min(len(configs), os.cpu_count(), max_workers) + + # apply config in thread parallel + _sched: List[Schedule] = [] + with ThreadPoolExecutor(max_workers=4) as schduler: + futures = { + schduler.submit(lambda f, c: _apply_config(f, c), func, config) for config in configs + } + for future in as_completed(futures): + _sched.append(future.result()) + + builder = PopenPoolExecutor(max_workers=max_workers) + + # build in process parallel + def _build(context) -> str: + idx, mod, arch = context + + # TODO(lei): + # this is a trick to implement rasteration, will be removed in the future + config = configs[idx] + @tvm.register_func(func_name="tvm_callback_cuda_postproc", override=True) + def tvm_callback_cuda_postproc(code, _): + index = code.index("{", match_global_kernel(code)) + if not isinstance(config.rasterization_plan, NoRasterization): + factor = config.rasterization_plan.panel_width_ + rasterization_code = get_rasterization_code(factor) + code = code[: index + 2] + rasterization_code + code[index + 2 :] + return code + + with tvm.transform.PassContext(config={"tir.use_async_copy": True}): + rt_mod = tvm.build(mod["main"], target=arch.target) + + from tvm.contrib.tar import tar # pylint: disable=import-outside-toplevel + + artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format) + code = rt_mod.imported_modules[0].get_source() + rt_mod.export_library(artifact_path, fcompile=tar) + return idx, code, artifact_path + + for map_result in builder.map_with_error_catching( + _build, + [(i, sch.mod, arch) for i, sch in enumerate(_sched)], + ): + if map_result.status == StatusKind.TIMEOUT: + print("[FastDlight] LocalBuilder: Timeout") + elif map_result.status == StatusKind.EXCEPTION: + # TODO(lei): redirect the exception to file if needed + print("[FastDlight] LocalBuilder: An exception occurred ", map_result.value) + continue + elif map_result.status == StatusKind.COMPLETE: + idx, code, artifact_path = map_result.value + assert artifact_path is not None, "artifact_path is None" + + sch = _sched[idx] + config = configs[idx] + rt_mod = tvm.runtime.load_module(artifact_path) + cpresult = CompileResult(config, sch, rt_mod) + timer_cuda_mod = rt_mod.time_evaluator( + rt_mod.entry_name, arch.device, number=num_repeats + ) + cpresult.profile_tensors = profile_tensors + cpresult.time_evaluator = timer_cuda_mod + cpresult.code = code + cpresults.append(cpresult) + else: + raise ValueError(f"Unreachable: unexpected result: {map_result}") + + del builder + + best = None + best_latency = 1e9 + for cpresult in cpresults: + config = cpresult.config + try: + latency = cpresult.profile() + except Exception as e_mesg: + print("[FastDlight] Evaluation with config failed: ", e_mesg) + continue + print("[FastDlight] Evaluation with config ", config) + print("[FastDlight] Time cost of this config: {:.3f} ms".format(latency * 1e3)) + + cpresult.latency = latency + if latency < best_latency: + best_latency = latency + best = cpresult + + return cpresults, best + + +def apply_and_build( + func, + configs, + arch, + parallel_build=False, +) -> Tuple[List[CompileResult], CompileResult]: + max_workers = 10 if parallel_build else 1 + return apply_and_build_parallel(func, configs, arch, max_workers) + + +def fast_tune( + func: tir.PrimFunc, + target: tvm.target.Target, + topk: int = 10, + parallel_build: bool = True, +): + if target.kind.name != "cuda": + print("[FastDlight] Only support CUDA target") + return None, None + + specilized_func = func + if func.attrs is not None and "opt_shapes" in func.attrs: + opt_shapes = func.attrs["opt_shapes"] + # should be int value + if not all([isinstance(v.value, int) for v in opt_shapes.values()]): + print("[FastDlight] The opt_shapes should be int value") + return None, None + # currently only support one dynmaic range + if len(opt_shapes) > 1: + print("[FastDlight] Currently only support one dynamic range") + return None, None + + for buffer in func.buffer_map.values(): + for axis in buffer.shape: + if isinstance(axis, tvm.tir.Var): + if axis.name not in opt_shapes: + raise NotImplementedError( + "Currently do not support fast tune with none-dynamic range set" + ) + if opt_shapes: + for name, shape in opt_shapes.items(): + var = find_var_from_func(func, name) + specilized_func = func.specialize({var: shape.astype(var.dtype)}).with_attr( + "is_specialized" + ) + + arch = CUDA(target) + + policy = DefaultPolicy(func=func, arch=arch) + try: + specilized_func, tags = get_tensorized_func_and_tags(specilized_func, arch.target) + except Exception as e_msg: + print("[FastDlight] Get tensorized func and tags failed: ", e_msg) + tags = None + if tags: + policy = TensorCorePolicy(func=specilized_func, arch=arch, tags=tags) + + configs = policy.emit_config(topk) + cpresults, best = apply_and_build(func, configs, arch, parallel_build=parallel_build) + + return cpresults, best + + +# always use the first function as the base +def collect_buffers_to_declare(func): + params = [] + # collect dynamic symbolic + dyn_symbolic: List[tvm.tir.Var] = [] + buffers_to_declare = [] + for param in func.params: + if param not in func.buffer_map: + continue + buffer = func.buffer_map[param] + for axis in buffer.shape: + if isinstance(axis, tvm.tir.Var) and axis not in dyn_symbolic: + dyn_symbolic.append(axis) + buffers_to_declare.append(buffer) + params.append(buffer.data) + + # the args should be buffers + dynamic symbolic + params += list(dyn_symbolic) + + return params, buffers_to_declare + + +def refactor_specialized_func(g_var, func, params, buffers_to_declare): + body = func.body + attrs = func.attrs + global_symbol = g_var + if "opt_shapes" in func.attrs: + opt_shapes = func.attrs["opt_shapes"] + + def serialize_name(opt_shapes: Dict): + return "_opt_" + "_".join([f"{k}_{v}" for k, v in opt_shapes.items()]) + + global_symbol += serialize_name(opt_shapes) + ret_type = func.ret_type + for buf in buffers_to_declare: + body = tvm.tir.DeclBuffer(buf, body=body) + + # devide func must be private + device_func = tvm.tir.PrimFunc(params, body, ret_type, attrs=attrs).without_attr( + "global_symbol" + ) + return global_symbol, device_func + + +def create_dispatch_func(g_var: str, func: tir.PrimFunc, refactored_funcs: List[str]): + global_symbol = g_var + attrs = func.attrs + buffer_map = func.buffer_map + params = func.params + ret_type = func.ret_type + + # collect dynamic symbolic + dyn_symbolic: List[tvm.tir.Var] = [] + _invoke_params = [] + for param in func.params: + if param not in func.buffer_map: + continue + buffer = func.buffer_map[param] + for axis in buffer.shape: + if isinstance(axis, tvm.tir.Var) and axis not in dyn_symbolic: + dyn_symbolic.append(axis) + _invoke_params.append(buffer.data) + _invoke_params += list(dyn_symbolic) + + func_range: List[int] = [] + global_symbols = [] + for g_var, refactor_func in refactored_funcs: + opt_shapes = refactor_func.attrs["opt_shapes"] + func_range.append(list(opt_shapes.values())[0]) + global_symbols.append(g_var) + + # TODO(lei): general the dispatch function to support multiple dynamic symbolics + assert len(dyn_symbolic) == 1, "Only support one dyanmic symbolics currently" + + ib = tvm.tir.ir_builder.create() + syb = list(dyn_symbolic)[-1] + last_range = 0 + for i, (_range, g_var) in enumerate(zip(func_range, global_symbols)): + if i == 0: + with ib.if_scope(syb <= _range): + ib.emit(tvm.tir.Call(None, g_var, _invoke_params)) + else: + with ib.if_scope(tvm.tir.all(syb > last_range, syb <= _range)): + ib.emit(tvm.tir.Call(None, g_var, _invoke_params)) + last_range = _range + with ib.if_scope(syb > last_range): + ib.emit(tvm.tir.Call(None, g_var, _invoke_params)) + stmt = ib.get() + dispatch_func = tvm.tir.PrimFunc(params, stmt, ret_type, buffer_map, attrs).with_attrs( + {"tir.is_global_func": True, "global_symbol": global_symbol} + ) + return dispatch_func + + +def create_dispatch_mod( + g_var: str, original_func: tir.PrimFunc, specialized_funcs: List[tir.PrimFunc] +) -> IRModule: + dispatch_mod: IRModule = tvm.IRModule() + g_var_supply = GlobalVarSupply(dispatch_mod) + refactored_funcs = [] + for func in specialized_funcs: + params, buffers_to_declare = collect_buffers_to_declare(func) + global_symbol, device_func = refactor_specialized_func( + g_var, func, params, buffers_to_declare + ) + global_symbol = g_var_supply.fresh_global(global_symbol, add_prefix=False) + dispatch_mod[global_symbol] = device_func + refactored_funcs.append((global_symbol, device_func)) + dispatch_func = create_dispatch_func(g_var, original_func, refactored_funcs=refactored_funcs) + dispatch_mod.update(tvm.IRModule.from_expr(dispatch_func)) + return dispatch_mod + + +def fast_tune_with_dynamic_range( + func: tir.PrimFunc, + target: tvm.target.Target, + topk: int = 10, + parallel_build: bool = True, + global_symbol: Optional[str] = None, + dynamic_range: Dict[str, List[int]] = {}, +) -> IRModule: + if target.kind.name != "cuda": + print("[FastDlight] Only support CUDA target") + return None + if not global_symbol: + global_symbol = func.attrs["global_symbol"] + + # set opt_shapes for the primfunc with dynamc symbolic + opt_shapes: Dict[str, List[int]] = {} + for buffer in func.buffer_map.values(): + for axis in buffer.shape: + if isinstance(axis, tvm.tir.Var): + if axis.name in dynamic_range: + opt_shapes[axis.name] = dynamic_range[axis.name] + else: + raise ValueError(f"[FastDlight] The axis {axis.name} is not in dynamic_range") + func = func.with_attr("opt_shapes", opt_shapes) + + if "opt_shapes" not in func.attrs: + print("[FastDlight] The primfunc has no opt_shapes, please set opt_shapes for the primfunc") + return None + else: + # should be list value + if not all([isinstance(v, tvm.ir.Array) for v in func.attrs["opt_shapes"].values()]): + print("[FastDlight] The opt_shapes should be list value") + return None + + print("[FastDlight] Start fast tuning with dynamic range") + opt_shapes = func.attrs["opt_shapes"] + + # Step 1.Calculate the Cartesian product using itertools.product + product_list = list(itertools.product(*(opt_shapes[key] for key in opt_shapes))) + + # Convert the Cartesian product to a list of dictionaries + specialize_items: List[Dict] = [dict(zip(opt_shapes.keys(), values)) for values in product_list] + + specilized_tuned_funcs: List[tir.PrimFunc] = [] + for item in specialize_items: + func = func.with_attr("opt_shapes", item) + _, best = fast_tune(func, target, topk, parallel_build) + if best is None: + return None + specilized_tuned_funcs.append(best.sch.mod["main"]) + + return create_dispatch_mod(global_symbol, func, specilized_tuned_funcs) From 028c7710169168303363b7ac142e056a770f4553 Mon Sep 17 00:00:00 2001 From: LeiWang Date: Tue, 6 Feb 2024 10:54:09 -0400 Subject: [PATCH 002/286] gpu schedule --- python/bitblas/gpu/__init__.py | 32 + python/bitblas/gpu/base.py | 40 + python/bitblas/gpu/element_wise.py | 110 +++ python/bitblas/gpu/fallback.py | 91 ++ python/bitblas/gpu/gemv.py | 905 +++++++++++++++++++ python/bitblas/gpu/general_reduction.py | 478 ++++++++++ python/bitblas/gpu/intrin/lop3.py | 315 +++++++ python/bitblas/gpu/matmul.py | 383 ++++++++ python/bitblas/gpu/matmul_analysis.py | 631 ++++++++++++++ python/bitblas/gpu/matmul_mma.py | 692 +++++++++++++++ python/bitblas/gpu/matmul_mma_dequantize.py | 540 ++++++++++++ python/bitblas/gpu/matmul_wmma.py | 922 ++++++++++++++++++++ python/bitblas/gpu/reduction.py | 298 +++++++ python/bitblas/gpu/rmsnorm.py | 140 +++ python/bitblas/gpu/transpose.py | 129 +++ python/bitblas/gpu/utils.py | 99 +++ 16 files changed, 5805 insertions(+) create mode 100644 python/bitblas/gpu/__init__.py create mode 100644 python/bitblas/gpu/base.py create mode 100644 python/bitblas/gpu/element_wise.py create mode 100644 python/bitblas/gpu/fallback.py create mode 100644 python/bitblas/gpu/gemv.py create mode 100644 python/bitblas/gpu/general_reduction.py create mode 100644 python/bitblas/gpu/intrin/lop3.py create mode 100644 python/bitblas/gpu/matmul.py create mode 100644 python/bitblas/gpu/matmul_analysis.py create mode 100644 python/bitblas/gpu/matmul_mma.py create mode 100644 python/bitblas/gpu/matmul_mma_dequantize.py create mode 100644 python/bitblas/gpu/matmul_wmma.py create mode 100644 python/bitblas/gpu/reduction.py create mode 100644 python/bitblas/gpu/rmsnorm.py create mode 100644 python/bitblas/gpu/transpose.py create mode 100644 python/bitblas/gpu/utils.py diff --git a/python/bitblas/gpu/__init__.py b/python/bitblas/gpu/__init__.py new file mode 100644 index 0000000000..1824700dd9 --- /dev/null +++ b/python/bitblas/gpu/__init__.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +GPU-generic schedule rules. +For CUDA/ROCm/Vulkan/Metal-specific rules, use `tvm.dlight.cuda/rocm/vulkan/metal` instead +""" +from .fallback import Fallback +from .element_wise import ElementWise +from .gemv import GEMV +from .general_reduction import GeneralReduction +from .matmul import ( + Matmul, + MatmulTensorizationMMA, + MatmulTensorizationWMMA, + MatmulTensorizationLegacy, +) +from .reduction import Reduction +from .transpose import Transpose diff --git a/python/bitblas/gpu/base.py b/python/bitblas/gpu/base.py new file mode 100644 index 0000000000..b5cf0bb7a9 --- /dev/null +++ b/python/bitblas/gpu/base.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Base schedule rule for GPU operators.""" + +from tvm.target import Target + +from ..base import ScheduleRule + + +class GPUScheduleRule(ScheduleRule): # pylint: disable=too-few-public-methods + """The Schedule Rule specific to GPU targets, will return None if the target is not GPU.""" + + def is_target_available(self, target: Target) -> bool: + """Check whether the target is available for gpu rule. + + Parameters + ---------- + target : Target + The compilation target to check. + + Returns + ------- + available : bool + Whether the target is available for this rule. + """ + return super().is_target_available(target) and "gpu" in target.keys diff --git a/python/bitblas/gpu/element_wise.py b/python/bitblas/gpu/element_wise.py new file mode 100644 index 0000000000..e415ea76e5 --- /dev/null +++ b/python/bitblas/gpu/element_wise.py @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +"""A fallback schedule rule for GPU operators.""" +from typing import List + +from tvm import tir + +from ..base import ScheduleRule, normalize_prim_func, try_inline + + +class ElementWise(ScheduleRule): + """ + An elementwise schedule rule for GPU operators. + """ + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> tir.Schedule: + block_factors = config.block + thread_factors = config.thread + step_factors = config.step + + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + + if block_infos is None: + return None + + block_infos = try_inline(sch, block_infos) + + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block) + ] + ) + or len(sch.get_loops(block)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block)) + sch.reorder(*s_loops, *r_loops, *o_loops) + + block_loops = [] + vthread_loops = [] + thread_loops = [] + inner_loops = [] + for s_loop, block_factor, step_factor, thread_factor in zip( + s_loops, block_factors, step_factors, thread_factors + ): + block_loop, inner_loop = sch.split(s_loop, factors=[None, block_factor]) + vthread_loop, inner_loop = sch.split( + inner_loop, factors=[None, thread_factor * step_factor] + ) + thread_loop, inner_loop = sch.split( + inner_loop, factors=[None, step_factor] + ) + block_loops.append(block_loop) + vthread_loops.append(vthread_loop) + thread_loops.append(thread_loop) + inner_loops.append(inner_loop) + + # inner virtual thread first + vthread_loops = list(reversed(vthread_loops)) + sch.reorder( + *block_loops, + *vthread_loops, + *thread_loops, + *inner_loops, + *r_loops, + *o_loops + ) + sch.bind(sch.fuse(*block_loops), "blockIdx.x") + sch.bind(sch.fuse(*thread_loops), "threadIdx.x") + if len(vthread_loops) > 3: + vthread_loops = vthread_loops[0:2] + [sch.fuse(*vthread_loops[2:])] + + for i, ax in enumerate(vthread_loops): + sch.bind(ax, "vthread" + [".x", ".y", ".z"][i]) + + return sch diff --git a/python/bitblas/gpu/fallback.py b/python/bitblas/gpu/fallback.py new file mode 100644 index 0000000000..84ed35ff95 --- /dev/null +++ b/python/bitblas/gpu/fallback.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +"""A fallback schedule rule for GPU operators.""" +from typing import List, Tuple + +from tvm import tir +from tvm.target import Target + +from ..base import normalize_prim_func, try_inline +from . import utils +from .base import GPUScheduleRule + + +class Fallback(GPUScheduleRule): + """ + A fallback schedule rule for all GPU operators. It will try to inline all the blocks first, + and then apply a simple block/grid mapping to the spatial loops on top of the remaining blocks. + """ + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> tir.Schedule: + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + max_threads_per_block = utils.max_threads_per_block(target) + + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + + if block_infos is None: + return None + + block_infos = try_inline(sch, block_infos) + reduction_blocks: List[Tuple[tir.schedule.BlockRV, tir.schedule.LoopRV]] = [] + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block) + ] + ) + or len(sch.get_loops(block)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block)) + sch.reorder(*s_loops, *r_loops, *o_loops) + bx, tx = sch.split( # pylint: disable=invalid-name + sch.fuse(*s_loops), + factors=[None, max_threads_per_block], + ) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + if len(r_loops) > 0: + reduction_blocks.append((block, r_loops[0])) + + for block, r_loop in reduction_blocks: + sch.decompose_reduction(block, r_loop) + + return sch + \ No newline at end of file diff --git a/python/bitblas/gpu/gemv.py b/python/bitblas/gpu/gemv.py new file mode 100644 index 0000000000..83381d4453 --- /dev/null +++ b/python/bitblas/gpu/gemv.py @@ -0,0 +1,905 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A rule for GEMV and DecodeGEMV.""" +import re +from functools import reduce +from typing import List, Optional, Union + +from tvm.tir.function import PrimFunc +from tvm import DataType, arith, ir, tir +from tvm.target import Target + +from ..base import ( + BlockInfo, + collect_block_iter_vars_used_in_access_region, + collect_vars_used_in_prim_expr, + detect_dominant_read, + is_broadcast_epilogue, + normalize_prim_func, + try_inline_contiguous_spatial, + get_output_blocks, + get_block, +) +from .base import GPUScheduleRule + + +def _get_reduction_expr(block: tir.Block) -> Optional[tir.PrimExpr]: + # Detect and return `Y` in `X[...] = X[...] + Y` + buffer_store = block.body + if not isinstance(buffer_store, tir.BufferStore): + return None + if not isinstance(buffer_store.value, tir.Add): + return None + if not ir.structural_equal( + buffer_store.value.a, + tir.BufferLoad(buffer_store.buffer, block.body.indices), + map_free_vars=True, + ): + return None + return buffer_store.value.b + + +def get_extent(sch: tir.Schedule, loop_rv: tir.schedule.LoopRV): + loop: tir.For = sch.get(loop_rv) + return loop.extent.value if isinstance(loop.extent, tir.IntImm) else loop.extent + + +def get_bytes(dtype: Union[DataType, str]) -> int: + num = re.findall(r"\d+", dtype) + if len(num) != 1: + raise ValueError(f"Cannot get bytes from {dtype}") + return int(num[0]) // 8 + + +def is_gemv(sch: tir.Schedule, block_info: BlockInfo) -> Optional[List[tir.Buffer]]: + """Check if the block is a GEMV. + + Parameters + ---------- + + sch : tir.Schedule + The schedule + + block_info : BlockInfo + The block info to be checked + + + Returns + ------- + ret : Optional[List[tir.Buffer]] + The vector buffers used in the GEMV if it is a GEMV, otherwise None. + """ + block = block_info.block_rv + block_stmt = sch.get(block) + conditions = [] + conditions.append(block_info.is_reduction()) + conditions.append(len(block_stmt.reads) >= 2) + conditions.append(len(block_stmt.writes) == 1) + conditions.append(_get_reduction_expr(block_stmt) is not None) + conditions.append( + len(collect_block_iter_vars_used_in_access_region(block_stmt, block_stmt.writes[0].region)) + > 0 + ) + if not all(conditions): + return None + + iter_num = len(block_stmt.iter_vars) + ret = [ + read.buffer + for read in block_stmt.reads + if len(collect_block_iter_vars_used_in_access_region(block_stmt, read.region)) < iter_num + and len(collect_block_iter_vars_used_in_access_region(block_stmt, read.region)) > 0 + ] + return ret if 0 < len(ret) < len(block_stmt.reads) else None + + +def normalize( + sch: tir.Schedule, + block_info: BlockInfo, +) -> Optional[bool]: + """Normalize the main block.""" + block_stmt: tir.Block = sch.get(block_info.block_rv) + access = arith.normalize_to_iter_sum( + detect_dominant_read(block_stmt), + input_iters={i.var: i.dom for i in block_stmt.iter_vars}, + ) + buffers_use_vars = [ + collect_block_iter_vars_used_in_access_region(block_stmt, buf.region) + for buf in block_stmt.writes + ] + buffers_use_vars.extend( + [ + collect_block_iter_vars_used_in_access_region(block_stmt, buf.region) + for buf in block_stmt.reads + ] + ) + if collect_vars_used_in_prim_expr(access.base) & set( + iter_var.var for iter_var in block_stmt.iter_vars + ): + return None + iter_to_info = {i.var: i for i in block_info.iters} + batch_loops, s_loops, r_loops, c_loops = [], [], [], [] + inner_axis = access.args[-1].source.source + is_inner_reduction = iter_to_info[inner_axis].kind == "R" + + for split_expr in access.args: + var = split_expr.source.source + info = iter_to_info.get(var) + loop = info.loop_rv + is_reduction = info.kind == "R" + if split_expr.lower_factor > 1: + if c_loops: + return None + loop, c_loop = sch.split(loop, factors=[None, split_expr.lower_factor]) + # we only support the reduction dim being grouped atm + if not is_reduction: + return None + c_loops.append(c_loop) + if is_reduction: + r_loops.append(loop) + elif all([var in buf_vars for buf_vars in buffers_use_vars]): + batch_loops.append(loop) + else: + s_loops.append(loop) + + assert s_loops + assert r_loops + if not c_loops: + c_loops = [sch.add_unit_loop(block_info.block_rv)] + if not batch_loops: + batch_loops = [sch.add_unit_loop(block_info.block_rv)] + sch.reorder(*batch_loops, *s_loops, *r_loops, *c_loops) + sch.fuse(*batch_loops) + sch.fuse(*s_loops) + sch.fuse(*r_loops) + return is_inner_reduction + + +class GEMVWithDequantizeInfo(GPUScheduleRule): + """A rule for Dequantized GEMV.""" + + def sch_inner_reduction_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + sch = tir.Schedule(func) + from .intrin.lop3 import get_lop3_intrin_group + + # TODO(leiwang): this is a hack to get the configuaration, should write a pass to analysis + dequantize_info = func.attrs["dequantize_info"] + + def check_dequantize_info(dequantize_info): + conditions = [] + conditions.append(len(dequantize_info) == 1) + # more conditions, e.g. check the format is in [fp, nf, int] + # check if the dequantize value name is weight + return all(conditions) + + assert check_dequantize_info(dequantize_info) + + (B_decode_info,) = list(dequantize_info.values()) + block_infos = normalize_prim_func(sch) + + if block_infos is None: + return None + + reduction_block: tir.schedule.BlockRV = None + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block) + ] + ) + or len(sch.get_loops(block)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block)) + if len(r_loops) > 0: + reduction_block = block + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + vec = list(config.vectorize.values())[-1] + + num_warps = int(prod(config.thread)) + warp_size = int(prod(config.reduce_thread)) + + block_b = reduction_block + output_blocks = get_output_blocks(sch, block_infos) + B_decode_block = get_block(sch, block_infos, B_decode_info["decode_block"]) + + # compute inline + for block_info in reversed(block_infos): + block = block_info.block_rv + if block not in (reduction_block, *output_blocks, B_decode_block): + sch.compute_inline(block) + + block_decode_B = sch.cache_read(block_b, 1, "local") + sch.compute_inline(B_decode_block) + + j, k = sch.get_loops(block_b)[-2:] + + block_shared_local_A = sch.cache_read(block_b, 0, "local") + block_shared_local_B = sch.cache_read(block_decode_B, 0, "local") + block_local_C = sch.cache_write(block_b, 0, "local") + # reverse inline + if reduction_block != None and reduction_block != output_blocks[0]: + sch.reverse_compute_inline(output_blocks[0]) + + bx, j = sch.split(j, factors=[None, num_warps]) + k, tx, vk = sch.split(k, factors=[None, warp_size, vec]) + sch.reorder(bx, j, k, tx) + + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.bind(j, "threadIdx.y") + + self.block_size = [sch.get(tx).extent, sch.get(j).extent, 1] + self.grid_size = [sch.get(bx).extent, 1, 1] + + sch.compute_at(block_decode_B, tx, preserve_unit_loops=True) + sch.compute_at(block_shared_local_A, tx, preserve_unit_loops=True) + sch.compute_at(block_shared_local_B, tx, preserve_unit_loops=True) + sch.reverse_compute_at(block_local_C, j, preserve_unit_loops=True) + + block_local_a_v = sch.get_loops(block_shared_local_A)[-1] + sch.vectorize(block_local_a_v) + block_local_b_v = sch.get_loops(block_shared_local_B)[-1] + sch.vectorize(block_local_b_v) + if "fast_decoding" in B_decode_info and B_decode_info["fast_decoding"]: + intrin_info = get_lop3_intrin_group( + in_dtype="int8", out_dtype="float16", storage_nbit=4, with_scale=False + ) + sch.tensorize(sch.get_loops(block_decode_B)[-1], intrin_info["compute"]) + sch.annotate(block_b, ann_key="pragma_import_c", ann_val=intrin_info["c_source"]) + return sch + + def apply_config(self, func: PrimFunc, config): + if any([t > 1 for t in config.reduce_thread]): + return self.sch_inner_reduction_with_config(func, config) + else: + return None + + +class GEMV(GPUScheduleRule): + """A rule for GEMV and DecodeGEMV.""" + + def apply( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if len(block_infos) == 1: + epilogue = None + elif len(block_infos) == 2: + epilogue = block_infos[1] + if not epilogue.is_injective(): + return None + else: + return None + + block_info = block_infos[0] + if len(block_info.iters) not in [2, 3]: + # either [B, S, R] = [B, S, R] * [B, R] + # or [S, R] = [S, R] * [R] + return None + block = block_info.block_rv + vector_input_buffers = is_gemv(sch, block_info) + if vector_input_buffers is None: + return None + + # Step 1. Normalize the block, merge spatial and reduction iters + is_inner_reduction = normalize(sch, block_info) + + # Step 2. Do the scheduling + if is_inner_reduction is None: + return None + elif is_inner_reduction: + self.sch_inner_reduction(sch, target, block, vector_input_buffers, epilogue) + return sch + else: + return self.sch_outer_reduction(sch, target, block, vector_input_buffers, epilogue) + + def sch_inner_reduction( # pylint: disable=too-many-arguments, invalid-name, unused-argument + self, + sch: tir.Schedule, + target: Target, + block: tir.schedule.BlockRV, + vector_input_buffers: List[tir.Buffer], + epilogue_info: Optional[BlockInfo], + ): + """Schedule the inner reduction block.""" + + def get_max_factor(n, factors): + factors = sorted(factors, reverse=True) + for factor in factors: + if n % factor == 0: + return factor + return 1 + + def apply( + sch: tir.Schedule, + gemv, + TAG_S, + TAG_R, + TS, + TR, + TILE_S, + TILE_R, + VEC_LOAD, + VEC_C, + LOAD_V_SHARED, + LOAD_V_VEC, + UNROLL, + ): + # rfactor: reduce to tx * vec_c + _, s, r, c = sch.get_loops(block=gemv) + s = sch.fuse(_, s) + r = sch.fuse(r, c) + bx, ts, tile_s = sch.split(s, factors=[None, TS, TILE_S], preserve_unit_iters=True) + r, tr, tile_r_vec_n, vec_c = sch.split( + r, factors=[None, TR, TILE_R // VEC_C, VEC_C], preserve_unit_iters=True + ) + sch.reorder(r, tile_r_vec_n, tr, vec_c) + tr_vec_c = sch.fuse(tr, vec_c) + rf = sch.rfactor(tr_vec_c, 0) + + # rfactor: reduce to tx + bx, ts, tile_s, tr_vec_c = sch.get_loops(block=gemv) + tr, vec_c = sch.split(tr_vec_c, factors=[TR, None], preserve_unit_iters=True) + rf2 = sch.rfactor(tr, 0) + + # bind, vectorize compute + bx, ts, tile_s, r, tile_r_vec_n, tr_vec_c = sch.get_loops(block=rf) + tr, vec_c = sch.split(tr_vec_c, factors=[TR, None], preserve_unit_iters=True) + sch.reorder(bx, ts, tr, r, tile_s, tile_r_vec_n, vec_c) + sch.bind(bx, "blockIdx.x") + sch.bind(ts, TAG_S) + sch.bind(tr, TAG_R) + sch.vectorize(vec_c) + + shared_mem_usage = 0 + for buf in vector_input_buffers: + buf_size = reduce( + lambda x, y: x * y, buf.shape, tir.IntImm(buf.shape[0].dtype, 1) + ) * get_bytes(buf.dtype) + shared_mem_usage += buf_size + LOAD_V_SHARED = ( + LOAD_V_SHARED + and isinstance(shared_mem_usage, tir.IntImm) + and shared_mem_usage.value <= target.max_shared_memory_per_block + ) + + # vectorize load A + # (TODO) this is now actually problematic since the number of loops is dependent on the + # number of dimensions of A_q + Aq_local = sch.cache_read(rf, read_buffer_index=1, storage_scope="local") + sch.compute_at(Aq_local, r, preserve_unit_loops=True) + s_local, r_local = sch.get_loops(block=Aq_local)[-2:] + s_local, vec_load = sch.split( + s_local, factors=[None, VEC_LOAD], preserve_unit_iters=True + ) + sch.reorder(s_local, r_local, vec_load) # either s_local or r_local should be 1 + sch.vectorize(vec_load) + + # load vector into shared memory, shape should be the whole vector + if LOAD_V_SHARED: + assert len(vector_input_buffers) == 1 + V_shared = sch.cache_read(rf, read_buffer_index=0, storage_scope="shared") + sch.compute_at(V_shared, tr, preserve_unit_loops=True) + l = sch.get_loops(block=V_shared)[-1] + loop: tir.For = sch.get(l) + if isinstance(loop.extent, tir.IntImm): + # avoid introducing predicates when vector length is too large + vec_length = max( + min( + get_max_factor( + (int)(loop.extent), + [TS * TR * 1, TS * TR * 2, TS * TR * 4, TS * TR * 8], + ) + // TS + // TR, + LOAD_V_VEC, + ), + 1, + ) + else: + vec_length = LOAD_V_VEC + if TAG_R == "threadIdx.x": + _, ty, tx, vec = sch.split( + l, factors=[None, TS, TR, vec_length], preserve_unit_iters=True + ) + else: + _, ty, tx, vec = sch.split( + l, factors=[None, TR, TS, vec_length], preserve_unit_iters=True + ) + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + sch.vectorize(vec) + + # reduce tile_s * tr * vec to tile_s * tr + sch.reverse_compute_at(rf2, loop=bx, preserve_unit_loops=True) + tr, vec_c, *ts_tile_s = sch.get_loops(block=rf2)[1:] + ts_tile_s = sch.fuse(*ts_tile_s) + ts, tile_s = sch.split(ts_tile_s, factors=[TS, None], preserve_unit_iters=True) + tile_s, vec_s = sch.split( + tile_s, + factors=[None, get_max_factor(TILE_S, [1, 2, 4, 8])], + preserve_unit_iters=True, + ) + sch.reorder(ts, tr, tile_s, vec_s, vec_c) + sch.bind(ts, TAG_S) + sch.bind(tr, TAG_R) + sch.vectorize(vec_s) + + # reduce tile_s * tr to tile_s + sch.reverse_compute_at(gemv, loop=bx, preserve_unit_loops=True) + tr, *ts_tile_s = sch.get_loops(block=gemv)[1:] + ts_tile_s = sch.fuse(*ts_tile_s) + ts, tile_s = sch.split(ts_tile_s, factors=[TS, None], preserve_unit_iters=True) + sch.reorder(tile_s, ts, tr) + sch.bind(ts, TAG_S) + sch.bind(tr, TAG_R) + + sch.decompose_reduction(rf, loop=sch.get_loops(block=rf)[3]) + sch.decompose_reduction(rf2, loop=sch.get_loops(block=rf2)[-1]) + + sch.set_scope(rf, buffer_index=0, storage_scope="local") + sch.set_scope(rf2, buffer_index=0, storage_scope="local") + + unroll_factor = UNROLL + + sch.annotate( + block_or_loop=sch.get_loops(rf)[3], + ann_key="pragma_auto_unroll_max_step", + ann_val=unroll_factor, + ) + sch.annotate( + block_or_loop=sch.get_loops(rf)[3], ann_key="pragma_unroll_explicit", ann_val=1 + ) + + sch.annotate( + block_or_loop=sch.get_loops(rf2)[3], + ann_key="pragma_auto_unroll_max_step", + ann_val=unroll_factor, + ) + sch.annotate( + block_or_loop=sch.get_loops(rf2)[3], ann_key="pragma_unroll_explicit", ann_val=1 + ) + + if LOAD_V_SHARED: + sch.annotate( + block_or_loop=sch.get_loops(V_shared)[-4], + ann_key="pragma_unroll_explicit", + ann_val=unroll_factor, + ) + sch.annotate( + block_or_loop=sch.get_loops(V_shared)[-4], ann_key="pragma_vectorize", ann_val=1 + ) + + # Schedule epilogue + if epilogue_info is not None: + epilogue = epilogue_info.block_rv + if is_broadcast_epilogue(sch, block, epilogue): + sch.reverse_compute_at(epilogue, bx) + sch.set_scope(block, 0, "shared") + _, _, *s = sch.get_loops(epilogue) # pylint: disable=invalid-name + _, tx = sch.split(sch.fuse(*s), factors=[None, TX]) + sch.bind(tx, "threadIdx.x") + else: + sch.reverse_compute_at(epilogue, bx, preserve_unit_loops=True) + ts_tile_s = sch.fuse(*sch.get_loops(epilogue)[1:]) + ts_tile_s = sch.get_loops(epilogue)[-1] + ts, tile_s = sch.split(ts_tile_s, factors=[TS, None], preserve_unit_iters=True) + sch.bind(ts, TAG_S) + sch.set_scope(block, 0, "local") + # pylint: enable=invalid-name + return sch + + # Specify the `len_tx` and `len_ty` according to the loop extent + batch, s, r, c = sch.get_loops(block=block) + len_batch, len_s, len_r, len_c = ( + get_extent(sch, batch), + get_extent(sch, s), + get_extent(sch, r), + get_extent(sch, c), + ) + len_S = len_batch * len_s + len_R = len_r * len_c + + TAG_S, TAG_R = "threadIdx.y", "threadIdx.x" + if target.kind.name == "cuda": + VEC_C = 4 + LOAD_V_SHARED = True + LOAD_V_VEC = 8 + UNROLL = 256 + if isinstance(len_S, int): + if len_S > len_R: + TS, TR = 4, 64 + else: + TS, TR = 16, 32 + elif target.kind.name == "metal": + # Note that the following tile size is tuned on M2 Ultra for 7B + TAG_S, TAG_R = "threadIdx.x", "threadIdx.y" + VEC_C = 1 + LOAD_V_SHARED = False + LOAD_V_VEC = -1 + UNROLL = 256 + if isinstance(len_S, int): + if len_S > len_R: + TS, TR = 4, 16 + else: + TS, TR = 2, 64 + elif target.kind.name == "rocm": + VEC_C = 4 + LOAD_V_SHARED = True + LOAD_V_VEC = 8 + UNROLL = 256 + if isinstance(len_S, int): + if len_S > len_R: + TS, TR = 1, 128 + else: + TS, TR = 8, 64 + elif target.kind.name == "opencl" and "android" in str(target.host): + TAG_S, TAG_R = "threadIdx.x", "threadIdx.y" + VEC_C = 8 + LOAD_V_SHARED = False + LOAD_V_VEC = -1 + UNROLL = 8 + TS, TR = 2, 32 + elif target.kind.name == "vulkan": + VEC_C = 4 + LOAD_V_SHARED = True + LOAD_V_VEC = 4 + UNROLL = 256 + if isinstance(len_S, int): + if len_S > len_R: + TS, TR = 4, 32 + else: + TS, TR = 16, 32 + elif target.kind.name == "opencl" and "mali" in str(target.attrs): + VEC_C = 8 + LOAD_V_SHARED = False + LOAD_V_VEC = -1 + UNROLL = 64 + TS, TR = 1, 64 + else: + VEC_C = 1 + LOAD_V_SHARED = False + LOAD_V_VEC = -1 + UNROLL = 64 + TS, TR = 1, 64 + + if not isinstance(len_S, int): + TS, TR = 1, 64 + + while TS * TR > target.max_num_threads: + if TS > 1: + TS //= 2 + else: + TR //= 2 + + TILE_S, TILE_R = ( + 1, + len_c + if len_c > 1 + else max(get_max_factor(len_r, [TR * 1, TR * 2, TR * 4, TR * 8]) // TR, 1), + ) + VEC_C = min(get_max_factor(TILE_R, [1, 2, 4, 8]), VEC_C) + VEC_LOAD = 1 + + return apply( + sch, + gemv=block, + TAG_S=TAG_S, + TAG_R=TAG_R, + TS=TS, + TR=TR, + TILE_S=TILE_S, + TILE_R=TILE_R, + VEC_LOAD=VEC_LOAD, + VEC_C=VEC_C, + LOAD_V_SHARED=LOAD_V_SHARED, + LOAD_V_VEC=LOAD_V_VEC, + UNROLL=UNROLL, + ) + + def sch_outer_reduction( # pylint: disable=too-many-arguments, invalid-name, unused-argument + self, + sch: tir.Schedule, + target: Target, + block: tir.schedule.BlockRV, + vector_input_buffers: List[tir.Buffer], + epilogue_info: Optional[BlockInfo], + ): + """Schedule the outer reduction block.""" + # NOTE: Only Android is supported so far + if not (target.kind.name == "opencl" and "android" in str(target.host)): + return None + batch, s, r, c = sch.get_loops(block) + len_s = get_extent(sch, s) + + # The config is designed for Adreno + tx_len = 64 + vec_len = (4 if len_s > 4096 else 2) if isinstance(len_s, int) else 1 + inner_r = 4 + + bx, tx, vec = sch.split(s, factors=[None, tx_len, vec_len]) + r0, r1 = sch.split(r, factors=[None, inner_r]) + sch.bind(batch, "blockIdx.y") + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.reorder(bx, tx, r0, r1, c, vec) + + sch.annotate(tx, ann_key="pragma_auto_unroll_max_step", ann_val=8) + sch.annotate(tx, ann_key="pragma_unroll_explicit", ann_val=1) + + cache_v = sch.cache_read(block, vector_input_buffers[0], "local") + sch.compute_at(cache_v, r1, preserve_unit_loops=True) + sch.vectorize(sch.get_loops(cache_v)[-1]) + + sch.vectorize(vec) + + # Schedule epilogue + if epilogue_info is not None: + sch.reverse_compute_at(epilogue_info.block_rv, tx) + + sch.set_scope(block, 0, "local") + + sch.decompose_reduction(block, r0) + + return sch + + def sch_inner_reduction_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + sch = tir.Schedule(func) + + block_infos = normalize_prim_func(sch) + + if block_infos is None: + return None + + reduction_block: tir.schedule.BlockRV = None + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block) + ] + ) + or len(sch.get_loops(block)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block)) + if len(r_loops) > 0: + reduction_block = block + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + vec = list(config.vectorize.values())[-1] + + num_warps = int(prod(config.thread)) + warp_size = int(prod(config.reduce_thread)) + + block_b = reduction_block + output_blocks = get_output_blocks(sch, block_infos) + # compute inline + for block_info in reversed(block_infos): + block = block_info.block_rv + if block not in (reduction_block, *output_blocks): + sch.compute_inline(block) + try: + i, j, k = sch.get_loops(block_b) + except: + j, k = sch.get_loops(block_b) + block_shared_local_A = sch.cache_read(block_b, 0, "local") + block_shared_local_B = sch.cache_read(block_b, 1, "local") + block_local_C = sch.cache_write(block_b, 0, "local") + # reverse inline + if reduction_block != None and reduction_block != output_blocks[0]: + sch.reverse_compute_inline(output_blocks[0]) + + bx, j = sch.split(j, factors=[None, num_warps]) + k, tx, vk = sch.split(k, factors=[None, warp_size, vec]) + sch.reorder(bx, j, k, tx) + + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.bind(j, "threadIdx.y") + + self.block_size = [sch.get(tx).extent, sch.get(j).extent, 1] + self.grid_size = [sch.get(bx).extent, 1, 1] + + sch.compute_at(block_shared_local_A, tx, preserve_unit_loops=True) + sch.compute_at(block_shared_local_B, tx, preserve_unit_loops=True) + sch.reverse_compute_at(block_local_C, j, preserve_unit_loops=True) + + block_local_a_v = sch.get_loops(block_shared_local_A)[-1] + sch.vectorize(block_local_a_v) + block_local_b_v = sch.get_loops(block_shared_local_B)[-1] + sch.vectorize(block_local_b_v) + + return sch + + def sch_outer_reduction_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + + if block_infos is None: + return None + + reduction_block: tir.schedule.BlockRV = None + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block) + ] + ) + or len(sch.get_loops(block)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block)) + if len(r_loops) > 0: + reduction_block = block + + C = reduction_block + CL = sch.cache_write(reduction_block, 0, "local") + + blck_axis = [] + vthd_axis = [] + thrd_axis = [] + tile_axis = [] + for i, loop in enumerate(s_loops): + if sch.get(loop).extent % config.block[i]: + raise NotImplementedError("Undivisible block in TIR schedule is still buggy.") + bx, _t = sch.split(loop, factors=[None, config.block[i]]) + blck_axis.append(bx) + if config.step[i] > 1: + _t, tn = sch.split(_t, factors=[None, config.step[i]]) + tile_axis.append(tn) + if config.block[i] <= config.thread[i] * config.step[i]: + tx = _t + else: + vx, tx = sch.split(_t, factors=[None, config.thread[i]]) + vthd_axis.append(vx) + thrd_axis.append(tx) + + reduce_outer_axis, reduce_inner_axis = [], [] + for i in config.raxis_order: + loop = r_loops[i] + ro, ri = sch.split(loop, factors=[None, config.rstep[i]]) + reduce_outer_axis.append(ro) + reduce_inner_axis.append(ri) + + vthd_axis = list(reversed(vthd_axis)) # inner virtual thread first + axis_order = ( + blck_axis + vthd_axis + thrd_axis + reduce_outer_axis + reduce_inner_axis + tile_axis + ) + + sch.reorder(*axis_order) + blck_fused = sch.fuse(*blck_axis) + thrd_fused = sch.fuse(*thrd_axis) + sch.bind(blck_fused, "blockIdx.x") + sch.bind(thrd_fused, "threadIdx.x") + if len(vthd_axis) > 3: + vthd_axis = vthd_axis[0:2] + [sch.fuse(*vthd_axis[2:])] + for i, ax in enumerate(vthd_axis): + sch.bind(ax, "vthread" + [".x", ".y", ".z"][i]) + for ax in tile_axis: + sch.unroll(ax) + + sch.reverse_compute_at(CL, thrd_fused) + if len(tile_axis) > 0: + for ax in sch.get_loops(CL)[-len(tile_axis) :]: + sch.unroll(ax) + + sch.decompose_reduction(C, reduce_outer_axis[0]) + + try_inline_contiguous_spatial(sch, block_infos) + + return sch + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> tir.Schedule: + if not isinstance(func, tir.PrimFunc): + return None + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if len(block_infos) == 1: + epilogue = None + elif len(block_infos) == 2: + epilogue = block_infos[1] + if not epilogue.is_injective(): + return None + else: + return None + + block_info = block_infos[0] + if len(block_info.iters) not in [2, 3]: + # either [B, S, R] = [B, S, R] * [B, R] + # or [S, R] = [S, R] * [R] + return None + + if is_gemv(sch, block_info) is None: + return None + + if "dequantize_info" in func.attrs: + dequantize_rule = GEMVWithDequantizeInfo() + return dequantize_rule.apply_config(func, config) + + if any([t > 1 for t in config.reduce_thread]): + return self.sch_inner_reduction_with_config(func, config) + + return self.sch_outer_reduction_with_config(func, config) diff --git a/python/bitblas/gpu/general_reduction.py b/python/bitblas/gpu/general_reduction.py new file mode 100644 index 0000000000..779f5d9875 --- /dev/null +++ b/python/bitblas/gpu/general_reduction.py @@ -0,0 +1,478 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Reduction rule for operators including softmax, layer norm, RMS norm, etc""" +from typing import List, Union +from functools import reduce + +from tvm import tir +from tvm.target import Target + +from ..base import normalize_prim_func, try_inline_contiguous_spatial +from ..base.analysis import get_root_block, get_reduction_blocks, BlockInfo +from .base import GPUScheduleRule + + +class GeneralReduction(GPUScheduleRule): + """General Reduction rule for operators including softmax, layer norm, RMS norm, etc""" + + def apply( # pylint: disable=too-many-locals + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + + if target.kind.name == "cuda": + len_tx = 256 + unroll_depth = 256 + else: + len_tx = 64 + unroll_depth = 64 + + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if block_infos is None or len(block_infos) == 0: + return None + + dom_kind = block_infos[0].dom_kind() + num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) + num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) + + # Align the number of block iters of the last block. + num_last_block_iter = len(block_infos[-1].dom_kind()) + if num_last_block_iter < len(dom_kind): + index_map = tir.IndexMap.from_func( + lambda *iters: ( + [tir.const(0, iters[0].dtype)] * (len(dom_kind) - num_last_block_iter) + + list(iters) + ), + ndim=num_last_block_iter, + ) + sch.transform_block_layout(block_infos[-1].block_rv, index_map) + + try: + # TODO: fix num_leading_s = 0 case + assert num_trailing_r > 0 + for block in block_infos[1:-1]: + assert block.dom_kind() == dom_kind + assert block_infos[-1].is_injective() + assert len(block_infos[-1].dom_kind()) <= len(dom_kind) + except AssertionError: + return None + + loops = sch.get_loops(block_infos[-1].block_rv) + bx = sch.fuse(*loops[:num_leading_s]) + r_loop, tx = sch.split(loops[-1], [None, len_tx]) + sch.reorder(tx, r_loop) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.annotate(r_loop, ann_key="pragma_auto_unroll_max_step", ann_val=unroll_depth) + sch.annotate(r_loop, ann_key="pragma_unroll_explicit", ann_val=1) + + for block in reversed(block_infos[:-1]): + block = block.block_rv + for i, _ in enumerate(sch.get(block).writes): + sch.set_scope(block, buffer_index=i, storage_scope="shared") + sch.compute_at(block, bx, preserve_unit_loops=True) + r_loop = sch.fuse(*sch.get_loops(block)[-num_trailing_r:]) + r_loop, tx = sch.split(r_loop, [None, len_tx]) + sch.reorder(tx, r_loop) + sch.bind(tx, "threadIdx.x") + sch.annotate(r_loop, ann_key="pragma_auto_unroll_max_step", ann_val=unroll_depth) + sch.annotate(r_loop, ann_key="pragma_unroll_explicit", ann_val=1) + + # TODO: It's just a workaround to avoid unroll spatial loops, because of the bug of + # the pass lower-thread-allreduce. We should fix it in the future. + # sch.annotate(bx, ann_key="pragma_auto_unroll_max_step", ann_val=unroll_depth) + # sch.annotate(bx, ann_key="pragma_unroll_explicit", ann_val=1) + return sch + + def sch_inner_reduction_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + block_factors = config.block + thread_factors = config.thread + reduce_therad_factors = config.reduce_thread + + # For inter thread reduction case, one thread must only compute one element + assert thread_factors == block_factors + + # inline all the other blocks + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + + schedule_block: tir.schedule.BlockRV = None + reduction_blocks: List[tir.schedule.BlockRV] = [] + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block_rv = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block_rv) + ] + ) + or len(sch.get_loops(block.block_rv)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block_rv), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block_rv)) + if len(r_loops) > 0: + # always use the last reduction block for scheduling + schedule_block = block + reduction_blocks.append(block_rv) + + # Align the number of block iters of the last block. + dom_kind = schedule_block.dom_kind() + num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) + num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) + + schedule_block = schedule_block.block_rv + loops = sch.get_loops(schedule_block) + s_loops = loops[:num_leading_s] + r_loops = loops[-num_trailing_r:] + + block_axis = [] + thread_axis = [] + + for s_loop, block_factor in zip(s_loops, block_factors): + block_loop, thread_loop = sch.split(s_loop, factors=[None, block_factor]) + block_axis.append(block_loop) + thread_axis.append(thread_loop) + + axis_order = block_axis + thread_axis + + sch.reorder(*axis_order) + blck_fused = sch.fuse(*block_axis) + thrd_fused = sch.fuse(*thread_axis) + sch.bind(blck_fused, "blockIdx.x") + sch.bind(thrd_fused, "threadIdx.y") + + reduce_outer_axis, reduce_inner_axis, reduce_inter_threads = [], [], [] + for i in config.raxis_order: + loop = r_loops[i] + ro, ri = sch.split(loop, factors=[None, config.rstep[i]]) + ri, thd = sch.split(ri, factors=[None, config.reduce_thread[i]]) + reduce_inter_threads.append(thd) + reduce_outer_axis.append(ro) + reduce_inner_axis.append(ri) + + axis_order = reduce_inter_threads + reduce_outer_axis + reduce_inner_axis + sch.reorder(*axis_order) + fused_reduce_inter_threads = sch.fuse(*reduce_inter_threads) + sch.bind(fused_reduce_inter_threads, "threadIdx.x") + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + reg_tile = sch.cache_write(schedule_block, 0, "local") + + # todo(lei): should add the shared_inputs/stride memory pad analysis at shared memory fusion stage. + for i, input_region in enumerate(sch.get(schedule_block).reads): + if input_region.buffer.name not in config.cached_tensors: + continue + + # otherwise cooperative fetch in shared memory. + cache_shared = sch.cache_read(schedule_block, i, "shared") + sch.compute_at(cache_shared, reduce_outer_axis[-1]) + + dim_offset = ( + len(reduce_inner_axis) + len(reduce_outer_axis) + 2 + ) # outer loops are: blck_fused, thrd_fused, vthread_axis, reduce_outer_axis + if input_region.buffer.name in config.vectorize: + vectorize = config.vectorize[input_region.buffer.name] + else: + vectorize = 1 + + loops = sch.get_loops(cache_shared) + if len(loops) == dim_offset: + # handle fetching only one element + loops.append(sch.add_unit_loop(schedule_block)) + assert len(loops) > dim_offset + + _, ty, tx, tv = sch.split( + sch.fuse(*loops[dim_offset:]), + factors=[ + None, + int(prod(thread_factors)), + int(prod(reduce_therad_factors)), + vectorize, + ], + ) + sch.vectorize(tv) + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + + sch.reverse_compute_at(reg_tile, thrd_fused) + + # resolve compute_at + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if block_infos is None or len(block_infos) == 0: + return None + return sch + + def sch_outer_reduction_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + block_factors = config.block + thread_factors = config.thread + step_factors = config.step + + # inline all the other blocks + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + + schedule_block: BlockInfo = None + for block in block_infos: + s_loops: List[tir.schedule.LoopRV] = [] + r_loops: List[tir.schedule.LoopRV] = [] + o_loops: List[tir.schedule.LoopRV] = [] + dom_kind = block.dom_kind() + block_rv = block.block_rv + + if ( + any( + [ + sch.get(loop_rv).thread_binding is not None + for loop_rv in sch.get_loops(block_rv) + ] + ) + or len(sch.get_loops(block.block_rv)) == 0 + ): + continue + + for loop, iter_type in zip(sch.get_loops(block_rv), dom_kind): + {"S": s_loops, "R": r_loops, "O": o_loops}[iter_type].append(loop) + + if not s_loops: + s_loops.append(sch.add_unit_loop(block_rv)) + if len(r_loops) > 0: + # always use the last reduction block for scheduling + schedule_block = block + + # Align the number of block iters of the last block. + dom_kind = schedule_block.dom_kind() + num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) + num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) + + num_last_block_iter = len(block_infos[-1].dom_kind()) + if num_last_block_iter < len(dom_kind): + index_map = tir.IndexMap.from_func( + lambda *iters: ( + [tir.const(0, iters[0].dtype)] * (len(dom_kind) - num_last_block_iter) + + list(iters) + ), + ndim=num_last_block_iter, + ) + sch.transform_block_layout(block_infos[-1].block_rv, index_map) + + schedule_block = schedule_block.block_rv + loops = sch.get_loops(schedule_block) + s_loops = loops[:num_leading_s] + r_loops = loops[-num_trailing_r:] + + reg_tile = sch.cache_write(schedule_block, 0, "local") + + block_axis = [] + vthread_axis = [] + thread_axis = [] + inner_axis = [] + for s_loop, block_factor, step_factor, thread_factor in zip( + s_loops, block_factors, step_factors, thread_factors + ): + block_loop, inner_loop = sch.split(s_loop, factors=[None, block_factor]) + vthread_loop, inner_loop = sch.split( + inner_loop, factors=[None, thread_factor * step_factor] + ) + thread_loop, inner_loop = sch.split(inner_loop, factors=[None, step_factor]) + block_axis.append(block_loop) + vthread_axis.append(vthread_loop) + thread_axis.append(thread_loop) + inner_axis.append(inner_loop) + + reduce_outer_axis, reduce_inner_axis = [], [] + for i in config.raxis_order: + loop = r_loops[i] + ro, ri = sch.split(loop, factors=[None, config.rstep[i]]) + reduce_outer_axis.append(ro) + reduce_inner_axis.append(ri) + + vthread_axis = list(reversed(vthread_axis)) # inner virtual thread first + axis_order = ( + block_axis + + vthread_axis + + thread_axis + + reduce_outer_axis + + reduce_inner_axis + + inner_axis + ) + + sch.reorder(*axis_order) + blck_fused = sch.fuse(*block_axis) + thrd_fused = sch.fuse(*thread_axis) + sch.bind(blck_fused, "blockIdx.x") + sch.bind(thrd_fused, "threadIdx.x") + if len(vthread_axis) > 3: + vthread_axis = vthread_axis[0:2] + [sch.fuse(*vthread_axis[2:])] + for i, ax in enumerate(vthread_axis): + sch.bind(ax, "vthread" + [".x", ".y", ".z"][i]) + + # todo(lei): should add the shared_inputs/stride memory pad analysis at shared memory fusion stage. + for i, input_region in enumerate(sch.get(schedule_block).reads): + if input_region.buffer.name not in config.cached_tensors: + continue + + # otherwise cooperative fetch in shared memory. + cache_shared = sch.cache_read(schedule_block, i, "shared") + sch.compute_at(cache_shared, reduce_outer_axis[-1]) + + dim_offset = ( + len(vthread_axis) + len(reduce_outer_axis) + 2 + ) # outer loops are: blck_fused, thrd_fused, vthread_axis, reduce_outer_axis + if input_region.buffer.name in config.vectorize: + vectorize = config.vectorize[input_region.buffer.name] + else: + vectorize = 1 + + loops = sch.get_loops(cache_shared) + if len(loops) == dim_offset: + # handle fetching only one element + loops.append(sch.add_unit_loop(schedule_block)) + assert len(loops) > dim_offset + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + _, tx, tv = sch.split( + sch.fuse(*loops[dim_offset:]), factors=[None, int(prod(thread_factors)), vectorize] + ) + sch.vectorize(tv) + sch.bind(tx, "threadIdx.x") + + sch.reverse_compute_at(reg_tile, thrd_fused) + + sch.decompose_reduction(schedule_block, reduce_outer_axis[0]) + + # resolve compute_at + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if block_infos is None or len(block_infos) == 0: + return None + + return sch + + def sch_mutiple_reductions_with_config( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + config, + ): + block_factors = config.block + thread_factors = config.thread + reduce_therad_factors = config.reduce_thread + + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if block_infos is None or len(block_infos) == 0: + return None + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + len_tx = prod(thread_factors) * prod(reduce_therad_factors) + block_factor = prod(block_factors) + + dom_kind = block_infos[0].dom_kind() + num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) + num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) + + # Align the number of block iters of the last block. + num_last_block_iter = len(block_infos[-1].dom_kind()) + if num_last_block_iter < len(dom_kind): + index_map = tir.IndexMap.from_func( + lambda *iters: ( + [tir.const(0, iters[0].dtype)] * (len(dom_kind) - num_last_block_iter) + + list(iters) + ), + ndim=num_last_block_iter, + ) + sch.transform_block_layout(block_infos[-1].block_rv, index_map) + + try: + # TODO: fix num_leading_s = 0 case + assert num_trailing_r > 0 + for block in block_infos[1:-1]: + assert block.dom_kind() == dom_kind + assert block_infos[-1].is_injective() + assert len(block_infos[-1].dom_kind()) <= len(dom_kind) + except AssertionError: + return None + + loops = sch.get_loops(block_infos[-1].block_rv) + bx, _ = sch.split(sch.fuse(*loops[:num_leading_s]), factors=[None, block_factor]) + r_loop, tx = sch.split(loops[-1], [None, len_tx]) + sch.reorder(tx, r_loop) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + for block in reversed(block_infos[:-1]): + block = block.block_rv + for i, _ in enumerate(sch.get(block).writes): + sch.set_scope(block, buffer_index=i, storage_scope="shared") + sch.compute_at(block, bx, preserve_unit_loops=True) + r_loop = sch.fuse(*sch.get_loops(block)[-num_trailing_r:]) + r_loop, tx = sch.split(r_loop, [None, len_tx]) + sch.reorder(tx, r_loop) + sch.bind(tx, "threadIdx.x") + + return sch + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> tir.Schedule: + # check the number of reduction blocks + sch = tir.Schedule(func) + root_block = get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + reduction_blocks = get_reduction_blocks(sch, blocks) + if len(reduction_blocks) > 1: + # schedule for multiple reduction blocks (e.g. softmax) + return self.sch_mutiple_reductions_with_config(func, config) + + if any([t > 1 for t in config.reduce_thread]): + # todo(lei) should implement block reduction schedule + return self.sch_inner_reduction_with_config(func, config) + else: + return self.sch_outer_reduction_with_config(func, config) diff --git a/python/bitblas/gpu/intrin/lop3.py b/python/bitblas/gpu/intrin/lop3.py new file mode 100644 index 0000000000..db49bdfcc6 --- /dev/null +++ b/python/bitblas/gpu/intrin/lop3.py @@ -0,0 +1,315 @@ +import tvm +from tvm.runtime import convert +from tvm.tir.function import TensorIntrin +from tvm.script import tir as T +import numpy as np +from typing import Dict, Literal + +lift = convert + +decode_i4_to_f16 = """ +template +__device__ void decode_i4s_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x000f000f; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + uint const i4s = *reinterpret_cast(_i4s); +#pragma unroll + // decode 2 elems at one time. + for (int i = 0; i < (N / 2); i++) + { + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" + : "=r"(h[i]) + : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(FP16_TOP_MAGIC_NUM)); + } +} +""" + + +decode_i1s_to_i8s_l16 = """template +__device__ void decode_i1s_to_i8s_l16(T1 *_i1s, T2 *_i8s, const int N = 16) +{ + int *i8s = reinterpret_cast(_i8s); + int16_t i1s_i16 = *reinterpret_cast(_i1s); + // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15} + // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x} + int i1s = (i1s_i16 & 0x0f0f); + i1s |= ((i1s_i16 & 0xf0f0) << 12); + // i1s {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // First, we extract the i1s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1 + static constexpr uint I8s_MAGIC_NUM = 0x00000000; + + for (int i = 0; i < N / 4; i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" + : "=r"(i8s[i]) + : "r"(i1s >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut)); + } +} +""" + +decode_i2s_to_i8s = """template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +""" + +decode_i4s_to_i8s = """template +__device__ void decode_i4s_to_i8s(T1 *_i4s, T2 *_i8s, const int N = 8) +{ + uint *i8s = reinterpret_cast(_i8s); + uint i4s = *_i4s; + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 4); i++) + { + // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" + : "=r"(i8s[i]) + : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +""" + + +def get_fast_decode_intrin( + storage_nbit=4, storage_dtype="int8", target_dtype="float16", loops_extent=8 +): + """ + loops extent is the number of elements to be decoded in one stage + for memory friendly process, the loops_extent should be a multiple of (sizeof(int) // 8). + However, for the case of int1b, it is not possible to decode 8 elements in one stage, so we have to use 16. + """ + if target_dtype == "float16": + d4f = "f16" + elif target_dtype == "int8": + d4f = "i8s" + else: + raise ValueError("Unsupported target dtype: {}".format(target_dtype)) + func_name = "decode_i{}s_to_{}".format(storage_nbit, d4f) + + def _tir_u8_to_int_to_float( + nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str + ): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) + + assert storage_dtype == "int8" + elem_per_i8 = 8 // storage_nbit + n_storage_elems = loops_extent // elem_per_i8 + + @T.prim_func + def fast_decode_desc(compressed: T.handle, decompressed: T.handle) -> None: + Compressed = T.match_buffer( + compressed, + [ + n_storage_elems, + ], + dtype=storage_dtype, + scope="local", + ) + Decompressed = T.match_buffer( + decompressed, + [ + loops_extent, + ], + dtype=target_dtype, + scope="local", + ) + + with T.block("root"): + T.reads(Compressed[0:n_storage_elems]) + T.writes(Decompressed[0:loops_extent]) + for i in T.grid(loops_extent): + with T.block("decode"): + vi = T.axis.remap("S", [i]) + Decompressed[vi] = _tir_u8_to_int_to_float( + storage_nbit, + Compressed[vi // elem_per_i8], + vi % elem_per_i8, + dtype=target_dtype, + ) + + @T.prim_func + def fast_decode_impl(compressed: T.handle, decompressed: T.handle) -> None: + Compressed = T.match_buffer( + compressed, + [ + n_storage_elems, + ], + dtype=storage_dtype, + scope="local", + ) + Decompressed = T.match_buffer( + decompressed, + [ + loops_extent, + ], + dtype=target_dtype, + scope="local", + ) + + with T.block("root"): + T.reads(Compressed[0:n_storage_elems]) + T.writes(Decompressed[0:loops_extent]) + T.call_extern( + "handle", func_name, Compressed.data, Decompressed.data, loops_extent + ) + + return fast_decode_desc, fast_decode_impl + + +LOP3_FAST_DECODE_INT4_TO_FP16_L8_INTRIN = "lop3_fast_decode_i4_to_f16_l8_" +TensorIntrin.register( + LOP3_FAST_DECODE_INT4_TO_FP16_L8_INTRIN, + *get_fast_decode_intrin( + storage_nbit=4, storage_dtype="int8", target_dtype="float16", loops_extent=8 + ), +) + +LOP3_FAST_DECODE_INT4_TO_INT8_L8_INTRIN = "lop3_fast_decode_i4_to_i8_l8_" +TensorIntrin.register( + LOP3_FAST_DECODE_INT4_TO_INT8_L8_INTRIN, + *get_fast_decode_intrin( + storage_nbit=4, storage_dtype="int8", target_dtype="int8", loops_extent=8 + ), +) + + +LOP3_FAST_DECODE_INT2_TO_INT8_L16_INTRIN = "lop3_fast_decode_i2_to_i8_l16_" +TensorIntrin.register( + LOP3_FAST_DECODE_INT2_TO_INT8_L16_INTRIN, + *get_fast_decode_intrin( + storage_nbit=2, storage_dtype="int8", target_dtype="int8", loops_extent=16 + ), +) + +LOP3_FAST_DECODE_INT1_TO_INT8_L16_INTRIN = "lop3_fast_decode_int1_to_i8_l16_" +TensorIntrin.register( + LOP3_FAST_DECODE_INT1_TO_INT8_L16_INTRIN, + *get_fast_decode_intrin( + storage_nbit=1, storage_dtype="int8", target_dtype="int8", loops_extent=16 + ), +) + + +def get_lop3_intrin_group( + in_dtype: Literal["int8"], + out_dtype: Literal["float16", "int8"], + storage_nbit: int = 4, +) -> Dict[str, str]: + """ + This function is used to get the intrinsic group of the LOP3 operation to avoid the overhead of fast decoding. + LOP3 is a type of logic operation that takes three inputs. The intrinsic group refers to the set of + intrinsic operations that can be performed on these inputs. This function retrieves and returns this group. + + Parameters + ---------- + in_dtype : Literal["int8"] + The data type of the input. It should be "int8". + + out_dtype : Literal["float16", "int8"] + The data type of the output. It can be either "float16" or "int8". + + storage_nbit : int, optional + The number of bits used for storage. By default, it is 4. + + with_scale : bool, optional + A boolean parameter that indicates whether scaling should be applied. By default, it is False. + + Returns + ------- + Dict[str, str] + A dictionary mapping the names of the intrinsics to their corresponding implementations. + """ + assert in_dtype in ["int8"] + assert out_dtype in ["float16", "int8"] + + dtype_mapping = {"float16": "f16", "int8": "i8", "int32": "i32"} + target_dtype = dtype_mapping[out_dtype] + target_bits = tvm.DataType(out_dtype).bits + loop_extent = min(128 // target_bits, 32 // storage_nbit) + _intrin = f"lop3_fast_decode_i{storage_nbit}_to_{target_dtype}_l{loop_extent}_" + import_c_map = { + "i4_to_f16": decode_i4_to_f16, + "i1_to_i8": decode_i1s_to_i8s_l16, + "i2_to_i8": decode_i2s_to_i8s, + "i4_to_i8": decode_i4s_to_i8s, + } + return { + "c_source": import_c_map[f"i{storage_nbit}_to_{target_dtype}"], + "compute": _intrin, + } + + +# interleave weight numpy code +def interleave_weight(qweight, nbits=4, target_dtype="float16"): + assert target_dtype in ["float16", "int8"] + # reinterpret the data type of qweight to int32 + qweight = qweight.view(np.int32) + new_qweight = np.zeros_like(qweight) + bits_stride = 8 if target_dtype == "int8" else 16 + mask = (1 << nbits) - 1 # for 4bit the val is 0x0000000f + num_groups = 32 // bits_stride + elems_per_group = bits_stride // nbits + for i in range(num_groups): + for j in range(elems_per_group): + offset = i * elems_per_group + j + shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits + new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift + + if nbits == 1 and target_dtype == "int8": + # special handling for 1b interleave + n16_weight = new_qweight & np.int32(0xF0F00F0F) + n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16 + n16_weight |= ((new_qweight & np.int32(0x0000F000)) >> 12) << 24 + n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4 + n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12 + return n16_weight.view(np.int8) + elif nbits == 2 and target_dtype == "float16": + n8_weight = new_qweight & np.int32(0xFF0000FF) + n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16 + n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8 + return n8_weight.view(np.int8) + elif nbits == 1 and target_dtype == "float16": + n8_weight = new_qweight & 0xF000000F + n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8 + n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16 + n8_weight |= ((new_qweight & 0x0000F000) >> 12) << 24 + n8_weight |= ((new_qweight & 0x000F0000) >> 16) << 4 + n8_weight |= ((new_qweight & 0x00F00000) >> 20) << 12 + n8_weight |= ((new_qweight & 0x0F000000) >> 24) << 20 + + return new_qweight.view(np.int8) diff --git a/python/bitblas/gpu/matmul.py b/python/bitblas/gpu/matmul.py new file mode 100644 index 0000000000..ce4881816a --- /dev/null +++ b/python/bitblas/gpu/matmul.py @@ -0,0 +1,383 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring, invalid-name +"""A GEMM schedule rule for GPU operators.""" +from dataclasses import dataclass +from typing import Optional + +from tvm import tir +from tvm.target import Target +from tvm.tir.stmt import ForKind + +from ..base import analysis +from .base import GPUScheduleRule +from . import utils +from .matmul_analysis import ( + auto_inline_consumer_chain, + auto_inline_producers, + get_in_out_dtypes, + get_index_map, + normalize_to_matmul, + get_reduction_blocks, +) +from .matmul_mma import MatmulTensorizationMMA +from .matmul_wmma import MatmulInt8Tensorization, MatmulTensorizationWMMA, MatmulTensorizationLegacy +from functools import reduce + +class Matmul(GPUScheduleRule): + """The schedule rule for matmul-like computation""" + + @dataclass + class Config: + block_size_x: int = 8 + block_size_y: int = 8 + vthread_x: int = 1 + vthread_y: int = 1 + micro_size_x: int = 4 + micro_size_y: int = 4 + micro_size_k: int = 8 + vector_size: int = 1 + unroll: int = 256 # 0 means no unroll + use_shared: bool = True + storage_align: bool = False + inner_x: bool = False + + def get_configs(self, target: Target) -> Config: + """Get the schedule config for the target""" + if target.kind.name == "cuda" or target.kind.name == "rocm": + return Matmul.Config( + block_size_x=8, + block_size_y=16, + vthread_x=1, + vthread_y=1, + micro_size_x=4, + micro_size_y=4, + micro_size_k=16, + vector_size=2, + unroll=256, + use_shared=True, + storage_align=True, + inner_x=False, + ) + elif target.kind.name == "opencl" and "android" in str(target.host): + return Matmul.Config( + block_size_x=8, + block_size_y=8, + vthread_x=1, + vthread_y=1, + micro_size_x=8, + micro_size_y=2, + micro_size_k=16, + vector_size=8, + unroll=64, + use_shared=False, + storage_align=False, + inner_x=True, + ) + else: + return Matmul.Config() + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Optional[tir.Schedule]: + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + block_stmt = sch.get(main_block) + sch = normalize_to_matmul(sch, main_block) + if sch is None: + return None + + # Step 1. Check Tensor Core support + # Tensorization config: + # If any value of I, J, K is fixed and less than this threshold, + # tensorization rule will not be applied. + minimal_tensorize_threshold = 64 + block_stmt = sch.get(main_block) + if target.kind.name == "cuda" and utils.get_sm_version(target) >= 70: + apply_tensorization: bool = True + # the batch dimension is not taken into consideration. + # Analyze read/write buffers and choose correct tensorizer: int8 or fp16. + in_dtype, out_dtype = get_in_out_dtypes(block_stmt) + if in_dtype not in ["int8", "float16"]: + apply_tensorization = False + for item_var in block_stmt.iter_vars[1:]: + extent = item_var.dom.extent + if isinstance(extent, tir.expr.IntImm): + if extent.value <= minimal_tensorize_threshold: + apply_tensorization = False + if apply_tensorization: + if in_dtype == "int8" and out_dtype == "int32": + tensorize_sch = MatmulInt8Tensorization().apply(func, target, _) + elif utils.get_sm_version(target) >= 80: + # For A100(sm_80) or more advanced gpu, use MMA tensorization. + tensorize_sch = MatmulTensorizationMMA().apply(func, target, _) + else: + # For other GPUs, use WMMA tensorization. + tensorize_sch = MatmulTensorizationWMMA().apply(func, target, _) + if tensorize_sch is not None: + return tensorize_sch + + # Step 2. Get schedule config. + config = self.get_configs(target) + + # Step 3. Schedule matmul + y_kernel_size = config.vthread_y * config.block_size_y * config.micro_size_y + x_kernel_size = config.vthread_x * config.block_size_x * config.micro_size_x + if config.inner_x: + sch.pad_einsum( + main_block, + [1, y_kernel_size, x_kernel_size, config.micro_size_k], + ) + batch, y, x, k = sch.get_loops(main_block) + else: + sch.pad_einsum( + main_block, + [1, x_kernel_size, y_kernel_size, config.micro_size_k], + ) + batch, x, y, k = sch.get_loops(main_block) + by, vy, ty, yi = sch.split( + y, [None, config.vthread_y, config.block_size_y, config.micro_size_y] + ) + bx, vx, tx, xi = sch.split( + x, [None, config.vthread_x, config.block_size_x, config.micro_size_x] + ) + ko, ki = sch.split(k, factors=[None, config.micro_size_k]) + sch.reorder(by, bx, vy, vx, ty, tx, ko, ki, yi, xi) + by = sch.fuse(batch, by) + sch.bind(bx, "blockIdx.x") + sch.bind(by, "blockIdx.y") + sch.bind(vy, "vthread.y") + sch.bind(vx, "vthread.x") + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + inner_loop = config.micro_size_x if config.inner_x else config.micro_size_y + if inner_loop % config.vector_size == 0: + _, v = sch.split(xi, [None, config.vector_size]) + sch.vectorize(v) + + if config.unroll > 0: + sch.annotate(tx, ann_key="pragma_auto_unroll_max_step", ann_val=config.unroll) + sch.annotate(tx, ann_key="pragma_unroll_explicit", ann_val=1) + + l2g = sch.cache_write(main_block, 0, "local") + sch.reverse_compute_at(l2g, tx, preserve_unit_loops=True) + if config.micro_size_x % config.vector_size == 0: + _, v = sch.split(sch.get_loops(l2g)[-1], [None, config.vector_size]) + sch.vectorize(v) + + if config.use_shared: + + def _cooperative_fetch(index, vec_len): + block = sch.cache_read(main_block, index, "shared") + num_loops = len(sch.get_loops(block)) + sch.compute_at(block, ko, preserve_unit_loops=True) + loops = sch.get_loops(block)[-num_loops:] + ty, tx, _, vec = sch.split( + sch.fuse(*loops), + factors=[config.block_size_y, config.block_size_x, None, vec_len], + ) + sch.vectorize(vec) + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + if config.storage_align: + sch.storage_align(block, 0, axis=1, factor=8, offset=vec_len) + return block + + a_g2s = _cooperative_fetch(0, vec_len=config.vector_size) + b_g2s = _cooperative_fetch(1, vec_len=config.vector_size) + + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + else: + auto_inline_producers(sch, main_block) + + auto_inline_consumer_chain(sch, l2g) + sch.decompose_reduction(main_block, ko) + + # Step 4. Check if there are unbound blocks. Execute fallback scheduling to them. + def is_scheduled(block: tir.schedule.BlockRV) -> bool: + loops = sch.get_loops(block) + loop_kinds = {sch.get(loop).kind for loop in loops} + return loop_kinds != {ForKind.SERIAL} + + blocks = sch.get_child_blocks(root_block) + max_threads_per_block = utils.max_threads_per_block(target) + for block in blocks: + if is_scheduled(block): + continue + # no axis of the block is bound to thread or block + s_loops = sch.get_loops(block) + bx, tx = sch.split( + sch.fuse(*s_loops), + factors=[ + None, + 256, + ], + ) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + return sch + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> tir.Schedule: + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + # in some case conv template will use this rule, but the tile config is not + # analyzed by matmul expr. + assert len(config.block) == 2, "Matmul Only support 2D block" + + main_block = reduction_blocks[0] + + block_stmt = sch.get(main_block) + + # cuda core prefer b is [k, j] layout without swizzling. + index_maps = get_index_map(block_stmt, ["n", "n", "n"]) + if index_maps is None: + return None + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # Step 0. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K] + block = sch.reindex(main_block, ("read", 0)) + sch.transform_layout(block, ("write", 0), a_index_map) + block = sch.reindex(main_block, ("read", 1)) + sch.transform_layout(block, ("write", 0), b_index_map) + block = sch.reindex(main_block, ("write", 0)) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + + # Step 2. Get schedule config. + block_row_warps = config.block[0] // (config.thread[0] * config.step[0]) + block_col_warps = config.block[1] // (config.thread[1] * config.step[1]) + thread_row_tiles = config.thread[1] // (config.step[0] * 2) + thread_col_tiles = config.thread[1] // (config.step[1] * 2) + vthread_row_tiles = config.step[0] * 2 # expand vtrhead to avoid load band conflict + vthread_col_tiles = config.step[1] * 2 # expand vtrhead to avoid load band conflict + chunk = config.rstep[0] + + # Step 3. Schedule matmul + BM = block_row_warps * vthread_row_tiles * thread_row_tiles + BN = block_col_warps * vthread_col_tiles * thread_col_tiles + BK = chunk + + sch.pad_einsum( + main_block, + [1, BM, BN, BK], + ) + batch, y, x, k = sch.get_loops(main_block) + by, vy, ty, yi = sch.split(y, [None, vthread_row_tiles, block_row_warps, thread_row_tiles]) + bx, vx, tx, xi = sch.split(x, [None, vthread_col_tiles, block_col_warps, thread_col_tiles]) + ko, ki = sch.split(k, factors=[None, BK]) + sch.reorder(by, bx, vy, vx, ty, tx, ko, ki, yi, xi) + by = sch.fuse(batch, by) + sch.bind(bx, "blockIdx.x") + sch.bind(by, "blockIdx.y") + sch.bind(vy, "vthread.y") + sch.bind(vx, "vthread.x") + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + + def prod(iterable): + return reduce(lambda x, y: x * y, iterable, 1) + + l2g = sch.cache_write(main_block, 0, "local") + sch.reverse_compute_at(l2g, tx, preserve_unit_loops=True) + + def _cooperative_fetch(index, vec_len): + block = sch.cache_read(main_block, index, "shared") + num_loops = len(sch.get_loops(block)) + block_local = sch.cache_read(main_block, index, "local") + sch.compute_at(block_local, ki, preserve_unit_loops=True) + sch.compute_at(block, ko, preserve_unit_loops=True) + loops = sch.get_loops(block)[-num_loops:] + _, ty, tx, vec = sch.split( + sch.fuse(*loops), + factors=[None, block_row_warps, block_col_warps, vec_len], + ) + + auto_inline_producers(sch, block) + + def is_trivial_load(block): + # avoid vectorize under global[v2, v1]] shared[v1, v2] case + reads = sch.get(block).reads + writes = sch.get(block).writes + if len(reads) != 1 or len(writes) != 1: + return False + return all( + read.region[-1] == write.region[-1] for read, write in zip(reads, writes) + ) + + if is_trivial_load(block): + sch.vectorize(vec) + + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + + _, vec = sch.split( + sch.fuse(*sch.get_loops(block_local)[-2:]), + [None, vec_len // prod(config.step)], + ) + sch.vectorize(vec) + + return block + + for i, input_region in enumerate(sch.get(main_block).reads): + _buffer_name = input_region.buffer.name.replace("_reindex", "").replace("_pad", "") + if _buffer_name not in config.cached_tensors: + print( + f"Warning: {_buffer_name} is not in cached_tensors {config.cached_tensors}, skip." + ) + continue + + # otherwise cooperative fetch in shared memory. + if _buffer_name in config.vectorize: + vectorize = config.vectorize[_buffer_name] + else: + vectorize = 1 + + _cooperative_fetch(i, vec_len=vectorize) + + auto_inline_consumer_chain(sch, l2g) + + _, vec = sch.split( + sch.fuse(*sch.get_loops(l2g)[-2:]), [None, vectorize // prod(config.step)] + ) + sch.vectorize(vec) + + sch.decompose_reduction(main_block, ko) + return sch diff --git a/python/bitblas/gpu/matmul_analysis.py b/python/bitblas/gpu/matmul_analysis.py new file mode 100644 index 0000000000..1610fb3e14 --- /dev/null +++ b/python/bitblas/gpu/matmul_analysis.py @@ -0,0 +1,631 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring, invalid-name +"""A GEMM schedule rule for GPU operators.""" +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Set, Union, Tuple, Dict + +from tvm import tir +from tvm.ir import Range +from tvm.tir import IterVar, PrimExpr, Var +from tvm.tir.analysis import undefined_vars +from tvm.tir.schedule.schedule import BlockRV +from ..base.analysis import ( + collect_block_iter_vars_used_in_access_region, + get_root_block, + get_reduction_blocks, +) +from tvm.target.target import Target +from tvm.tir import IndexMap + + +def _is_one(x: PrimExpr) -> bool: + return isinstance(x, tir.IntImm) and x.value == 1 + + +def _collect_producers(sch: tir.Schedule, block: tir.schedule.BlockRV): + result = [] + for producer in sch.get_producers(block): + result.append(producer) + result.extend(_collect_producers(sch, producer)) + return result + + +def _collect_consumers(sch: tir.Schedule, block: tir.schedule.BlockRV): + result = [] + for consumer in sch.get_consumers(block): + result.append(consumer) + result.extend(_collect_consumers(sch, consumer)) + return result + + +def auto_inline_producers( + sch: tir.Schedule, + block: tir.schedule.BlockRV, + skip_blocks: Optional[List[tir.schedule.BlockRV]] = None, +): + skip_blocks = skip_blocks or [] + while True: + inlined_cnt = 0 + producers = _collect_producers(sch, block) + for producer in producers: + if any(sch.get(producer) == sch.get(skip_block) for skip_block in skip_blocks): + continue + try: + sch.compute_inline(producer) + inlined_cnt += 1 + except: # pylint: disable=bare-except + continue + if inlined_cnt == 0: + return + + +def auto_inline_consumers( + sch: tir.Schedule, + block: tir.schedule.BlockRV, +): + while True: + inlined_cnt = 0 + consumers = _collect_consumers(sch, block) + for consumer in consumers: + try: + sch.compute_inline(consumer) + inlined_cnt += 1 + except: # pylint: disable=bare-except + continue + for consumer in consumers: + try: + sch.reverse_compute_inline(consumer) + inlined_cnt += 1 + except: # pylint: disable=bare-except + continue + if inlined_cnt == 0: + return + + +def auto_inline_consumer_chain( + sch: tir.Schedule, + block: tir.schedule.BlockRV, +): + auto_inline_consumers(sch, block) + remaining_consumers = sch.get_consumers(block) + + if len(remaining_consumers) != 0: + # Some blocks have failed to be inlined to the producer cache-write stage. + # This could be due to another producer block that has not been scheduled. + for c in remaining_consumers: + for p in sch.get_producers(c): + if sch.get(p) != sch.get(block): + sch.compute_inline(p) + + # Try inlining into the cache-write stage again, this time it should succeed. + auto_inline_consumers(sch, block) + + +class IterKind(Enum): + """Iter kinds for GEMM-liked programs. + We can simplify the computation to C[S, I, J] += A[S, I, K] * B[S, J, K], + where `I, J, K` are fundamental axes for gemm and `S` represents all + other spatial axes (e.g. batches) + kIter_S: spatial axes + kIter_I: I axes + kIter_J: J axes + kIter_K: K axes + kIter_T: trivial axes (i.e. with extent 1) + """ + + kIter_S = 0 + kIter_I = 1 + kIter_J = 2 + kIter_K = 3 + kIter_T = 4 + + +@dataclass +class IterTrait: + kind: IterKind + extent: PrimExpr + + +def make_iter_fusion_index_map( + traits: List[IterTrait], + kind_order: List[IterKind], +) -> tir.IndexMap: + fused_iters: Dict[IterKind, PrimExpr] = {} + input_iters: List[tir.Var] = [] + for i, trait in enumerate(traits): + v_i = tir.Var(f"i{i}", trait.extent.dtype) + input_iters.append(v_i) + if trait.kind == IterKind.kIter_T: + continue + if trait.kind not in kind_order: + raise ValueError(f"Unknown iter kind {trait.kind}") + if trait.kind in fused_iters: + fused_iters[trait.kind] = fused_iters[trait.kind] * trait.extent + v_i + else: + fused_iters[trait.kind] = v_i + + final_indices: List[tir.PrimExpr] = [ + fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order + ] + + return tir.IndexMap(input_iters, final_indices, None) + + +def detect_iter_traits(block: tir.Block) -> Optional[Tuple[List[IterTrait]]]: + """Detect iter traits based on the pattern C[S, I, J] += A[S, I, K] * B[S, J, K] + + Parameters + ---------- + block : tir.Block + The block to be analyzed + + Returns + ------- + traits : Optional[Tuple[List[IterTrait]]] + The detected iter traits for axes in A, B and C. None if the block + does not match the pattern. + + """ + + if len(block.reads) != 2 or len(block.writes) != 1: + return None + + def get_access_axes(region: List[Range]) -> Set[Var]: + axes: Set[Var] = set() + for r in region: + if not _is_one(r.extent): + raise ValueError("Expect elemwise block access") + axes = axes.union(set(undefined_vars(r.min))) + return axes + + try: + A_axes = get_access_axes(block.reads[0].region) + B_axes = get_access_axes(block.reads[1].region) + C_axes = get_access_axes(block.writes[0].region) + except ValueError: + return None + + traits: Dict[Var, IterTrait] = {} + for iter_var in block.iter_vars: + var = iter_var.var + kind: IterKind + if _is_one(iter_var.dom.extent): + if iter_var.iter_type == tir.IterVar.CommReduce: + # for simplified case (e.g. 1x1 conv kernel) + kind = IterKind.kIter_K + else: + kind = IterKind.kIter_T + elif iter_var.iter_type == iter_var.DataPar: + if var in A_axes and var in B_axes and var in C_axes: + kind = IterKind.kIter_S + elif var in A_axes and var in C_axes: + kind = IterKind.kIter_I + elif var in B_axes and var in C_axes: + kind = IterKind.kIter_J + else: + return None + elif iter_var.iter_type == tir.IterVar.CommReduce: + if var in A_axes and var in B_axes and var not in C_axes: + kind = IterKind.kIter_K + else: + return None + else: + return None + traits[var] = IterTrait(kind, iter_var.dom.extent) + + # A Gemm-kernel requires have I, J and K axes + gemm_traits = {IterKind.kIter_I, IterKind.kIter_J, IterKind.kIter_K} + if {x.kind for x in traits.values()}.intersection(gemm_traits) != gemm_traits: + return None + + A_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in A_axes] + B_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in B_axes] + C_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in C_axes] + block_traits = [traits[i.var] for i in block.iter_vars] + return A_traits, B_traits, C_traits, block_traits + + +def get_index_map( + block: tir.Block, layout: List[str] = ["n", "t", "n"] +) -> Optional[Tuple[tir.IndexMap, ...]]: + """Get index maps for the block + + Parameters + ---------- + block : tir.Block + The block to be analyzed + + layout : List[str] + the target layout index map to be used. + 'n' for [i, k] layout + 't' for [k, j] layout + 'a' for auto inference based on whether the last axis is reduction. + + Returns + ------- + index_maps : Optional[Tuple[tir.IndexMap]] + The index maps for the block, or None if the block is not a gemm-liked kernel + """ + traits = detect_iter_traits(block) + if traits is None: + return None + A_traits, B_traits, C_traits, block_traits = traits + + def get_ordered_axes(region: List[Range]) -> Set[Var]: + axes: List[Var] = [] + for r in region: + if not _is_one(r.extent): + raise ValueError("Expect elemwise block access") + axes.append(r.min) + return axes + + def is_common_reduce(var: Var) -> bool: + for iter_var in block.iter_vars: + if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce: + return True + return False + + def check_last_trait(region: List[Range]): + axes = get_ordered_axes(region) + return is_common_reduce(axes[-1]) + + def infer_layout(layout: str, region: List[Range], kind: str = "A"): + """ + Infer the layout based on the region and the kind of buffer + kind: "A", "B", "C" + """ + primary_iter, secondary_iter, reduction_iter = { + "A": (IterKind.kIter_I, IterKind.kIter_K, IterKind.kIter_K), + "B": (IterKind.kIter_K, IterKind.kIter_J, IterKind.kIter_K), + "C": (IterKind.kIter_I, IterKind.kIter_J, None), + }[kind] + + spatial_iter = { + "A": IterKind.kIter_I, + "B": IterKind.kIter_J, + "C": None, + }[kind] + + if layout == "n": + return [IterKind.kIter_S, primary_iter, secondary_iter] + elif layout == "t": + return [IterKind.kIter_S, secondary_iter, primary_iter] + elif layout == "a": + # auto inference layout + # for buffer with reduction axis, we put it as the last axis + # otherwise, we put it as the first axis + if kind == "C": + return [IterKind.kIter_S, primary_iter, secondary_iter] + else: + return ( + [IterKind.kIter_S, spatial_iter, reduction_iter] + if check_last_trait(region) + else [IterKind.kIter_S, reduction_iter, spatial_iter] + ) + else: + raise ValueError(f"Unknown layout {layout}") + + A_index_map = make_iter_fusion_index_map( + A_traits, infer_layout(layout[0], block.reads[0].region, kind="A") + ) + B_index_map = make_iter_fusion_index_map( + B_traits, infer_layout(layout[1], block.reads[1].region, kind="B") + ) + C_index_map = make_iter_fusion_index_map( + C_traits, infer_layout(layout[2], block.writes[0].region, kind="C") + ) + + matmul_index_map = make_iter_fusion_index_map( + block_traits, [IterKind.kIter_S, IterKind.kIter_I, IterKind.kIter_J, IterKind.kIter_K] + ) + + return ( + matmul_index_map, + A_index_map, + B_index_map, + C_index_map, + ) + + +def get_in_out_dtypes(block: tir.Block) -> Tuple[str]: + """ + Detect In/Out data types for the given block based on the analysis if read/write buffers. + """ + assert len(block.reads) > 0 and len(block.writes) > 0 + in_dtype = block.reads[0].buffer.dtype + out_dtype = block.writes[0].buffer.dtype + return (in_dtype, out_dtype) + + +def get_dequantize_block(sch, blocks) -> Optional[BlockRV]: + # check at least two input and one output + # at lease one input has uint dtype, and the output dtype is float + def is_dequantize(block: BlockRV) -> bool: + block_stmt = sch.get(block) + if len(block_stmt.reads) < 2: + return False + has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads) + if not has_uint_input: + return False + if len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype): + return False + return True + + dequantize_blocks = [block for block in blocks if is_dequantize(block)] + return dequantize_blocks[0] if len(dequantize_blocks) == 1 else None + + +def is_identity_or_transpose_block(block_stmt: tir.Block) -> bool: + iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} + if iter_types != {IterVar.DataPar}: + return False, False + if not isinstance(block_stmt.body, tir.BufferStore): + return False, False + if not isinstance(block_stmt.body.value, tir.BufferLoad): + return False, False + + def get_access_vars(region: List[Range]) -> List[Var]: + axes: List[Var] = [] + for r in region: + if not _is_one(r.extent): + return None + axes.extend(undefined_vars(r.min)) + # remove trivial axis + trivial_vars = set( + iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent) + ) + axes = [axis for axis in axes if axis not in trivial_vars] + # remove duplicate axis + axes = [var for i, var in enumerate(axes) if i == 0 or var != axes[i - 1]] + return axes + + lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:] + rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:] + is_identity = list(lhs_access_vars) == list(rhs_access_vars) + is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set( + rhs_access_vars + ) + return is_identity, is_transpose + + +def is_identity_block(block_stmt: tir.Block) -> bool: + return is_identity_or_transpose_block(block_stmt)[0] + + +def is_transpose_block(block_stmt: tir.Block) -> bool: + return is_identity_or_transpose_block(block_stmt)[1] + + +def inline_transpose_block(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]): + result_blocks = [] + for block in blocks: + if not is_transpose_block(sch.get(block)): + result_blocks.append(block) + continue + try: + sch.compute_inline(block) + except: + try: + sch.reverse_compute_inline(block) + except: + result_blocks.append(block) + return result_blocks + + +def normalize_to_matmul( + sch: tir.Schedule, main_block: BlockRV, layout: List[str] = ["n", "t", "n"] +) -> Optional[tir.Schedule]: + block_stmt = sch.get(main_block) + + # let layout be 'a' to auto inference the layout + index_maps = get_index_map(block_stmt, layout=layout) + if index_maps is None: + print("[WARNING] Cannot find the appropriate index map for tensorcore") + return None + + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # `skip_simplify` to avoid the bug in the 1x1 conv + block = sch.reindex(main_block, ("read", 0), skip_simplify=True) + sch.transform_layout(block, ("write", 0), a_index_map) + block = sch.reindex(main_block, ("read", 1), skip_simplify=True) + sch.transform_layout(block, ("write", 0), b_index_map) + block = sch.reindex(main_block, ("write", 0), skip_simplify=True) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + sch.mod["main"] = sch.mod["main"].with_attr("dlight.tensorcore_prenormlized", True) + return sch + + +def get_tensorized_func_and_tags( + func: tir.PrimFunc, + target: Target, +) -> Tuple[tir.PrimFunc, Dict[str, Union[List[int], int]]]: + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_wmma_intrin_group, + ) + + """ + transform function to matmul if necessary (e.g. transform conv2d with im2col) + """ + # step1. detect whether the function can utilize tensorcore + sch = tir.Schedule(func) + root_block = get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + reduction_blocks = get_reduction_blocks(sch, blocks) + if not reduction_blocks or len(reduction_blocks) != 1: + return func, None + + def _can_be_tensorized(sch: tir.Schedule, block: BlockRV) -> bool: + block_stmt = sch.get(block) + conditions = [] + conditions.append(len(block_stmt.reads) == 2) + conditions.append(len(block_stmt.writes) == 1) + conditions.append( + len( + collect_block_iter_vars_used_in_access_region( + block_stmt, block_stmt.writes[0].region + ) + ) + > 0 + ) + if not all(conditions): + return False + return True + + # step2. transform function to tensorcore matmul (e.g. conv2d with im2col) + def check_sm_version(arch: str) -> int: + sm_version = arch.replace("sm_", "") + return int(sm_version) if sm_version.isdigit() else -1 + + def analysis_tensorcore_tags(sch: tir.Schedule, block: BlockRV, target: Target) -> bool: + tags: Dict[str, Union[List[int], int]] = {} + block_stmt = sch.get(block) + + # analysis tensorcore axis + # todo(lei): maybe we can remove this in the future + (write_buffer_region,) = block_stmt.writes + out_axis = len(write_buffer_region.buffer.shape) + tags["tensorcore_config"] = [out_axis - 2, out_axis - 1] + + # analysis pipeline stage + # todo(lei): maybe we can integrate this into policy in the future + tags["pipeline_stage"] = 1 + if target.kind.name == "cuda" and check_sm_version(target.arch) >= 80: + # enable pipleline stage only for sm_80 devices + tags["pipeline_stage"] = 2 + + # analysis async copy + # todo(lei): maybe we can integrate this into policy in the future + tags["use_async_copy"] = 0 + if tags["pipeline_stage"] == 2 and check_sm_version(target.arch) >= 80: + # async copy only works in software pipeline. + tags["use_async_copy"] = 1 + + # analysis intrin infomation + def get_ordered_axes(region: List[Range]) -> Set[Var]: + axes: List[Var] = [] + for r in region: + if not _is_one(r.extent): + raise ValueError("Expect elemwise block access") + axes.append(r.min) + return axes + + def is_common_reduce(var: Var) -> bool: + for iter_var in block_stmt.iter_vars: + if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce: + return True + return False + + def check_last_trait(region: List[Range]): + axes = get_ordered_axes(region) + return is_common_reduce(axes[-1]) + + intrin_info: dict = {} + in_dtype, out_dtype = get_in_out_dtypes(block_stmt) + intrin_info["in_dtype"] = in_dtype + intrin_info["out_dtype"] = out_dtype + # if the last dimension is reduce axis, the B is transposed + intrin_info["trans_b"] = check_last_trait(block_stmt.reads[1].region) + if "smooth_a" in func.attrs: + intrin_info["smooth_a"] = func.attrs["smooth_a"] + if "smooth_b" in func.attrs: + intrin_info["smooth_b"] = func.attrs["smooth_b"] + tags["intrin_info"] = intrin_info + + return tags + + (main_block,) = reduction_blocks + if _can_be_tensorized(sch, main_block) is None: + return func, None + + minimal_tensorize_threshold = 16 + block_stmt = sch.get(main_block) + if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70: + in_dtype, out_dtype = get_in_out_dtypes(block_stmt) + try: + _ = get_wmma_intrin_group( + in_dtype=in_dtype, + out_dtype=out_dtype, + ) + except: + print("[FastDlight][WARNING] Cannot find the corresponding wmma intrin group") + return func, None + + # reindex and transform functions + # Normalize tensor functions to C[S, I, J] += A[S, I, K] * B[S, J, K] + # or C[S, I, J] += A[S, I, K] * B[S, K, J] + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + if sch is None: + return func, None + + block_stmt = sch.get(main_block) + + # the batch dimension is not taken into consideration. + for item_var in block_stmt.iter_vars[1:]: + extent = item_var.dom.extent + if isinstance(extent, tir.expr.IntImm): + if extent.value < minimal_tensorize_threshold: + return func, None + tags = analysis_tensorcore_tags(sch, main_block, target) + return sch.mod["main"], tags + + return func, None + + +def get_propagate_map(trans: bool = True, dtype="float16"): + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + ldmatrix_32x8_to_shared_16x16_layout, + ldmatrix_trans_32x8_to_shared_16x16_layout, + ) + + assert dtype in ["float16"], "Only support float16 for now" + + ldmatrix_layout = ldmatrix_32x8_to_shared_16x16_layout + ldmatrix_layout_trans = ldmatrix_trans_32x8_to_shared_16x16_layout + + # IntraWarp memory layout was occurred by ldmatrix, we should lift the ld_matrix out + def ldmatrix_permutation_16x16_32x8_16x16(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 8 + local_id = kernel_j % 8 + return ldmatrix_layout(thread_id, local_id) + + def ldmatrix_trans_permutation_16x16_32x8_16x16(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 8 + local_id = kernel_j % 8 + return ldmatrix_layout_trans(thread_id, local_id) + + ldmatrix_index_map = ( + ldmatrix_trans_permutation_16x16_32x8_16x16 + if trans + else ldmatrix_permutation_16x16_32x8_16x16 + ) + + def permutation(i, j, kernel_i, kernel_j): + return ( + i, + j, + *ldmatrix_index_map(kernel_i, kernel_j), + ) + + # TODO(lei): index_dtype should be analyzed from the schedule + inversed_index_map = IndexMap.from_func( + ldmatrix_index_map, index_dtype="int32" + ).inverse([16, 16]) + return permutation, inversed_index_map diff --git a/python/bitblas/gpu/matmul_mma.py b/python/bitblas/gpu/matmul_mma.py new file mode 100644 index 0000000000..7e3dab360b --- /dev/null +++ b/python/bitblas/gpu/matmul_mma.py @@ -0,0 +1,692 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring, invalid-name +"""A GEMM schedule rule for GPU operators.""" +from typing import Literal, Optional, List + +from tvm import tir +from tvm.target import Target + +from ..base.roller.rasterization import NoRasterization +from ..base import analysis +from .base import GPUScheduleRule +from .matmul_mma_dequantize import MatmulTensorizationMMAWithDequantizeInfo +from .matmul_analysis import ( + auto_inline_consumer_chain, + is_transpose_block, + is_identity_block, + _collect_producers, + inline_transpose_block, + auto_inline_producers, + get_index_map, + get_reduction_blocks, + get_dequantize_block, + normalize_to_matmul, + get_propagate_map, +) + + +def get_index_map_3d(index_map, l=16, r=16): + def index_map_3d(b, i, j): + return ( + b, + i // l, + j // r, + *index_map(i % l, j % r), + ) + + return index_map_3d + + +def get_index_map_5d(index_map): + """ + for layout transformed gemm, the index map should be 5d + """ + + def index_map_5d(b, i, j, ii, jj): + return ( + b, + i, + j, + *index_map(ii, jj), + ) + + return index_map_5d + + +def get_warp_index_map(index_map, l=16, r=16, is_5d=False): + if is_5d: + return get_index_map_5d(index_map) + return get_index_map_3d(index_map, l, r) + + +class MatmulTensorizationMMA(GPUScheduleRule): + """ + The schedule rule for float16 tensor core matmul computation. + func with attr 'dlight.do_not_tensorize' will not be tensorized. + """ + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Optional[tir.Schedule]: + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + # We first inline all transpose blocks for later analysis of transposed A and B + blocks = inline_transpose_block(sch, blocks) + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + dequantize_block = get_dequantize_block(sch, blocks) + + main_block = reduction_blocks[0] + main_block_stmt = sch.get(main_block) + + # Supported data types: + # fp16, fp16, fp16: fp16 precision + # fp16, fp16, fp32: fp16 mixed precision + dtype_a = main_block_stmt.reads[0].buffer.dtype + dtype_b = main_block_stmt.reads[1].buffer.dtype + dtype_c = main_block_stmt.writes[0].buffer.dtype + if dtype_a != dtype_b: + return None + + # Get index maps + index_maps = get_index_map(main_block_stmt) + if index_maps is None: + return None + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # Tensorization by hardware intrinsics + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_mma_intrin_group, + shared_16x16_to_mma_32x8_layout, + ) + + # tile size + block_m, block_n, block_k = 128, 128, 32 + + # tensor core intrinsic size + micro_size_m, micro_size_n, micro_size_k = 16, 16, 16 + + # thread size + # thread_x == warp_size + thread_z, thread_y, thread_x = 2, 2, 32 + + vector_size = 8 + unroll_depth = 4 + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K] + block = sch.reindex(main_block, ("read", 0)) + sch.transform_layout(block, ("write", 0), a_index_map) + is_transpose_a = is_transpose_block(sch.get(block)) + block = sch.reindex(main_block, ("read", 1)) + sch.transform_layout(block, ("write", 0), b_index_map) + is_transpose_b = is_identity_block(sch.get(block)) + block = sch.reindex(main_block, ("write", 0)) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + + batch, i, j, k = sch.get_loops(main_block) + + swizzle_factor_for_l2_m = [1, None] + swizzle_factor_for_l2_n = [1, None] + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + swizzle_factor_for_l2_m[0] * block_m, + swizzle_factor_for_l2_n[0] * block_n, + block_k, + ], + ) + + # Step 3. Reorder loops for tiling + + # Step 3.1 inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_m]) + j, j_inner = sch.split(j, factors=[None, micro_size_n]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = main_block + block_outer = sch.blockize(i_inner) + + # Step 3.2 outer loops for tiling + # split factors for i, j, and k + micro_block_cnt_in_warp_m = block_m // thread_z // micro_size_m + micro_block_cnt_in_warp_n = block_n // thread_y // micro_size_n + micro_block_cnt_in_warp_k = block_k // micro_size_k + + i_factors = swizzle_factor_for_l2_m + [thread_z, micro_block_cnt_in_warp_m] + j_factors = swizzle_factor_for_l2_n + [thread_y, micro_block_cnt_in_warp_n] + k_factors = [None, micro_block_cnt_in_warp_k] + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, factors=k_factors) + + sch.reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3) + + block_axis = sch.fuse(batch, i0, j0, i1, j1) + sch.bind(block_axis, "blockIdx.x") + + sch.bind(i2, "threadIdx.z") + sch.bind(j2, "threadIdx.y") + + # Step 4. Read/write to shared mem and register + def fetch_input(block_outer, read_buffer_idx, tensor_name: Literal["A", "B"], is_transpose): + # 1) Read to shared memory + block_read_smem = sch.cache_read(block_outer, read_buffer_idx, "shared.dyn") + sch.compute_at(block_read_smem, k0) + auto_inline_producers( + sch, block_read_smem, [dequantize_block] if dequantize_block else [] + ) + + # For transposed read, we directly load transposed tensor from global + # Then use ldmatrix.trans to handle transpose later + if (tensor_name == "A" and is_transpose) or (tensor_name == "B" and not is_transpose): + # specifical handle transpose read (for NN matmul or TT matmul) + v0, v1 = sch.get_loops(block_read_smem)[-2:] + sch.reorder(v1, v0) + sch.transform_layout(block_read_smem, ("write", 0), lambda b, i, j: (b, j, i)) + + # bind loops + fused = sch.fuse(*sch.get_loops(block_read_smem)[-2:]) + f0, f1, f2, f3, f4 = sch.split(fused, [None, thread_z, thread_y, thread_x, vector_size]) + sch.bind(f1, "threadIdx.z") + sch.bind(f2, "threadIdx.y") + sch.bind(f3, "threadIdx.x") + sch.vectorize(f4) + + # swizzling + sch.annotate(block_read_smem, ann_key="permuted_layout", ann_val=1) + + # 2) Read to register + block_read_reg = sch.cache_read(block_outer, read_buffer_idx, "warp") + sch.compute_at(block_read_reg, k1) + + # bind_loops + micro_size_spatial = micro_size_m if tensor_name == "A" else micro_size_n + micro_size_1, micro_size_2 = ( + (micro_size_spatial, micro_size_k) + if not is_transpose + else (micro_size_k, micro_size_spatial) + ) + v00, v01 = sch.split(sch.get_loops(block_read_reg)[-2], [None, micro_size_1]) + v10, v11 = sch.split(sch.get_loops(block_read_reg)[-1], [None, micro_size_2]) + sch.reorder(v00, v10, v01, v11) + + # reorder read axis to match the layout of ldmatrix + sch.transform_layout( + block_read_reg, + ("write", 0), + lambda v0, v1, v2: ( + v0, + v1 // micro_size_1, + v2 // micro_size_2, + *shared_16x16_to_mma_32x8_layout(v1 % micro_size_1, v2 % micro_size_2), + ), + ) + + # swizzling + mma_read_block = sch.blockize(sch.get_loops(block_read_reg)[-2]) + sch.annotate(mma_read_block, ann_key="permuted_layout", ann_val=1) + + return block_read_smem, block_read_reg + + block_read_a, block_read_reg_a = fetch_input(block_outer, 0, "A", is_transpose_a) + block_read_b, block_read_reg_b = fetch_input(block_outer, 1, "B", is_transpose_b) + + # Write to register, and then smem + def store_output(block_outer, write_buffer_idx): + # 1) Write to shared memory + block_write_smem = sch.cache_write(block_outer, write_buffer_idx, "shared.dyn") + sch.reverse_compute_at(block_write_smem, block_axis) + auto_inline_consumer_chain(sch, block_write_smem) + + # bind loops + fused = sch.fuse(*sch.get_loops(block_write_smem)[-2:]) + f0, f1, f2 = sch.split(fused, [None, thread_x, vector_size]) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + + # 2) Write to register + block_write_reg = sch.cache_write(block_outer, write_buffer_idx, "warp") + + # bind loops + v0, v1, v2 = sch.get_loops(block_write_reg)[-3:] + v11, v12, v13 = sch.split(v1, factors=[thread_z, None, micro_size_m]) + v21, v22, v23 = sch.split(v2, factors=[thread_y, None, micro_size_n]) + sch.reorder(v11, v21, v12, v22, v13, v23) + sch.bind(v11, "threadIdx.z") + sch.bind(v21, "threadIdx.y") + + # reorder write axis to match the layout of ldmatrix + sch.transform_layout( + block_write_reg, + ("read", 0), + lambda v0, v1, v2: ( + v0, + v1 // micro_size_m, + v2 // micro_size_n, + *shared_16x16_to_mma_32x8_layout(v1 % micro_size_m, v2 % micro_size_n), + ), + ) + + return block_write_smem, block_write_reg + + block_write_smem, block_write_reg = store_output(block_outer, 0) + + # Step 5. Schedule tensor core computation + block_init = sch.decompose_reduction(block_outer, k0) + block_init_inner = sch.get_child_blocks(block_init)[0] + + intrin_group = get_mma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype=str(dtype_a), + out_dtype=str(dtype_c), + trans_a=is_transpose_a, + trans_b=is_transpose_b, + not_use_mma_store_intrinic=False, + ) + + sch.tensorize(sch.get_loops(block_init_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(block_read_reg_a)[-2], intrin_group["load_a"]) + sch.tensorize(sch.get_loops(block_read_reg_b)[-2], intrin_group["load_b"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + sch.tensorize(sch.get_loops(block_write_reg)[-2], intrin_group["store"]) + + # Step 6. Async pipeline + sch.annotate(k0, ann_key="software_pipeline_stage", ann_val=[0, 0, 3]) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) + sch.annotate(k0, ann_key="software_pipeline_async_stages", ann_val=[0]) + + # Step 7. Handle dequantize block + # Now we just add a dummy kernel to compute dequantize + if dequantize_block is not None: + auto_inline_producers(sch, dequantize_block) + loops = sch.get_loops(dequantize_block) + loop = sch.fuse(*loops) + v0, v1, v2, v3 = sch.split(loop, [None, 128, 2, 4]) + sch.bind(v0, "blockIdx.x") + sch.bind(v1, "threadIdx.x") + sch.unroll(v2) + sch.vectorize(v3) + return sch + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> Optional[tir.Schedule]: + if "dequantize_info" in func.attrs: + dequantize_rule = MatmulTensorizationMMAWithDequantizeInfo() + return dequantize_rule.apply_config(func, config) + + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_mma_intrin_group, + ) + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + output_blocks = [sch.get(block) for block in sch.get_output_blocks(root_block)] + + def check_require_cache(func: tir.PrimFunc): + conditions: List[bool] = [] + + # check if has dynamic symbolic + def check_has_dynamic(func: tir.PrimFunc): + for param in func.params: + if param not in func.buffer_map: + continue + arg = func.buffer_map[param] + for i in arg.shape: + if isinstance(i, tir.Var): + return True + return False + + conditions.append(check_has_dynamic(func)) + # check if has post process + conditions.append(sch.get(main_block) not in output_blocks) + return any(conditions) + + cache_write_required = check_require_cache(func) + + shared_scope = "shared" + + intrin_info = config.intrin_info + intrin_group = get_mma_intrin_group( + load_scope=shared_scope, + store_scope=shared_scope if cache_write_required else "global", + in_dtype=intrin_info.in_dtype, + out_dtype=intrin_info.out_dtype, + trans_a=intrin_info.trans_a, + trans_b=intrin_info.trans_b, + smooth_a=intrin_info.smooth_a, + smooth_b=intrin_info.smooth_b, + not_use_mma_store_intrinic=False, + ) + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # tensor core intrinsic size + warp_row_tiles = config.warp[0] + warp_col_tiles = config.warp[1] + block_row_warps = config.block[0] // warp_row_tiles + block_col_warps = config.block[1] // warp_col_tiles + stage = config.pipeline_stage + use_async = config.use_async + chunk = config.rstep[0] + + micro_size_x, micro_size_y, micro_size_k = intrin_group["micro_kernel"] + + # get the axis for layout transform + def get_axis(l, r, trans): + return (r, l) if trans else (l, r) + + a_lr = get_axis(micro_size_x, micro_size_k, intrin_info.trans_a) + b_lr = get_axis(micro_size_k, micro_size_y, intrin_info.trans_b) + + def can_enable_swizzle(dtype: str, smooth: bool): + # inject_permuted_layout only support float16 currently + if dtype == "float16": + # if we use smooth layout, we don't need to do swizzling + return not smooth + return False + + can_swizzle_a = can_enable_swizzle(intrin_info.in_dtype, intrin_info.smooth_a) + can_swizzle_b = can_enable_swizzle(intrin_info.in_dtype, intrin_info.smooth_b) + + warp_size = 32 + + i_factors, j_factors, k_factors = ( + [None, 1, block_row_warps, warp_row_tiles // micro_size_x], + [1, None, block_col_warps, warp_col_tiles // micro_size_y], + [None, chunk // micro_size_k], + ) + + num_ty = i_factors[2] + num_tz = j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] + if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + + sch.reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3) + + block_idy = sch.fuse(i0, j0) + block_idx = sch.fuse(i1, j1) + thread_idy = i2 + thread_idz = j2 + + # plan rasteration + if ( + not isinstance(config.rasterization_plan, NoRasterization) + and sch.get(batch).extent.value == 1 + ): + device_func, invoke_func = config.rasterization_plan.get_code() + factor = config.rasterization_plan.panel_width_ + + # TODO(lei): this is a trick for rasterization implementation + # is not optimal. (5% performance loss) + # require a solution for general block rasterization + factor = 8 # should be divisible by block_idx + if sch.get(block_idx).extent.value % factor == 0: + block_k, block_idx = sch.split(block_idx, factors=[None, factor]) + sch.reorder(block_k, block_idy, block_idx) + sch.bind(block_k, "blockIdx.z") + else: + sch.bind(batch, "blockIdx.z") + + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + sch.bind(thread_idz, "threadIdx.z") + + # rewrite smooth layout of shared memory + def smooth_smem_layout_rewrite(block, scope, l=16, r=16, enable=True): + if not enable: + return + sch.transform_layout( + block, + scope, + lambda b, i, j: ( + b, + i // l, + j // r, + i % l, + j % r, + ), + ) + + smooth_smem_layout_rewrite(block_outer, ("read", 0), *a_lr, enable=intrin_info.smooth_a) + smooth_smem_layout_rewrite(block_outer, ("read", 1), *b_lr, enable=intrin_info.smooth_b) + smooth_smem_layout_rewrite(block_outer, ("write", 0), enable=True) + + def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False, trans=False): + block_read = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_read, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_read).iter_vars) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + f_0, f_1, f_2, f_3, f_4 = sch.split( + fused, factors=[num_ty, num_tz, None, warp_size, vec_len] + ) + + sch.bind(f_3, "threadIdx.x") + sch.bind(f_1, "threadIdx.z") + sch.bind(f_0, "threadIdx.y") + sch.vectorize(f_4) + sch.unroll(f_2) + # Apply Swizzling + sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) + # if not, apply padding to alleviate bank conflict + if not (can_swizzle or is_smooth): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) + sch.annotate(f_2, "pragma_unroll_explicit", False) + return block_read + + a_g2s = fetch_to_shared( + block_outer, + 0, + vec_len=list(config.vectorize.values())[0], + can_swizzle=can_swizzle_a, + is_smooth=intrin_info.smooth_a, + trans=intrin_info.trans_a, + ) + b_g2s = fetch_to_shared( + block_outer, + 1, + vec_len=list(config.vectorize.values())[1], + can_swizzle=can_swizzle_b, + is_smooth=intrin_info.smooth_b, + trans=intrin_info.trans_b, + ) + + # rewrite global smooth layout + def smooth_gmem_layout_rewrite(sch, block, enable=True, trans=False): + if not enable: + return + # step1: find the first producer block + # Notes: we assume the layout propagate happens in the first producer block + # otherwise, the layout transform will have no effect as it will transform both + # read and write buffer + producers = _collect_producers(sch, block) + + propagate_block: tir.Block = producers[-1] + + # step2: transform the layout with inverse permutation + _, inverse_indexmap = get_propagate_map(trans=trans, dtype=intrin_info.in_dtype) + + def inverse_permutation(i, j, ii, jj): + return (i, j, *inverse_indexmap.map_indices([ii, jj])) + + sch.transform_layout(propagate_block, ("read", 0), inverse_permutation) + + smooth_gmem_layout_rewrite(sch, a_g2s, intrin_info.smooth_a, intrin_info.trans_a) + smooth_gmem_layout_rewrite(sch, b_g2s, intrin_info.smooth_b, intrin_info.trans_b) + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "warp") + B_mat = sch.cache_read(block_outer, 1, "warp") + sch.compute_at(A_mat, k1) + sch.compute_at(B_mat, k1) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + if cache_write_required: + accumulator_shared_to_global = sch.cache_write(block_outer, 0, shared_scope) + + store = sch.cache_write(block_outer, 0, "warp") + sch.reverse_compute_at(store, j2) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, micro_size_x]) + j0, j1 = sch.split(j, factors=[None, micro_size_y]) + sch.reorder(i0, j0, i1, j1) + + if cache_write_required: + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + sch.reverse_compute_at( + accumulator_shared_to_global, sch.get_loops(store)[-3], preserve_unit_loops=True + ) + + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) + f0, f1, f2 = sch.split( + fused, factors=[None, warp_size, max(list(config.vectorize.values()))] + ) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + sch.unroll(f0) + sch.annotate(f0, "pragma_unroll_explicit", False) + else: + auto_inline_consumer_chain(sch, store) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + index_map_a, index_map_b, index_map_c = intrin_group["index_map"] + + sch.transform_layout( + A_mat, ("write", 0), get_warp_index_map(index_map_a, *a_lr, intrin_info.smooth_a) + ) + sch.transform_layout( + B_mat, ("write", 0), get_warp_index_map(index_map_b, *b_lr, intrin_info.smooth_b) + ) + sch.transform_layout( + store, + ("read", 0), + get_warp_index_map(index_map_c, is_5d=True), + ) + + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, a_lr[0]]) + j0, j1 = sch.split(j, factors=[None, a_lr[1]]) + sch.reorder(i0, j0, i1, j1) + ba = sch.blockize(i1) + sch.annotate(ba, ann_key="permuted_layout", ann_val=can_swizzle_a) + sch.tensorize(ba, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, b_lr[0]]) + j0, j1 = sch.split(j, factors=[None, b_lr[1]]) + sch.reorder(i0, j0, i1, j1) + bb = sch.blockize(i1) + sch.annotate(bb, ann_key="permuted_layout", ann_val=can_swizzle_b) + sch.tensorize(bb, intrin_group["load_b"]) + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + tensorize_init_store_compute() + + if stage > 1: + sch.annotate(k0, ann_key="software_pipeline_stage", ann_val=[0, 0, stage - 1]) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) + if use_async: + sch.annotate(k0, "software_pipeline_async_stages", [0]) + + return sch diff --git a/python/bitblas/gpu/matmul_mma_dequantize.py b/python/bitblas/gpu/matmul_mma_dequantize.py new file mode 100644 index 0000000000..3f902a7ac6 --- /dev/null +++ b/python/bitblas/gpu/matmul_mma_dequantize.py @@ -0,0 +1,540 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring, invalid-name +"""A GEMM schedule rule for GPU operators.""" +from typing import Literal, Optional + +from tvm import tir +from tvm.target import Target + +from ..base.roller.rasterization import NoRasterization +from ..base import analysis +from .base import GPUScheduleRule +from .matmul_analysis import ( + auto_inline_consumer_chain, + auto_inline_producers, + get_reduction_blocks, + get_dequantize_block, + normalize_to_matmul, +) + + +def get_index_map_3d(index_map, l=16, r=16): + def index_map_3d(b, i, j): + return ( + b, + i // l, + j // r, + *index_map(i % l, j % r), + ) + + return index_map_3d + + +def get_index_map_5d(index_map): + """ + for layout transformed gemm, the index map should be 5d + """ + + def index_map_5d(b, i, j, ii, jj): + return ( + b, + i, + j, + *index_map(ii, jj), + ) + + return index_map_5d + + +def get_index_map(index_map, l=16, r=16, is_5d=False): + if is_5d: + return get_index_map_5d(index_map) + return get_index_map_3d(index_map, l, r) + + +class MatmulTensorizationMMAWithDequantizeInfo(GPUScheduleRule): + """ + The schedule rule for float16 tensor core matmul computation. + func with attr 'dlight.do_not_tensorize' will not be tensorized. + """ + + def sch_dequantize_in_register_with_config( + self, + func: tir.PrimFunc, + config, + ): + """ + Simple dequantize schedule without shared memory prefetch. + quantized weight + | + V + dequantized in register + | + V + save into shared memory + | + V + compute + """ + + return None + + def sch_shared_memory_prefetch_with_config( + self, + func: tir.PrimFunc, + config, + ): + """ + For A100 Like devices, the shared memory prefetch(async) is required + to achieve optimal performance. + quantized weight + | + V + shared memory prefetch (with async copy) + | + V + dequantized into shared memory + | + V + compute + """ + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_mma_intrin_group, + ) + from .intrin.lop3 import get_lop3_intrin_group + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + # always enable shared memory rewrite + cache_write_required = True + + # Check Dequantize Info + # TODO(leiwang): this is a hack to get the configuaration, can be improved by writing a pass to analysis the dequantize block. + dequantize_info = func.attrs["dequantize_info"] + + def check_dequantize_info(dequantize_info): + conditions = [] + # currently only support weight only dequantization + conditions.append(len(dequantize_info) == 1) + # TODO(@lei) check if the dequantize value name is weight + return all(conditions) + + assert check_dequantize_info(dequantize_info) + + (B_decode_info,) = list(dequantize_info.values()) + + def check_b_decode_info(B_decode_info): + conditions = [] + # check source format in ["int", "fp", "af"] + conditions.append("source_format" in B_decode_info) + conditions.append(B_decode_info["source_format"]["format"] in ["int", "fp", "af"]) + # check source bits in [1, 2, 4, 8] + conditions.append(B_decode_info["source_format"]["bits"] in [1, 2, 4, 8]) + # check target format in ["float16", "int8"] + conditions.append("target_format" in B_decode_info) + conditions.append(B_decode_info["target_format"] in ["float16", "int8"]) + return all(conditions) + + assert check_b_decode_info(B_decode_info) + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # tensor core intrinsic size + intrin_info = config.intrin_info + shared_scope = "shared" + + intrin_info = config.intrin_info + intrin_group = get_mma_intrin_group( + load_scope=shared_scope, + store_scope=shared_scope if cache_write_required else "global", + in_dtype=intrin_info.in_dtype, + out_dtype=intrin_info.out_dtype, + trans_a=intrin_info.trans_a, + trans_b=intrin_info.trans_b, + smooth_a=intrin_info.smooth_a, + smooth_b=intrin_info.smooth_b, + not_use_mma_store_intrinic=False, + ) + + warp_row_tiles = config.warp[0] + warp_col_tiles = config.warp[1] + block_row_warps = config.block[0] // warp_row_tiles + block_col_warps = config.block[1] // warp_col_tiles + stage = config.pipeline_stage + use_async = config.use_async + chunk = config.rstep[0] + + micro_size_x, micro_size_y, micro_size_k = intrin_group["micro_kernel"] + + # get the axis for layout transform + def get_axis(l, r, trans): + return (r, l) if trans else (l, r) + + a_lr = get_axis(micro_size_x, micro_size_k, intrin_info.trans_a) + b_lr = get_axis(micro_size_k, micro_size_y, intrin_info.trans_b) + + def can_enable_swizzle(dtype: str, smooth: bool): + # inject_permuted_layout only support float16 currently + if dtype == "float16": + # if we use smooth layout, we don't need to do swizzling + return not smooth + return False + + can_swizzle_a = can_enable_swizzle(intrin_info.in_dtype, intrin_info.smooth_a) + can_swizzle_b = can_enable_swizzle(intrin_info.in_dtype, intrin_info.smooth_b) + + warp_size = 32 + + i_factors, j_factors, k_factors = ( + [None, 1, block_row_warps, warp_row_tiles // micro_size_x], + [1, None, block_col_warps, warp_col_tiles // micro_size_y], + [None, chunk // micro_size_k], + ) + + num_ty = i_factors[2] + num_tz = j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] + if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + + sch.reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3) + + block_idy = sch.fuse(i0, j0) + block_idx = sch.fuse(i1, j1) + thread_idy = i2 + thread_idz = j2 + + # plan rasteration + if ( + not isinstance(config.rasterization_plan, NoRasterization) + and sch.get(batch).extent.value == 1 + ): + device_func, invoke_func = config.rasterization_plan.get_code() + factor = config.rasterization_plan.panel_width_ + + # TODO(lei): this is a trick for rasterization implementation + # is not optimal. + # require a solution for general block rasterization + factor = 8 # should be divisible by block_idy + if sch.get(block_idx).extent.value % factor == 0: + block_k, block_idx = sch.split(block_idx, factors=[None, factor]) + sch.bind(block_k, "blockIdx.z") + else: + sch.bind(batch, "blockIdx.z") + + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + sch.bind(thread_idz, "threadIdx.z") + + def smooth_layout_recover(block, scope, l=16, r=16, enable=True): + if not enable: + return + sch.transform_layout( + block, + scope, + lambda b, i, j: ( + b, + i // l, + j // r, + i % l, + j % r, + ), + ) + + smooth_layout_recover(block_outer, ("read", 0), *a_lr, enable=intrin_info.smooth_a) + smooth_layout_recover( + block_outer, + ("read", 1), + *b_lr, + enable=intrin_info.smooth_b, + ) + smooth_layout_recover(block_outer, ("write", 0), enable=True) + + def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): + block_read = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_read, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_read).iter_vars) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + f_0, f_1, f_2, f_3, f_4 = sch.split( + fused, factors=[num_ty, num_tz, None, warp_size, vec_len] + ) + + sch.bind(f_3, "threadIdx.x") + sch.bind(f_1, "threadIdx.z") + sch.bind(f_0, "threadIdx.y") + sch.vectorize(f_4) + sch.unroll(f_2) + # Apply Swizzling + sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) + # if not, apply padding to alleviate bank conflict + if not (can_swizzle or is_smooth): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) + sch.annotate(f_2, "pragma_unroll_explicit", False) + return block_read + + a_g2s = fetch_to_shared( + block_outer, + 0, + vec_len=list(config.vectorize.values())[0], + can_swizzle=can_swizzle_a, + is_smooth=intrin_info.smooth_a, + ) + + auto_inline_producers(sch, a_g2s) + + def decode_fetch_to_shared(block, idx): + # step1. create memory hierarchy + # global -> local -> shared + block_shared = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_shared, k0, preserve_unit_loops=True) + + # TODO(lei): the factor shoule be analyzed more deeper. + _, B_shared_vi, _ = sch.split(sch.get_loops(block_shared)[-1], factors=[None, 1, 8]) + block_shared_local = sch.cache_read(block_shared, 0, "local") + # global -> dequantzed_local -> shared + # step2. inline to local block + auto_inline_producers(sch, block_shared_local) + + # get target dequantize buffer's idx + def get_idx(): + # for LUT dequantize, the expr is LUT(w), the idx is 1 + # maybe we can use a more general and structual based way + # to analysis the idx + if B_decode_info["source_format"]["format"] == "af": + return 1 + return 0 + + b_idx = get_idx() + # global -> prefetch_local -> dequantzed_local -> shared + block_shared_local_local = sch.cache_read(block_shared_local, b_idx, "local") + # global -> prefetch_shared -> vector load -> dequantzed_local -> shared + block_shared_local_local_shared = sch.cache_read( + block_shared_local_local, 0, shared_scope + ) + sch.compute_at(block_shared_local, B_shared_vi, preserve_unit_loops=True) + sch.compute_at(block_shared_local_local, B_shared_vi, preserve_unit_loops=True) + + dequantize_block = block_shared_local + # fast type conversion + if "fast_decoding" in B_decode_info and B_decode_info["fast_decoding"]: + intrin_group = get_lop3_intrin_group( + in_dtype="int8", out_dtype="float16", storage_nbit=4, with_scale=False + ) + sch.tensorize(sch.get_loops(dequantize_block)[-1], intrin_group["compute"]) + sch.annotate( + thread_idz, ann_key="pragma_import_c", ann_val=intrin_group["c_source"] + ) + + sch.annotate(block_shared, ann_key="permuted_layout", ann_val=can_swizzle_b) + union_len = (2 + 4) if intrin_info.smooth_b else (2 + 2) + B_shared_fused = sch.fuse(*sch.get_loops(block_shared)[-union_len:-2]) + _, B_shared_ty, B_shared_tz, B_shared_tx = sch.split( + B_shared_fused, factors=[None, num_ty, num_tz, warp_size] + ) + if not (can_swizzle_b or intrin_info.smooth_b): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_shared, 0, axis=-2, factor=16, offset=pad_offset) + sch.bind(B_shared_tx, "threadIdx.x") + sch.bind(B_shared_ty, "threadIdx.y") + sch.bind(B_shared_tz, "threadIdx.z") + sch.vectorize(sch.get_loops(block_shared)[-1]) + sch.vectorize(sch.get_loops(block_shared_local_local)[-1]) + + sch.compute_at(block_shared_local_local_shared, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_shared_local_local_shared).iter_vars) + fused = sch.fuse(*sch.get_loops(block_shared_local_local_shared)[-ndim:]) + + f_0, f_1, f_2, f_3, f_4 = sch.split( + fused, factors=[None, num_tz, num_ty, warp_size, 16] # int8x16 = 128bits + ) + + sch.bind(f_3, "threadIdx.x") + sch.bind(f_2, "threadIdx.y") + sch.bind(f_1, "threadIdx.z") + sch.vectorize(f_4) + sch.unroll(f_0) + sch.annotate(f_0, "pragma_unroll_explicit", False) + + # cache small tensors, e.g. LUT + if b_idx: + block_shared_lut = sch.cache_read(dequantize_block, 0, shared_scope) + sch.reverse_compute_at(block_shared_lut, j2) + _, B_shared_tx = sch.split( + sch.get_loops(block_shared_lut)[-1], factors=[None, warp_size] + ) + sch.bind(B_shared_tx, "threadIdx.x") + return block_shared_local + + dequantize_block = decode_fetch_to_shared(block_outer, 1) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "warp") + B_mat = sch.cache_read(block_outer, 1, "warp") + sch.compute_at(A_mat, k1, preserve_unit_loops=True) + sch.compute_at(B_mat, k1, preserve_unit_loops=True) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + if cache_write_required: + accumulator_shared_to_global = sch.cache_write(block_outer, 0, shared_scope) + + store = sch.cache_write(block_outer, 0, "warp") + sch.reverse_compute_at(store, j2) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, micro_size_x]) + j0, j1 = sch.split(j, factors=[None, micro_size_y]) + sch.reorder(i0, j0, i1, j1) + + if cache_write_required: + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + sch.reverse_compute_at( + accumulator_shared_to_global, sch.get_loops(store)[-3], preserve_unit_loops=True + ) + + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) + f0, f1, f2 = sch.split( + fused, factors=[None, warp_size, max(list(config.vectorize.values()))] + ) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + sch.unroll(f0) + sch.annotate(f0, "pragma_unroll_explicit", False) + else: + auto_inline_consumer_chain(sch, store) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + + index_map_a, index_map_b, index_map_c = intrin_group["index_map"] + + sch.transform_layout( + A_mat, ("write", 0), get_index_map(index_map_a, *a_lr, intrin_info.smooth_a) + ) + sch.transform_layout( + B_mat, ("write", 0), get_index_map(index_map_b, *b_lr, intrin_info.smooth_b) + ) + sch.transform_layout( + store, + ("read", 0), + get_index_map(index_map_c, is_5d=True), + ) + + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, a_lr[0]]) + j0, j1 = sch.split(j, factors=[None, a_lr[1]]) + sch.reorder(i0, j0, i1, j1) + ba = sch.blockize(i1) + sch.annotate(ba, ann_key="permuted_layout", ann_val=can_swizzle_a) + sch.tensorize(ba, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, b_lr[0]]) + j0, j1 = sch.split(j, factors=[None, b_lr[1]]) + sch.reorder(i0, j0, i1, j1) + bb = sch.blockize(i1) + sch.annotate(bb, ann_key="permuted_layout", ann_val=can_swizzle_b) + sch.tensorize(bb, intrin_group["load_b"]) + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + tensorize_init_store_compute() + + if stage > 1: + sch.annotate( + k0, ann_key="software_pipeline_stage", ann_val=[0, 0, stage - 1, stage - 1] + ) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2, 3]) + if use_async: + sch.annotate(k0, "software_pipeline_async_stages", [0]) + return sch + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> Optional[tir.Schedule]: + def check_sm_version(arch: str) -> int: + sm_version = arch.replace("sm_", "") + return int(sm_version) if sm_version.isdigit() else -1 + + if check_sm_version(config.arch.target.arch) < 80: + """MMA Template only support sm_80 and above""" + return None + + if ( + config.arch.target.kind.name == "cuda" + and check_sm_version(config.arch.target.arch) == 80 + ): + return self.sch_shared_memory_prefetch_with_config(func, config) + else: + return self.sch_with_config(func, config) diff --git a/python/bitblas/gpu/matmul_wmma.py b/python/bitblas/gpu/matmul_wmma.py new file mode 100644 index 0000000000..682c40d1e1 --- /dev/null +++ b/python/bitblas/gpu/matmul_wmma.py @@ -0,0 +1,922 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring, invalid-name +"""A GEMM schedule rule for GPU operators.""" +import math +from typing import Literal, Optional + +from tvm import DataType, tir +from tvm.target import Target +from tvm.tir.stmt import ForKind + +from ..base.roller.rasterization import NoRasterization +from ..base import analysis +from .base import GPUScheduleRule +from .matmul_analysis import ( + auto_inline_consumer_chain, + auto_inline_consumers, + auto_inline_producers, + get_index_map, + get_reduction_blocks, + normalize_to_matmul, +) + + +class MatmulTensorizationWMMA(GPUScheduleRule): + """ + The schedule rule for float16 tensor core matmul computation. + func with attr 'dlight.do_not_tensorize' will not be tensorized. + """ + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Optional[tir.Schedule]: + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + block_stmt = sch.get(main_block) + index_maps = get_index_map(block_stmt) + if index_maps is None: + return None + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + block_m = 128 + block_n = 128 + block_k = 32 + + # tensor core intrinsic size + micro_size_m = 16 + micro_size_n = 16 + micro_size_k = 16 + + thread_z = 2 + thread_y = 2 + warp_size = 32 + thread_cnt = thread_y * thread_z * warp_size + + vector_size = 8 + unroll_depth = 256 + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K] + block = sch.reindex(main_block, ("read", 0)) + sch.transform_layout(block, ("write", 0), a_index_map) + block = sch.reindex(main_block, ("read", 1)) + sch.transform_layout(block, ("write", 0), b_index_map) + block = sch.reindex(main_block, ("write", 0)) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + + # Step 2. Padding for dynamic shape kernels + + # # Step 2.1 Swizzle for l2, for better performance on inputs exceeding l2 size + # # Get input shape + batch, i, j, k = sch.get_loops(main_block) + # input_b, input_m, input_n, input_k = [sch.get(loop).extent for loop in [batch, i, j, k]] + + # # Get input/output dtype + dtype_a, dtype_b = [DataType(region.buffer.dtype) for region in sch.get(main_block).reads] + dtype_c = DataType(sch.get(main_block).writes[0].buffer.dtype) + # dtype_a_bytes, dtype_b_bytes = [math.ceil(d.bits / 8) for d in [dtype_a, dtype_b]] + + # # Get l2 size + # l2_size = target.l2_cache_size_bytes + + # # Analyse swizzle factor + # def get_swizzle_factor(l2_size, input_k, dtype_bytes, input_spatial, block_size): + # if l2_size != 0 and isinstance(input_k, (int, tir.IntImm)): + # # div by 3: suppose the two inputs and the output uses the same amount of l2 + # swizzle_factor = l2_size / 3 / int(input_k) / dtype_bytes / block_size + # # optimization: try find the best swizzle factor (aka the least additional padding) + # if isinstance(input_spatial, (int, tir.IntImm)): + # block_cnt = math.ceil(int(input_spatial) / block_size) + # swizzle_factor = math.ceil(block_cnt / math.ceil(block_cnt / swizzle_factor)) + # else: + # swizzle_factor = math.floor(swizzle_factor) + # return [None, swizzle_factor] + # else: + # return [4, None] + + # swizzle_factor_m = get_swizzle_factor(l2_size, input_k, dtype_a_bytes, input_m, block_m) + # swizzle_factor_n = get_swizzle_factor(l2_size, input_k, dtype_b_bytes, input_n, block_n) + + swizzle_factor_m = [4, None] + swizzle_factor_n = [4, None] + + # Step 2.2 Add padding + sch.pad_einsum( + main_block, + [ + 1, + (swizzle_factor_m[0] or swizzle_factor_m[1]) * block_m, + (swizzle_factor_n[0] or swizzle_factor_n[1]) * block_n, + block_k, + ], + ) + + # Step 3. Reorder loops for tiling + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_m]) + j, j_inner = sch.split(j, factors=[None, micro_size_n]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = main_block + block_outer = sch.blockize(i_inner) + + # split factors for i, j, and k + in_wrap_block_cnt_m = block_m // thread_z // micro_size_m + in_wrap_block_cnt_n = block_n // thread_y // micro_size_n + in_wrap_block_cnt_k = block_k // micro_size_k + + i_factors = swizzle_factor_m + [thread_z, in_wrap_block_cnt_m] + j_factors = swizzle_factor_n + [thread_y, in_wrap_block_cnt_n] + k_factors = [None, in_wrap_block_cnt_k] + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, factors=k_factors) + + sch.reorder(i0, j0, i1, j1, k0, i2, j2, k1, i3, j3) + block_axis = sch.fuse(batch, i0, j0, i1, j1) + + sch.bind(block_axis, "blockIdx.x") + sch.bind(i2, "threadIdx.z") + sch.bind(j2, "threadIdx.y") + + # Step 4. Read to/write from shared mem, and from/to wmma fragments + def fetch_input(block_outer, read_buffer_idx, tensor_name: Literal["A", "B"], wmma_name): + block_read = sch.cache_read(block_outer, read_buffer_idx, "shared.dyn") + sch.compute_at(block_read, k0) + fused = sch.fuse(*sch.get_loops(block_read)[-2:]) + + f0, f1, f2, f3, f4 = sch.split( + fused, [None, thread_z, thread_y, warp_size, vector_size] + ) + + sch.bind(f1, "threadIdx.z") + sch.bind(f2, "threadIdx.y") + sch.bind(f3, "threadIdx.x") + sch.vectorize(f4) + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=8) + + auto_inline_producers(sch, block_read) + + wmma_read = sch.cache_read(block_outer, read_buffer_idx, wmma_name) + sch.compute_at(wmma_read, k1) + + micro_size_spatial = micro_size_m if tensor_name == "A" else micro_size_n + v0, v1 = sch.get_loops(wmma_read)[-2:] + sch.split(v0, factors=[None, micro_size_spatial]) + + return wmma_read + + wmma_read_a = fetch_input( + block_outer, 0, [block_m, block_k, micro_size_m, micro_size_k], "wmma.matrix_a" + ) + wmma_read_b = fetch_input( + block_outer, 1, [block_n, block_k, micro_size_n, micro_size_k], "wmma.matrix_b" + ) + + def store_output(block_outer, write_buffer_idx, wmma_name): + block_write = sch.cache_write(block_outer, write_buffer_idx, "shared.dyn") + sch.reverse_compute_at(block_write, block_axis) + + fused = sch.fuse(*sch.get_loops(block_write)[-2:]) + + f0, f1, f2, f3, f4 = sch.split( + fused, [None, thread_z, thread_y, warp_size, vector_size] + ) + + sch.bind(f1, "threadIdx.z") + sch.bind(f2, "threadIdx.y") + sch.bind(f3, "threadIdx.x") + sch.vectorize(f4) + # sch.storage_align(block_write, 0, axis=-2, factor=128, offset=16) + + auto_inline_consumer_chain(sch, block_write) + + wmma_store = sch.cache_write(block_outer, write_buffer_idx, wmma_name) + v0, v1 = sch.get_loops(wmma_store)[-2:] + v00, v01, v02 = sch.split(v0, factors=[thread_z, None, micro_size_m]) + v10, v11, v12 = sch.split(v1, factors=[thread_y, None, micro_size_n]) + sch.reorder(v00, v10, v01, v11, v02, v12) + sch.bind(v00, "threadIdx.z") + sch.bind(v10, "threadIdx.y") + return wmma_store + + wmma_store = store_output(block_outer, 0, "wmma.accumulator") + + block_init = sch.decompose_reduction(block_outer, k0) + block_init_inner = sch.get_child_blocks(block_init)[0] + + # unroll k + sch.unroll(k0) + + # Step 5. Schedule tensor core computation + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_wmma_intrin_group, + ) + + intrin_group = get_wmma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype=str(dtype_a), + out_dtype=str(dtype_c), + trans_b=True, + ) + + sch.tensorize(sch.get_loops(block_init_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(wmma_read_a)[-2], intrin_group["load_a"]) + sch.tensorize(sch.get_loops(wmma_read_b)[-2], intrin_group["load_b"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + sch.tensorize(sch.get_loops(wmma_store)[-2], intrin_group["store"]) + + return sch + + +class MatmulInt8Tensorization(GPUScheduleRule): + """ + The schedule rule for int8 tensor core matmul computation. + func with attr 'dlight.do_not_tensorize' will not be tensorized. + """ + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Optional[tir.Schedule]: + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_wmma_intrin_group, + ) + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + block_stmt = sch.get(main_block) + index_maps = get_index_map(block_stmt) + if index_maps is None: + return None + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # tensor core intrinsic size + micro_size_x = 16 + micro_size_y = 16 + micro_size_k = 16 + + warp_size = 32 + vector_size = 4 + + i_factors, j_factors, k_factors = ( + [None, 1, 4, 2], + [1, None, 4, 2], + [None, 1], + ) + + num_ty = i_factors[2] * j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K] + block = sch.reindex(main_block, ("read", 0)) + sch.transform_layout(block, ("write", 0), a_index_map) + block = sch.reindex(main_block, ("read", 1)) + sch.transform_layout(block, ("write", 0), b_index_map) + block = sch.reindex(main_block, ("write", 0)) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + sch.annotate(k0, "software_pipeline_order", [0, 3, 1, 4, 5, 2, 6]) + sch.annotate(k0, "software_pipeline_stage", [0, 0, 0, 0, 0, 1, 1]) + sch.annotate(k1, "software_pipeline_order", [0, 1, 2]) + sch.annotate(k1, "software_pipeline_stage", [0, 0, 1]) + + sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3) + + block_idx = sch.fuse(i0, j0) + block_idy = sch.fuse(i1, j1) + thread_idy = sch.fuse(j2, i2) + sch.bind(batch, "blockIdx.z") + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + + def fetch_to_shared(block, idx, ndim): + block_read = sch.cache_read(block, idx, "shared.dyn") + sch.compute_at(block_read, k0) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size]) + + sch.bind(f_2, "threadIdx.x") + sch.bind(f_1, "threadIdx.y") + sch.vectorize(f_3) + + sch.storage_align(block_read, 0, axis=-2, factor=32, offset=16) + sch.annotate(block_read, "tir.manifest_shared_memory_local_stage", 1) + sch.annotate(block_read, "double_buffer_scope", 0) + return block_read + + a_g2s = fetch_to_shared(block_outer, 0, 2) + b_g2s = fetch_to_shared(block_outer, 1, 2) + + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "wmma.matrix_a") + B_mat = sch.cache_read(block_outer, 1, "wmma.matrix_b") + sch.compute_at(A_mat, k1) + sch.compute_at(B_mat, k1) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + accumulator_shared_to_global = sch.cache_write(block_outer, 0, "shared.dyn") + sch.storage_align(accumulator_shared_to_global, 0, -2, 16, 4) + + store = sch.cache_write(block_outer, 0, "wmma.accumulator") + sch.reverse_compute_at(store, thread_idy) + sch.reverse_compute_at(accumulator_shared_to_global, thread_idy) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + intrin_group = get_wmma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype="int8", + out_dtype="int32", + trans_b=True, + ) + + try: + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_b"]) + except: # pylint: disable=bare-except + return None + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + try: + tensorize_init_store_compute() + except: # pylint: disable=bare-except + return None + + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-2:]) + _, f1, f2 = sch.split(fused, factors=[None, warp_size, vector_size]) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + + return sch + + +class MatmulTensorizationLegacy(GPUScheduleRule): + """ + The schedule rule for float16 tensor core matmul computation. + func with attr 'dlight.do_not_tensorize' will not be tensorized. + """ + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Optional[tir.Schedule]: + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_wmma_intrin_group, + ) + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + block_stmt = sch.get(main_block) + index_maps = get_index_map(block_stmt) + if index_maps is None: + return None + matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # tensor core intrinsic size + micro_size_x = 16 + micro_size_y = 16 + micro_size_k = 16 + + warp_size = 32 + vector_size = 4 + + i_factors, j_factors, k_factors = ( + [None, 1, 4, 2], + [1, None, 4, 2], + [None, 4], + ) + + num_ty = i_factors[2] * j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K] + block = sch.reindex(main_block, ("read", 0)) + sch.transform_layout(block, ("write", 0), a_index_map) + block = sch.reindex(main_block, ("read", 1)) + sch.transform_layout(block, ("write", 0), b_index_map) + block = sch.reindex(main_block, ("write", 0)) + sch.transform_layout(block, ("read", 0), c_index_map) + sch.transform_block_layout(main_block, matmul_index_map) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + sch.annotate(k0, "software_pipeline_order", [0, 3, 1, 4, 5, 2, 6]) + sch.annotate(k0, "software_pipeline_stage", [0, 0, 0, 0, 0, 1, 1]) + sch.annotate(k1, "software_pipeline_order", [0, 1, 2]) + sch.annotate(k1, "software_pipeline_stage", [0, 0, 1]) + + sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3) + + block_idx = sch.fuse(i0, j0) + block_idy = sch.fuse(i1, j1) + thread_idy = sch.fuse(j2, i2) + sch.bind(batch, "blockIdx.z") + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + + def fetch_to_shared(block, idx, ndim): + block_read = sch.cache_read(block, idx, "shared.dyn") + sch.compute_at(block_read, k0) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size]) + + sch.bind(f_2, "threadIdx.x") + sch.bind(f_1, "threadIdx.y") + sch.vectorize(f_3) + + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=8) + sch.annotate(block_read, "tir.manifest_shared_memory_local_stage", 1) + sch.annotate(block_read, "double_buffer_scope", 0) + return block_read + + a_g2s = fetch_to_shared(block_outer, 0, 2) + b_g2s = fetch_to_shared(block_outer, 1, 2) + + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "wmma.matrix_a") + B_mat = sch.cache_read(block_outer, 1, "wmma.matrix_b") + sch.compute_at(A_mat, k1) + sch.compute_at(B_mat, k1) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + accumulator_shared_to_global = sch.cache_write(block_outer, 0, "shared.dyn") + sch.storage_align(accumulator_shared_to_global, 0, -2, 16, 4) + + store = sch.cache_write(block_outer, 0, "wmma.accumulator") + sch.reverse_compute_at(store, thread_idy) + sch.reverse_compute_at(accumulator_shared_to_global, thread_idy) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + intrin_group = get_wmma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype="float16", + out_dtype="float32", + trans_b=True, + ) + + try: + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_b"]) + except: # pylint: disable=bare-except + return None + + # Try to tensorize the init, store and compute block with f16 or f32 intrinsics + tensorize_success: bool = False + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + try: + tensorize_init_store_compute() + tensorize_success = True + except: # pylint: disable=bare-except + intrin_group = get_wmma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype="float16", + out_dtype="float16", + trans_b=True, + ) + + if not tensorize_success: + try: + tensorize_init_store_compute() + tensorize_success = True + except: # pylint: disable=bare-except + return None + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-2:]) + _, f1, f2 = sch.split(fused, factors=[None, warp_size, vector_size]) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + + return sch if tensorize_success else None + + def apply_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config, + ) -> Optional[tir.Schedule]: + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_wmma_intrin_group, + ) + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + + # Start Schedule + # Step 0. Get schedule config. + # NOTE: we can analyze the config by the hardware spec in the future + + # tensor core intrinsic size + intrin_info = config.intrin_info + warp_row_tiles = config.warp[0] + warp_col_tiles = config.warp[1] + block_row_warps = config.block[0] // warp_row_tiles + block_col_warps = config.block[1] // warp_col_tiles + stage = config.pipeline_stage + use_async = config.use_async + chunk = config.rstep[0] + + micro_size_x = 16 + micro_size_y = 16 + micro_size_k = 16 + + warp_size = 32 + + i_factors, j_factors, k_factors = ( + [None, 1, block_row_warps, warp_row_tiles // micro_size_x], + [1, None, block_col_warps, warp_col_tiles // micro_size_y], + [None, chunk // micro_size_k], + ) + + num_ty = i_factors[2] * j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] + if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + + sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3) + + block_idx = sch.fuse(i0, j0) + block_idy = sch.fuse(i1, j1) + thread_idy = sch.fuse(j2, i2) + # plan rasteration + if ( + not isinstance(config.rasterization_plan, NoRasterization) + and sch.get(batch).extent.value == 1 + ): + device_func, invoke_func = config.rasterization_plan.get_code() + factor = config.rasterization_plan.panel_width_ + + # TODO(lei): this is a trick for rasterization implementation + # wait for https://github.com/apache/tvm/pull/16113 to be merged + # require a solution for general block rasterization + factor = 8 # should be divisible by block_idy + if sch.get(block_idy).extent.value % factor == 0: + block_k, block_idy = sch.split(block_idy, factors=[None, factor]) + sch.bind(block_k, "blockIdx.z") + else: + sch.bind(batch, "blockIdx.z") + + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + + def fetch_to_shared(block, idx, ndim, vec_len, dtype="float16"): + block_read = sch.cache_read(block, idx, "shared.dyn") + sch.compute_at(block_read, k0) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vec_len]) + + sch.bind(f_2, "threadIdx.x") + sch.bind(f_1, "threadIdx.y") + sch.vectorize(f_3) + offset: int = 0 + if dtype == "float16": + offset = 8 + elif dtype == "int8": + offset = 16 + # todo(lei): the pad value should be varied according to the data type + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=offset) + return block_read + + a_g2s = fetch_to_shared( + block_outer, + 0, + 2, + vec_len=list(config.vectorize.values())[0], + dtype=intrin_info.in_dtype, + ) + b_g2s = fetch_to_shared( + block_outer, + 1, + 2, + vec_len=list(config.vectorize.values())[1], + dtype=intrin_info.in_dtype, + ) + + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "wmma.matrix_a") + B_mat = sch.cache_read(block_outer, 1, "wmma.matrix_b") + sch.compute_at(A_mat, k1) + sch.compute_at(B_mat, k1) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + accumulator_shared_to_global = sch.cache_write(block_outer, 0, "shared.dyn") + sch.storage_align(accumulator_shared_to_global, 0, -2, 16, 4) + + store = sch.cache_write(block_outer, 0, "wmma.accumulator") + sch.reverse_compute_at(store, thread_idy) + sch.reverse_compute_at(accumulator_shared_to_global, thread_idy) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + intrin_group = get_wmma_intrin_group( + load_scope="shared.dyn", + store_scope="shared.dyn", + in_dtype=intrin_info.in_dtype, + out_dtype=intrin_info.out_dtype, + trans_b=intrin_info.trans_b, + ) + + try: + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, 16]) + j0, j1 = sch.split(j, factors=[None, 16]) + sch.reorder(i0, j0, i1, j1) + sch.unroll(i0) + sch.unroll(j0) + sch.tensorize(i1, intrin_group["load_b"]) + except: # pylint: disable=bare-except + return None + + # Try to tensorize the init, store and compute block with f16 or f32 intrinsics + tensorize_success: bool = False + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + try: + tensorize_init_store_compute() + tensorize_success = True + except: # pylint: disable=bare-except + return None + + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-2:]) + _, f1, f2 = sch.split( + fused, factors=[None, warp_size, max(list(config.vectorize.values()))] + ) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + + if stage > 1: + sch.annotate(k0, ann_key="software_pipeline_stage", ann_val=[0, 0, stage - 1]) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) + if use_async: + sch.annotate(k0, "software_pipeline_async_stages", [0]) + + return sch if tensorize_success else None + diff --git a/python/bitblas/gpu/reduction.py b/python/bitblas/gpu/reduction.py new file mode 100644 index 0000000000..76fb0e07f2 --- /dev/null +++ b/python/bitblas/gpu/reduction.py @@ -0,0 +1,298 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A rule for reduction. """ +# TODO: combine reduction rule and general reduction rule into one file. +from typing import List, Optional, Tuple, Union + +from tvm import arith, ir, tir +from tvm.target import Target + +from ..base import ( + BlockInfo, + normalize_prim_func, + try_inline_contiguous_spatial, + detect_dominant_read, + is_broadcast_epilogue, +) +from . import utils +from .base import GPUScheduleRule + + +def _get_reduction_expr(block: tir.Block) -> Optional[tir.PrimExpr]: + # Detect and return `Y` in `X[...] = X[...] + Y` + buffer_store = block.body + if not isinstance(buffer_store, tir.BufferStore): + return None + if not isinstance(buffer_store.value, tir.Add): + return None + if not ir.structural_equal( + buffer_store.value.a, + tir.BufferLoad(buffer_store.buffer, block.body.indices), + map_free_vars=True, + ): + return None + return buffer_store.value.b + + +class Reduction(GPUScheduleRule): + """A rule for Reduction.""" + + def apply( # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + sch = tir.Schedule(func) + block_infos = normalize_prim_func(sch) + if block_infos is None: + return None + block_infos = try_inline_contiguous_spatial(sch, block_infos) + if len(block_infos) == 1: + epilogue = None + elif len(block_infos) == 2: + epilogue = block_infos[1] + if not epilogue.is_injective(): + return None + else: + return None + + block_info = block_infos[0] + block = block_info.block_rv + block_stmt = sch.get(block) + + # Step 1. Check reduction block + if ( + (not block_info.is_reduction()) + or len(block_stmt.writes) != 1 + or _get_reduction_expr(block_stmt) is None + ): + return None + # Step 2. Normalize the block, merge spatial and reduction iters + is_inner_reduction, c_factor, loop_order, s_split_index = self._normalize( + sch, + block_info, + arith.normalize_to_iter_sum( + detect_dominant_read(block_stmt), + input_iters={i.var: i.dom for i in block_stmt.iter_vars}, + ), + ) + if is_inner_reduction is None and c_factor is None: + return None + # Step 3. Do the scheduling + if is_inner_reduction: + self._sch_inner_reduction( + sch, target, block, c_factor, epilogue, loop_order, s_split_index + ) + else: + self._sch_inner_spatial( + sch, target, block, block_info, c_factor, epilogue, loop_order, s_split_index + ) + return sch + + def _normalize( # pylint: disable=too-many-branches + self, + sch: tir.Schedule, + block_info: BlockInfo, + access: arith.IterSumExpr, + ) -> Tuple[Optional[bool], Optional[int]]: + if access.base != 0: + return None, None, None, None + iter_to_info = {i.var: i for i in block_info.iters} + s_loops, r_loops, c_loops, c_factor = [], [], [], None + s_split_loop, s_split_index = None, None + for split_expr in access.args: + var = split_expr.source.source + info = iter_to_info.pop(var) + loop = info.loop_rv + is_inner_reduction = info.kind == "R" + if split_expr.lower_factor > 1: + if c_loops: + return None, None, None, None + s_split_loop = loop + s_split_index = len(s_loops) + loop, c_loop = sch.split(loop, factors=[None, split_expr.lower_factor]) + c_loops.append(c_loop) + if not is_inner_reduction: + c_factor = split_expr.lower_factor + if is_inner_reduction: + r_loops.append(loop) + else: + s_loops.append(loop) + + if iter_to_info: + for var, info in iter_to_info.items(): + if info.kind == "S" and info.dom.extent == 1: + s_loops.append(info.loop_rv) + else: + return None, None, None, None + + loop_order = {} + s_block_var_loops = [] + for i in block_info.iters: + if i.loop_rv in s_loops or i.loop_rv == s_split_loop: + s_block_var_loops.append(i.loop_rv) + + for i in range(len(s_block_var_loops)): + for j in range(len(s_loops)): + if s_block_var_loops[i] == s_loops[j]: + loop_order[i] = j + break + if s_block_var_loops[i] == s_split_loop: + loop_order[i] = s_split_index + break + + assert s_loops + assert r_loops + if len(s_loops) != len([i for i in block_info.iters if i.kind == "S"]): + return None, None + if not c_loops: + c_loops = [sch.add_unit_loop(block_info.block_rv)] + sch.reorder(*s_loops, *r_loops, *c_loops) + sch.fuse(*s_loops) + sch.fuse(*r_loops) + return is_inner_reduction, c_factor, loop_order, s_split_index + + def _sch_inner_reduction( # pylint: disable=too-many-arguments + self, + sch: tir.Schedule, + target: Target, + block: tir.schedule.BlockRV, + unroll_spatial_factor: Optional[int], + epilogue_info: Optional[BlockInfo], + loop_order, + s_split_index, + ): + # pylint: disable=invalid-name + _, r, _ = sch.get_loops(block) + (len_tx,) = utils.suggest_threads_per_block( # pylint: disable=unbalanced-tuple-unpacking + target, [sch.get(r)] + ) + + _, tx = sch.split(r, factors=[None, len_tx]) + # Schedule the RF block + rf = sch.rfactor(tx, 0) + bx, r, tx, _ = sch.get_loops(rf) + sch.reorder(bx, tx, r) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.annotate(tx, ann_key="pragma_auto_unroll_max_step", ann_val=256) + sch.annotate(tx, ann_key="pragma_unroll_explicit", ann_val=1) + sch.set_scope(rf, 0, "local") + sch.decompose_reduction(rf, r) + # Schedule the write back block + sch.reverse_compute_at(block, bx, preserve_unit_loops=True) + _, tx, *s = sch.get_loops(block) + + if unroll_spatial_factor: + assert len(s) == len(loop_order) + new_order_s = [s[loop_order[i]] for i in range(len(s))] + sch.reorder(*new_order_s) + new_order_s[s_split_index], c = sch.split( + new_order_s[s_split_index], factors=[None, unroll_spatial_factor] + ) + sch.reorder(*new_order_s, c) + s = sch.fuse(*new_order_s) + sch.reorder(s, tx, c) + else: + s = sch.fuse(*s) + sch.reorder(s, tx) + sch.bind(tx, "threadIdx.x") + # Schedule epilogue + if epilogue_info is not None: + epilogue = epilogue_info.block_rv + sch.reverse_compute_at(epilogue, bx) + if is_broadcast_epilogue(sch, block, epilogue): + sch.set_scope(block, 0, "shared") + _, *s = sch.get_loops(epilogue) # pylint: disable=invalid-name + _, tx = sch.split(sch.fuse(*s), factors=[None, len_tx]) + sch.bind(tx, "threadIdx.x") + else: + sch.set_scope(block, 0, "local") + # pylint: enable=invalid-name + + def _sch_inner_spatial( + self, + sch: tir.Schedule, + _: Target, + block: tir.schedule.BlockRV, + block_info: BlockInfo, + unroll_spatial_factor: Optional[int], + epilogue_info: Optional[BlockInfo], + loop_order, + s_split_index, + ): + # pylint: disable=invalid-name + s, r, _ = sch.get_loops(block) + len_tx, len_ty = 16, 16 + s_factor = [i.dom.extent for i in block_info.iters if i.kind == "S"][-1] + # get perfect spatial factor, spatial factor should be divide the innermost spatial loop so + # that the block after r_factor and be reversed compute at the original scope + while len_tx > 1: + if s_factor % len_tx == 0: + break + len_tx -= 1 + _, _ = sch.split(s, factors=[None, len_tx]) + _, ty = sch.split(r, factors=[None, len_ty]) + # Schedule the RF block + rf = sch.rfactor(ty, 0) + bx, tx, r, ty, _ = sch.get_loops(rf) + sch.reorder(bx, tx, ty, r) + sch.bind(tx, "threadIdx.x") + sch.bind(ty, "threadIdx.y") + sch.bind(bx, "blockIdx.x") + sch.set_scope(rf, 0, "local") + sch.decompose_reduction(rf, r) + # Schedule the write back block + sch.reverse_compute_at(block, bx, preserve_unit_loops=True) + _, r, *s = sch.get_loops(block) + if unroll_spatial_factor: + assert len(s) == len(loop_order) + new_order_s = [s[loop_order[i]] for i in range(len(s))] + sch.reorder(*new_order_s) + new_order_s[s_split_index], c = sch.split( + new_order_s[s_split_index], factors=[None, unroll_spatial_factor] + ) + sch.reorder(*new_order_s, c) + s = sch.fuse(*new_order_s) + sch.reorder(s, c, r) + else: + s = sch.fuse(*s) + sch.reorder(s, r) + sch.bind(s, "threadIdx.x") + sch.bind(r, "threadIdx.y") + + # Schedule epilogue + if epilogue_info is not None: + epilogue = epilogue_info.block_rv + sch.reverse_compute_at(epilogue, bx) + if is_broadcast_epilogue(sch, block, epilogue): + sch.set_scope(block, 0, "shared") + _, *s = sch.get_loops(epilogue) # pylint: disable=invalid-name + _, tx, ty = sch.split(sch.fuse(*s), factors=[None, len_tx, len_ty]) + sch.bind(tx, "threadIdx.x") + sch.bind(ty, "threadIdx.y") + else: + # The epilogue is element-wise without broadcasting. + # Thus the remaining spatial part should be bind to tx. + sch.set_scope(block, 0, "local") + _, *s = sch.get_loops(epilogue) # pylint: disable=invalid-name + tx, _ = sch.split(sch.fuse(*s), factors=[len_tx, None]) + sch.bind(tx, "threadIdx.x") + # pylint: enable=invalid-name diff --git a/python/bitblas/gpu/rmsnorm.py b/python/bitblas/gpu/rmsnorm.py new file mode 100644 index 0000000000..f8b2bb4a17 --- /dev/null +++ b/python/bitblas/gpu/rmsnorm.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +"""A RMS norm schedule rule for GPU operators.""" + +import tvm +from tvm import tir +from tvm.tir import Block, BufferStore +from tvm.tir.expr import Cast, BufferLoad, Call +from tvm.target import Target + +from ..base import ScheduleRule + + +def identify_cast_or_load_block(block: Block) -> bool: + if len(block.reads) != 1 or len(block.writes) != 1: + return False + + if not isinstance(block.body, BufferStore): + return False + store = block.body + + # check types + if isinstance(store.value, BufferLoad): + load = store.value + elif isinstance(store.value, Cast): + load = store.value.value + if not isinstance(load, BufferLoad): + return False + else: + return False + + # check indices + if len(load.indices) != len(store.indices): + return False + + for lhs, rhs in zip(load.indices, store.indices): + if not lhs.same_as(rhs): + return False + + return True + + +def identify_rsqrt_block(block: Block) -> bool: + if len(block.reads) != 1 or len(block.writes) != 1: + return False + + if not isinstance(block.body, BufferStore): + return False + store = block.body + + if not isinstance(store.value, Call): + return False + call = store.value + op = call.op + + return op == tvm.ir.op.Op.get("tir.rsqrt") + + +class RMSNorm(ScheduleRule): + """A rule for RMS norm.""" + + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> tir.Schedule: + if target.kind.name == "cuda": + num_tx = 512 + else: + num_tx = 64 + + sch = tir.Schedule(func) + root = sch.get_block(name="root", func_name="main") + + blocks = sch.get_child_blocks(root) + + if not any([identify_rsqrt_block(sch.get(block)) for block in blocks]): + return None + + read = sch.cache_read(block=blocks[0], read_buffer_index=0, storage_scope="local") + write = sch.cache_write(block=blocks[-1], write_buffer_index=0, storage_scope="local") + + for block in blocks: + if identify_cast_or_load_block(sch.get(block)): + sch.compute_inline(block) + + blocks = sch.get_child_blocks(root) + + read, sqr, redsum, rsqrt, norm, write = blocks + + if not identify_rsqrt_block(sch.get(rsqrt)): + return None + + for name in [read, sqr, redsum, rsqrt, norm, write]: + loops = sch.get_loops(name) + sch.fuse(*loops[:-1]) + + block_loop, loops = sch.get_loops(block=read) + thread_loop, _, _ = sch.split( + loop=loops, factors=[num_tx, None, 8], preserve_unit_iters=True + ) + sch.bind(block_loop, thread_axis="blockIdx.x") + sch.bind(thread_loop, thread_axis="threadIdx.x") + sch.vectorize(sch.get_loops(block=read)[-1]) + sch.reverse_compute_at(block=sqr, loop=thread_loop) + sch.reverse_compute_at(block=redsum, loop=thread_loop) + + sch.reverse_compute_at(block=rsqrt, loop=block_loop, index=-1) + sch.reverse_compute_at(block=norm, loop=block_loop, index=-1) + block_loop, loops = sch.get_loops(block=norm) + thread_loop, _, _ = sch.split( + loop=loops, factors=[num_tx, None, 8], preserve_unit_iters=True + ) + sch.bind(thread_loop, thread_axis="threadIdx.x") + + sch.reverse_compute_at(block=write, loop=thread_loop, index=-1) + sch.vectorize(sch.get_loops(block=write)[-1]) + + sch.set_scope(block=sqr, buffer_index=0, storage_scope="local") + sch.set_scope(block=redsum, buffer_index=0, storage_scope="local") + sch.set_scope(block=rsqrt, buffer_index=0, storage_scope="shared") + sch.set_scope(block=norm, buffer_index=0, storage_scope="local") + + return sch diff --git a/python/bitblas/gpu/transpose.py b/python/bitblas/gpu/transpose.py new file mode 100644 index 0000000000..f1e19ff364 --- /dev/null +++ b/python/bitblas/gpu/transpose.py @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Reduction rule for operators including softmax, layer norm, RMS norm, etc""" +from typing import List, Union + +from tvm import arith, tir +from tvm.target import Target +from tvm.tir import Schedule +from tvm.tir.schedule import BlockRV + +from ..base import ( + detect_dominant_read, + normalize_prim_func, + try_inline_contiguous_spatial, +) +from .base import GPUScheduleRule + + +class Transpose(GPUScheduleRule): + """Schedule rule for transpose""" + + def is_transpose(self, sch: Schedule, block_rv: BlockRV): + block = sch.get(block_rv) + if isinstance(block.body, tir.BufferStore): + rhs = block.body.value + if isinstance(rhs, tir.BufferLoad): + lhs_indices = block.body.indices + rhs_indices = rhs.indices + if list(lhs_indices) != list(rhs_indices) and set(lhs_indices) == set(rhs_indices): + return True + return False + + def apply( # pylint: disable=too-many-locals + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + # pylint: disable=invalid-name + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + if target.kind.name == "cuda": + len_tx = 16 + len_ty = 8 + unroll_depth = 256 + else: + len_tx = 8 + len_ty = 4 + unroll_depth = 64 + len_vec = 4 + + sch = tir.Schedule(func) + blocks = normalize_prim_func(sch) + transpose_block_idx = -1 + for idx, block in reversed(list(enumerate(blocks))): + if self.is_transpose(sch, block.block_rv): + transpose_block_idx = idx + break + if not block.is_injective(): + return None + if transpose_block_idx == -1: + return None + transpose_block = blocks[transpose_block_idx].block_rv + + prologue = None # the optional decoding block + if transpose_block_idx > 0: + spatials = try_inline_contiguous_spatial(sch, blocks[: transpose_block_idx - 1]) + assert len(spatials) == 0 + prologue = blocks[transpose_block_idx - 1].block_rv + + loops = sch.get_loops(transpose_block) + if len(loops) != 2: + # transpose with more than 2 axes is not supported + return None + + c_factor = 1 + if prologue is not None: + block_stmt = sch.get(prologue) + result = arith.normalize_to_iter_sum( + detect_dominant_read(block_stmt), + input_iters={i.var: i.dom.extent for i in block_stmt.iter_vars}, + ) + if len(result.args) > 0: + c_factor = int(result.args[0].lower_factor) + + i, j = loops + i, vi = sch.split(i, factors=[None, c_factor], preserve_unit_iters=True) + bi, ti = sch.split(i, factors=[None, len_ty], preserve_unit_iters=True) + bj, tj = sch.split(j, factors=[None, len_tx], preserve_unit_iters=True) + sch.reorder(bi, bj, ti, tj, vi) + sch.bind(bi, "blockIdx.y") + sch.bind(bj, "blockIdx.x") + sch.bind(ti, "threadIdx.y") + sch.bind(tj, "threadIdx.x") + len_vec = min(len_vec, c_factor) + _, vi = sch.split(vi, factors=[None, len_vec]) + if len_vec > 1: + sch.vectorize(vi) + + cache_read = sch.cache_read(transpose_block, read_buffer_index=0, storage_scope="shared") + sch.compute_at(cache_read, bj) + loops = sch.get_loops(cache_read)[2:] + fused = sch.fuse(*loops) + _, ty, tx, v = sch.split(fused, factors=[None, len_ty, len_tx, c_factor]) + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + sch.unroll(v) + sch.storage_align(block=cache_read, buffer_index=0, axis=0, factor=32, offset=1) + + sch.annotate(bi, ann_key="pragma_auto_unroll_max_step", ann_val=unroll_depth) + sch.annotate(bi, ann_key="pragma_unroll_explicit", ann_val=1) + + if prologue is not None: + sch.compute_inline(prologue) + return sch diff --git a/python/bitblas/gpu/utils.py b/python/bitblas/gpu/utils.py new file mode 100644 index 0000000000..d03d876595 --- /dev/null +++ b/python/bitblas/gpu/utils.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +"""Utility methods for generic GPU.""" +from typing import List, Optional + +from tvm import tir +from tvm.target import Target + + +def max_threads_per_block(target: Target) -> int: + """Get the maximum number of threads per block for a given target. + + Parameters + ---------- + target : Target + The target to get the maximum number of threads per block for. + + Returns + ------- + max_threads_per_block : int + The maximum number of threads per block for the given target. + """ + for name in ["max_threads_per_block", "max_num_threads"]: + result = target.attrs.get(name, None) + if result is not None: + return result + if target.kind.name == "cuda": + return 1024 + return 256 + + +def suggest_threads_per_block( + target: Target, + loops: List[tir.For], + max_threads_for_dynamic_loop: int = 32, +) -> List[int]: + if target.kind.name == "cuda": + threads = 1024 + elif target.kind.name == "rocm": + threads = 256 + elif target.kind.name == "metal": + threads = 256 + else: + threads = 64 + results: List[Optional[int]] = [] + dynamic: List[int] = [] + for i, loop in enumerate(loops): + loop_extent = loop.extent + if isinstance(loop_extent, tir.IntImm): + loop_extent = loop_extent.value + extent = 1 + while extent <= loop_extent and extent <= threads: + extent *= 2 + extent //= 2 + assert extent >= 1 + assert threads % extent == 0 + threads //= extent + results.append(extent) + else: + results.append(None) + dynamic.append(i) + + for i in dynamic: + extent = 1 + while extent <= max_threads_for_dynamic_loop and extent <= threads: + extent *= 2 + extent //= 2 + assert extent >= 1 + assert threads % extent == 0 + threads //= extent + results[i] = extent + + if dynamic: + results[dynamic[0]] *= threads + + return results + + +def get_sm_version(target: Target) -> int: + if target.kind.name != "cuda": + return -1 + arch = target.arch + sm_version = arch.replace("sm_", "") + return int(sm_version) if sm_version.isdigit() else -1 From d08f67ade9bb27116c07e7eab9d0c50245f7d023 Mon Sep 17 00:00:00 2001 From: LeiWang Date: Tue, 6 Feb 2024 10:54:26 -0400 Subject: [PATCH 003/286] matmul ops --- python/bitblas/ops/__init__.py | 1 + python/bitblas/ops/gemv_impl.py | 71 +++++ python/bitblas/ops/matmul.py | 94 +++++++ python/bitblas/ops/matmul_impl.py | 438 ++++++++++++++++++++++++++++++ python/bitblas/ops/operator.py | 180 ++++++++++++ 5 files changed, 784 insertions(+) create mode 100644 python/bitblas/ops/__init__.py create mode 100644 python/bitblas/ops/gemv_impl.py create mode 100644 python/bitblas/ops/matmul.py create mode 100644 python/bitblas/ops/matmul_impl.py create mode 100644 python/bitblas/ops/operator.py diff --git a/python/bitblas/ops/__init__.py b/python/bitblas/ops/__init__.py new file mode 100644 index 0000000000..f370c84daa --- /dev/null +++ b/python/bitblas/ops/__init__.py @@ -0,0 +1 @@ +from .matmul import Matmul \ No newline at end of file diff --git a/python/bitblas/ops/gemv_impl.py b/python/bitblas/ops/gemv_impl.py new file mode 100644 index 0000000000..063d857984 --- /dev/null +++ b/python/bitblas/ops/gemv_impl.py @@ -0,0 +1,71 @@ +# pre-transformed tir expression of gemv +import tvm +from tvm.script import tir as T +from tvm import te + + +def gemv_i4(M, N, K, dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int_to_float( + nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str + ): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) + + A = te.placeholder((M, K), name="A", dtype=dtype) + B = te.placeholder((N, K // 8 * bit), name="B", dtype="int8") + + def decode_func(n, k): + w = _tir_u8_to_int_to_float( + bit, B[n, k // n_float_per_i8], k % n_float_per_i8, dtype=dtype + ) + return w + + B_decode = te.compute((N, K), decode_func, name="B_decode") + + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name="k") + C = te.compute( + (M, N), lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), name="C" + ) + func = te.create_prim_func([A, B, C]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "fast_decoding": True, + "source_format": { + "bits": 4, + "format": "int", + }, + "target_format": { + "bits": 16, + "format": "float", + }, + } + }, + ) + return tvm.IRModule.from_expr(func) + + +def gemv(M, N, K, dtype="float16"): + @tvm.script.ir_module + class GEMV: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [M, K], dtype=dtype) + B = T.match_buffer(b, [N, K], dtype=dtype) + C = T.match_buffer(c, [M, N], dtype=dtype) + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = 0.0 + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk] + + return GEMV diff --git a/python/bitblas/ops/matmul.py b/python/bitblas/ops/matmul.py new file mode 100644 index 0000000000..01cb133c45 --- /dev/null +++ b/python/bitblas/ops/matmul.py @@ -0,0 +1,94 @@ +import tvm +from tvm.target import Target +from bitblas.base.roller.arch.arch_base import Arch +from bitblas.base.roller.arch.cuda import CUDA +from bitblas.base.utils import fast_tune, fast_tune_with_dynamic_range +from typing import List, Dict + +from .operator import Operator +from .matmul_impl import matmul_impl_factory +from ..base.utils import match_global_kernel, get_rasterization_code + + +class Matmul(Operator): + def __init__( + self, + M, + N, + K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + name="matmul", + target: Target = tvm.target.Target("cuda"), + ): + super().__init__(name) + + if target.kind.name != "cuda": + raise ValueError("Currently only support cuda target") + self.arch = CUDA(target) + assert propagate_a is False, "Currently only support propagate_a=False" + + self.M = M + self.N = N + self.K = K + self.a_dtype = a_dtype + self.b_dtype = b_dtype + self.c_dtype = c_dtype + self.propagate_a = propagate_a + self.propagate_b = propagate_b + self.layout = layout + self.prim_func_mod = self._select_impl() + self.optimized_func = self._optimize_default(self.prim_func_mod, target) + if isinstance(M, List): + self.dynamic_range = {"m": M} + self.prim_func_mod["main"] = self.prim_func_mod["main"].with_attrs( + {"opt_shapes": self.dynamic_range} + ) + else: + self.dynamic_range = None + self.target = target + + def _select_impl(self): + _impl_key = f"matmul_{self.layout}" + if isinstance(self.M, List): + _impl_key += "_dyn_m" + if self.propagate_a: + _impl_key += "_pa" + if self.propagate_b: + _impl_key += "_pb" + if isinstance(self.M, int): + args = (self.M, self.N, self.K, self.a_dtype, self.b_dtype) + else: + args = (self.N, self.K, self.a_dtype, self.b_dtype) + impl_handler = matmul_impl_factory[_impl_key] + return impl_handler(*args) + + def optimize(self, topk: int = 20): + dynamic_range = self.dynamic_range + if dynamic_range is not None: + self.optimized_func = self._optimize_fast_tune_with_dynamic_range( + self.prim_func_mod["main"], self.target, topk, dynamic_range + ) + else: + self.optimized_func = self._optimize_fast_tune( + self.prim_func_mod["main"], self.target, topk + ) + + def post_process(self, code: str) -> str: + index = code.index("{", match_global_kernel(code)) + # some tricky judge to decide whether to insert rasterization code + if self.N * self.K > 10**6: + rasterization_code = get_rasterization_code(10) + code = code[: index + 2] + rasterization_code + code[index + 2 :] + return code + + def forward(self, a, b, c): + adapater_a = self._tensor_adapter(a, self.arch.device) + adapater_b = self._tensor_adapter(b, self.arch.device) + adapater_c = self._tensor_adapter(c, self.arch.device) + self.rt_mod(adapater_a, adapater_b, adapater_c) + return adapater_c diff --git a/python/bitblas/ops/matmul_impl.py b/python/bitblas/ops/matmul_impl.py new file mode 100644 index 0000000000..230e8e6cea --- /dev/null +++ b/python/bitblas/ops/matmul_impl.py @@ -0,0 +1,438 @@ +# pre-transformed tir expression of matmul +import tvm +from tvm.script import tir as T +from tvm import te + + +def matmul_nt_dyn_m(N, K, in_dtype="float16", out_dtype="float16"): + @tvm.script.ir_module + class MatmulNT: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + m = T.int32() + A = T.match_buffer(a, [m, K], dtype=in_dtype) + B = T.match_buffer(b, [N, K], dtype=in_dtype) + C = T.match_buffer(c, [m, N], dtype=out_dtype) + + for i, j, k in T.grid(m, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B[ + vj, vk + ].astype(out_dtype) + + return MatmulNT + + +def matmul_nn_dyn_m(N, K, in_dtype="float16", out_dtype="float16"): + @tvm.script.ir_module + class MatmulNN: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + m = T.int32() + A = T.match_buffer(a, [m, K], dtype=in_dtype) + B = T.match_buffer(b, [K, N], dtype=in_dtype) + C = T.match_buffer(c, [m, N], dtype=out_dtype) + + for i, j, k in T.grid(m, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B[ + vk, vj + ].astype(out_dtype) + + return MatmulNN + + +def matmul_nn(M, N, K, in_dtype="float16", out_dtype="float16"): + @tvm.script.ir_module + class MatmulNN: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [M, K], dtype=in_dtype) + B = T.match_buffer(b, [K, N], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B[ + vk, vj + ].astype(out_dtype) + + return MatmulNN + +def matmul_nt(M, N, K, in_dtype="float16", out_dtype="float16"): + @tvm.script.ir_module + class MatmulNT: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [M, K], dtype=in_dtype) + B = T.match_buffer(b, [N, K], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B[ + vj, vk + ].astype(out_dtype) + + return MatmulNT + +def matmul_nt_propagate_b_f16_f16_mma(M, N, K, in_dtype="float16", out_dtype="float16"): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + {"global_symbol": "main", "tir.noalias": True, "smooth_b": True} + ) + A = T.match_buffer(a, [M, K], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[ + vj // wn, + vk // wk, + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wn // 8, + vj % 8 // 4 * 8 + vk % 8, + ] + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B_reindex[ + vj, vk + ].astype(out_dtype) + + return MyModule + +def matmul_nt_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + { + "global_symbol": "main", + "tir.noalias": True, + "smooth_a": True, + "smooth_b": True, + } + ) + A = T.match_buffer(a, [M // wm, K // wk, wm, wk], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + A_reindex = T.alloc_buffer([M, K], dtype=in_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + + for i, k in T.grid(M, K): + with T.block("A_reindex"): + vj, vk = T.axis.remap("SS", [i, k]) + A_reindex[vj, vk] = A[vj // wm, vk // wk, vj % wm, vk % wk] + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[vj // wn, vk // wk, vj % wn, vk % wk] + + for i, j, k in T.grid(M, N, K): + with T.block("C"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A_reindex[vi, vk].astype( + out_dtype + ) * B_reindex[vj, vk].astype(out_dtype) + + return MyModule + + +def matmul_nt_i4(M, N, K, in_dtype="float16", out_dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) + + A = te.placeholder((M, K), name='A', dtype=in_dtype) + B = te.placeholder((N, K // 8 * bit), name='B', dtype='int8') + + def decode_func(n, k): + w = _tir_u8_to_int_to_float(bit, B[n, k // n_float_per_i8], k % n_float_per_i8, dtype=in_dtype) + return w + + B_decode = te.compute( + (N, K), + decode_func, + name='B_decode' + ) + + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), + name='C' + ) + func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { + 'B': { + 'decode_block': 'B_decode', + 'fast_decoding': True, + 'source_format':{ + 'bits': 4, + 'format': 'int', + }, + 'target_format': "float16" + } + }) + return tvm.IRModule.from_expr(func) + + +def matmul_nt_i4_propagate_b(M, N, K, in_dtype="float16", out_dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) + + A = te.placeholder((M, K), name='A', dtype=in_dtype) + B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') + + def decode_func(n, k, nn, kk): + w = _tir_u8_to_int_to_float(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype) + return w + + B_decode = te.compute( + (N // 16, K // 16, 16, 16), + decode_func, + name='B_decode' + ) + + B_reindex = te.compute( + (N, K), + lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], + name="B_reindex" + ) + + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A[i, k] * B_reindex[j, k], axis=k), + name='C' + ) + func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { + 'B': { + 'decode_block': 'B_decode', + 'fast_decoding': True, + 'source_format':{ + 'bits': 4, + 'format': 'int', + }, + 'target_format': "float16" + } + }) + func = func.with_attr("smooth_b", True) + + return tvm.IRModule.from_expr(func) + + +def matmul_nt_i4_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) + + A = te.placeholder((M // 16, K // 16, 16, 16), name='A', dtype=in_dtype) + B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') + + def decode_func(n, k, nn, kk): + w = _tir_u8_to_int_to_float(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype) + return w + + B_decode = te.compute( + (N // 16, K // 16, 16, 16), + decode_func, + name='B_decode' + ) + + B_reindex = te.compute( + (N, K), + lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], + name="B_reindex" + ) + + A_reindex = te.compute( + (M, K), + lambda i, j: A[i // 16, j // 16, i % 16, j % 16], + name="A_reindex" + ) + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A_reindex[i, k] * B_reindex[j, k], axis=k), + name='C' + ) + func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { + 'B': { + 'decode_block': 'B_decode', + 'fast_decoding': True, + 'source_format':{ + 'bits': 4, + 'format': 'int', + }, + 'target_format': "float16" + } + }) + func = func.with_attr("smooth_a", True) + func = func.with_attr("smooth_b", True) + + return tvm.IRModule.from_expr(func) + + +def matmul_nt_af4(M, N, K, in_dtype="float16", out_dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return (val >> (pos * nbit).astype("int8")) & mask + + A = te.placeholder((M, K), name='A', dtype=in_dtype) + B = te.placeholder((N, K // 8 * bit), name='B', dtype='int8') + LUT = te.placeholder((1 << bit, ), name='LUT', dtype='float16') + + + def decode_func(n, k): + w = _tir_u8_to_int(bit, B[n, k // n_float_per_i8], k % n_float_per_i8) + return LUT[w] + + B_decode = te.compute( + (N, K), + decode_func, + name='B_decode' + ) + + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), + name='C' + ) + func = te.create_prim_func([A, B, LUT, C]).with_attr("dequantize_info", { + 'B': { + 'decode_block': 'B_decode', + 'source_format':{ + 'bits': 4, + 'format': 'af', + }, + 'target_format': "float16" + } + }) + return tvm.IRModule.from_expr(func) + +def matmul_nt_af4_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): + bit = 4 + n_float_per_i8 = 8 // bit + + def _tir_u8_to_int(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr): + assert val.dtype == "int8" + mask = tvm.tir.const((1 << nbit) - 1, "int8") + return (val >> (pos * nbit).astype("int8")) & mask + + A = te.placeholder((M // 16, K // 16, 16, 16), name='A', dtype=in_dtype) + B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') + LUT = te.placeholder((1 << bit, ), name='LUT', dtype='float16') + + def decode_func(n, k, nn, kk): + w = _tir_u8_to_int(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8) + return LUT[w] + + B_decode = te.compute( + (N // 16, K // 16, 16, 16), + decode_func, + name='B_decode' + ) + + B_reindex = te.compute( + (N, K), + lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], + name="B_reindex" + ) + + A_reindex = te.compute( + (M, K), + lambda i, j: A[i // 16, j // 16, i % 16, j % 16], + name="A_reindex" + ) + # Describe the matrix multiplication in TE + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A_reindex[i, k] * B_reindex[j, k], axis=k), + name='C' + ) + func = te.create_prim_func([A, B, LUT, C]).with_attr("dequantize_info", { + 'B': { + 'decode_block': 'B_decode', + 'source_format':{ + 'bits': 4, + 'format': 'af', + }, + 'target_format': "float16" + } + }) + func = func.with_attr("smooth_a", True) + func = func.with_attr("smooth_b", True) + + return tvm.IRModule.from_expr(func) + + +# register the func +matmul_impl_factory = { + 'matmul_nt': matmul_nt, + 'matmul_nt_dyn_m': matmul_nt_dyn_m, + 'matmul_nn': matmul_nn, + 'matmul_nn_dyn_m': matmul_nn_dyn_m, + 'matmul_nt_propagate_b_f16_f16_mma': matmul_nt_propagate_b_f16_f16_mma, + 'matmul_nt_propagate_a_b': matmul_nt_propagate_a_b, + 'matmul_nt_propagate_a_b_f16_f16_mma': matmul_nt_propagate_a_b, +} \ No newline at end of file diff --git a/python/bitblas/ops/operator.py b/python/bitblas/ops/operator.py new file mode 100644 index 0000000000..2109d7b402 --- /dev/null +++ b/python/bitblas/ops/operator.py @@ -0,0 +1,180 @@ +from abc import ABC, abstractmethod +import tvm +from tvm import IRModule +from tvm.target import Target +from tvm.tir import PrimFunc +import bitblas +from typing import List, Dict +import numpy as np +from ..base import fast_tune, fast_tune_with_dynamic_range +from copy import deepcopy +import torch + +class Operator(ABC): + def __init__(self, name): + self.name = name + self.prim_func_mod = None + self.optimized_func = None + self.rt_mod = None + self.time_evaluator = None + self.profile_tensors = None + self.arch = None + + def codegen(self, target: Target) -> str: + if self.optimized_func: + with tvm.transform.PassContext(config={"tir.use_async_copy": True}): + rt_mod = tvm.build(self.optimized_func, target=target) + if rt_mod: + self.rt_mod = rt_mod + self.time_evaluator = rt_mod.time_evaluator( + rt_mod.entry_name, self.arch.device, number=10 + ) + return ( + self.post_process(rt_mod.imported_modules[0].get_source()) + if rt_mod + else None + ) + + def _optimize_default(self, func_mod: IRModule, target: Target) -> IRModule: + mod_for_opt = deepcopy(func_mod) + with target: + optimized_mod = bitblas.ApplyDefaultSchedule( # pylint: disable=not-callable + bitblas.gpu.Matmul(), + bitblas.gpu.GEMV(), + bitblas.gpu.Reduction(), + bitblas.gpu.GeneralReduction(), + bitblas.gpu.Fallback(), + )(mod_for_opt) + + if optimized_mod is not None: + return optimized_mod + return None + + def post_process(self, code: str) -> str: + return code + + def _optimize_fast_tune( + self, func: PrimFunc, target: Target, topk: int = 20 + ) -> IRModule: + _, best = fast_tune(func, target, topk=topk, parallel_build=True) + if best is not None: + return best.sch.mod + return None + + def _optimize_fast_tune_with_dynamic_range( + self, + func: PrimFunc, + target: Target, + topk: int = 20, + dynamic_range: Dict[str, List[int]] = None, + ): + optimized_mod = fast_tune_with_dynamic_range( + func, target, topk=topk, parallel_build=True, dynamic_range=dynamic_range + ) + if optimized_mod is not None: + return optimized_mod + return None + + def profile_latency(self) -> str: + if self.dynamic_range is not None: + return self._profile_latency_with_dynamic_range() + func = self.prim_func_mod["main"] + device = self.arch.device + + def var_warpper(v): + if isinstance(v, tvm.tir.Var): + assert "opt_shapes" in func.attrs + assert v.name in func.attrs["opt_shapes"] + return func.attrs["opt_shapes"][v.name].value + elif isinstance(v, tvm.tir.IntImm): + return v.value + else: + raise RuntimeError("Not supported type: ", type(v)) + + profile_tensors = [] + for param in func.params: + if param not in func.buffer_map: + # in case of dynamic symbolic may in params + continue + arg = func.buffer_map[param] + if arg.dtype == "int8": + profile_tensors.append( + tvm.nd.array( + np.random.randint( + -127, 128, [var_warpper(i) for i in arg.shape] + ).astype(arg.dtype), + device=device, + ) + ) + else: + profile_tensors.append( + tvm.nd.array( + np.random.uniform( + 0, 1, [var_warpper(i) for i in arg.shape] + ).astype(arg.dtype), + device=device, + ) + ) + self.profile_tensors = profile_tensors + latency = self.time_evaluator(*profile_tensors).mean * 1e3 + # ms + return latency + + def _profile_latency_with_dynamic_range(self) -> List: + func = self.prim_func_mod["main"] + device = self.arch.device + + def var_warpper(v, m): + if isinstance(v, tvm.tir.Var): + assert "opt_shapes" in func.attrs + assert v.name in func.attrs["opt_shapes"] + return m + elif isinstance(v, tvm.tir.IntImm): + return v.value + else: + raise RuntimeError("Not supported type: ", type(v)) + + benchmark_latencies = [] + for m in self.dynamic_range["m"]: + profile_tensors = [] + for param in func.params: + if param not in func.buffer_map: + # in case of dynamic symbolic may in params + continue + arg = func.buffer_map[param] + if arg.dtype == "int8": + profile_tensors.append( + tvm.nd.array( + np.random.randint( + -127, 128, [var_warpper(i, m) for i in arg.shape] + ).astype(arg.dtype), + device=device, + ) + ) + else: + profile_tensors.append( + tvm.nd.array( + np.random.uniform( + 0, 1, [var_warpper(i, m) for i in arg.shape] + ).astype(arg.dtype), + device=device, + ) + ) + self.profile_tensors = profile_tensors + latency = self.time_evaluator(*profile_tensors).mean * 1e3 + benchmark_latencies.append({ + "m": m, + "latency": latency + }) + # ms + return benchmark_latencies + + def _tensor_adapter(self, tensor, device): + if isinstance(tensor, tvm.te.Tensor): + return tensor + elif isinstance(tensor, torch.Tensor): + return tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(tensor)) + elif isinstance(tensor, np.ndarray): + return tvm.nd.array(tensor, device=device) + else: + raise RuntimeError("Not supported type: ", type(tensor)) From 595627ee42b77d5ef1c14d63fe50bd909b34c069 Mon Sep 17 00:00:00 2001 From: LeiWang Date: Tue, 6 Feb 2024 10:54:55 -0400 Subject: [PATCH 004/286] initial commit --- .gitignore | 59 +++++++ CODE_OF_CONDUCT.md | 9 + LICENSE | 21 +++ README.md | 0 SECURITY.md | 41 +++++ VERSION | 1 + python/bitblas/__init__.py | 29 ++++ python/bitblas/generator.py | 15 ++ python/bitblas_cli.py | 0 testing/operators/test_matmul_codegen.py | 162 ++++++++++++++++++ testing/type_conversion/int4b_fp16_convert.py | 80 +++++++++ 11 files changed, 417 insertions(+) create mode 100644 .gitignore create mode 100644 CODE_OF_CONDUCT.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 SECURITY.md create mode 100644 VERSION create mode 100644 python/bitblas/__init__.py create mode 100644 python/bitblas/generator.py create mode 100644 python/bitblas_cli.py create mode 100644 testing/operators/test_matmul_codegen.py create mode 100644 testing/type_conversion/int4b_fp16_convert.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..1a4c7500d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,59 @@ +# Compiled Object files +*.slo +*.lo +*.o +*.obj +*.pyc + +# Precompiled Headers +*.gch +*.pch + +# emacs +*~ + +# vim +*.swp +*.swo + +build/ +dist/ +__pycache__ +nnfusion.tar.gz + +# makeenv and test intermediate files +tmp/ + +venv/ +.vscode/ +.vs/ + +# VisualGDB files +VisualGDB/ +toolchain.cmake + +# docbuild artifacts +doc/sphinx/build/* +doc/doxygen/*.xml +doc/doxygen/*.html +doc/doxygen/man/* +doc/doxygen/latex/* +doc/doxygen/xml/* +doc/doxygen/html/* + +# git merge +*.orig +\#* +\.#* + +# idea +.idea/* + +# python egg +*.egg-info + +# Macos +**/.DS_Store + +nnfusion_rt/ +models/frozenmodels/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..f9ba8cf65f --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..79656060de --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..aea9c97253 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + + \ No newline at end of file diff --git a/VERSION b/VERSION new file mode 100644 index 0000000000..b9f8bf2855 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.0.0.dev \ No newline at end of file diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py new file mode 100644 index 0000000000..678896310b --- /dev/null +++ b/python/bitblas/__init__.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""DLight package provides efficient schedules out-of-box for deep learning workloads.""" +from . import gpu +from .base import ( + fast_tune, + ApplyDefaultSchedule, + ApplyFastTuning, + BlockInfo, + IterInfo, + ScheduleRule, + normalize_prim_func, + try_inline, + try_inline_contiguous_spatial, +) diff --git a/python/bitblas/generator.py b/python/bitblas/generator.py new file mode 100644 index 0000000000..30d68fc576 --- /dev/null +++ b/python/bitblas/generator.py @@ -0,0 +1,15 @@ + +class BitBLASGenerator: + def __init__(self, input_size, data_type='float', optimization_level=1): + self.input_size = input_size + self.data_type = data_type + self.optimization_level = optimization_level + # 其他初始化代码 + + def generate_cuda_code(self): + # 生成CUDA代码的逻辑 + pass + + def generate_header(self): + # 生成Header文件的逻辑 + pass diff --git a/python/bitblas_cli.py b/python/bitblas_cli.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/testing/operators/test_matmul_codegen.py b/testing/operators/test_matmul_codegen.py new file mode 100644 index 0000000000..b911f4a849 --- /dev/null +++ b/testing/operators/test_matmul_codegen.py @@ -0,0 +1,162 @@ +import tvm +import bitblas +from bitblas.ops import Matmul +import numpy as np +import torch + + +def test_matmul_codegen_static_shape_default(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + code = matmul.codegen(target=target) + assert code + + +def test_matmul_codegen_static_shape_optimize(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + matmul.optimize() + code = matmul.codegen(target=target) + assert code + + +def test_matmul_codegen_dynamic_range_optimize(): + M = [1024] + N = 1024 + K = 1024 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + matmul.optimize() + code = matmul.codegen(target=target) + print(code) + assert code + + +def test_matmul_profile_static_shape_default(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + code = matmul.codegen(target=target) + latency = matmul.profile_latency() + print(latency) + + +def test_matmul_profile_dynamic_shape_default(): + M = [16, 32, 64, 128] + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + code = matmul.codegen(target=target) + latency = matmul.profile_latency() + print(latency) + + +def test_matmul_invoke_static_shape_default(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="float16", + b_dtype="float16", + c_dtype="float16", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + code = matmul.codegen(target=target) + latency = matmul.profile_latency() + a = torch.rand((M, K), dtype=torch.float16).cuda() + b = torch.rand((N, K), dtype=torch.float16).cuda() + c = torch.empty((M, N), dtype=torch.float16).cuda() + matmul.forward(a, b, c) + + +if __name__ == "__main__": + # test_matmul_codegen_static_shape_default() # passed + # test_matmul_codegen_static_shape_optimize() # passed + # test_matmul_codegen_dynamic_range_optimize() # passed + # test_matmul_profile_static_shape_default() # passed + # test_matmul_profile_dynamic_shape_default() # passed + # test_matmul_invoke_static_shape_default() + test_matmul_codegen_dynamic_range_optimize() diff --git a/testing/type_conversion/int4b_fp16_convert.py b/testing/type_conversion/int4b_fp16_convert.py new file mode 100644 index 0000000000..f0123e3055 --- /dev/null +++ b/testing/type_conversion/int4b_fp16_convert.py @@ -0,0 +1,80 @@ +import tvm +import torch +import numpy as np +import tvm.testing +from tvm.script import tir as T +import os +from tvm import te + +import numpy as np + +def compress_int4_to_int8(int4_weight): + if int4_weight.dtype == np.float16: + int4_weight = int4_weight.astype(dtype=np.int8) + int8_weight = np.zeros( + (*int4_weight.shape[:-1], int4_weight.shape[-1] // 2), dtype=np.int8 + ) + for j in range(int4_weight.shape[-1] // 2): + for k in range(2): + int8_weight[:, j] |= int4_weight[:, j * 2 + k] << (4 * k) + return int8_weight + + +def interleave_weight_int4(qweight): + nbits = 4 + qweight = qweight.view(np.int32) + new_qweight = np.zeros_like(qweight) + bits_stride = 16 + mask = (1 << nbits) - 1 # for 4bit the val is 0x0000000f + num_groups = 32 // bits_stride + elems_per_group = bits_stride // nbits + for i in range(num_groups): + for j in range(elems_per_group): + offset = i * elems_per_group + j + shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits + new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift + + return new_qweight.view(np.int8) + + +N = 2 +K = 16 +torch.manual_seed(0) +raw_data = torch.randint(0, 7, (N, K), dtype=torch.int8).cpu().numpy() +compressed_b = compress_int4_to_int8(raw_data) +interleaved_weight = interleave_weight_int4(compressed_b) + +print(f"raw_data: \n", raw_data) +print(f"interleaved_weight: \n", interleaved_weight) + + +def tir_interleave_weight_int4_f16(N=2, K=16, bits=4): + QK = K * bits // 32 + bits_stride = 16 + mask = (1 << bits) - 1 # for 4bit the val is 0x0000000f + num_groups = 32 // bits_stride + elems_per_group = bits_stride // bits + + @T.prim_func + def main(A: T.Buffer((N, QK), "int32"), B: T.Buffer((N, QK), "int32")): + for ax0, ax1, ax2, ax3 in T.grid(N, QK, num_groups, elems_per_group): + with T.block("B"): + v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + offset = v2 * elems_per_group + v3 + shift = (offset % num_groups) * bits_stride + (offset // num_groups) * bits + B[v0, v1] = B[v0, v1] | (((A[v0, v1] >> (bits * offset)) & mask) << shift) + + return main + +interleave_func = tir_interleave_weight_int4_f16() + +ref_func = tvm.build(interleave_func, target="llvm") +ctx = tvm.cpu(0) +compressed_b_cast_32 = compressed_b.view(np.int32) +print("compressed_b_cast_32: \n", compressed_b_cast_32) +tvm_compress_b = tvm.nd.array(compressed_b_cast_32, ctx) +tvm_interleaved_b = tvm.nd.array(np.zeros_like(compressed_b_cast_32), ctx) +ref_func(tvm_compress_b, tvm_interleaved_b) +tvm_interleaved_b_np = tvm_interleaved_b.asnumpy() +tvm_interleaved_b_np_int8 = tvm_interleaved_b_np.view(np.int8) +print("tvm_interleaved_b_np_int8: \n", tvm_interleaved_b_np_int8) From c72090a27d86367f4f4ef033c0ae59fdb4ada1ba Mon Sep 17 00:00:00 2001 From: LeiWang Date: Tue, 6 Feb 2024 11:07:48 -0400 Subject: [PATCH 005/286] refactor fast dlight to bit blas --- python/bitblas/base/utils.py | 56 ++++++++++++------------ testing/operators/test_matmul_codegen.py | 4 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py index f8edd2159b..38717e13d9 100644 --- a/python/bitblas/base/utils.py +++ b/python/bitblas/base/utils.py @@ -23,7 +23,7 @@ from tvm import tir, IRModule from tvm.runtime import Module from tvm.tir import Schedule -from tvm import dlight as dl +import bitblas from .analysis import get_root_block, get_reduction_blocks, find_var_from_func from .roller.arch import Arch from bitblas.base.roller.arch import CUDA @@ -93,34 +93,34 @@ def _apply_config( root_block = get_root_block(sch) blocks = sch.get_child_blocks(root_block) reduction_blocks = get_reduction_blocks(sch, blocks) - try: - if not reduction_blocks: - return dl.gpu.ElementWise().apply_config(func, config) - elif config.use_tc: - if config.arch.sm_version >= 80: - # For A100(sm_80) or more advanced gpu, use MMA tensorization. - return dl.gpu.MatmulTensorizationMMA().apply_config(func, config) - else: - # For other GPUs, use WMMA tensorization. - return dl.gpu.MatmulTensorizationWMMA().apply_config(func, config) + # try: + if not reduction_blocks: + return bitblas.gpu.ElementWise().apply_config(func, config) + elif config.use_tc: + if config.arch.sm_version >= 80: + # For A100(sm_80) or more advanced gpu, use MMA tensorization. + return bitblas.gpu.MatmulTensorizationMMA().apply_config(func, config) else: - _reduction_rules = [] - - _reduction_rules.append(dl.gpu.GEMV()) - if not any([t > 1 for t in config.reduce_thread]): - # Matrix multiplication template doesn't support inner thread reduction - _reduction_rules.append(dl.gpu.Matmul()) - _reduction_rules.append(dl.gpu.GeneralReduction()) - - for rule in _reduction_rules: - try: - sch = rule.apply_config(func, config) - except: - continue - if sch is not None: - return sch - except Exception as e_msg: - print("[FastDlight] Apply config failed: ", e_msg) + # For other GPUs, use WMMA tensorization. + return bitblas.gpu.MatmulTensorizationWMMA().apply_config(func, config) + else: + _reduction_rules = [] + + _reduction_rules.append(bitblas.gpu.GEMV()) + if not any([t > 1 for t in config.reduce_thread]): + # Matrix multiplication template doesn't support inner thread reduction + _reduction_rules.append(bitblas.gpu.Matmul()) + _reduction_rules.append(bitblas.gpu.GeneralReduction()) + + for rule in _reduction_rules: + try: + sch = rule.apply_config(func, config) + except: + continue + if sch is not None: + return sch + # except Exception as e_msg: + # print("[FastDlight] Apply config failed: ", e_msg) return None diff --git a/testing/operators/test_matmul_codegen.py b/testing/operators/test_matmul_codegen.py index b911f4a849..d19372b664 100644 --- a/testing/operators/test_matmul_codegen.py +++ b/testing/operators/test_matmul_codegen.py @@ -155,8 +155,8 @@ def test_matmul_invoke_static_shape_default(): if __name__ == "__main__": # test_matmul_codegen_static_shape_default() # passed # test_matmul_codegen_static_shape_optimize() # passed - # test_matmul_codegen_dynamic_range_optimize() # passed + test_matmul_codegen_dynamic_range_optimize() # passed # test_matmul_profile_static_shape_default() # passed # test_matmul_profile_dynamic_shape_default() # passed # test_matmul_invoke_static_shape_default() - test_matmul_codegen_dynamic_range_optimize() + # test_matmul_codegen_dynamic_range_optimize() From 3cdebe907df38c791ebb21101411b30e0355e99d Mon Sep 17 00:00:00 2001 From: LeiWang Date: Wed, 7 Feb 2024 00:55:47 -0400 Subject: [PATCH 006/286] support i8 swizzle --- python/bitblas/base/utils.py | 1 + python/bitblas/gpu/matmul_analysis.py | 92 ++++++++++---- python/bitblas/gpu/matmul_mma.py | 10 +- python/bitblas/ops/matmul.py | 4 +- python/bitblas/ops/matmul_impl.py | 42 ++++++- testing/operators/test_int8xint8_gemm.py | 33 +++++ testing/operators/test_matmul_codegen.py | 26 +++- testing/tir_expr/float16xfloat16_gemm.py | 65 ++++++++++ testing/tir_expr/int8xint8_gemm.py | 76 ++++++++++++ .../test_numpy_compress_convert.py | 0 testing/weight_only/inverse_index_map.py | 117 ++++++++++++++++++ 11 files changed, 430 insertions(+), 36 deletions(-) create mode 100644 testing/operators/test_int8xint8_gemm.py create mode 100644 testing/tir_expr/float16xfloat16_gemm.py create mode 100644 testing/tir_expr/int8xint8_gemm.py create mode 100644 testing/type_conversion/test_numpy_compress_convert.py create mode 100644 testing/weight_only/inverse_index_map.py diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py index 38717e13d9..1b277efbea 100644 --- a/python/bitblas/base/utils.py +++ b/python/bitblas/base/utils.py @@ -232,6 +232,7 @@ def tvm_callback_cuda_postproc(code, _): best = None best_latency = 1e9 for cpresult in cpresults: + # print(cpresult.code) config = cpresult.config try: latency = cpresult.profile() diff --git a/python/bitblas/gpu/matmul_analysis.py b/python/bitblas/gpu/matmul_analysis.py index 1610fb3e14..7af32a71bd 100644 --- a/python/bitblas/gpu/matmul_analysis.py +++ b/python/bitblas/gpu/matmul_analysis.py @@ -64,7 +64,9 @@ def auto_inline_producers( inlined_cnt = 0 producers = _collect_producers(sch, block) for producer in producers: - if any(sch.get(producer) == sch.get(skip_block) for skip_block in skip_blocks): + if any( + sch.get(producer) == sch.get(skip_block) for skip_block in skip_blocks + ): continue try: sch.compute_inline(producer) @@ -161,7 +163,8 @@ def make_iter_fusion_index_map( fused_iters[trait.kind] = v_i final_indices: List[tir.PrimExpr] = [ - fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order + fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) + for kind in kind_order ] return tir.IndexMap(input_iters, final_indices, None) @@ -234,9 +237,15 @@ def get_access_axes(region: List[Range]) -> Set[Var]: if {x.kind for x in traits.values()}.intersection(gemm_traits) != gemm_traits: return None - A_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in A_axes] - B_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in B_axes] - C_traits = [traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in C_axes] + A_traits = [ + traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in A_axes + ] + B_traits = [ + traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in B_axes + ] + C_traits = [ + traits[iter_var.var] for iter_var in block.iter_vars if iter_var.var in C_axes + ] block_traits = [traits[i.var] for i in block.iter_vars] return A_traits, B_traits, C_traits, block_traits @@ -332,7 +341,8 @@ def infer_layout(layout: str, region: List[Range], kind: str = "A"): ) matmul_index_map = make_iter_fusion_index_map( - block_traits, [IterKind.kIter_S, IterKind.kIter_I, IterKind.kIter_J, IterKind.kIter_K] + block_traits, + [IterKind.kIter_S, IterKind.kIter_I, IterKind.kIter_J, IterKind.kIter_K], ) return ( @@ -360,10 +370,14 @@ def is_dequantize(block: BlockRV) -> bool: block_stmt = sch.get(block) if len(block_stmt.reads) < 2: return False - has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads) + has_uint_input = any( + "uint" in str(region.buffer.dtype) for region in block_stmt.reads + ) if not has_uint_input: return False - if len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype): + if len(block_stmt.writes) != 1 or "float" not in str( + block_stmt.writes[0].buffer.dtype + ): return False return True @@ -388,7 +402,9 @@ def get_access_vars(region: List[Range]) -> List[Var]: axes.extend(undefined_vars(r.min)) # remove trivial axis trivial_vars = set( - iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent) + iter_var.var + for iter_var in block_stmt.iter_vars + if _is_one(iter_var.dom.extent) ) axes = [axis for axis in axes if axis not in trivial_vars] # remove duplicate axis @@ -398,9 +414,9 @@ def get_access_vars(region: List[Range]) -> List[Var]: lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:] rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:] is_identity = list(lhs_access_vars) == list(rhs_access_vars) - is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set( - rhs_access_vars - ) + is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set( + lhs_access_vars + ) == set(rhs_access_vars) return is_identity, is_transpose @@ -494,7 +510,9 @@ def check_sm_version(arch: str) -> int: sm_version = arch.replace("sm_", "") return int(sm_version) if sm_version.isdigit() else -1 - def analysis_tensorcore_tags(sch: tir.Schedule, block: BlockRV, target: Target) -> bool: + def analysis_tensorcore_tags( + sch: tir.Schedule, block: BlockRV, target: Target + ) -> bool: tags: Dict[str, Union[List[int], int]] = {} block_stmt = sch.get(block) @@ -565,7 +583,9 @@ def check_last_trait(region: List[Range]): out_dtype=out_dtype, ) except: - print("[FastDlight][WARNING] Cannot find the corresponding wmma intrin group") + print( + "[FastDlight][WARNING] Cannot find the corresponding wmma intrin group" + ) return func, None # reindex and transform functions @@ -577,7 +597,7 @@ def check_last_trait(region: List[Range]): block_stmt = sch.get(main_block) - # the batch dimension is not taken into consideration. + # the batch dimension is not taken into consideration. for item_var in block_stmt.iter_vars[1:]: extent = item_var.dom.extent if isinstance(extent, tir.expr.IntImm): @@ -589,16 +609,27 @@ def check_last_trait(region: List[Range]): return func, None -def get_propagate_map(trans: bool = True, dtype="float16"): +def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A"): from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel ldmatrix_32x8_to_shared_16x16_layout, ldmatrix_trans_32x8_to_shared_16x16_layout, + ldmatrix_32x16_to_shared_16x32_layout_a, + ldmatrix_32x16_to_shared_16x32_layout_b, ) - assert dtype in ["float16"], "Only support float16 for now" - - ldmatrix_layout = ldmatrix_32x8_to_shared_16x16_layout - ldmatrix_layout_trans = ldmatrix_trans_32x8_to_shared_16x16_layout + assert dtype in ["float16", "int8"], "Only support float16 for now" + if dtype == "float16": + ldmatrix_layout = ldmatrix_32x8_to_shared_16x16_layout + ldmatrix_layout_trans = ldmatrix_trans_32x8_to_shared_16x16_layout + elif dtype == "int8": + # int8 mma only support 32x16 to 16x32 layout + if matrix_name == "A" and trans == False: + ldmatrix_layout = ldmatrix_32x16_to_shared_16x32_layout_a + elif matrix_name == "B" and trans == True: + ldmatrix_layout = ldmatrix_32x16_to_shared_16x32_layout_b + else: + print("trans", trans) + raise ValueError("Unknown matrix name ", matrix_name) # IntraWarp memory layout was occurred by ldmatrix, we should lift the ld_matrix out def ldmatrix_permutation_16x16_32x8_16x16(kernel_i, kernel_j): @@ -611,11 +642,19 @@ def ldmatrix_trans_permutation_16x16_32x8_16x16(kernel_i, kernel_j): local_id = kernel_j % 8 return ldmatrix_layout_trans(thread_id, local_id) - ldmatrix_index_map = ( - ldmatrix_trans_permutation_16x16_32x8_16x16 - if trans - else ldmatrix_permutation_16x16_32x8_16x16 - ) + def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 16 + local_id = kernel_j % 16 + return ldmatrix_layout(thread_id, local_id) + + if dtype == "float16": + ldmatrix_index_map = ( + ldmatrix_trans_permutation_16x16_32x8_16x16 + if trans + else ldmatrix_permutation_16x16_32x8_16x16 + ) + else: + ldmatrix_index_map = ldmatrix_permutation_16x32_32x16_32x16 def permutation(i, j, kernel_i, kernel_j): return ( @@ -625,7 +664,8 @@ def permutation(i, j, kernel_i, kernel_j): ) # TODO(lei): index_dtype should be analyzed from the schedule + row, col = [16, 16] if dtype == "float16" else [16, 32] inversed_index_map = IndexMap.from_func( ldmatrix_index_map, index_dtype="int32" - ).inverse([16, 16]) + ).inverse([row, col]) return permutation, inversed_index_map diff --git a/python/bitblas/gpu/matmul_mma.py b/python/bitblas/gpu/matmul_mma.py index 7e3dab360b..a0c4ecca21 100644 --- a/python/bitblas/gpu/matmul_mma.py +++ b/python/bitblas/gpu/matmul_mma.py @@ -433,7 +433,7 @@ def get_axis(l, r, trans): def can_enable_swizzle(dtype: str, smooth: bool): # inject_permuted_layout only support float16 currently - if dtype == "float16": + if dtype == "float16" or dtype == "int8": # if we use smooth layout, we don't need to do swizzling return not smooth return False @@ -582,7 +582,7 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False, tra ) # rewrite global smooth layout - def smooth_gmem_layout_rewrite(sch, block, enable=True, trans=False): + def smooth_gmem_layout_rewrite(sch, block, enable=True, trans=False, matrix_name="A"): if not enable: return # step1: find the first producer block @@ -594,15 +594,15 @@ def smooth_gmem_layout_rewrite(sch, block, enable=True, trans=False): propagate_block: tir.Block = producers[-1] # step2: transform the layout with inverse permutation - _, inverse_indexmap = get_propagate_map(trans=trans, dtype=intrin_info.in_dtype) + _, inverse_indexmap = get_propagate_map(trans=trans, dtype=intrin_info.in_dtype, matrix_name=matrix_name) def inverse_permutation(i, j, ii, jj): return (i, j, *inverse_indexmap.map_indices([ii, jj])) sch.transform_layout(propagate_block, ("read", 0), inverse_permutation) - smooth_gmem_layout_rewrite(sch, a_g2s, intrin_info.smooth_a, intrin_info.trans_a) - smooth_gmem_layout_rewrite(sch, b_g2s, intrin_info.smooth_b, intrin_info.trans_b) + smooth_gmem_layout_rewrite(sch, a_g2s, intrin_info.smooth_a, intrin_info.trans_a, matrix_name="A") + smooth_gmem_layout_rewrite(sch, b_g2s, intrin_info.smooth_b, intrin_info.trans_b, matrix_name="B") auto_inline_producers(sch, a_g2s) auto_inline_producers(sch, b_g2s) diff --git a/python/bitblas/ops/matmul.py b/python/bitblas/ops/matmul.py index 01cb133c45..c5355922cc 100644 --- a/python/bitblas/ops/matmul.py +++ b/python/bitblas/ops/matmul.py @@ -61,9 +61,9 @@ def _select_impl(self): if self.propagate_b: _impl_key += "_pb" if isinstance(self.M, int): - args = (self.M, self.N, self.K, self.a_dtype, self.b_dtype) + args = (self.M, self.N, self.K, self.a_dtype, self.c_dtype) else: - args = (self.N, self.K, self.a_dtype, self.b_dtype) + args = (self.N, self.K, self.a_dtype, self.c_dtype) impl_handler = matmul_impl_factory[_impl_key] return impl_handler(*args) diff --git a/python/bitblas/ops/matmul_impl.py b/python/bitblas/ops/matmul_impl.py index 230e8e6cea..71884f51cd 100644 --- a/python/bitblas/ops/matmul_impl.py +++ b/python/bitblas/ops/matmul_impl.py @@ -92,7 +92,45 @@ def main(a: T.handle, b: T.handle, c: T.handle): return MatmulNT -def matmul_nt_propagate_b_f16_f16_mma(M, N, K, in_dtype="float16", out_dtype="float16"): +def matmul_nt_propagate_b_s8_s8_s32_mma(M, N, K, in_dtype="int8", out_dtype="int32"): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + {"global_symbol": "main", "tir.noalias": True, "smooth_b": True} + ) + A = T.match_buffer(a, [M, K], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[ + vj // wn, + vk // wk, + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, + vj % 8 // 4 * 16 + vk % 16 + ] + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B_reindex[ + vj, vk + ].astype(out_dtype) + + return MyModule + +def matmul_nt_propagate_b_f16_f16_f16_mma(M, N, K, in_dtype="float16", out_dtype="float16"): wm, wn, wk = 16, 16, 16 if in_dtype == "int8": wm, wn, wk = 16, 16, 32 @@ -432,7 +470,7 @@ def decode_func(n, k, nn, kk): 'matmul_nt_dyn_m': matmul_nt_dyn_m, 'matmul_nn': matmul_nn, 'matmul_nn_dyn_m': matmul_nn_dyn_m, - 'matmul_nt_propagate_b_f16_f16_mma': matmul_nt_propagate_b_f16_f16_mma, + 'matmul_nt_propagate_b_f16_f16_mma': matmul_nt_propagate_b_f16_f16_f16_mma, 'matmul_nt_propagate_a_b': matmul_nt_propagate_a_b, 'matmul_nt_propagate_a_b_f16_f16_mma': matmul_nt_propagate_a_b, } \ No newline at end of file diff --git a/testing/operators/test_int8xint8_gemm.py b/testing/operators/test_int8xint8_gemm.py new file mode 100644 index 0000000000..145dd40b60 --- /dev/null +++ b/testing/operators/test_int8xint8_gemm.py @@ -0,0 +1,33 @@ +import tvm +import bitblas +from bitblas.ops import Matmul +import numpy as np +import torch + + +def test_matmul_codegen_static_shape_optimize_s8(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="int8", + b_dtype="int8", + c_dtype="int32", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + matmul.optimize() + code = matmul.codegen(target=target) + assert code + + +if __name__ == "__main__": + test_matmul_codegen_static_shape_optimize_s8() \ No newline at end of file diff --git a/testing/operators/test_matmul_codegen.py b/testing/operators/test_matmul_codegen.py index d19372b664..4602cad370 100644 --- a/testing/operators/test_matmul_codegen.py +++ b/testing/operators/test_matmul_codegen.py @@ -50,6 +50,29 @@ def test_matmul_codegen_static_shape_optimize(): matmul.optimize() code = matmul.codegen(target=target) assert code + +def test_matmul_codegen_static_shape_optimize_s8(): + M = 16384 + N = 16384 + K = 16384 + + target = tvm.target.Target("nvidia/nvidia-a100") + + matmul = Matmul( + M=M, + N=N, + K=K, + a_dtype="int8", + b_dtype="int8", + c_dtype="int32", + propagate_a=False, + propagate_b=False, + layout="nt", + target=target, + ) + matmul.optimize() + code = matmul.codegen(target=target) + assert code def test_matmul_codegen_dynamic_range_optimize(): @@ -155,7 +178,8 @@ def test_matmul_invoke_static_shape_default(): if __name__ == "__main__": # test_matmul_codegen_static_shape_default() # passed # test_matmul_codegen_static_shape_optimize() # passed - test_matmul_codegen_dynamic_range_optimize() # passed + test_matmul_codegen_static_shape_optimize_s8() + # test_matmul_codegen_dynamic_range_optimize() # passed # test_matmul_profile_static_shape_default() # passed # test_matmul_profile_dynamic_shape_default() # passed # test_matmul_invoke_static_shape_default() diff --git a/testing/tir_expr/float16xfloat16_gemm.py b/testing/tir_expr/float16xfloat16_gemm.py new file mode 100644 index 0000000000..dca307b0ff --- /dev/null +++ b/testing/tir_expr/float16xfloat16_gemm.py @@ -0,0 +1,65 @@ +import tvm +from tvm.script import tir as T +from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy +from bitblas.base.roller.arch import CUDA +from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags +from bitblas.base.utils import apply_and_build +from bitblas.ops.matmul_impl import ( + matmul_nt, + matmul_nt_propagate_b_s8_s8_s32_mma +) + + +def test_f16_f16_gemm(): + ir_module = matmul_nt(16384, 16384, 16384, "float16", "float16") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(1) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +def test_i8_i8_gemm_propagate_b(): + ir_module = matmul_nt_propagate_b_s8_s8_s32_mma(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(1) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + print(best.sch.mod) + +test_f16_f16_gemm() +# test_i8_i8_gemm_propagate_b() diff --git a/testing/tir_expr/int8xint8_gemm.py b/testing/tir_expr/int8xint8_gemm.py new file mode 100644 index 0000000000..f7ae7238cd --- /dev/null +++ b/testing/tir_expr/int8xint8_gemm.py @@ -0,0 +1,76 @@ +import tvm +import numpy as np +from tvm.script import tir as T +from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy +from bitblas.base.roller.arch import CUDA +from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags +from bitblas.base.utils import apply_and_build +from bitblas.ops.matmul_impl import ( + matmul_nt, + matmul_nt_propagate_b_s8_s8_s32_mma +) + + +def test_i8_i8_gemm_correctness(): + ir_module = matmul_nt(1024, 1024, 1024, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + + numpy_a = np.random.randint(-128, 127, (1024, 1024)).astype("int8") + numpy_b = np.random.randint(-128, 127, (1024, 1024)).astype("int8") + numpy_c = np.matmul(numpy_a.astype("int32"), numpy_b.T.astype("int32")) + ctx = tvm.cuda() + tvm_a = tvm.nd.array(numpy_a, device=ctx) + tvm_b = tvm.nd.array(numpy_b, device=ctx) + tvm_c = tvm.nd.array(np.zeros((1024, 1024), dtype="int32"), device=ctx) + best.mod(tvm_a, tvm_b, tvm_c) + np.testing.assert_allclose(tvm_c.asnumpy(), numpy_c, atol=1e-5) + print(best.code) + +def test_i8_i8_gemm_propagate_b(): + ir_module = matmul_nt_propagate_b_s8_s8_s32_mma(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +test_i8_i8_gemm_correctness() +# test_i8_i8_gemm_propagate_b() diff --git a/testing/type_conversion/test_numpy_compress_convert.py b/testing/type_conversion/test_numpy_compress_convert.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/testing/weight_only/inverse_index_map.py b/testing/weight_only/inverse_index_map.py new file mode 100644 index 0000000000..14fee55585 --- /dev/null +++ b/testing/weight_only/inverse_index_map.py @@ -0,0 +1,117 @@ +import tvm +from tvm.script import tir as T +from tvm.tir import IndexMap +from tvm.tir.tensor_intrin.cuda import ( + ldmatrix_trans_32x8_to_shared_16x16_layout, + ldmatrix_32x16_to_shared_16x32_layout_a, + ldmatrix_32x16_to_shared_16x32_layout_b, +) + +def ldmatrix_trans_permutation_16x16_32x8_16x16(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 8 + local_id = kernel_j % 8 + return ldmatrix_trans_32x8_to_shared_16x16_layout(thread_id, local_id) + +@tvm.script.ir_module +class LDMATRIX_16x16: + @T.prim_func + def main(a: T.handle, b: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [16, 16], dtype="float16") + B = T.match_buffer(b, [16, 16], dtype="float16") + + for i, j in T.grid(16, 16): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(B[vi, vj]) + T.writes(A[vi, vj]) + A[vi, vj] = B[vi, vj] + +ir_module = LDMATRIX_16x16 +sch = tvm.tir.Schedule(ir_module) + +block_b = sch.get_block("B") +sch.transform_layout(block_b, ('read', 0), ldmatrix_trans_permutation_16x16_32x8_16x16) +print("========================inject transform=============================") +print(sch.mod["main"].script()) + +index_map = IndexMap.from_func(ldmatrix_trans_permutation_16x16_32x8_16x16, index_dtype="int32") +inversed_index_map = index_map.inverse([16, 16]) +def inverse_permutation(i, j): + return inversed_index_map.map_indices([i, j]) +sch.transform_layout(block_b, ('read', 0), inverse_permutation) +print("========================inverse inject transform=============================") +print(sch.mod["main"].script()) + + +def ldmatrix_trans_permutation_16x32_16x32_16x32(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 16 + local_id = kernel_j % 16 + return ldmatrix_32x16_to_shared_16x32_layout_a(thread_id, local_id) + +@tvm.script.ir_module +class LDMATRIX_16x32_A: + @T.prim_func + def main(a: T.handle, b: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [16, 32], dtype="float16") + B = T.match_buffer(b, [16, 32], dtype="float16") + + for i, j in T.grid(16, 32): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(B[vi, vj]) + T.writes(A[vi, vj]) + A[vi, vj] = B[vi, vj] + +ir_module = LDMATRIX_16x32_A +sch = tvm.tir.Schedule(ir_module) + +block_b = sch.get_block("B") +sch.transform_layout(block_b, ('read', 0), ldmatrix_trans_permutation_16x32_16x32_16x32) +print("========================inject transform=============================") +print(sch.mod["main"].script()) + +index_map = IndexMap.from_func(ldmatrix_trans_permutation_16x32_16x32_16x32, index_dtype="int32") +inversed_index_map = index_map.inverse([16, 32]) +def inverse_permutation(i, j): + return inversed_index_map.map_indices([i, j]) +sch.transform_layout(block_b, ('read', 0), inverse_permutation) +print("========================inverse inject transform=============================") +print(sch.mod["main"].script()) + +def ldmatrix_trans_permutation_16x32_16x32_16x32(kernel_i, kernel_j): + thread_id = kernel_i * 2 + kernel_j // 16 + local_id = kernel_j % 16 + return ldmatrix_32x16_to_shared_16x32_layout_b(thread_id, local_id) + +@tvm.script.ir_module +class LDMATRIX_16x32_B: + @T.prim_func + def main(a: T.handle, b: T.handle): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [16, 32], dtype="float16") + B = T.match_buffer(b, [16, 32], dtype="float16") + + for i, j in T.grid(16, 32): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(B[vi, vj]) + T.writes(A[vi, vj]) + A[vi, vj] = B[vi, vj] + +ir_module = LDMATRIX_16x32_B +sch = tvm.tir.Schedule(ir_module) + +block_b = sch.get_block("B") +sch.transform_layout(block_b, ('read', 0), ldmatrix_trans_permutation_16x32_16x32_16x32) +print("========================inject transform=============================") +print(sch.mod["main"].script()) + +index_map = IndexMap.from_func(ldmatrix_trans_permutation_16x32_16x32_16x32, index_dtype="int32") +inversed_index_map = index_map.inverse([16, 32]) +def inverse_permutation(i, j): + return inversed_index_map.map_indices([i, j]) +sch.transform_layout(block_b, ('read', 0), inverse_permutation) +print("========================inverse inject transform=============================") +print(sch.mod["main"].script()) From 6b485087fa737d9b0152882ea9640c1801ef967f Mon Sep 17 00:00:00 2001 From: LeiWang Date: Wed, 7 Feb 2024 12:59:22 -0400 Subject: [PATCH 007/286] int8xint2 gemm --- after_memory_rewrite.cu | 387 ++ python/bitblas/base/config.py | 26 - python/bitblas/base/roller/config.py | 28 +- python/bitblas/base/roller/policy/default.py | 1 - .../bitblas/base/roller/policy/tensorcore.py | 1 - python/bitblas/base/utils.py | 66 +- python/bitblas/gpu/intrin/lop3.py | 23 +- python/bitblas/gpu/matmul_analysis.py | 19 +- python/bitblas/gpu/matmul_mma.py | 13 +- python/bitblas/gpu/matmul_mma_dequantize.py | 84 +- python/bitblas/ops/matmul_impl.py | 496 +- run.log | 4573 +++++++++++++++++ testing/tir_expr/f16.swizzle | 261 + testing/tir_expr/float16xfloat16_gemm.py | 27 +- testing/tir_expr/i8_right_swizzle | 272 + testing/tir_expr/i8_wrong_swizzle | 261 + testing/tir_expr/int8xint8_gemm.py | 312 +- testing/tir_expr/test_tir_0.py | 187 + testing/tir_expr/test_tir_1.py | 177 + 19 files changed, 6931 insertions(+), 283 deletions(-) create mode 100644 after_memory_rewrite.cu delete mode 100644 python/bitblas/base/config.py create mode 100644 run.log create mode 100644 testing/tir_expr/f16.swizzle create mode 100644 testing/tir_expr/i8_right_swizzle create mode 100644 testing/tir_expr/i8_wrong_swizzle create mode 100644 testing/tir_expr/test_tir_0.py create mode 100644 testing/tir_expr/test_tir_1.py diff --git a/after_memory_rewrite.cu b/after_memory_rewrite.cu new file mode 100644 index 0000000000..0fab57ad6f --- /dev/null +++ b/after_memory_rewrite.cu @@ -0,0 +1,387 @@ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, signed char* __restrict__ D); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, signed char* __restrict__ D) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + int C_reindex_shared_warp[128]; + __shared__ signed char A_reindex_reindex_shared[32768]; + __shared__ signed char B_shared[2048]; + __shared__ signed char B_reindex_reindex_shared[4096]; + signed char B_local[4]; + signed char B_reindex_reindex_local[16]; + signed char A_reindex_reindex_shared_warp[128]; + signed char B_reindex_reindex_shared_warp[32]; + signed char B_local_1[4]; + signed char B_reindex_reindex_local_1[16]; + signed char A_reindex_reindex_shared_warp_1[128]; + signed char B_reindex_reindex_shared_warp_1[32]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 8; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 2; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 16) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_reindex_shared + ((((ax0_ax1_ax2_ax3_ax4_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_reindex_shared + ((((ax0_ax1_ax2_ax3_ax4_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + (((((((int)blockIdx.y) * 4194304) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 524288)) + (((int)threadIdx.y) * 262144)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_shared + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_shared + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((int)blockIdx.x) * 262144) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0_1 = 0; ax0_ax1_ax2_ax3_ax4_fused_0_1 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_reindex_shared + (((((((ax3_0_0 + 1) & 1) * 16384) + (ax0_ax1_ax2_ax3_ax4_fused_0_1 * 2048)) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_reindex_shared + (((((((ax3_0_0 + 1) & 1) * 16384) + (ax0_ax1_ax2_ax3_ax4_fused_0_1 * 2048)) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + (((((((((int)blockIdx.y) * 4194304) + (ax0_ax1_ax2_ax3_ax4_fused_0_1 * 524288)) + (((int)threadIdx.y) * 262144)) + (ax3_0_0 * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 1024))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_shared + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_shared + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.x) * 262144) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 2; ++ax1_ax2_ax3_ax4_0_fused_0) { + *(int*)(B_local + 0) = *(int*)(B_shared + ((((((ax3_0_0 & 1) * 1024) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + for (int ax4 = 0; ax4 < 16; ++ax4) { + B_reindex_reindex_local[ax4] = ((B_local[(ax4 >> 2)] >> ((signed char)((ax4 & 3) * 2))) & (signed char)3); + } + *(int4*)(B_reindex_reindex_shared + ((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16))) = *(int4*)(B_reindex_reindex_local + 0); + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1 = 0; ax1 < 8; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_reindex_shared[(((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1 * 1024)) + (ax3_0_1 * 512))])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_reindex_shared[(((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1 * 1024)) + (ax3_0_1 * 512))])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 2; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_reindex_shared[(((((int)threadIdx.z) * 2048) + (ax1_1 * 1024)) + (ax3_0_1 * 512))])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_reindex_shared[(((((int)threadIdx.z) * 2048) + (ax1_1 * 1024)) + (ax3_0_1 * 512))])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 8; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 2; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 2; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + *(int*)(B_local_1 + 0) = *(int*)(B_shared + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 1024)); + for (int ax4_1 = 0; ax4_1 < 16; ++ax4_1) { + B_reindex_reindex_local_1[ax4_1] = ((B_local_1[(ax4_1 >> 2)] >> ((signed char)((ax4_1 & 3) * 2))) & (signed char)3); + } + *(int4*)(B_reindex_reindex_shared + ((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16))) = *(int4*)(B_reindex_reindex_local_1 + 0); + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_2 = 0; ax1_2 < 8; ++ax1_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_reindex_shared[((((((int)threadIdx.y) * 8192) + (ax1_2 * 1024)) + (ax3_0_1_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_reindex_shared[((((((int)threadIdx.y) * 8192) + (ax1_2 * 1024)) + (ax3_0_1_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_2 * 16)))[0]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_2 * 16)))[1]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_2 * 16)))[2]), "=r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_2 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_3 = 0; ax1_3 < 2; ++ax1_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_reindex_shared[(((((int)threadIdx.z) * 2048) + (ax1_3 * 1024)) + (ax3_0_1_1 * 512))])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_reindex_shared[(((((int)threadIdx.z) * 2048) + (ax1_3 * 1024)) + (ax3_0_1_1 * 512))])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_3 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_3 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_3 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_3 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 8; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 2; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 8; ++ax0) { + for (int ax1_4 = 0; ax1_4 < 2; ++ax1_4) { + __syncthreads(); + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)A_reindex_reindex_shared)[((((int)threadIdx.y) * 6144) + (((int)threadIdx.z) * 512))]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 16) + (ax1_4 * 8)) + local_id]; +} +; + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0_2 = 0; ax0_ax1_ax2_ax3_ax4_fused_0_2 < 2; ++ax0_ax1_ax2_ax3_ax4_fused_0_2) { + int __1; + int4 v_ = *(int4*)(((int*)A_reindex_reindex_shared) + ((((((int)threadIdx.y) * 6144) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_ax3_ax4_fused_0_2 * 128)) + (((int)threadIdx.x) * 4))); + __1=((signed char)(v_.x) << 0); + __1=__1 & ~(0x000000ff << 8) |((signed char)(v_.y) << 8); + __1=__1 & ~(0x000000ff << 16) |((signed char)(v_.z) << 16); + __1=__1 & ~(0x000000ff << 24) |((signed char)(v_.w) << 24); + *(int*)(D + (((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (ax0 * 262144)) + (ax0_ax1_ax2_ax3_ax4_fused_0_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.z) * 32)) + (ax1_4 * 16)) + ((((int)threadIdx.x) & 3) * 4))) = __1; + } + } + } + } +} + diff --git a/python/bitblas/base/config.py b/python/bitblas/base/config.py deleted file mode 100644 index 94f2031f81..0000000000 --- a/python/bitblas/base/config.py +++ /dev/null @@ -1,26 +0,0 @@ -from dataclasses import dataclass - -class ScheduleConfig: - """Configuration for dlight schedule""" - def __init__(self): - self._config = {} - self.block_factors = [] - self.thread_factors = [] - self.rstep = [] - self.reduce_thread = [] - self.pipeline_stage = 1 - self.vectorize = {} - - def __getattr__(self, name): - return self._config[name] - - def __setattr__(self, name, value): - self._config[name] = value - - def from_roller(self, roller_config): - self.block = roller_config.block - self.thread = roller_config.thread - self.rstep = roller_config.rstep - self.reduce_thread = roller_config.reduce_thread - self.pipeline_stage = roller_config.pipeline_stage - self.vectorize = roller_config.vectorize diff --git a/python/bitblas/base/roller/config.py b/python/bitblas/base/roller/config.py index 9255980d05..a81fb38a91 100644 --- a/python/bitblas/base/roller/config.py +++ b/python/bitblas/base/roller/config.py @@ -139,9 +139,7 @@ def __init__( self.smooth_b = smooth_b def __repr__(self) -> str: - return ( - f"" - ) + return f"" class Config(object): @@ -152,15 +150,12 @@ class Config(object): def __init__(self) -> None: self.arch = None self.use_tc = None # todo(lei): this should be renamed. - self.compute_capability = None # spacial axes tiling info self.block = [] self.thread = [] # special axes for tensorCore self.warp = [] - self.wmma = [] - self.tc_extra_conf: Optional[TensorCoreExtraConfig] = None # reduce axes tiling info self.rstep = [] self.reduce_thread = [] @@ -177,13 +172,14 @@ def __init__(self) -> None: self.use_async = False self.opt_shapes: Dict[str, int] = {} self.intrin_info = IntrinInfo("float16", "float16", True) + self.shared_scope: str = "shared" + self.pass_context: Dict = {} def to_dict(self) -> Dict: dic = {} dic["block"] = self.block if self.use_tc: dic["warp"] = self.warp - dic["wmma"] = self.wmma else: dic["thread"] = self.thread dic["rstep"] = self.rstep @@ -213,7 +209,6 @@ def from_dict(self, dic: Dict) -> "Config": self.block = dic["block"] if self.use_tc: self.warp = dic["warp"] - self.wmma = dic["wmma"] else: self.thread = dic["thread"] self.rstep = dic["rstep"] @@ -247,17 +242,10 @@ def __repr__(self) -> str: return str(self.to_dict()) def complete_config(self, node): - if not self.use_tc: - return self - _, _, wmma_k = self.wmma - tc_axis = node.infer_tensorcore_axis() - - shapes = node.propogate_reduction_inputs(self.block, {x: self.rstep[0] for x in node.raxis}) - AS_shape, BS_shape = shapes.values() - - shapes = node.propogate_reduction_inputs(self.warp, {x: wmma_k for x in node.raxis}) - AF_shape, BF_shape = shapes.values() - - self.tc_extra_conf = TensorCoreExtraConfig(AS_shape, BS_shape, AF_shape, BF_shape, tc_axis) + # analysis pass context, for int8 mma, we should merge static shared memory + merge_static_smem = False + if self.use_tc and self.intrin_info.in_dtype == "int8": + merge_static_smem = True + self.pass_context = {"tir.merge_static_smem": merge_static_smem} return self diff --git a/python/bitblas/base/roller/policy/default.py b/python/bitblas/base/roller/policy/default.py index 9d698540bf..31d1ffb5d4 100644 --- a/python/bitblas/base/roller/policy/default.py +++ b/python/bitblas/base/roller/policy/default.py @@ -692,7 +692,6 @@ def _score(node, thread): # small is better reduce_thread[target_ax] *= factor codegen_dict = Config() - codegen_dict.compute_capability = self.arch.compute_capability codegen_dict.block = tile codegen_dict.thread = cur_threads codegen_dict.rstep = [rsteps[ax.var.name] for ax in node.raxis] diff --git a/python/bitblas/base/roller/policy/tensorcore.py b/python/bitblas/base/roller/policy/tensorcore.py index 1171a79156..e0214a5646 100644 --- a/python/bitblas/base/roller/policy/tensorcore.py +++ b/python/bitblas/base/roller/policy/tensorcore.py @@ -299,7 +299,6 @@ def _score(node, thread): # small is better codegen_dict.rstep = [int(rsteps[ax.var.name]) for ax in node.raxis] codegen_dict.cached_tensors = td.cached_tensors_map[node] codegen_dict.rasterization_plan = self.plan_rasterization(td) - codegen_dict.wmma = wmma + [self.wmma_k] intrin_info = node.get_tag("intrin_info") if intrin_info: diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py index 1b277efbea..6c767dc70d 100644 --- a/python/bitblas/base/utils.py +++ b/python/bitblas/base/utils.py @@ -124,7 +124,9 @@ def _apply_config( return None -def apply_and_build_parallel(func, configs, arch, num_repeats=5, max_workers=10) -> CompileResult: +def apply_and_build_parallel( + func, configs, arch, num_repeats=5, max_workers=10 +) -> CompileResult: cpresults = [] def var_warpper(v): @@ -146,16 +148,18 @@ def var_warpper(v): if arg.dtype == "int8": profile_tensors.append( tvm.nd.array( - np.random.randint(-127, 128, [var_warpper(i) for i in arg.shape]).astype( - arg.dtype - ), + np.random.randint( + -127, 128, [var_warpper(i) for i in arg.shape] + ).astype(arg.dtype), device=arch.device, ) ) else: profile_tensors.append( tvm.nd.array( - np.random.uniform(0, 1, [var_warpper(i) for i in arg.shape]).astype(arg.dtype), + np.random.uniform(0, 1, [var_warpper(i) for i in arg.shape]).astype( + arg.dtype + ), device=arch.device, ) ) @@ -166,7 +170,8 @@ def var_warpper(v): _sched: List[Schedule] = [] with ThreadPoolExecutor(max_workers=4) as schduler: futures = { - schduler.submit(lambda f, c: _apply_config(f, c), func, config) for config in configs + schduler.submit(lambda f, c: _apply_config(f, c), func, config) + for config in configs } for future in as_completed(futures): _sched.append(future.result()) @@ -180,6 +185,7 @@ def _build(context) -> str: # TODO(lei): # this is a trick to implement rasteration, will be removed in the future config = configs[idx] + @tvm.register_func(func_name="tvm_callback_cuda_postproc", override=True) def tvm_callback_cuda_postproc(code, _): index = code.index("{", match_global_kernel(code)) @@ -189,12 +195,16 @@ def tvm_callback_cuda_postproc(code, _): code = code[: index + 2] + rasterization_code + code[index + 2 :] return code - with tvm.transform.PassContext(config={"tir.use_async_copy": True}): + with tvm.transform.PassContext( + config={"tir.use_async_copy": True, **config.pass_context} + ): rt_mod = tvm.build(mod["main"], target=arch.target) from tvm.contrib.tar import tar # pylint: disable=import-outside-toplevel - artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format) + artifact_path = os.path.join( + tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format + ) code = rt_mod.imported_modules[0].get_source() rt_mod.export_library(artifact_path, fcompile=tar) return idx, code, artifact_path @@ -292,15 +302,17 @@ def fast_tune( if opt_shapes: for name, shape in opt_shapes.items(): var = find_var_from_func(func, name) - specilized_func = func.specialize({var: shape.astype(var.dtype)}).with_attr( - "is_specialized" - ) + specilized_func = func.specialize( + {var: shape.astype(var.dtype)} + ).with_attr("is_specialized") arch = CUDA(target) policy = DefaultPolicy(func=func, arch=arch) try: - specilized_func, tags = get_tensorized_func_and_tags(specilized_func, arch.target) + specilized_func, tags = get_tensorized_func_and_tags( + specilized_func, arch.target + ) except Exception as e_msg: print("[FastDlight] Get tensorized func and tags failed: ", e_msg) tags = None @@ -308,7 +320,9 @@ def fast_tune( policy = TensorCorePolicy(func=specilized_func, arch=arch, tags=tags) configs = policy.emit_config(topk) - cpresults, best = apply_and_build(func, configs, arch, parallel_build=parallel_build) + cpresults, best = apply_and_build( + func, configs, arch, parallel_build=parallel_build + ) return cpresults, best @@ -401,9 +415,9 @@ def create_dispatch_func(g_var: str, func: tir.PrimFunc, refactored_funcs: List[ with ib.if_scope(syb > last_range): ib.emit(tvm.tir.Call(None, g_var, _invoke_params)) stmt = ib.get() - dispatch_func = tvm.tir.PrimFunc(params, stmt, ret_type, buffer_map, attrs).with_attrs( - {"tir.is_global_func": True, "global_symbol": global_symbol} - ) + dispatch_func = tvm.tir.PrimFunc( + params, stmt, ret_type, buffer_map, attrs + ).with_attrs({"tir.is_global_func": True, "global_symbol": global_symbol}) return dispatch_func @@ -421,7 +435,9 @@ def create_dispatch_mod( global_symbol = g_var_supply.fresh_global(global_symbol, add_prefix=False) dispatch_mod[global_symbol] = device_func refactored_funcs.append((global_symbol, device_func)) - dispatch_func = create_dispatch_func(g_var, original_func, refactored_funcs=refactored_funcs) + dispatch_func = create_dispatch_func( + g_var, original_func, refactored_funcs=refactored_funcs + ) dispatch_mod.update(tvm.IRModule.from_expr(dispatch_func)) return dispatch_mod @@ -448,15 +464,21 @@ def fast_tune_with_dynamic_range( if axis.name in dynamic_range: opt_shapes[axis.name] = dynamic_range[axis.name] else: - raise ValueError(f"[FastDlight] The axis {axis.name} is not in dynamic_range") + raise ValueError( + f"[FastDlight] The axis {axis.name} is not in dynamic_range" + ) func = func.with_attr("opt_shapes", opt_shapes) if "opt_shapes" not in func.attrs: - print("[FastDlight] The primfunc has no opt_shapes, please set opt_shapes for the primfunc") + print( + "[FastDlight] The primfunc has no opt_shapes, please set opt_shapes for the primfunc" + ) return None else: # should be list value - if not all([isinstance(v, tvm.ir.Array) for v in func.attrs["opt_shapes"].values()]): + if not all( + [isinstance(v, tvm.ir.Array) for v in func.attrs["opt_shapes"].values()] + ): print("[FastDlight] The opt_shapes should be list value") return None @@ -467,7 +489,9 @@ def fast_tune_with_dynamic_range( product_list = list(itertools.product(*(opt_shapes[key] for key in opt_shapes))) # Convert the Cartesian product to a list of dictionaries - specialize_items: List[Dict] = [dict(zip(opt_shapes.keys(), values)) for values in product_list] + specialize_items: List[Dict] = [ + dict(zip(opt_shapes.keys(), values)) for values in product_list + ] specilized_tuned_funcs: List[tir.PrimFunc] = [] for item in specialize_items: diff --git a/python/bitblas/gpu/intrin/lop3.py b/python/bitblas/gpu/intrin/lop3.py index db49bdfcc6..65d9a1b389 100644 --- a/python/bitblas/gpu/intrin/lop3.py +++ b/python/bitblas/gpu/intrin/lop3.py @@ -82,22 +82,25 @@ """ decode_i4s_to_i8s = """template -__device__ void decode_i4s_to_i8s(T1 *_i4s, T2 *_i8s, const int N = 8) +__device__ void decode_i4s_to_i8s(T1 *_i4s, T2 *_i8s, const int N = 16) { uint *i8s = reinterpret_cast(_i8s); - uint i4s = *_i4s; + uint *i4s = reinterpret_cast(_i4s); // First, we extract the i4s and construct an intermediate fp16 number. static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4 static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 - #pragma unroll - for (int i = 0; i < (N / 4); i++) + for (int i = 0; i < (N / 8); i++) { // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" : "=r"(i8s[i]) - : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + : "r"(i4s[0] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" + : "=r"(i8s[i + 2]) + : "r"(i4s[1] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); } } """ @@ -207,6 +210,14 @@ def fast_decode_impl(compressed: T.handle, decompressed: T.handle) -> None: ), ) +LOP3_FAST_DECODE_INT4_TO_INT8_L16_INTRIN = "lop3_fast_decode_i4_to_i8_l16_" +TensorIntrin.register( + LOP3_FAST_DECODE_INT4_TO_INT8_L16_INTRIN, + *get_fast_decode_intrin( + storage_nbit=4, storage_dtype="int8", target_dtype="int8", loops_extent=16 + ), +) + LOP3_FAST_DECODE_INT2_TO_INT8_L16_INTRIN = "lop3_fast_decode_i2_to_i8_l16_" TensorIntrin.register( @@ -260,7 +271,7 @@ def get_lop3_intrin_group( dtype_mapping = {"float16": "f16", "int8": "i8", "int32": "i32"} target_dtype = dtype_mapping[out_dtype] target_bits = tvm.DataType(out_dtype).bits - loop_extent = min(128 // target_bits, 32 // storage_nbit) + loop_extent = 128 // target_bits _intrin = f"lop3_fast_decode_i{storage_nbit}_to_{target_dtype}_l{loop_extent}_" import_c_map = { "i4_to_f16": decode_i4_to_f16, diff --git a/python/bitblas/gpu/matmul_analysis.py b/python/bitblas/gpu/matmul_analysis.py index 7af32a71bd..d967899f70 100644 --- a/python/bitblas/gpu/matmul_analysis.py +++ b/python/bitblas/gpu/matmul_analysis.py @@ -19,8 +19,7 @@ from dataclasses import dataclass from enum import Enum from typing import List, Optional, Set, Union, Tuple, Dict - -from tvm import tir +from tvm import tir, DataType from tvm.ir import Range from tvm.tir import IterVar, PrimExpr, Var from tvm.tir.analysis import undefined_vars @@ -420,6 +419,22 @@ def get_access_vars(region: List[Range]) -> List[Var]: return is_identity, is_transpose +def get_coalesced_veclen(block_stmt: tir.Block, target_bits: int = 128) -> int: + # gpu memory prefer 128 bits coalesced access (e.g. four banks) + # 128 bits + block_stmt + buffers: List[tir.Buffer] = [] + for read in block_stmt.reads: + buffers.append(read.buffer) + for write in block_stmt.writes: + buffers.append(write.buffer) + # pick the dtype with the largest bits + max_dtype_bits: int = 0 + for buffer in buffers: + max_dtype_bits = max(max_dtype_bits, DataType(buffer.dtype).bits) + return target_bits // max_dtype_bits + + def is_identity_block(block_stmt: tir.Block) -> bool: return is_identity_or_transpose_block(block_stmt)[0] diff --git a/python/bitblas/gpu/matmul_mma.py b/python/bitblas/gpu/matmul_mma.py index a0c4ecca21..35fc5aeabe 100644 --- a/python/bitblas/gpu/matmul_mma.py +++ b/python/bitblas/gpu/matmul_mma.py @@ -37,6 +37,7 @@ get_dequantize_block, normalize_to_matmul, get_propagate_map, + get_coalesced_veclen ) @@ -394,7 +395,7 @@ def check_has_dynamic(func: tir.PrimFunc): cache_write_required = check_require_cache(func) - shared_scope = "shared" + shared_scope = config.shared_scope intrin_info = config.intrin_info intrin_group = get_mma_intrin_group( @@ -621,19 +622,19 @@ def inverse_permutation(i, j, ii, jj): # split the store loop to match hardware intrinsic pattern i, j = sch.get_loops(store)[-2:] - i0, i1 = sch.split(i, factors=[None, micro_size_x]) - j0, j1 = sch.split(j, factors=[None, micro_size_y]) + i0, i1 = sch.split(i, factors=[None, micro_size_x], preserve_unit_iters=False) + j0, j1 = sch.split(j, factors=[None, micro_size_y], preserve_unit_iters=False) sch.reorder(i0, j0, i1, j1) if cache_write_required: auto_inline_consumer_chain(sch, accumulator_shared_to_global) sch.reverse_compute_at( - accumulator_shared_to_global, sch.get_loops(store)[-3], preserve_unit_loops=True + accumulator_shared_to_global, sch.get_loops(store)[-5], preserve_unit_loops=True ) - + vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global)) fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) f0, f1, f2 = sch.split( - fused, factors=[None, warp_size, max(list(config.vectorize.values()))] + fused, factors=[None, warp_size, vec_len] ) sch.bind(f1, "threadIdx.x") sch.vectorize(f2) diff --git a/python/bitblas/gpu/matmul_mma_dequantize.py b/python/bitblas/gpu/matmul_mma_dequantize.py index 3f902a7ac6..0c03ae977f 100644 --- a/python/bitblas/gpu/matmul_mma_dequantize.py +++ b/python/bitblas/gpu/matmul_mma_dequantize.py @@ -29,6 +29,7 @@ auto_inline_producers, get_reduction_blocks, get_dequantize_block, + get_coalesced_veclen, normalize_to_matmul, ) @@ -152,7 +153,9 @@ def check_b_decode_info(B_decode_info): conditions = [] # check source format in ["int", "fp", "af"] conditions.append("source_format" in B_decode_info) - conditions.append(B_decode_info["source_format"]["format"] in ["int", "fp", "af"]) + conditions.append( + B_decode_info["source_format"]["format"] in ["int", "fp", "af"] + ) # check source bits in [1, 2, 4, 8] conditions.append(B_decode_info["source_format"]["bits"] in [1, 2, 4, 8]) # check target format in ["float16", "int8"] @@ -190,7 +193,7 @@ def check_b_decode_info(B_decode_info): stage = config.pipeline_stage use_async = config.use_async chunk = config.rstep[0] - + micro_size_x, micro_size_y, micro_size_k = intrin_group["micro_kernel"] # get the axis for layout transform @@ -202,7 +205,7 @@ def get_axis(l, r, trans): def can_enable_swizzle(dtype: str, smooth: bool): # inject_permuted_layout only support float16 currently - if dtype == "float16": + if dtype == "float16" or dtype == "int8": # if we use smooth layout, we don't need to do swizzling return not smooth return False @@ -225,7 +228,10 @@ def can_enable_swizzle(dtype: str, smooth: bool): k_pad_factor = k_factors[1] # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] - if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + if not ( + func.attrs is not None + and "dlight.tensorcore_prenormlized" in func.attrs.keys() + ): sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) # Step 2. Padding for dynamic shape kernels @@ -276,10 +282,10 @@ def can_enable_swizzle(dtype: str, smooth: bool): # TODO(lei): this is a trick for rasterization implementation # is not optimal. # require a solution for general block rasterization - factor = 8 # should be divisible by block_idy - if sch.get(block_idx).extent.value % factor == 0: - block_k, block_idx = sch.split(block_idx, factors=[None, factor]) - sch.bind(block_k, "blockIdx.z") + # factor = 8 # should be divisible by block_idy + # if sch.get(block_idx).extent.value % factor == 0: + # block_k, block_idx = sch.split(block_idx, factors=[None, factor]) + # sch.bind(block_k, "blockIdx.z") else: sch.bind(batch, "blockIdx.z") @@ -303,7 +309,9 @@ def smooth_layout_recover(block, scope, l=16, r=16, enable=True): ), ) - smooth_layout_recover(block_outer, ("read", 0), *a_lr, enable=intrin_info.smooth_a) + smooth_layout_recover( + block_outer, ("read", 0), *a_lr, enable=intrin_info.smooth_a + ) smooth_layout_recover( block_outer, ("read", 1), @@ -319,14 +327,14 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) f_0, f_1, f_2, f_3, f_4 = sch.split( - fused, factors=[num_ty, num_tz, None, warp_size, vec_len] + fused, factors=[None, num_ty, num_tz, warp_size, vec_len] ) sch.bind(f_3, "threadIdx.x") - sch.bind(f_1, "threadIdx.z") - sch.bind(f_0, "threadIdx.y") + sch.bind(f_2, "threadIdx.z") + sch.bind(f_1, "threadIdx.y") sch.vectorize(f_4) - sch.unroll(f_2) + sch.unroll(f_0) # Apply Swizzling sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) # if not, apply padding to alleviate bank conflict @@ -353,7 +361,10 @@ def decode_fetch_to_shared(block, idx): sch.compute_at(block_shared, k0, preserve_unit_loops=True) # TODO(lei): the factor shoule be analyzed more deeper. - _, B_shared_vi, _ = sch.split(sch.get_loops(block_shared)[-1], factors=[None, 1, 8]) + decode_factor = 8 if B_decode_info["target_format"] == "float16" else 16 + _, B_shared_vi, _ = sch.split( + sch.get_loops(block_shared)[-1], factors=[None, 1, decode_factor] + ) block_shared_local = sch.cache_read(block_shared, 0, "local") # global -> dequantzed_local -> shared # step2. inline to local block @@ -370,23 +381,33 @@ def get_idx(): b_idx = get_idx() # global -> prefetch_local -> dequantzed_local -> shared - block_shared_local_local = sch.cache_read(block_shared_local, b_idx, "local") + block_shared_local_local = sch.cache_read( + block_shared_local, b_idx, "local" + ) # global -> prefetch_shared -> vector load -> dequantzed_local -> shared block_shared_local_local_shared = sch.cache_read( block_shared_local_local, 0, shared_scope ) sch.compute_at(block_shared_local, B_shared_vi, preserve_unit_loops=True) - sch.compute_at(block_shared_local_local, B_shared_vi, preserve_unit_loops=True) + sch.compute_at( + block_shared_local_local, B_shared_vi, preserve_unit_loops=True + ) dequantize_block = block_shared_local # fast type conversion if "fast_decoding" in B_decode_info and B_decode_info["fast_decoding"]: + storage_nbits = B_decode_info["source_format"]["bits"] + out_dtype = B_decode_info["target_format"] intrin_group = get_lop3_intrin_group( - in_dtype="int8", out_dtype="float16", storage_nbit=4, with_scale=False + in_dtype="int8", out_dtype=out_dtype, storage_nbit=storage_nbits + ) + sch.tensorize( + sch.get_loops(dequantize_block)[-1], intrin_group["compute"] ) - sch.tensorize(sch.get_loops(dequantize_block)[-1], intrin_group["compute"]) sch.annotate( - thread_idz, ann_key="pragma_import_c", ann_val=intrin_group["c_source"] + thread_idz, + ann_key="pragma_import_c", + ann_val=intrin_group["c_source"], ) sch.annotate(block_shared, ann_key="permuted_layout", ann_val=can_swizzle_b) @@ -397,19 +418,24 @@ def get_idx(): ) if not (can_swizzle_b or intrin_info.smooth_b): pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 - sch.storage_align(block_shared, 0, axis=-2, factor=16, offset=pad_offset) + sch.storage_align( + block_shared, 0, axis=-2, factor=16, offset=pad_offset + ) sch.bind(B_shared_tx, "threadIdx.x") sch.bind(B_shared_ty, "threadIdx.y") sch.bind(B_shared_tz, "threadIdx.z") sch.vectorize(sch.get_loops(block_shared)[-1]) sch.vectorize(sch.get_loops(block_shared_local_local)[-1]) - sch.compute_at(block_shared_local_local_shared, k0, preserve_unit_loops=True) + sch.compute_at( + block_shared_local_local_shared, k0, preserve_unit_loops=True + ) ndim = len(sch.get(block_shared_local_local_shared).iter_vars) fused = sch.fuse(*sch.get_loops(block_shared_local_local_shared)[-ndim:]) f_0, f_1, f_2, f_3, f_4 = sch.split( - fused, factors=[None, num_tz, num_ty, warp_size, 16] # int8x16 = 128bits + fused, + factors=[None, num_tz, num_ty, warp_size, 16], # int8x16 = 128bits ) sch.bind(f_3, "threadIdx.x") @@ -453,13 +479,13 @@ def get_idx(): if cache_write_required: auto_inline_consumer_chain(sch, accumulator_shared_to_global) sch.reverse_compute_at( - accumulator_shared_to_global, sch.get_loops(store)[-3], preserve_unit_loops=True + accumulator_shared_to_global, + sch.get_loops(store)[-5], + preserve_unit_loops=True, ) - + vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global)) fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) - f0, f1, f2 = sch.split( - fused, factors=[None, warp_size, max(list(config.vectorize.values()))] - ) + f0, f1, f2 = sch.split(fused, factors=[None, warp_size, vec_len]) sch.bind(f1, "threadIdx.x") sch.vectorize(f2) sch.unroll(f0) @@ -511,7 +537,9 @@ def tensorize_init_store_compute(): if stage > 1: sch.annotate( - k0, ann_key="software_pipeline_stage", ann_val=[0, 0, stage - 1, stage - 1] + k0, + ann_key="software_pipeline_stage", + ann_val=[0, 0, stage - 1, stage - 1], ) sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2, 3]) if use_async: diff --git a/python/bitblas/ops/matmul_impl.py b/python/bitblas/ops/matmul_impl.py index 71884f51cd..fee9219189 100644 --- a/python/bitblas/ops/matmul_impl.py +++ b/python/bitblas/ops/matmul_impl.py @@ -71,6 +71,7 @@ def main(a: T.handle, b: T.handle, c: T.handle): return MatmulNN + def matmul_nt(M, N, K, in_dtype="float16", out_dtype="float16"): @tvm.script.ir_module class MatmulNT: @@ -92,6 +93,127 @@ def main(a: T.handle, b: T.handle, c: T.handle): return MatmulNT + +def matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma( + M, N, K, in_dtype="int8", out_dtype="int32" +): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + { + "global_symbol": "main", + "tir.noalias": True, + "smooth_a": True, + "smooth_b": True, + } + ) + A = T.match_buffer(a, [M // wm, K // wk, wm, wk], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.match_buffer(c, [M, N], dtype=out_dtype) + A_reindex = T.alloc_buffer([M, K], dtype=in_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + + for i, k in T.grid(M, K): + with T.block("A_reindex"): + vi, vk = T.axis.remap("SS", [i, k]) + A_reindex[vi, vk] = A[ + vi // wn, + vk // wk, + vi % wn % 8 * 2 + vk % wk // 16, + vi % wn // 8 * 16 + vk % 16, + ] + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[ + vj // wn, + vk // wk, + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, + vj % 8 // 4 * 16 + vk % 16, + ] + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A_reindex[vi, vk].astype( + out_dtype + ) * B_reindex[vj, vk].astype(out_dtype) + + return MyModule + + +def matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma_cast_s8( + M, N, K, in_dtype="int8", out_dtype="int32" +): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + { + "global_symbol": "main", + "tir.noalias": True, + "smooth_a": True, + "smooth_b": True, + } + ) + A = T.match_buffer(a, [M // wm, K // wk, wm, wk], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.alloc_buffer([M, N], dtype=out_dtype) + A_reindex = T.alloc_buffer([M, K], dtype=in_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + D = T.match_buffer(c, [M, N], dtype="int8") + + for i, k in T.grid(M, K): + with T.block("A_reindex"): + vi, vk = T.axis.remap("SS", [i, k]) + A_reindex[vi, vk] = A[ + vi // wn, + vk // wk, + vi % wn % 8 * 2 + vk % wk // 16, + vi % wn // 8 * 16 + vk % 16, + ] + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[ + vj // wn, + vk // wk, + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, + vj % 8 // 4 * 16 + vk % 16, + ] + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A_reindex[vi, vk].astype( + out_dtype + ) * B_reindex[vj, vk].astype(out_dtype) + + for i, j in T.grid(M, N): + with T.block(""): + vi, vj = T.axis.remap("SS", [i, j]) + D[vi, vj] = C[vi, vj].astype("int8") + + return MyModule + + def matmul_nt_propagate_b_s8_s8_s32_mma(M, N, K, in_dtype="int8", out_dtype="int32"): wm, wn, wk = 16, 16, 16 if in_dtype == "int8": @@ -115,8 +237,50 @@ def main(a: T.handle, b: T.handle, c: T.handle): B_reindex[vj, vk] = B[ vj // wn, vk // wk, - vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, - vj % 8 // 4 * 16 + vk % 16 + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, + vj % 8 // 4 * 16 + vk % 16, + ] + + for i, j, k in T.grid(M, N, K): + with T.block("B"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = tvm.tir.const(0, out_dtype) + C[vi, vj] = C[vi, vj] + A[vi, vk].astype(out_dtype) * B_reindex[ + vj, vk + ].astype(out_dtype) + + return MyModule + + +def matmul_nt_propagate_b_s8_s8_s32_cast_s8_mma( + M, N, K, in_dtype="int8", out_dtype="int32" +): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + + @tvm.script.ir_module + class MyModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle): + T.func_attr( + {"global_symbol": "main", "tir.noalias": True, "smooth_b": True} + ) + A = T.match_buffer(a, [M, K], dtype=in_dtype) + B = T.match_buffer(b, [N // wn, K // wk, wn, wk], dtype=in_dtype) + C = T.alloc_buffer([M, N], dtype=out_dtype) + B_reindex = T.alloc_buffer([N, K], dtype=in_dtype) + D = T.match_buffer(c, [M, N], dtype="int8") + + for j, k in T.grid(N, K): + with T.block("B_reindex"): + vj, vk = T.axis.remap("SS", [j, k]) + B_reindex[vj, vk] = B[ + vj // wn, + vk // wk, + vj % wn // 8 * 8 + vj % 4 * 2 + vk % wk // 16, + vj % 8 // 4 * 16 + vk % 16, ] for i, j, k in T.grid(M, N, K): @@ -128,9 +292,17 @@ def main(a: T.handle, b: T.handle, c: T.handle): vj, vk ].astype(out_dtype) + for i, j in T.grid(M, N): + with T.block("C"): + vi, vj = T.axis.remap("SS", [i, j]) + D[vi, vj] = C[vi, vj].astype("int8") + return MyModule -def matmul_nt_propagate_b_f16_f16_f16_mma(M, N, K, in_dtype="float16", out_dtype="float16"): + +def matmul_nt_propagate_b_f16_f16_f16_mma( + M, N, K, in_dtype="float16", out_dtype="float16" +): wm, wn, wk = 16, 16, 16 if in_dtype == "int8": wm, wn, wk = 16, 16, 32 @@ -168,6 +340,7 @@ def main(a: T.handle, b: T.handle, c: T.handle): return MyModule + def matmul_nt_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): wm, wn, wk = 16, 16, 16 if in_dtype == "int8": @@ -213,151 +386,170 @@ def main(a: T.handle, b: T.handle, c: T.handle): return MyModule -def matmul_nt_i4(M, N, K, in_dtype="float16", out_dtype="float16"): - bit = 4 +def matmul_nt_dequantize_b(M, N, K, in_dtype="float16", out_dtype="float16", bit=4): n_float_per_i8 = 8 // bit - def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + def _tir_u8_to_int_to_float( + nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str + ): assert val.dtype == "int8" mask = tvm.tir.const((1 << nbit) - 1, "int8") return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) - - A = te.placeholder((M, K), name='A', dtype=in_dtype) - B = te.placeholder((N, K // 8 * bit), name='B', dtype='int8') - + + A = te.placeholder((M, K), name="A", dtype=in_dtype) + B = te.placeholder((N, K // 8 * bit), name="B", dtype="int8") + def decode_func(n, k): - w = _tir_u8_to_int_to_float(bit, B[n, k // n_float_per_i8], k % n_float_per_i8, dtype=in_dtype) + w = _tir_u8_to_int_to_float( + bit, B[n, k // n_float_per_i8], k % n_float_per_i8, dtype=in_dtype + ) return w - B_decode = te.compute( - (N, K), - decode_func, - name='B_decode' - ) + B_decode = te.compute((N, K), decode_func, name="B_decode") # Describe the matrix multiplication in TE - k = te.reduce_axis((0, K), name='k') + k = te.reduce_axis((0, K), name="k") C = te.compute( (M, N), - lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), - name='C' + lambda i, j: te.sum( + A[i, k].astype(out_dtype) * B_decode[j, k].astype(out_dtype), axis=k + ), + name="C", + ) + func = te.create_prim_func([A, B, C]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "fast_decoding": True, + "source_format": { + "bits": bit, + "format": "int", + }, + "target_format": in_dtype, + } + }, ) - func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { - 'B': { - 'decode_block': 'B_decode', - 'fast_decoding': True, - 'source_format':{ - 'bits': 4, - 'format': 'int', - }, - 'target_format': "float16" - } - }) return tvm.IRModule.from_expr(func) -def matmul_nt_i4_propagate_b(M, N, K, in_dtype="float16", out_dtype="float16"): - bit = 4 +def matmul_nt_dequantize_b_propagate_b(M, N, K, in_dtype="float16", out_dtype="float16", cast_dtype="float16", bit=4, fast_decoding=False): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 + n_float_per_i8 = 8 // bit - def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + def _tir_u8_to_int_to_float( + nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str + ): assert val.dtype == "int8" mask = tvm.tir.const((1 << nbit) - 1, "int8") return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) - - A = te.placeholder((M, K), name='A', dtype=in_dtype) - B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') - + + A = te.placeholder((M, K), name="A", dtype=in_dtype) + B = te.placeholder((N // wn, K // wk, wn, wk // 8 * bit), name="B", dtype="int8") + def decode_func(n, k, nn, kk): - w = _tir_u8_to_int_to_float(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype) + w = _tir_u8_to_int_to_float( + bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype + ) return w - B_decode = te.compute( - (N // 16, K // 16, 16, 16), - decode_func, - name='B_decode' - ) - + B_decode = te.compute((N // wn, K // wk, wn, wk), decode_func, name="B_decode") + B_reindex = te.compute( (N, K), - lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], - name="B_reindex" + lambda i, j: B_decode[i // wn, j // wk, i % wn, j % wk], + name="B_reindex", ) # Describe the matrix multiplication in TE - k = te.reduce_axis((0, K), name='k') + k = te.reduce_axis((0, K), name="k") C = te.compute( (M, N), - lambda i, j: te.sum(A[i, k] * B_reindex[j, k], axis=k), - name='C' + lambda i, j: te.sum( + A[i, k].astype(out_dtype) * B_reindex[j, k].astype(out_dtype), axis=k + ), + name="C", + ) + D = te.compute( + (M, N), lambda i, j: C[i, j].astype(cast_dtype), name="D" + ) + func = te.create_prim_func([A, B, D]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "fast_decoding": fast_decoding, + "source_format": { + "bits": bit, + "format": "int", + }, + "target_format": in_dtype, + } + }, ) - func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { - 'B': { - 'decode_block': 'B_decode', - 'fast_decoding': True, - 'source_format':{ - 'bits': 4, - 'format': 'int', - }, - 'target_format': "float16" - } - }) func = func.with_attr("smooth_b", True) return tvm.IRModule.from_expr(func) +def matmul_nt_dequantize_b_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16", cast_dtype="float16", bit=4, fast_decoding=False): + wm, wn, wk = 16, 16, 16 + if in_dtype == "int8": + wm, wn, wk = 16, 16, 32 -def matmul_nt_i4_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): - bit = 4 n_float_per_i8 = 8 // bit - def _tir_u8_to_int_to_float(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str): + def _tir_u8_to_int_to_float( + nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: str + ): assert val.dtype == "int8" mask = tvm.tir.const((1 << nbit) - 1, "int8") return ((val >> (pos * nbit).astype("int8")) & mask).astype(dtype) - - A = te.placeholder((M // 16, K // 16, 16, 16), name='A', dtype=in_dtype) - B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') - + + A = te.placeholder((M // wm, K // wk, wm, wk), name="A", dtype=in_dtype) + B = te.placeholder((N // wn, K // wk, wn, wk // 8 * bit), name="B", dtype="int8") + def decode_func(n, k, nn, kk): - w = _tir_u8_to_int_to_float(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype) + w = _tir_u8_to_int_to_float( + bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8, dtype=in_dtype + ) return w - B_decode = te.compute( - (N // 16, K // 16, 16, 16), - decode_func, - name='B_decode' - ) - + B_decode = te.compute((N // wn, K // wk, wn, wk), decode_func, name="B_decode") + B_reindex = te.compute( (N, K), - lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], - name="B_reindex" + lambda i, j: B_decode[i // wn, j // wk, i % wn, j % wk], + name="B_reindex", ) - + A_reindex = te.compute( - (M, K), - lambda i, j: A[i // 16, j // 16, i % 16, j % 16], - name="A_reindex" + (M, K), lambda i, j: A[i // wm, j // wk, i % wm, j % wk], name="A_reindex" ) # Describe the matrix multiplication in TE - k = te.reduce_axis((0, K), name='k') + k = te.reduce_axis((0, K), name="k") C = te.compute( - (M, N), - lambda i, j: te.sum(A_reindex[i, k] * B_reindex[j, k], axis=k), - name='C' + (M, N), lambda i, j: te.sum(A_reindex[i, k].astype(out_dtype) * B_reindex[j, k].astype(out_dtype), axis=k), name="C" + ) + D = te.compute( + (M, N), lambda i, j: C[i, j].astype(cast_dtype), name="D" + ) + func = te.create_prim_func([A, B, D]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "fast_decoding": fast_decoding, + "source_format": { + "bits": bit, + "format": "int", + }, + "target_format": in_dtype, + } + }, ) - func = te.create_prim_func([A, B, C]).with_attr("dequantize_info", { - 'B': { - 'decode_block': 'B_decode', - 'fast_decoding': True, - 'source_format':{ - 'bits': 4, - 'format': 'int', - }, - 'target_format': "float16" - } - }) func = func.with_attr("smooth_a", True) func = func.with_attr("smooth_b", True) @@ -372,41 +564,38 @@ def _tir_u8_to_int(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr): assert val.dtype == "int8" mask = tvm.tir.const((1 << nbit) - 1, "int8") return (val >> (pos * nbit).astype("int8")) & mask - - A = te.placeholder((M, K), name='A', dtype=in_dtype) - B = te.placeholder((N, K // 8 * bit), name='B', dtype='int8') - LUT = te.placeholder((1 << bit, ), name='LUT', dtype='float16') + A = te.placeholder((M, K), name="A", dtype=in_dtype) + B = te.placeholder((N, K // 8 * bit), name="B", dtype="int8") + LUT = te.placeholder((1 << bit,), name="LUT", dtype="float16") def decode_func(n, k): w = _tir_u8_to_int(bit, B[n, k // n_float_per_i8], k % n_float_per_i8) return LUT[w] - B_decode = te.compute( - (N, K), - decode_func, - name='B_decode' - ) + B_decode = te.compute((N, K), decode_func, name="B_decode") # Describe the matrix multiplication in TE - k = te.reduce_axis((0, K), name='k') + k = te.reduce_axis((0, K), name="k") C = te.compute( - (M, N), - lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), - name='C' + (M, N), lambda i, j: te.sum(A[i, k] * B_decode[j, k], axis=k), name="C" + ) + func = te.create_prim_func([A, B, LUT, C]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "source_format": { + "bits": 4, + "format": "af", + }, + "target_format": "float16", + } + }, ) - func = te.create_prim_func([A, B, LUT, C]).with_attr("dequantize_info", { - 'B': { - 'decode_block': 'B_decode', - 'source_format':{ - 'bits': 4, - 'format': 'af', - }, - 'target_format': "float16" - } - }) return tvm.IRModule.from_expr(func) + def matmul_nt_af4_propagate_a_b(M, N, K, in_dtype="float16", out_dtype="float16"): bit = 4 n_float_per_i8 = 8 // bit @@ -415,62 +604,57 @@ def _tir_u8_to_int(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr): assert val.dtype == "int8" mask = tvm.tir.const((1 << nbit) - 1, "int8") return (val >> (pos * nbit).astype("int8")) & mask - - A = te.placeholder((M // 16, K // 16, 16, 16), name='A', dtype=in_dtype) - B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name='B', dtype='int8') - LUT = te.placeholder((1 << bit, ), name='LUT', dtype='float16') + + A = te.placeholder((M // 16, K // 16, 16, 16), name="A", dtype=in_dtype) + B = te.placeholder((N // 16, K // 16, 16, 16 // 8 * bit), name="B", dtype="int8") + LUT = te.placeholder((1 << bit,), name="LUT", dtype="float16") def decode_func(n, k, nn, kk): w = _tir_u8_to_int(bit, B[n, k, nn, kk // n_float_per_i8], kk % n_float_per_i8) return LUT[w] - B_decode = te.compute( - (N // 16, K // 16, 16, 16), - decode_func, - name='B_decode' - ) - + B_decode = te.compute((N // 16, K // 16, 16, 16), decode_func, name="B_decode") + B_reindex = te.compute( (N, K), lambda i, j: B_decode[i // 16, j // 16, i % 16, j % 16], - name="B_reindex" + name="B_reindex", ) - + A_reindex = te.compute( - (M, K), - lambda i, j: A[i // 16, j // 16, i % 16, j % 16], - name="A_reindex" + (M, K), lambda i, j: A[i // 16, j // 16, i % 16, j % 16], name="A_reindex" ) # Describe the matrix multiplication in TE - k = te.reduce_axis((0, K), name='k') + k = te.reduce_axis((0, K), name="k") C = te.compute( - (M, N), - lambda i, j: te.sum(A_reindex[i, k] * B_reindex[j, k], axis=k), - name='C' + (M, N), lambda i, j: te.sum(A_reindex[i, k] * B_reindex[j, k], axis=k), name="C" + ) + func = te.create_prim_func([A, B, LUT, C]).with_attr( + "dequantize_info", + { + "B": { + "decode_block": "B_decode", + "source_format": { + "bits": 4, + "format": "af", + }, + "target_format": "float16", + } + }, ) - func = te.create_prim_func([A, B, LUT, C]).with_attr("dequantize_info", { - 'B': { - 'decode_block': 'B_decode', - 'source_format':{ - 'bits': 4, - 'format': 'af', - }, - 'target_format': "float16" - } - }) func = func.with_attr("smooth_a", True) func = func.with_attr("smooth_b", True) return tvm.IRModule.from_expr(func) -# register the func +# register the func matmul_impl_factory = { - 'matmul_nt': matmul_nt, - 'matmul_nt_dyn_m': matmul_nt_dyn_m, - 'matmul_nn': matmul_nn, - 'matmul_nn_dyn_m': matmul_nn_dyn_m, - 'matmul_nt_propagate_b_f16_f16_mma': matmul_nt_propagate_b_f16_f16_f16_mma, - 'matmul_nt_propagate_a_b': matmul_nt_propagate_a_b, - 'matmul_nt_propagate_a_b_f16_f16_mma': matmul_nt_propagate_a_b, -} \ No newline at end of file + "matmul_nt": matmul_nt, + "matmul_nt_dyn_m": matmul_nt_dyn_m, + "matmul_nn": matmul_nn, + "matmul_nn_dyn_m": matmul_nn_dyn_m, + "matmul_nt_propagate_b_f16_f16_mma": matmul_nt_propagate_b_f16_f16_f16_mma, + "matmul_nt_propagate_a_b": matmul_nt_propagate_a_b, + "matmul_nt_propagate_a_b_f16_f16_mma": matmul_nt_propagate_a_b, +} diff --git a/run.log b/run.log new file mode 100644 index 0000000000..58ae899a50 --- /dev/null +++ b/run.log @@ -0,0 +1,4573 @@ +[FastDlight] Apply config {'block': [128, 64], 'warp': [64, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [128, 32], 'warp': [64, 16], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [256, 64], 'warp': [128, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [64, 128], 'warp': [32, 64], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [64, 64], 'warp': [32, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [128, 128], 'warp': [64, 64], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [64, 32], 'warp': [32, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [128, 16], 'warp': [32, 16], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [256, 32], 'warp': [128, 16], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [256, 128], 'warp': [128, 64], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [64, 16], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [32, 128], 'warp': [16, 64], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [64, 256], 'warp': [32, 128], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [32, 64], 'warp': [16, 32], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [128, 256], 'warp': [64, 128], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [32, 32], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [256, 16], 'warp': [64, 16], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [16, 128], 'warp': [16, 32], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [32, 256], 'warp': [16, 128], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Apply config {'block': [16, 64], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[75776]; + int C_reindex_shared_warp[128]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[128]; + signed char B_reindex_reindex_shared_warp[32]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[128]; + signed char B_reindex_reindex_shared_warp_1[32]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 8; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 2; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 16) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 8; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 6144)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 6144))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 2097152) + (((int)blockIdx.x) * 262144)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 8; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 6144)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 6144))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.z) * 2097152) + (((int)blockIdx.x) * 262144)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 2; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 1024) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 8; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 6144)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 6144)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 2; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 2048) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 2048) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 8; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 2; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 2; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 1024)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 8; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 22528)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 22528)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 2; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 2048) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 2048) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 8; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 2; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 8; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 2; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 512)) + (ax1_2 * 256)) + 9728)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 16) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 4; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 512)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.z) * 32)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 9728)); + } + } + } +} + + +Compilation error: +/tmp/tmph1ixa0cq/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmph1ixa0cq/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x12800 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[55296]; + int C_reindex_shared_warp[32]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[64]; + signed char B_reindex_reindex_shared_warp[16]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[64]; + signed char B_reindex_reindex_shared_warp_1[16]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 4; ++ax1_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[(ax1_0_3_init * 8) + i] = 0.0;} +; + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 8; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 22528)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 22528))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + ((((int)threadIdx.x) & 7) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 127; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 8; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 22528)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 22528))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_1 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + (ax3_0_0 * 128)) + ((((int)threadIdx.x) & 7) * 16)) + 128))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (ax3_0_0 * 512)) + (((int)threadIdx.x) * 16)) + 512))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 2; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 1024) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 4; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 22528)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 22528)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3 = 0; ax1_0_3 < 4; ++ax1_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3])); + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 2; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 1024)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 4; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 38912)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 38912)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 4; ++ax1_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3])); + } + } + } + for (int ax0 = 0; ax0 < 4; ++ax0) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 2048) + (ax0 * 512)) + (((int)threadIdx.z) * 256)) + 1536)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[(ax0 * 8) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 256)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + ((((((((int)threadIdx.y) * 2048) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 512)) + (((int)threadIdx.z) * 256)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 128)) + (((int)threadIdx.x) * 4)) + 1536)); + } + } +} + + +Compilation error: +/tmp/tmpx92caiz8/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpx92caiz8/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0xd800 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[69632]; + int C_reindex_shared_warp[128]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[64]; + signed char B_reindex_reindex_shared_warp[64]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[64]; + signed char B_reindex_reindex_shared_warp_1[64]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 4; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 4; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 32) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 4; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 4; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 4; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 2048) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 12288)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 12288)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 4; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 4; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 4; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 4; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 2048)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 20480)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 20480)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 4; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 4; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 4; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 4; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 4; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 1024)) + (ax1_2 * 256)) + 7168)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 32) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 1024)) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.z) * 64)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 7168)); + } + } + } +} + + +Compilation error: +/tmp/tmp6v_wuey1/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmp6v_wuey1/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x11000 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[53248]; + int C_reindex_shared_warp[16]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[32]; + signed char B_reindex_reindex_shared_warp[16]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[32]; + signed char B_reindex_reindex_shared_warp_1[16]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 2; ++ax1_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[(ax1_0_3_init * 8) + i] = 0.0;} +; + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 8; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 20480)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 20480))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2 * 32768)) + ((((int)threadIdx.x) >> 4) * 16384)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 65536)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 63; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 8; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 20480)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + (((((int)threadIdx.x) & 15) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 20480))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2_1 * 32768)) + ((((int)threadIdx.x) >> 4) * 16384)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 65536)) + (ax3_0_0 * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 1024))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 4; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 2048) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 8; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 4096)) + ((((int)threadIdx.x) & 15) * 256)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 20480)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 4096)) + ((((int)threadIdx.x) & 15) * 256)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 20480)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 4096) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 4096) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3])); + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 4; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 2048)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 8; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 4096)) + ((((int)threadIdx.x) & 15) * 256)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 36864)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 4096)) + ((((int)threadIdx.x) & 15) * 256)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 36864)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 4096) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 4096) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 2; ++ax1_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3])); + } + } + } + for (int ax0 = 0; ax0 < 2; ++ax0) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 1024) + (ax0 * 512)) + (((int)threadIdx.z) * 256)) + 3072)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[(ax0 * 8) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 4; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 256)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + ((((((((int)threadIdx.y) * 1024) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 512)) + (((int)threadIdx.z) * 256)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 128)) + (((int)threadIdx.x) * 4)) + 3072)); + } + } +} + + +Compilation error: +/tmp/tmpmlxj0z_y/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpmlxj0z_y/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0xd000 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[104448]; + int C_reindex_shared_warp[64]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[128]; + signed char B_reindex_reindex_shared_warp[16]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[128]; + signed char B_reindex_reindex_shared_warp_1[16]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 8; ++ax1_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[(ax1_0_3_init * 8) + i] = 0.0;} +; + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 16; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 8192)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 38912)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 8192)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 38912))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + ((((int)threadIdx.x) & 7) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 127; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 16; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 32768) + (((int)threadIdx.y) * 16384)) + (((int)threadIdx.z) * 8192)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 38912)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 32768) + (((int)threadIdx.y) * 16384)) + (((int)threadIdx.z) * 8192)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 38912))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2_1 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + (ax3_0_0 * 128)) + ((((int)threadIdx.x) & 7) * 16)) + 128))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + if (((int)threadIdx.z) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((ax3_0_0 + 1) & 1) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 1048576) + (((int)blockIdx.x) * 131072)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (ax3_0_0 * 512)) + (((int)threadIdx.x) * 16)) + 512))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 2; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 1024) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 4; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 8; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 32768) + (((int)threadIdx.y) * 16384)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 38912)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 32768) + (((int)threadIdx.y) * 16384)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 38912)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3 = 0; ax1_0_3 < 8; ++ax1_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3])); + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 2; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 1024)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 2048)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 4; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 8; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 16384) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 71680)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 16384) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 71680)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((int)threadIdx.z) * 2048) + (ax3_0_1_1 * 512)) + 2048)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 8; ++ax1_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3])); + } + } + } + for (int ax0 = 0; ax0 < 8; ++ax0) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 4096) + (ax0 * 512)) + (((int)threadIdx.z) * 256)) + 1536)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[(ax0 * 8) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 16; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 256)) + (((int)blockIdx.x) * 32)) + (((int)threadIdx.z) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + ((((((((int)threadIdx.y) * 4096) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 512)) + (((int)threadIdx.z) * 256)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 128)) + (((int)threadIdx.x) * 4)) + 1536)); + } + } +} + + +Compilation error: +/tmp/tmp_slfyv4d/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmp_slfyv4d/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x19800 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[118784]; + int C_reindex_shared_warp[256]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[128]; + signed char B_reindex_reindex_shared_warp[64]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[128]; + signed char B_reindex_reindex_shared_warp_1[64]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 8; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 4; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 32) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 8; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 8; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (((int)threadIdx.z) * 4096)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 12288))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (((int)threadIdx.z) * 1048576)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 4; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 2048) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 8; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 12288)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 16384) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 12288)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 4; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 8; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 4; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 4; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 2048)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 4096)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 8; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 28672)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 28672)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 4; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 4096)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 8; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 4; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 8; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 4; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 1024)) + (ax1_2 * 256)) + 11264)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 32) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 2097152)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 1024)) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.z) * 64)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 11264)); + } + } + } +} + + +Compilation error: +/tmp/tmp8_6jny4x/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmp8_6jny4x/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x1d000 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[81920]; + int C_reindex_shared_warp[128]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[32]; + signed char B_reindex_reindex_shared_warp[128]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[32]; + signed char B_reindex_reindex_shared_warp_1[128]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 2; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 8; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 64) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 2; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 2; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0 * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 2; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 2; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0_1 * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 8; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + (((((((ax3_0_0 & 1) * 4096) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 8192)); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 16384)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 8; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 8; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 8; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 12288)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 16384)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 2048) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 4096)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 2048) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 4096)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 8; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 16384)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 2; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 8; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 2; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 8; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 2048)) + (ax1_2 * 256)) + 8192)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 64) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 16; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 2048)) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.z) * 128)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 8192) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 8192)); + } + } + } +} + + +Compilation error: +/tmp/tmpjic8osyc/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpjic8osyc/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x14000 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[122880]; + int C_reindex_shared_warp[256]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[64]; + signed char B_reindex_reindex_shared_warp[128]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[64]; + signed char B_reindex_reindex_shared_warp_1[128]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 4; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 8; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 64) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 4; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 2; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0 * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 4; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 2; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((ax3_0_0 + 1) & 1) * 4096) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((ax3_0_0 + 1) & 1) * 4096) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0_1 * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 8; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 4096) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 24576)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 8; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 4; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 8; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 64) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 64) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 8; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 4096)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 24576)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 16384)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 16384)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 8; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 8192) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 4; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 8; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 64) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 4; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 8; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 2048)) + (ax1_2 * 256)) + 10240)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 64) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 16; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 2048)) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.z) * 128)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 16384) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 10240)); + } + } + } +} + + +Compilation error: +/tmp/tmpwy1too3e/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpwy1too3e/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x1e000 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[84992]; + int C_reindex_shared_warp[32]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[64]; + signed char B_reindex_reindex_shared_warp[16]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[64]; + signed char B_reindex_reindex_shared_warp_1[16]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 4; ++ax1_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[(ax1_0_3_init * 8) + i] = 0.0;} +; + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 16; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((int)threadIdx.y) * 8192) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 19456)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((int)threadIdx.y) * 8192) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 19456))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + (((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 1048576)) + (ax0_ax1_ax2_fused_2 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + ((((int)threadIdx.x) & 7) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + if (((int)threadIdx.y) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((int)blockIdx.z) * 524288) + (((int)blockIdx.x) * 65536)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 127; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 16; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 32768) + (((int)threadIdx.y) * 8192)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 19456)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 32768) + (((int)threadIdx.y) * 8192)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((ax0_ax1_ax2_fused_2_1 & 3) * 4) + (((int)threadIdx.x) >> 3))) * 16)) + 19456))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + (((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 1048576)) + (ax0_ax1_ax2_fused_2_1 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + (ax3_0_0 * 128)) + ((((int)threadIdx.x) & 7) * 16)) + 128))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + if (((int)threadIdx.y) < 1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((ax3_0_0 + 1) & 1) * 512) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((ax3_0_0 + 1) & 1) * 512) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 524288) + (((int)blockIdx.x) * 65536)) + (ax3_0_0 * 512)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 512))), "n"(16) + ); + } + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((ax3_0_0 & 1) * 512) + (((int)threadIdx.y) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)) + 1024)) = B_reindex_reindex_local[0]; + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 4; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 32768) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 19456)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 32768) + (((int)threadIdx.y) * 8192)) + (ax1_0 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 19456)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((ax3_0_1 * 512) + 1024)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((ax3_0_1 * 512) + 1024)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3 = 0; ax1_0_3 < 4; ++ax1_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 8) + 4)))[3])); + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((int)threadIdx.y) * 128) + (((int)threadIdx.x) * 4)) + 512)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((int)threadIdx.y) * 512) + (((int)threadIdx.x) * 16)) + 1024)) = B_reindex_reindex_local_1[0]; + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 4; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 52224)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 8192) + (ax1_0_1 * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 52224)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((ax3_0_1_1 * 512) + 1024)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((ax3_0_1_1 * 512) + 1024)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 4; ++ax1_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 0))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax1_0_3_1 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + 8))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 8) + 4)))[3])); + } + } + } + for (int ax0 = 0; ax0 < 4; ++ax0) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[(((((int)threadIdx.y) * 1024) + (ax0 * 256)) + 768)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[(ax0 * 8) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + (((((((((int)blockIdx.y) * 4194304) + (((int)threadIdx.y) * 1048576)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 128)) + (((int)blockIdx.x) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + ((((((int)threadIdx.y) * 1024) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 768)); + } + } +} + + +Compilation error: +/tmp/tmpsits3gou/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpsits3gou/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x14c00 bytes, 0xc000 max) + + +[FastDlight] LocalBuilder: An exception occurred Traceback (most recent call last): + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/exec/popen_worker.py", line 87, in main + result = fn(*args, **kwargs) + File "/home/t-leiwang/ladder_workspace/BitBLAS/python/bitblas/base/utils.py", line 201, in _build + rt_mod = tvm.build(mod["main"], target=arch.target) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/driver/build_module.py", line 294, in build + rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__ + raise_last_ffi_error() + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error + raise py_err + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile + ptx = compile_cuda(code, target_format="fatbin") + File "/home/t-leiwang/mlc_workspace/unity/python/tvm/contrib/nvcc.py", line 128, in compile_cuda + raise RuntimeError(msg) +RuntimeError: template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[90112]; + int C_reindex_shared_warp[64]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[16]; + signed char B_reindex_reindex_shared_warp[128]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[16]; + signed char B_reindex_reindex_shared_warp_1[128]; + for (int var = 0; var < 1; ++var) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 8; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[(ax2_0_3_init * 8) + i] = 0.0;} +; + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 2; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((((int)threadIdx.z) * 8) + (ax0_ax1_ax2_fused_2 * 4)) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((((int)threadIdx.z) * 8) + (ax0_ax1_ax2_fused_2 * 4)) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 524288) + (((int)threadIdx.y) * 262144)) + (((int)threadIdx.z) * 131072)) + (ax0_ax1_ax2_fused_2 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + ((((int)threadIdx.x) & 7) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 4; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((ax0_ax1_ax2_ax3_fused_0 * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0 * 262144)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (((int)threadIdx.x) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 127; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 2; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((((int)threadIdx.z) * 8) + (ax0_ax1_ax2_fused_2_1 * 4)) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 3) * 128)) + (((((int)threadIdx.x) & 7) ^ (((((int)threadIdx.z) * 8) + (ax0_ax1_ax2_fused_2_1 * 4)) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 524288) + (((int)threadIdx.y) * 262144)) + (((int)threadIdx.z) * 131072)) + (ax0_ax1_ax2_fused_2_1 * 65536)) + ((((int)threadIdx.x) >> 3) * 16384)) + (ax3_0_0 * 128)) + ((((int)threadIdx.x) & 7) * 16)) + 128))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 4; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 8192) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 8192) + (ax0_ax1_ax2_ax3_fused_0_1 * 2048)) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)) + 8192))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.z) * 8388608) + (((int)blockIdx.x) * 1048576)) + (ax0_ax1_ax2_ax3_fused_0_1 * 262144)) + (((int)threadIdx.z) * 131072)) + (((int)threadIdx.y) * 65536)) + (ax3_0_0 * 512)) + (((int)threadIdx.x) * 16)) + 512))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 16; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + (((((((ax3_0_0 & 1) * 8192) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 8192)); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 24576)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 4; ++ax3_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + 0))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + 0))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + 0))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + 0))[3]) + : "r"(addr) + ); + } + for (int ax1 = 0; ax1 < 8; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 16384) + (ax1 * 2048)) + (ax3_0_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 16384) + (ax1 * 2048)) + (ax3_0_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax2_0_3 = 0; ax2_0_3 < 8; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[1]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[2]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + 0))[0]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[1]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[2]), "r"(((unsigned *)(A_reindex_shared_warp + 0))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[3])); + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 16; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 16384)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 24576)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 4; ++ax3_0_1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.y) * 2048) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 4096)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.y) * 2048) + ((((int)threadIdx.x) & 15) * 128)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ (((int)threadIdx.x) & 15)) * 16)) + 4096)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[3]) + : "r"(addr) + ); + } + for (int ax1_1 = 0; ax1_1 < 8; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 16384) + (ax1_1 * 2048)) + (ax3_0_1_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 16384) + (ax1_1 * 2048)) + (ax3_0_1_1 * 512)) + 24576)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 8; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[0]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[1]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[2]), "=r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[0]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[2]), "r"(((int *)(C_reindex_shared_warp + (ax2_0_3_1 * 8)))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + 0))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax2_0_3_1 * 8) + 4)))[3])); + } + } + } + for (int ax0 = 0; ax0 < 8; ++ax0) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0 * 256)) + 14336)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[(ax0 * 8) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 16; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + (((((((((((int)blockIdx.y) * 524288) + (((int)threadIdx.y) * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 2048)) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.z) * 128)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 14336)); + } + } +} + + +Compilation error: +/tmp/tmpg804_j9j/tvm_kernels.cu(52): warning #177-D: function "__dp4a(int, unsigned int, int)" was declared but never referenced + +Remark: The warnings can be suppressed with "-diag-suppress " + +/tmp/tmpg804_j9j/tvm_kernels.cu(46): warning #177-D: function "__dp4a(unsigned int, int, int)" was declared but never referenced + +ptxas error : Entry function 'default_function_kernel' uses too much shared data (0x16000 bytes, 0xc000 max) + + +[FastDlight] Evaluation with config {'block': [128, 64], 'warp': [64, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 27.207 ms +[FastDlight] Evaluation with config {'block': [256, 64], 'warp': [128, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 29.398 ms +[FastDlight] Evaluation with config {'block': [64, 64], 'warp': [32, 32], 'rstep': [64], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 35.068 ms +[FastDlight] Evaluation with config {'block': [64, 32], 'warp': [32, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 61.230 ms +[FastDlight] Evaluation with config {'block': [64, 16], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 64.637 ms +[FastDlight] Evaluation with config {'block': [32, 128], 'warp': [16, 64], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 35.104 ms +[FastDlight] Evaluation with config {'block': [32, 64], 'warp': [16, 32], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 39.622 ms +[FastDlight] Evaluation with config {'block': [32, 32], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 52.153 ms +[FastDlight] Evaluation with config {'block': [16, 128], 'warp': [16, 32], 'rstep': [128], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 49.908 ms +[FastDlight] Evaluation with config {'block': [16, 64], 'warp': [16, 16], 'rstep': [256], 'use_tc': True, 'vectorize': {'A_reindex': 16, 'B_reindex_reindex': 16}} +[FastDlight] Time cost of this config: 58.419 ms +[FastDlight] The best latency of top 1 is 27.207 ms +[FastDlight] The best latency of top 1 is 27.207 ms +template +__device__ void decode_i2s_to_i8s(T1 *_i2s, T2 *_i8s, const int N = 16) +{ + // convert 8 int2b_t to 8 int8b_t -> 2 int32 + uint *i8s = reinterpret_cast(_i8s); + + // i2s = {e7,e6,e5,e4,e3,e2,e1,e0} + // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} + uint const i2s = *_i2s; + + // First, we extract the i4s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3 + static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); + } +} +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include + + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) __DEF_IF_HOST + +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +__SM_61_INTRINSICS_DECL__ int __dp4a(unsigned int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, unsigned int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif +__forceinline__ __device__ unsigned int +cast_smem_ptr_to_int(const void* const smem_ptr) +{ + unsigned int smem_int; + asm volatile ("{ .reg .u64 smem_int; cvta.to.shared.u64 smem_int, %1; cvt.u32.u64 %0, smem_int; }" + : "=r"(smem_int) : "l"(smem_ptr)); + return smem_int; +} + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C); +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[45056]; + int C_reindex_shared_warp[64]; + int B_local[1]; + int4 B_reindex_reindex_local[1]; + signed char A_reindex_shared_warp[32]; + signed char B_reindex_reindex_shared_warp[64]; + int B_local_1[1]; + int4 B_reindex_reindex_local_1[1]; + signed char A_reindex_shared_warp_1[32]; + signed char B_reindex_reindex_shared_warp_1[64]; + for (int var = 0; var < 1; ++var) { + for (int ax1_0_3_init = 0; ax1_0_3_init < 2; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 4; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_shared_warp[((ax1_0_3_init * 32) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 2; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 4096)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 4096))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0 = 0; ax0_ax1_ax2_ax3_fused_0 < 1; ++ax0_ax1_ax2_ax3_fused_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((int)threadIdx.z) * 1024) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + ((((int)threadIdx.x) & 15) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 2; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 4096)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 4096))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (((int)threadIdx.z) * 262144)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_ax3_fused_0_1 = 0; ax0_ax1_ax2_ax3_fused_0_1 < 1; ++ax0_ax1_ax2_ax3_fused_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (((int)threadIdx.y) * 512)) + (((int)threadIdx.x) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.z) * 4194304) + (((int)blockIdx.x) * 524288)) + (((int)threadIdx.z) * 262144)) + (((int)threadIdx.y) * 131072)) + ((((int)threadIdx.x) >> 4) * 65536)) + (ax3_0_0 * 256)) + ((((int)threadIdx.x) & 15) * 16)) + 256))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0 = 0; ax1_ax2_ax3_ax4_0_fused_0 < 4; ++ax1_ax2_ax3_ax4_0_fused_0) { + B_local[0] = *(int*)(((signed char*)buf_shmem) + ((((((ax3_0_0 & 1) * 2048) + (ax1_ax2_ax3_ax4_0_fused_0 * 512)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4))); + decode_i2s_to_i8s(B_local, B_reindex_reindex_local, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 12288)) = B_reindex_reindex_local[0]; + } + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 4096)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax1_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 4096)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax1_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1 = 0; ax1 < 4; ++ax1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 12288)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1 * 1024)) + (ax3_0_1 * 512)) + 12288)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 4; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax1_ax2_ax3_ax4_0_fused_0_1 = 0; ax1_ax2_ax3_ax4_0_fused_0_1 < 4; ++ax1_ax2_ax3_ax4_0_fused_0_1) { + B_local_1[0] = *(int*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.z) * 128)) + (((int)threadIdx.x) * 4)) + 2048)); + decode_i2s_to_i8s(B_local_1, B_reindex_reindex_local_1, 16); + *(int4*)(((signed char*)buf_shmem) + (((((ax1_ax2_ax3_ax4_0_fused_0_1 * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (((int)threadIdx.x) * 16)) + 12288)) = B_reindex_reindex_local_1[0]; + } + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 2048) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 2048) + (ax1_0_1 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_1 = 0; ax1_1 < 4; ++ax1_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 12288)])) + (((int)threadIdx.x) * 16)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((int)threadIdx.z) * 4096) + (ax1_1 * 1024)) + (ax3_0_1_1 * 512)) + 12288)])) + (((int)threadIdx.x) * 16))) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax1_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 2; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 4; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_shared_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_shared_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 2; ++ax0) { + __syncthreads(); + for (int ax1_2 = 0; ax1_2 < 4; ++ax1_2) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(((int*)buf_shmem)[((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 1024)) + (ax1_2 * 256)) + 5120)]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_shared_warp[((ax0 * 32) + (ax1_2 * 8)) + local_id]; +} +; + } + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_ax3_ax4_fused_0 = 0; ax0_ax1_ax2_ax3_ax4_fused_0 < 8; ++ax0_ax1_ax2_ax3_ax4_fused_0) { + *(int4*)(C + ((((((((((((int)blockIdx.y) * 1048576) + (((int)threadIdx.y) * 524288)) + (ax0 * 262144)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 & 1) * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (((int)blockIdx.z) * 1024)) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.z) * 64)) + ((ax0_ax1_ax2_ax3_ax4_fused_0 >> 1) * 16)) + ((((int)threadIdx.x) & 3) * 4))) = *(int4*)(((int*)buf_shmem) + (((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_ax3_ax4_fused_0 * 128)) + (((int)threadIdx.x) * 4)) + 5120)); + } + } + } +} + + diff --git a/testing/tir_expr/f16.swizzle b/testing/tir_expr/f16.swizzle new file mode 100644 index 0000000000..1af96ec37b --- /dev/null +++ b/testing/tir_expr/f16.swizzle @@ -0,0 +1,261 @@ +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(half* __restrict__ A, half* __restrict__ B, half* __restrict__ C) { + half C_reindex_warp[32]; + __shared__ half A_reindex_shared[4096]; + __shared__ half B_reindex_shared[4096]; + half A_reindex_shared_warp[16]; + half B_reindex_shared_warp[16]; + half A_reindex_shared_warp_1[16]; + half B_reindex_shared_warp_1[16]; + for (int ax1_0_3_init = 0; ax1_0_3_init < 2; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 2; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_warp[((ax1_0_3_init * 16) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 2; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_shared + (((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_shared + (((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + ((((int)threadIdx.x) & 3) * 8)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 2; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_reindex_shared + (((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_1 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_reindex_shared + (((((((int)threadIdx.y) * 1024) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_1 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.x) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_1 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + ((((int)threadIdx.x) & 3) * 8)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 31; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_2 = 0; ax0_ax1_ax2_fused_2_2 < 2; ++ax0_ax1_ax2_fused_2_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_2 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_2 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_2 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + (ax3_0_0 * 32)) + ((((int)threadIdx.x) & 3) * 8)) + 32))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_3 = 0; ax0_ax1_ax2_fused_2_3 < 2; ++ax0_ax1_ax2_fused_2_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_3 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (((int)threadIdx.z) * 512)) + (ax0_ax1_ax2_fused_2_3 * 256)) + ((((int)threadIdx.x) >> 2) * 32)) + (((((int)threadIdx.x) & 3) ^ (((int)threadIdx.x) >> 3)) * 8)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.x) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_3 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + (ax3_0_0 * 32)) + ((((int)threadIdx.x) & 3) * 8)) + 32))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_shared[((((((ax3_0_0 & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (ax0_0 * 512)) + ((((int)threadIdx.x) & 15) * 32)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_shared[((((((ax3_0_0 & 1) * 2048) + (((int)threadIdx.y) * 1024)) + (ax0_0 * 512)) + ((((int)threadIdx.x) & 15) * 32)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 8)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_1 = 0; ax0_0_1 < 2; ++ax0_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_shared[(((((((ax3_0_0 & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_0_1 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + ((((int)threadIdx.x) & 7) * 32)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_shared[(((((((ax3_0_0 & 1) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_0_1 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + ((((int)threadIdx.x) & 7) * 32)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 8)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 8)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 8)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 8)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 2; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16" + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n" + : "=r"(((unsigned *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "=r"(((unsigned *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 8)))[1]), "r"(((unsigned *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "r"(((unsigned *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16" + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n" + : "=r"(((unsigned *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((unsigned *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 8)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((unsigned *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax0_0_2 = 0; ax0_0_2 < 2; ++ax0_0_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_shared[(((((((int)threadIdx.y) * 1024) + (ax0_0_2 * 512)) + ((((int)threadIdx.x) & 15) * 32)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 2048)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_shared[(((((((int)threadIdx.y) * 1024) + (ax0_0_2 * 512)) + ((((int)threadIdx.x) & 15) * 32)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 2048)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 8)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 8)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 8)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 8)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_3 = 0; ax0_0_3 < 2; ++ax0_0_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_shared[((((((((int)threadIdx.z) * 1024) + (ax0_0_3 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + ((((int)threadIdx.x) & 7) * 32)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 2048)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_shared[((((((((int)threadIdx.z) * 1024) + (ax0_0_3 * 512)) + ((((int)threadIdx.x) >> 4) * 256)) + ((((int)threadIdx.x) & 7) * 32)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ ((((int)threadIdx.x) & 7) >> 1)) * 8)) + 2048)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 8)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 8)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 8)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 8)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 2; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 2; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16" + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n" + : "=r"(((unsigned *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "=r"(((unsigned *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 8)))[1]), "r"(((unsigned *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "r"(((unsigned *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16" + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n" + : "=r"(((unsigned *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((unsigned *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 8)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 8) + 4)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 8) + 4)))[1]), "r"(((unsigned *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((unsigned *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1])); + } + } + } + } + for (int ax0 = 0; ax0 < 2; ++ax0) { + for (int ax1 = 0; ax1 < 2; ++ax1) { + for (int local_id = 0; local_id < 8; local_id+=2) { +*((uint *)&(&(C[((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (ax0 * 16384)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.z) * 32)) + (ax1 * 16))]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 1024) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))]) = *((uint *)&C_reindex_warp[((ax0 * 16) + (ax1 * 8)) + local_id]); +} +; + } + } +} diff --git a/testing/tir_expr/float16xfloat16_gemm.py b/testing/tir_expr/float16xfloat16_gemm.py index dca307b0ff..c9ae61170b 100644 --- a/testing/tir_expr/float16xfloat16_gemm.py +++ b/testing/tir_expr/float16xfloat16_gemm.py @@ -4,14 +4,12 @@ from bitblas.base.roller.arch import CUDA from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags from bitblas.base.utils import apply_and_build -from bitblas.ops.matmul_impl import ( - matmul_nt, - matmul_nt_propagate_b_s8_s8_s32_mma -) +from bitblas.ops.matmul_impl import matmul_nt, matmul_nt_propagate_b_s8_s8_s32_mma +import numpy as np def test_f16_f16_gemm(): - ir_module = matmul_nt(16384, 16384, 16384, "float16", "float16") + ir_module = matmul_nt(1024, 1024, 1024, "float16", "float16") func = ir_module["main"] target = tvm.target.Target("nvidia/nvidia-a100") arch = CUDA(target) @@ -35,8 +33,24 @@ def test_f16_f16_gemm(): "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) ) + numpy_a = np.random.randint(-4, 3, (1024, 1024)).astype("float16") + numpy_b = np.random.randint(-4, 3, (1024, 1024)).astype("float16") + numpy_c = np.matmul(numpy_a.astype("float16"), numpy_b.T.astype("float16")) + ctx = tvm.cuda() + tvm_a = tvm.nd.array(numpy_a, device=ctx) + tvm_b = tvm.nd.array(numpy_b, device=ctx) + tvm_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float16"), device=ctx) + print(best.code) + best.mod(tvm_a, tvm_b, tvm_c) + print(best.config) + print("numpy_c ", numpy_c) + print("tvm_c.asnumpy() ", tvm_c.asnumpy()) + + def test_i8_i8_gemm_propagate_b(): - ir_module = matmul_nt_propagate_b_s8_s8_s32_mma(16384, 16384, 16384, "int8", "int32") + ir_module = matmul_nt_propagate_b_s8_s8_s32_mma( + 16384, 16384, 16384, "int8", "int32" + ) func = ir_module["main"] target = tvm.target.Target("nvidia/nvidia-a100") arch = CUDA(target) @@ -61,5 +75,6 @@ def test_i8_i8_gemm_propagate_b(): ) print(best.sch.mod) + test_f16_f16_gemm() # test_i8_i8_gemm_propagate_b() diff --git a/testing/tir_expr/i8_right_swizzle b/testing/tir_expr/i8_right_swizzle new file mode 100644 index 0000000000..9b5480476b --- /dev/null +++ b/testing/tir_expr/i8_right_swizzle @@ -0,0 +1,272 @@ +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + + const int MAX_BLOCK_N = 10; + const auto baseBlockIdx = blockIdx.x + gridDim.x *blockIdx.y; + const auto totalPanel = (gridDim.x * gridDim.y +MAX_BLOCK_N * gridDim.x - 1) / (MAX_BLOCK_N * gridDim.x); + const auto totalBlock = gridDim.x * gridDim.y; + const auto panelIdx = baseBlockIdx / (MAX_BLOCK_N *gridDim.x); + const auto strideLd = panelIdx + 1 < totalPanel ?MAX_BLOCK_N : (totalBlock - panelIdx * (MAX_BLOCK_N *gridDim.x)) / gridDim.x; + const auto bx = (panelIdx & 1) ? gridDim.x -(baseBlockIdx - panelIdx * MAX_BLOCK_N * gridDim.x) /strideLd - 1 : (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) / strideLd; + const auto by = (baseBlockIdx - panelIdx * MAX_BLOCK_N *gridDim.x) % strideLd + panelIdx * MAX_BLOCK_N; + const auto bz = blockIdx.z; + const dim3 blockIdx(bx, by, bz); + __shared__ uchar buf_shmem[32768]; + int C_reindex_warp[128]; + signed char A_reindex_shared_warp[64]; + signed char B_reindex_shared_warp[64]; + signed char A_reindex_shared_warp_1[64]; + signed char B_reindex_shared_warp_1[64]; + for (int ax1_0_3_init = 0; ax1_0_3_init < 4; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 4; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_warp[((ax1_0_3_init * 32) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 4; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 4; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 16384)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((int)threadIdx.y) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_1 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 16384))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((int)blockIdx.z) * 16777216) + (((int)blockIdx.x) * 2097152)) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_1 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 255; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_2 = 0; ax0_ax1_ax2_fused_2_2 < 4; ++ax0_ax1_ax2_fused_2_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + ((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_2 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_2 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_3 = 0; ax0_ax1_ax2_fused_2_3 < 4; ++ax0_ax1_ax2_fused_2_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_3 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_3 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 16384)))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(buf_shmem + (((((((((ax3_0_0 + 1) & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (((int)threadIdx.z) * 2048)) + (ax0_ax1_ax2_fused_2_3 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ (((ax0_ax1_ax2_fused_2_3 & 1) * 2) + (((int)threadIdx.x) >> 4))) * 16)) + 16384))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + (((((((((((int)blockIdx.z) * 16777216) + (((int)blockIdx.x) * 2097152)) + (((int)threadIdx.y) * 1048576)) + (((int)threadIdx.z) * 524288)) + (ax0_ax1_ax2_fused_2_3 * 131072)) + ((((int)threadIdx.x) >> 2) * 16384)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax0_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.y) * 4096)) + (ax0_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 2) + ((((int)threadIdx.x) & 7) >> 2))) * 16)) + 16384)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((((ax3_0_0 & 1) * 8192) + (((int)threadIdx.z) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 2) + ((((int)threadIdx.x) & 7) >> 2))) * 16)) + 16384)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 4; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 4; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 32) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 32) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax0_0_2 = 0; ax0_0_2 < 4; ++ax0_0_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax0_0_2 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[(((((((int)threadIdx.y) * 4096) + (ax0_0_2 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 2)) * 16)) + 8192)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_3 = 0; ax0_0_3 < 4; ++ax0_0_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(((signed char*)buf_shmem)[((((((((int)threadIdx.z) * 4096) + (ax0_0_3 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 2) + ((((int)threadIdx.x) & 7) >> 2))) * 16)) + 24576)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(((signed char*)buf_shmem)[((((((((int)threadIdx.z) * 4096) + (ax0_0_3 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 2) + ((((int)threadIdx.x) & 7) >> 2))) * 16)) + 24576)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 4; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 4; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 32) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 4; ++ax0) { + for (int ax1 = 0; ax1 < 4; ++ax1) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(C[(((((((((int)blockIdx.y) * 2097152) + (((int)threadIdx.y) * 1048576)) + (ax0 * 262144)) + (((int)blockIdx.z) * 1024)) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.z) * 64)) + (ax1 * 16))]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 16384) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_warp[((ax0 * 32) + (ax1 * 8)) + local_id]; +} +; + } + } +} + diff --git a/testing/tir_expr/i8_wrong_swizzle b/testing/tir_expr/i8_wrong_swizzle new file mode 100644 index 0000000000..771f83a477 --- /dev/null +++ b/testing/tir_expr/i8_wrong_swizzle @@ -0,0 +1,261 @@ +extern "C" __global__ void __launch_bounds__(128) default_function_kernel(signed char* __restrict__ A, signed char* __restrict__ B, int* __restrict__ C) { + int C_reindex_warp[32]; + __shared__ signed char A_reindex_shared[8192]; + __shared__ signed char B_reindex_shared[8192]; + signed char A_reindex_shared_warp[32]; + signed char B_reindex_shared_warp[32]; + signed char A_reindex_shared_warp_1[32]; + signed char B_reindex_shared_warp_1[32]; + for (int ax1_0_3_init = 0; ax1_0_3_init < 2; ++ax1_0_3_init) { + for (int ax2_0_3_init = 0; ax2_0_3_init < 2; ++ax2_0_3_init) { + for (int i = 0; i < 8; ++i) { +C_reindex_warp[((ax1_0_3_init * 16) + (ax2_0_3_init * 8)) + i] = 0.0;} +; + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2 = 0; ax0_ax1_ax2_fused_2 < 2; ++ax0_ax1_ax2_fused_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_shared + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 4) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_shared + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2 * 4) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_1 = 0; ax0_ax1_ax2_fused_2_1 < 2; ++ax0_ax1_ax2_fused_2_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_reindex_shared + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 4) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_reindex_shared + (((((((int)threadIdx.y) * 2048) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_1 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_1 * 4) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((int)blockIdx.x) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_1 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + ((((int)threadIdx.x) & 3) * 16)))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + + for (int ax3_0_0 = 0; ax3_0_0 < 15; ++ax3_0_0) { + __syncthreads(); + #pragma unroll + for (int ax0_ax1_ax2_fused_2_2 = 0; ax0_ax1_ax2_fused_2_2 < 2; ++ax0_ax1_ax2_fused_2_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(A_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_2 * 4) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(A_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_2 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_2 * 4) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(A + ((((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_2 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } + #pragma unroll + for (int ax0_ax1_ax2_fused_2_3 = 0; ax0_ax1_ax2_fused_2_3 < 2; ++ax0_ax1_ax2_fused_2_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)(B_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_3 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_3 * 4) + (((int)threadIdx.x) >> 3))) * 16))))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)(B_reindex_shared + ((((((((ax3_0_0 + 1) & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (((int)threadIdx.z) * 1024)) + (ax0_ax1_ax2_fused_2_3 * 512)) + ((((int)threadIdx.x) >> 2) * 64)) + (((((int)threadIdx.x) & 3) ^ ((ax0_ax1_ax2_fused_2_3 * 4) + (((int)threadIdx.x) >> 3))) * 16)))) + ); +#endif + __asm__ __volatile__( + #if TVM_ENABLE_L2_PREFETCH + "cp.async.cg.shared.global.L2::128B [%0], [%1], %2;" + #else + "cp.async.cg.shared.global [%0], [%1], %2;" + #endif + :: "r"(addr), "l"((void*)(B + ((((((((((int)blockIdx.x) * 65536) + (((int)threadIdx.y) * 32768)) + (((int)threadIdx.z) * 16384)) + (ax0_ax1_ax2_fused_2_3 * 8192)) + ((((int)threadIdx.x) >> 2) * 1024)) + (ax3_0_0 * 64)) + ((((int)threadIdx.x) & 3) * 16)) + 64))), "n"(16) + ); + } + } +__asm__ __volatile__("cp.async.commit_group;"); + +__asm__ __volatile__("cp.async.wait_group 1;"); + + __syncthreads(); + for (int ax3_0_1 = 0; ax3_0_1 < 2; ++ax3_0_1) { + for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_shared[((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax0_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 1)) * 16))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_shared[((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.y) * 2048)) + (ax0_0 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 1)) * 16))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp + (ax0_0 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_1 = 0; ax0_0_1 < 2; ++ax0_0_1) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_shared[(((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 4) + ((((int)threadIdx.x) & 7) >> 1))) * 16))])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_shared[(((((((ax3_0_0 & 1) * 4096) + (((int)threadIdx.z) * 2048)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 4) + ((((int)threadIdx.x) & 7) >> 1))) * 16))])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp + (ax0_0_1 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3 = 0; ax1_0_3 < 2; ++ax1_0_3) { + for (int ax2_0_3 = 0; ax2_0_3 < 2; ++ax2_0_3) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 16)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + (ax2_0_3 * 16)))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[0]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[2]), "r"(((int *)(C_reindex_warp + ((ax1_0_3 * 16) + (ax2_0_3 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp + (ax1_0_3 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp + ((ax2_0_3 * 16) + 8)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_warp + (((ax1_0_3 * 16) + (ax2_0_3 * 8)) + 4)))[3])); + } + } + } + } + } +__asm__ __volatile__("cp.async.wait_group 0;"); + + __syncthreads(); + for (int ax3_0_1_1 = 0; ax3_0_1_1 < 2; ++ax3_0_1_1) { + for (int ax0_0_2 = 0; ax0_0_2 < 2; ++ax0_0_2) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(A_reindex_shared[(((((((int)threadIdx.y) * 2048) + (ax0_0_2 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 1)) * 16)) + 4096)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(A_reindex_shared[(((((((int)threadIdx.y) * 2048) + (ax0_0_2 * 1024)) + ((((int)threadIdx.x) & 15) * 64)) + ((((ax3_0_1_1 * 2) + (((int)threadIdx.x) >> 4)) ^ ((((int)threadIdx.x) & 15) >> 1)) * 16)) + 4096)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[0]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[1]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[2]), "=r"(((unsigned *)(A_reindex_shared_warp_1 + (ax0_0_2 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax0_0_3 = 0; ax0_0_3 < 2; ++ax0_0_3) { + + { + unsigned int addr; +#if TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST + addr = static_cast(__cvta_generic_to_shared((void *)((&(B_reindex_shared[((((((((int)threadIdx.z) * 2048) + (ax0_0_3 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 4) + ((((int)threadIdx.x) & 7) >> 1))) * 16)) + 4096)])) + 0))); +#else + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)((&(B_reindex_shared[((((((((int)threadIdx.z) * 2048) + (ax0_0_3 * 1024)) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)threadIdx.x) & 7) * 64)) + ((((ax3_0_1_1 * 2) + ((((int)threadIdx.x) & 15) >> 3)) ^ (((((int)threadIdx.x) >> 4) * 4) + ((((int)threadIdx.x) & 7) >> 1))) * 16)) + 4096)])) + 0)) + ); +#endif + __asm__ __volatile__( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16" + "{%0, %1, %2, %3}, [%4];\n" + : "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[0]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[1]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[2]), "=r"(((unsigned *)(B_reindex_shared_warp_1 + (ax0_0_3 * 16)))[3]) + : "r"(addr) + ); + } + } + for (int ax1_0_3_1 = 0; ax1_0_3_1 < 2; ++ax1_0_3_1) { + for (int ax2_0_3_1 = 0; ax2_0_3_1 < 2; ++ax2_0_3_1) { + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "=r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + (ax2_0_3_1 * 16)))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[0]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[1]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[2]), "r"(((int *)(C_reindex_warp + ((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8))))[3])); + } + + { + __asm__ __volatile__( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32" + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "=r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3]) + : "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[0]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[1]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[2]), "r"(((unsigned *)(A_reindex_shared_warp_1 + (ax1_0_3_1 * 16)))[3]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[0]), "r"(((unsigned *)(B_reindex_shared_warp_1 + ((ax2_0_3_1 * 16) + 8)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[0]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[1]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[2]), "r"(((int *)(C_reindex_warp + (((ax1_0_3_1 * 16) + (ax2_0_3_1 * 8)) + 4)))[3])); + } + } + } + } + for (int ax0 = 0; ax0 < 2; ++ax0) { + for (int ax1 = 0; ax1 < 2; ++ax1) { + for (int local_id = 0; local_id < 8; ++local_id) { +(&(C[((((((((int)blockIdx.y) * 65536) + (((int)threadIdx.y) * 32768)) + (ax0 * 16384)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.z) * 32)) + (ax1 * 16))]))[((((((local_id % 4) / 2) * 8) + (threadIdx.x / 4)) * 1024) + ((((local_id / 4) * 8) + ((threadIdx.x % 4) * 2)) + (local_id % 2)))] = C_reindex_warp[((ax0 * 16) + (ax1 * 8)) + local_id]; +} +; + } + } +} diff --git a/testing/tir_expr/int8xint8_gemm.py b/testing/tir_expr/int8xint8_gemm.py index f7ae7238cd..e4435eb2ab 100644 --- a/testing/tir_expr/int8xint8_gemm.py +++ b/testing/tir_expr/int8xint8_gemm.py @@ -7,10 +7,42 @@ from bitblas.base.utils import apply_and_build from bitblas.ops.matmul_impl import ( matmul_nt, - matmul_nt_propagate_b_s8_s8_s32_mma + matmul_nt_dequantize_b, + matmul_nt_dequantize_b_propagate_b, + matmul_nt_dequantize_b_propagate_a_b, + matmul_nt_propagate_b_s8_s8_s32_mma, + matmul_nt_propagate_b_s8_s8_s32_cast_s8_mma, + matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma, + matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma_cast_s8 ) +def test_i8_i8_gemm(): + ir_module = matmul_nt(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + + def test_i8_i8_gemm_correctness(): ir_module = matmul_nt(1024, 1024, 1024, "int8", "int32") func = ir_module["main"] @@ -25,7 +57,7 @@ def test_i8_i8_gemm_correctness(): policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) configs = policy.emit_config(20) - + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) print( "[FastDlight] The best latency of top 1 is {:.3f} ms".format( @@ -35,20 +67,266 @@ def test_i8_i8_gemm_correctness(): print( "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) ) - - numpy_a = np.random.randint(-128, 127, (1024, 1024)).astype("int8") - numpy_b = np.random.randint(-128, 127, (1024, 1024)).astype("int8") + + numpy_a = np.random.randint(-4, 3, (1024, 1024)).astype("int8") + numpy_b = np.random.randint(-4, 3, (1024, 1024)).astype("int8") numpy_c = np.matmul(numpy_a.astype("int32"), numpy_b.T.astype("int32")) ctx = tvm.cuda() tvm_a = tvm.nd.array(numpy_a, device=ctx) tvm_b = tvm.nd.array(numpy_b, device=ctx) tvm_c = tvm.nd.array(np.zeros((1024, 1024), dtype="int32"), device=ctx) + # print(best.sch.mod) + # print(best.code) best.mod(tvm_a, tvm_b, tvm_c) + print(best.config) + print("numpy_c ", numpy_c) + print("tvm_c.asnumpy() ", tvm_c.asnumpy()) + np.testing.assert_allclose(tvm_c.asnumpy(), numpy_c, atol=1e-5) + # print(best.code) + + +def test_i8_i8_i32_gemm_propagate_b(): + ir_module = matmul_nt_propagate_b_s8_s8_s32_mma( + 16384, 16384, 16384, "int8", "int32" + ) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + + +def test_i8_i8_i32_cast_i8_gemm_propagate_b(): + ir_module = matmul_nt_propagate_b_s8_s8_s32_cast_s8_mma( + 16384, 16384, 16384, "int8", "int32" + ) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + + +def test_i8_i8_i32_gemm_propagate_a_propagate_b(): + ir_module = matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma( + 16384, 16384, 16384, "int8", "int32" + ) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +def test_i8_i8_i32_gemm_propagate_a_propagate_b_cast_s8(): + ir_module = matmul_nt_propagate_a_propagate_b_s8_s8_s32_mma_cast_s8( + 16384, 16384, 16384, "int8", "int32" + ) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +def test_i8_i4_gemm(): + ir_module = matmul_nt_dequantize_b(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +def test_i8_i4_propagate_b_gemm(): + ir_module = matmul_nt_dequantize_b_propagate_b(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + # print(best.sch.mod) + print(best.code) + +def test_i8_i4_propagate_a_propagate_b_gemm(): + ir_module = matmul_nt_dequantize_b_propagate_a_b(16384, 16384, 16384, "int8", "int32") + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + print(best.config) + +def test_i8_i2_gemm(): + ir_module = matmul_nt_dequantize_b(16384, 16384, 16384, "int8", "int32", bit=2) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) print(best.code) - -def test_i8_i8_gemm_propagate_b(): - ir_module = matmul_nt_propagate_b_s8_s8_s32_mma(16384, 16384, 16384, "int8", "int32") + +def test_i8_i2_propagate_b_gemm(): + ir_module = matmul_nt_dequantize_b_propagate_b(16384, 16384, 16384, "int8", "int32", 'int8', bit=2, fast_decoding=True) + func = ir_module["main"] + target = tvm.target.Target("nvidia/nvidia-a100") + arch = CUDA(target) + policy = DefaultPolicy(func=func, arch=arch) + try: + tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) + except: + tags = None + if tags: + policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) + + configs = policy.emit_config(20) + + cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format( + cpresults[0].latency * 1e3 + ) + ) + print( + "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) + ) + +def test_i8_i2_propagate_a_propagate_b_gemm(): + ir_module = matmul_nt_dequantize_b_propagate_a_b(16384, 16384, 16384, "int8", "int32", "int8", bit=2, fast_decoding=False) func = ir_module["main"] target = tvm.target.Target("nvidia/nvidia-a100") arch = CUDA(target) @@ -71,6 +349,20 @@ def test_i8_i8_gemm_propagate_b(): print( "[FastDlight] The best latency of top 1 is {:.3f} ms".format(best.latency * 1e3) ) + with open("after_memory_rewrite.cu", "+w") as f: + f.write(best.code) + + +# test_i8_i8_gemm() +# test_i8_i8_gemm_correctness() +# test_i8_i8_i32_gemm_propagate_b() +# test_i8_i8_i32_cast_i8_gemm_propagate_b() +# test_i8_i8_i32_gemm_propagate_a_propagate_b() +# test_i8_i8_i32_gemm_propagate_a_propagate_b_cast_s8() +# test_i8_i4_gemm() +# test_i8_i4_propagate_b_gemm() +# test_i8_i4_propagate_a_propagate_b_gemm() -test_i8_i8_gemm_correctness() -# test_i8_i8_gemm_propagate_b() +# test_i8_i2_gemm() +test_i8_i2_propagate_b_gemm() +# test_i8_i2_propagate_a_propagate_b_gemm() diff --git a/testing/tir_expr/test_tir_0.py b/testing/tir_expr/test_tir_0.py new file mode 100644 index 0000000000..e7cca021f1 --- /dev/null +++ b/testing/tir_expr/test_tir_0.py @@ -0,0 +1,187 @@ +import tvm +from tvm.script import ir as I +from tvm.script import tir as T +from tvm.tir.tensor_intrin.cuda import get_mma_intrin_group + +@I.ir_module +class Module: + @T.prim_func + def main(A: T.Buffer((1024, 512, 16, 32), "int8"), B: T.Buffer((1024, 512, 16, 8), "int8"), C: T.Buffer((16384, 16384), "int32")): + T.func_attr({"dequantize_info": {"B": {"decode_block": "B_decode", "fast_decoding": T.bool(False), "source_format": {"bits": 2, "format": "int"}, "target_format": "int8"}}, "dlight.tensorcore_prenormlized": T.bool(True), "smooth_a": T.bool(True), "smooth_b": T.bool(True), "tir.noalias": T.bool(True)}) + # with T.block("root"): + A_reindex_reindex_shared = T.alloc_buffer((1, 1024, 512, 16, 32), "int8", scope="shared") + B_reindex_reindex_shared = T.alloc_buffer((1, 1024, 512, 16, 32), "int8", scope="shared") + B_reindex_reindex_local = T.alloc_buffer((1, 1024, 512, 16, 32), "int8", scope="local") + B_local = T.alloc_buffer((1024, 512, 16, 8), "int8", scope="local") + B_shared = T.alloc_buffer((1024, 512, 16, 8), "int8", scope="shared") + A_reindex_reindex_shared_warp = T.alloc_buffer((1, 1024, 512, 32, 16), "int8", scope="warp") + B_reindex_reindex_shared_warp = T.alloc_buffer((1, 1024, 512, 32, 16), "int8", scope="warp") + C_reindex_shared = T.alloc_buffer((1, 1024, 1024, 16, 16), "int32", scope="shared") + C_reindex_shared_warp = T.alloc_buffer((1, 1024, 1024, 32, 8), "int32", scope="warp") + for ax0 in range(1): + for ax1_0_0_ax2_0_0_fused in T.thread_binding(64, thread="blockIdx.y"): + for ax1_0_1_ax2_0_1_fused in T.thread_binding(256, thread="blockIdx.x"): + for ax1_0_2 in T.thread_binding(2, thread="threadIdx.y"): + for ax2_0_2 in T.thread_binding(2, thread="threadIdx.z"): + for ax1_0_3_init, ax2_0_3_init in T.grid(8, 2): + with T.block("C_o_init"): + v0_o = T.axis.spatial(1, ax0) + v1_o = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + ax1_0_2 * 8 + ax1_0_3_init) + v2_o = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + ax2_0_2 * 2 + ax2_0_3_init) + T.reads() + T.writes(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8]) + with T.block("C_init_o"): + v1_i_init_o = T.axis.spatial(1, 0) + v2_i_init_o = T.axis.spatial(1, 0) + T.reads() + T.writes(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8]) + C_warp = T.match_buffer(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8], (32, 8), "int32", scope="warp", offset_factor=1) + for tx in T.thread_binding(32, thread="threadIdx.x"): + T.mma_fill("int32", 8, C_warp.data, C_warp.elem_offset) + for ax3_0_0 in T.serial(256, annotations={"software_pipeline_async_stages": [0], "software_pipeline_order": [0, 1, 2, 3], "software_pipeline_stage": [0, 0, 1, 1]}): + for ax0_ax1_ax2_ax3_ax4_fused_0 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_ax2_ax3_ax4_fused_1 in T.thread_binding(2, thread="threadIdx.z"): + for ax0_ax1_ax2_ax3_ax4_fused_2 in T.unroll(8, annotations={"pragma_unroll_explicit": 0}): + for ax0_ax1_ax2_ax3_ax4_fused_3 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_ax2_ax3_ax4_fused_4 in T.vectorized(16): + with T.block("A_reindex_reindex_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + (ax0_ax1_ax2_ax3_ax4_fused_0 * 8192 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4096 + ax0_ax1_ax2_ax3_ax4_fused_2 * 512 + ax0_ax1_ax2_ax3_ax4_fused_3 * 16 + ax0_ax1_ax2_ax3_ax4_fused_4) // 1024) + v2 = T.axis.spatial(512, ax3_0_0 * 2 + (ax0_ax1_ax2_ax3_ax4_fused_0 * 8192 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4096 + ax0_ax1_ax2_ax3_ax4_fused_2 * 512 + ax0_ax1_ax2_ax3_ax4_fused_3 * 16 + ax0_ax1_ax2_ax3_ax4_fused_4) % 1024 // 512) + v3 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_ax4_fused_0 * 8192 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4096 + ax0_ax1_ax2_ax3_ax4_fused_2 * 512 + ax0_ax1_ax2_ax3_ax4_fused_3 * 16 + ax0_ax1_ax2_ax3_ax4_fused_4) % 512 // 32) + v4 = T.axis.spatial(32, (ax0_ax1_ax2_ax3_ax4_fused_0 * 8192 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4096 + ax0_ax1_ax2_ax3_ax4_fused_2 * 512 + ax0_ax1_ax2_ax3_ax4_fused_3 * 16 + ax0_ax1_ax2_ax3_ax4_fused_4) % 32) + T.reads(A[v1, v2, v3, v4]) + T.writes(A_reindex_reindex_shared[v0, v1, v2, v3, v4]) + T.block_attr({"permuted_layout": 0}) + A_reindex_reindex_shared[v0, v1, v2, v3, v4] = A[v1, v2, v3, v4] + for ax0_ax1_ax2_ax3_fused_0 in T.unroll(1, annotations={"pragma_unroll_explicit": 0}): + for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(2, thread="threadIdx.z"): + for ax0_ax1_ax2_ax3_fused_2 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_ax2_ax3_fused_3 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_ax2_ax3_fused_4 in T.vectorized(16): + with T.block("B_shared"): + v0 = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + (ax0_ax1_ax2_ax3_fused_0 * 2048 + ax0_ax1_ax2_ax3_fused_1 * 1024 + ax0_ax1_ax2_ax3_fused_2 * 512 + ax0_ax1_ax2_ax3_fused_3 * 16 + ax0_ax1_ax2_ax3_fused_4) // 256) + v1 = T.axis.spatial(512, ax3_0_0 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 2048 + ax0_ax1_ax2_ax3_fused_1 * 1024 + ax0_ax1_ax2_ax3_fused_2 * 512 + ax0_ax1_ax2_ax3_fused_3 * 16 + ax0_ax1_ax2_ax3_fused_4) % 256 // 128) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_fused_0 * 2048 + ax0_ax1_ax2_ax3_fused_1 * 1024 + ax0_ax1_ax2_ax3_fused_2 * 512 + ax0_ax1_ax2_ax3_fused_3 * 16 + ax0_ax1_ax2_ax3_fused_4) % 128 // 8) + v3 = T.axis.spatial(8, (ax0_ax1_ax2_ax3_fused_0 * 2048 + ax0_ax1_ax2_ax3_fused_1 * 1024 + ax0_ax1_ax2_ax3_fused_2 * 512 + ax0_ax1_ax2_ax3_fused_3 * 16 + ax0_ax1_ax2_ax3_fused_4) % 8) + T.where((((ax0_ax1_ax2_ax3_fused_0 * 2 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) * 32 + ax0_ax1_ax2_ax3_fused_3) * 16 + ax0_ax1_ax2_ax3_fused_4 < 1024) + T.reads(B[v0, v1, v2, v3]) + T.writes(B_shared[v0, v1, v2, v3]) + B_shared[v0, v1, v2, v3] = B[v0, v1, v2, v3] + for ax0_1, ax1_ax2_ax3_ax4_0_fused_0 in T.grid(1, 2): + for ax1_ax2_ax3_ax4_0_fused_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax1_ax2_ax3_ax4_0_fused_2 in T.thread_binding(2, thread="threadIdx.z"): + for ax1_ax2_ax3_ax4_0_fused_3 in T.thread_binding(32, thread="threadIdx.x"): + for ax4_1 in range(1): + for ax0_2, ax1, ax2 in T.grid(1, 1, 1): + for ax3 in T.vectorized(4): + with T.block("B_local"): + v0 = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) // 64 + ax0_2) + v1 = T.axis.spatial(512, ax3_0_0 * 2 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 64 // 32 + ax1) + v2 = T.axis.spatial(16, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 32 // 2 + ax2) + v3 = T.axis.spatial(8, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 2 * 4 + ax3) + T.reads(B_shared[v0, v1, v2, v3]) + T.writes(B_local[v0, v1, v2, v3]) + B_local[v0, v1, v2, v3] = B_shared[v0, v1, v2, v3] + for ax0_2, ax1, ax2, ax3, ax4 in T.grid(1, 1, 1, 1, 16): + with T.block("B_reindex_reindex_local"): + v0 = T.axis.spatial(1, ax0_2) + v1 = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) // 64 + ax1) + v2 = T.axis.spatial(512, ax3_0_0 * 2 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 64 // 32 + ax2) + v3 = T.axis.spatial(16, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 32 // 2 + ax3) + v4 = T.axis.spatial(32, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 2 * 16 + ax4) + T.reads(B_local[v1, v2, v3, v4 // 4]) + T.writes(B_reindex_reindex_local[v0, v1, v2, v3, v4]) + B_reindex_reindex_local[v0, v1, v2, v3, v4] = T.bitwise_and(T.shift_right(B_local[v1, v2, v3, v4 // 4], T.Cast("int8", v4 % 4 * 2)), T.int8(3)) + for ax4_2 in T.vectorized(16): + with T.block("B_reindex_reindex_shared"): + v0 = T.axis.spatial(1, ax0_1) + v1 = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) // 64) + v2 = T.axis.spatial(512, ax3_0_0 * 2 + (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 64 // 32) + v3 = T.axis.spatial(16, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 32 // 2) + v4 = T.axis.spatial(32, (ax1_ax2_ax3_ax4_0_fused_0 * 128 + ax1_ax2_ax3_ax4_0_fused_1 * 64 + ax1_ax2_ax3_ax4_0_fused_2 * 32 + ax1_ax2_ax3_ax4_0_fused_3) % 2 * 16 + ax4_1 * 16 + ax4_2) + T.reads(B_reindex_reindex_local[v0, v1, v2, v3, v4]) + T.writes(B_reindex_reindex_shared[v0, v1, v2, v3, v4]) + T.block_attr({"permuted_layout": 0}) + B_reindex_reindex_shared[v0, v1, v2, v3, v4] = B_reindex_reindex_local[v0, v1, v2, v3, v4] + for ax3_0_1 in range(2): + for ax0_1, ax1, ax2, ax3_0, ax4_0 in T.grid(1, 8, 1, 1, 1): + with T.block("A_reindex_reindex_shared_warp_o"): + v0_o = T.axis.spatial(1, ax0_1) + v1_o = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + ax1_0_2 * 8 + ax1) + v2_o = T.axis.spatial(512, ax3_0_0 * 2 + ax3_0_1 + ax2) + v3_o, v4_o = T.axis.remap("SS", [ax3_0, ax4_0]) + T.reads(A_reindex_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:32]) + T.writes(A_reindex_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:16]) + T.block_attr({"permuted_layout": 0}) + warp = T.match_buffer(A_reindex_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:16], (32, 16), "int8", scope="warp", offset_factor=32) + shared = T.match_buffer(A_reindex_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:32], (16, 32), "int8", strides=("shared_s0", "shared_s1"), scope="shared", offset_factor=32) + for tx in T.thread_binding(32, thread="threadIdx.x"): + T.ptx_ldmatrix("int8", T.bool(False), 4, ".b16", warp.data, warp.elem_offset + 16 * tx, T.tvm_access_ptr(T.type_annotation("int8"), shared.data, shared.elem_offset, shared.strides[0] * 16, 1), tx * 16) + for ax0_1, ax1, ax2, ax3_0, ax4_0 in T.grid(1, 2, 1, 1, 1): + with T.block("B_reindex_reindex_shared_warp_o"): + v0_o = T.axis.spatial(1, ax0_1) + v1_o = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + ax2_0_2 * 2 + ax1) + v2_o = T.axis.spatial(512, ax3_0_0 * 2 + ax3_0_1 + ax2) + v3_o, v4_o = T.axis.remap("SS", [ax3_0, ax4_0]) + T.reads(B_reindex_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:32]) + T.writes(B_reindex_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:16]) + T.block_attr({"permuted_layout": 0}) + warp = T.match_buffer(B_reindex_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:16], (32, 16), "int8", scope="warp", offset_factor=32) + shared = T.match_buffer(B_reindex_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:32], (16, 32), "int8", strides=("shared_s0", "shared_s1"), scope="shared", offset_factor=32) + for tx in T.thread_binding(32, thread="threadIdx.x"): + T.ptx_ldmatrix("int8", T.bool(False), 4, ".b16", warp.data, warp.elem_offset + 16 * tx, T.tvm_access_ptr(T.type_annotation("int8"), shared.data, shared.elem_offset, shared.strides[0] * 16, 1), tx * 16) + for ax1_0_3, ax2_0_3 in T.grid(8, 2): + with T.block("C_o_update"): + v0_o = T.axis.spatial(1, ax0) + v1_o = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + ax1_0_2 * 8 + ax1_0_3) + v2_o = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + ax2_0_2 * 2 + ax2_0_3) + v3_o = T.axis.reduce(512, ax3_0_0 * 2 + ax3_0_1) + T.reads(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8], A_reindex_reindex_shared_warp[0, v1_o, v3_o, 0:32, 0:16], B_reindex_reindex_shared_warp[0, v2_o, v3_o, 0:32, 0:16]) + T.writes(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8]) + with T.block("C_o"): + v1_i_o = T.axis.spatial(1, 0) + v2_i_o = T.axis.spatial(1, 0) + v3_i_o = T.axis.reduce(1, 0) + T.reads(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8], A_reindex_reindex_shared_warp[0, v1_o, v3_o, 0:32, 0:16], B_reindex_reindex_shared_warp[0, v2_o, v3_o, 0:32, 0:16]) + T.writes(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8]) + A_1 = T.match_buffer(A_reindex_reindex_shared_warp[0, v1_o, v3_o, 0:32, 0:16], (32, 16), "int8", scope="warp", offset_factor=32) + B_1 = T.match_buffer(B_reindex_reindex_shared_warp[0, v2_o, v3_o, 0:32, 0:16], (32, 16), "int8", scope="warp", offset_factor=32) + C_1 = T.match_buffer(C_reindex_shared_warp[0, v1_o, v2_o, 0:32, 0:8], (32, 8), "int32", scope="warp", offset_factor=16) + for tx in T.thread_binding(32, thread="threadIdx.x"): + T.ptx_mma("int32", "m16n8k32", "row", "col", "int8", "int8", "int32", A_1.data, A_1.elem_offset + tx * 16, B_1.data, B_1.elem_offset + tx * 16, C_1.data, C_1.elem_offset + tx * 8, T.bool(False)) + T.ptx_mma("int32", "m16n8k32", "row", "col", "int8", "int8", "int32", A_1.data, A_1.elem_offset + tx * 16, B_1.data, B_1.elem_offset + tx * 16 + 8, C_1.data, C_1.elem_offset + tx * 8 + 4, T.bool(False)) + for ax0_1, ax1 in T.grid(8, 2): + for ax2_0, ax3_0 in T.grid(1, 1): + with T.block("C_reindex_shared_warp_o"): + v0_o = T.axis.spatial(1, 0) + v1_o = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + ax1_0_2 * 8 + ax0_1) + v2_o = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + ax2_0_2 * 2 + ax1) + v3_o, v4_o = T.axis.remap("SS", [ax2_0, ax3_0]) + T.reads(C_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:8]) + T.writes(C_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:16]) + C_warp = T.match_buffer(C_reindex_shared_warp[v0_o, v1_o, v2_o, 0:32, 0:8], (32, 8), "int32", scope="warp", offset_factor=1) + C_1 = T.match_buffer(C_reindex_shared[v0_o, v1_o, v2_o, 0:16, 0:16], (16, 16), "int32", strides=("C_s0", "C_s1"), scope="shared", offset_factor=1) + for tx in T.thread_binding(32, thread="threadIdx.x"): + T.mma_store("int32", 16, 16, T.tvm_access_ptr(T.type_annotation("int32"), C_1.data, C_1.elem_offset, C_1.strides[0] * 16, 2), C_warp.data, C_warp.elem_offset, C_1.strides[0]) + for ax0_ax1_ax2_ax3_ax4_fused_0 in T.unroll(2, annotations={"pragma_unroll_explicit": 0}): + for ax0_ax1_ax2_ax3_ax4_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_ax2_ax3_ax4_fused_2 in T.vectorized(4): + with T.block("C_reindex_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(1024, ax1_0_0_ax2_0_0_fused * 16 + ax1_0_2 * 8 + ax0_1) + v2 = T.axis.spatial(1024, ax1_0_1_ax2_0_1_fused * 4 + ax2_0_2 * 2 + ax1) + v3 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_ax4_fused_0 * 128 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4 + ax0_ax1_ax2_ax3_ax4_fused_2) // 16) + v4 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_ax4_fused_0 * 128 + ax0_ax1_ax2_ax3_ax4_fused_1 * 4 + ax0_ax1_ax2_ax3_ax4_fused_2) % 16) + T.reads(C_reindex_shared[v0, v1, v2, v3, v4]) + T.writes(C[v3 + v1 * 16, v4 + v2 * 16]) + C[v3 + v1 * 16, v4 + v2 * 16] = C_reindex_shared[v0, v1, v2, v3, v4] + +mod = Module +sch = tvm.tir.Schedule(mod, debug_mask="all") +with tvm.transform.PassContext( + config={"tir.use_async_copy": True} + ): + dense_relu_0_rt_mod = tvm.build(sch.mod, target="cuda") +with open("after_memory_rewrite.cu", "+w") as f: + f.write(dense_relu_0_rt_mod.imported_modules[0].get_source()) diff --git a/testing/tir_expr/test_tir_1.py b/testing/tir_expr/test_tir_1.py new file mode 100644 index 0000000000..89efc86a43 --- /dev/null +++ b/testing/tir_expr/test_tir_1.py @@ -0,0 +1,177 @@ +import tvm +from tvm.script import ir as I +from tvm.script import tir as T +from tvm.tir.tensor_intrin.cuda import * + +# from tvm.script import tir as T +@T.prim_func +def main(input0: T.Buffer[(1024, 512, 16, 32), "int8"], input1: T.Buffer[(1024, 512, 16, 8), "int8"], output0: T.Buffer[(16384, 16384), "int8"]): + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # var definition + tx = T.env_thread("threadIdx.x") + C_s0 = T.var("int32") + C_s1 = T.var("int32") + shared_s0 = T.var("int32") + shared_s0_1 = T.var("int32") + shared_s1 = T.var("int32") + shared_s1_1 = T.var("int32") + # body + # with T.block("root") + input0_shared = T.alloc_buffer([1024, 512, 16, 32], dtype="int8", scope="shared") + mediate0_shared = T.alloc_buffer([1024, 512, 16, 32], dtype="int8", scope="shared") + mediate1_shared = T.alloc_buffer([1024, 1024, 16, 16], dtype="int32", scope="shared") + mediate1_shared_warp = T.alloc_buffer([1024, 1024, 32, 8], dtype="int32", scope="warp") + mediate0_local = T.alloc_buffer([1024, 512, 16, 32], dtype="int8", scope="local") + input1_shared = T.alloc_buffer([1024, 512, 16, 8], dtype="int8", scope="shared") + input1_shared_local = T.alloc_buffer([1024, 512, 16, 8], dtype="int8", scope="local") + input0_shared_warp = T.alloc_buffer([1024, 512, 32, 16], dtype="int8", scope="warp") + mediate0_shared_warp = T.alloc_buffer([1024, 512, 32, 16], dtype="int8", scope="warp") + for i_0 in T.thread_binding(256, thread="blockIdx.y"): + for j_0 in T.thread_binding(64, thread="blockIdx.x"): + for i_1 in T.thread_binding(2, thread="threadIdx.y"): + for j_1 in T.thread_binding(2, thread="threadIdx.z"): + for i_2_init in T.serial(2, annotations={"pragma_unroll_explicit":0, "thread_rasterization":10}): + for j_2_init in T.serial(8, annotations={"pragma_unroll_explicit":0}): + with T.block("mediate1_init_o"): + v_i = T.axis.spatial(1024, i_0 * 4 + i_1 * 2 + i_2_init) + v_j = T.axis.spatial(1024, j_0 * 16 + j_1 * 8 + j_2_init) + v_ii_o = T.axis.spatial(1, 0) + v_jj_o = T.axis.spatial(1, 0) + T.reads() + T.writes(mediate1_shared_warp[v_i, v_j, 0 : 32, 0 : 8]) + C_warp = T.match_buffer(mediate1_shared_warp[v_i, v_j, 0 : 32, 0 : 8], [32, 8], dtype="int32", scope="warp", offset_factor=1) + T.launch_thread(tx, 32) + T.mma_fill(8, C_warp.data, C_warp.elem_offset, dtype="int32") + for k_0 in T.serial(256, annotations={"software_pipeline_async_stages":[0], "software_pipeline_order":[0, 1, 2, 3], "software_pipeline_stage":[0, 0, 1, 1]}): + for ax0_ax1_ax2_ax3_0_fused_0 in T.unroll(2, annotations={"pragma_unroll_explicit":0}): + for ax0_ax1_ax2_ax3_0_fused_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_ax2_ax3_0_fused_2 in T.thread_binding(2, thread="threadIdx.z"): + for ax0_ax1_ax2_ax3_0_fused_3 in T.thread_binding(32, thread="threadIdx.x"): + for ax3_1 in T.vectorized(16): + with T.block("input0_shared"): + v0 = T.axis.spatial(1024, i_0 * 4 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) // 64) + v1 = T.axis.spatial(512, k_0 * 2 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 64 // 32) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 32 // 2) + v3 = T.axis.spatial(32, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 2 * 16 + ax3_1) + T.reads(input0[v0, v1, v2, v3]) + T.writes(input0_shared[v0, v1, v2, v3]) + input0_shared[v0, v1, v2, v3] = input0[v0, v1, v2, v3] + for ax0_ax1_ax2_ax3_fused_0_0_0_0 in T.serial(2): + for ax0_ax1_ax2_ax3_fused_0_0_0_1 in T.thread_binding(2, thread="threadIdx.z"): + for ax0_ax1_ax2_ax3_fused_0_0_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_ax2_ax3_fused_0_1 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_ax2_ax3_fused_1 in T.vectorized(16): + with T.block("input1_shared"): + v0 = T.axis.spatial(1024, j_0 * 16 + (ax0_ax1_ax2_ax3_fused_0_0_0_0 * 2048 + ax0_ax1_ax2_ax3_fused_0_0_0_1 * 1024 + ax0_ax1_ax2_ax3_fused_0_0_1 * 512 + ax0_ax1_ax2_ax3_fused_0_1 * 16 + ax0_ax1_ax2_ax3_fused_1) // 256) + v1 = T.axis.spatial(512, k_0 * 2 + (ax0_ax1_ax2_ax3_fused_0_0_0_0 * 2048 + ax0_ax1_ax2_ax3_fused_0_0_0_1 * 1024 + ax0_ax1_ax2_ax3_fused_0_0_1 * 512 + ax0_ax1_ax2_ax3_fused_0_1 * 16 + ax0_ax1_ax2_ax3_fused_1) % 256 // 128) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_fused_0_0_0_0 * 2048 + ax0_ax1_ax2_ax3_fused_0_0_0_1 * 1024 + ax0_ax1_ax2_ax3_fused_0_0_1 * 512 + ax0_ax1_ax2_ax3_fused_0_1 * 16 + ax0_ax1_ax2_ax3_fused_1) % 128 // 8) + v3 = T.axis.spatial(8, (ax0_ax1_ax2_ax3_fused_0_0_0_0 * 2048 + ax0_ax1_ax2_ax3_fused_0_0_0_1 * 1024 + ax0_ax1_ax2_ax3_fused_0_0_1 * 512 + ax0_ax1_ax2_ax3_fused_0_1 * 16 + ax0_ax1_ax2_ax3_fused_1) % 8) + T.reads(input1[v0, v1, v2, v3]) + T.writes(input1_shared[v0, v1, v2, v3]) + input1_shared[v0, v1, v2, v3] = input1[v0, v1, v2, v3] + for ax0_ax1_ax2_ax3_0_fused_0 in T.serial(8): + for ax0_ax1_ax2_ax3_0_fused_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_ax2_ax3_0_fused_2 in T.thread_binding(2, thread="threadIdx.z"): + for ax0_ax1_ax2_ax3_0_fused_3 in T.thread_binding(32, thread="threadIdx.x"): + for ax3_1 in T.serial(1): + for ax0 in T.vectorized(4): + with T.block("input1_shared_local"): + v0 = T.axis.spatial(1024, j_0 * 16 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) // 64) + v1 = T.axis.spatial(512, k_0 * 2 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 64 // 32) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 32 // 2) + v3 = T.axis.spatial(8, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 2 * 4 + ax0) + T.reads(input1_shared[v0, v1, v2, v3]) + T.writes(input1_shared_local[v0, v1, v2, v3]) + input1_shared_local[v0, v1, v2, v3] = input1_shared[v0, v1, v2, v3] + for ax0 in T.serial(16): + with T.block("mediate0_local"): + v0 = T.axis.spatial(1024, j_0 * 16 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) // 64) + v1 = T.axis.spatial(512, k_0 * 2 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 64 // 32) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 32 // 2) + v3 = T.axis.spatial(32, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 2 * 16 + ax0) + T.reads(input1_shared_local[v0, v1, v2, v3 // 4]) + T.writes(mediate0_local[v0, v1, v2, v3]) + mediate0_local[v0, v1, v2, v3] = T.bitwise_and(T.shift_right(input1_shared_local[v0, v1, v2, v3 // 4], T.Cast("int8", v3 % 4), dtype="int8"), T.int8(1), dtype="int8") + for ax3_2 in T.vectorized(16): + with T.block("mediate0_shared"): + v0 = T.axis.spatial(1024, j_0 * 16 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) // 64) + v1 = T.axis.spatial(512, k_0 * 2 + (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 64 // 32) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 32 // 2) + v3 = T.axis.spatial(32, (ax0_ax1_ax2_ax3_0_fused_0 * 128 + ax0_ax1_ax2_ax3_0_fused_1 * 64 + ax0_ax1_ax2_ax3_0_fused_2 * 32 + ax0_ax1_ax2_ax3_0_fused_3) % 2 * 16 + ax3_1 * 16 + ax3_2) + T.reads(mediate0_local[v0, v1, v2, v3]) + T.writes(mediate0_shared[v0, v1, v2, v3]) + mediate0_shared[v0, v1, v2, v3] = mediate0_local[v0, v1, v2, v3] + for k_1 in T.serial(2): + for ax0, ax1 in T.grid(2, 1): + with T.block("input0_shared_warp_o"): + v0 = T.axis.spatial(1024, i_0 * 4 + i_1 * 2 + ax0) + v1 = T.axis.spatial(512, ax1 * 512 + k_0 * 2 + k_1) + v2_o = T.axis.spatial(1, 0) + v3_o = T.axis.spatial(1, 0) + T.reads(input0_shared[v0, v1, 0 : 16, 0 : 32]) + T.writes(input0_shared_warp[v0, v1, 0 : 32, 0 : 16]) + warp = T.match_buffer(input0_shared_warp[v0, v1, 0 : 32, 0 : 16], [32, 16], dtype="int8", scope="warp", offset_factor=16) + shared = T.match_buffer(input0_shared[v0, v1, 0 : 16, 0 : 32], [16, 32], dtype="int8", strides=[shared_s0, shared_s1], scope="shared", offset_factor=16) + T.launch_thread(tx, 32) + T.ptx_ldmatrix(False, 4, ".b16", warp.data, warp.elem_offset + 16 * tx, T.tvm_access_ptr(T.type_annotation(dtype="int8"), shared.data, shared.elem_offset, shared_s0 * 16, 1, dtype="handle"), 16 * tx, dtype="int8") + for ax0, ax1 in T.grid(8, 1): + with T.block("mediate0_shared_warp_o"): + v0 = T.axis.spatial(1024, j_0 * 16 + j_1 * 8 + ax0) + v1 = T.axis.spatial(512, ax1 * 512 + k_0 * 2 + k_1) + v2_o = T.axis.spatial(1, 0) + v3_o = T.axis.spatial(1, 0) + T.reads(mediate0_shared[v0, v1, 0 : 16, 0 : 32]) + T.writes(mediate0_shared_warp[v0, v1, 0 : 32, 0 : 16]) + warp_1 = T.match_buffer(mediate0_shared_warp[v0, v1, 0 : 32, 0 : 16], [32, 16], dtype="int8", scope="warp", offset_factor=16) + shared_1 = T.match_buffer(mediate0_shared[v0, v1, 0 : 16, 0 : 32], [16, 32], dtype="int8", strides=[shared_s0_1, shared_s1_1], scope="shared", offset_factor=16) + T.launch_thread(tx, 32) + T.ptx_ldmatrix(False, 4, ".b16", warp_1.data, warp_1.elem_offset + 16 * tx, T.tvm_access_ptr(T.type_annotation(dtype="int8"), shared_1.data, shared_1.elem_offset, shared_s0_1 * 16, 1, dtype="handle"), 16 * tx, dtype="int8") + for i_2, j_2 in T.grid(2, 8): + with T.block("mediate1_update_o"): + v_i = T.axis.spatial(1024, i_0 * 4 + i_1 * 2 + i_2) + v_j = T.axis.spatial(1024, j_0 * 16 + j_1 * 8 + j_2) + v_ii_o = T.axis.spatial(1, 0) + v_jj_o = T.axis.spatial(1, 0) + v_k = T.axis.reduce(512, k_0 * 2 + k_1) + v_kk_o = T.axis.reduce(1, 0) + T.reads(mediate1_shared_warp[v_i, v_j, 0 : 32, 0 : 8], input0_shared_warp[v_i, v_k, 0 : 32, 0 : 16], mediate0_shared_warp[v_j, v_k, 0 : 32, 0 : 16]) + T.writes(mediate1_shared_warp[v_i, v_j, 0 : 32, 0 : 8]) + A = T.match_buffer(input0_shared_warp[v_i, v_k, 0 : 32, 0 : 16], [32, 16], dtype="int8", scope="warp", offset_factor=16) + B = T.match_buffer(mediate0_shared_warp[v_j, v_k, 0 : 32, 0 : 16], [32, 16], dtype="int8", scope="warp", offset_factor=16) + C = T.match_buffer(mediate1_shared_warp[v_i, v_j, 0 : 32, 0 : 8], [32, 8], dtype="int32", scope="warp", offset_factor=16) + T.launch_thread(tx, 32) + T.ptx_mma("m16n8k32", "row", "col", "int8", "int8", "int32", A.data, A.elem_offset + tx * 16, B.data, B.elem_offset + tx * 16, C.data, C.elem_offset + tx * 8, False, dtype="int32") + T.ptx_mma("m16n8k32", "row", "col", "int8", "int8", "int32", A.data, A.elem_offset + tx * 16, B.data, B.elem_offset + tx * 16 + T.FloorDiv(16, 2), C.data, C.elem_offset + tx * 8 + T.FloorDiv(8, 2), False, dtype="int32") + for ax0, ax1 in T.grid(2, 8): + with T.block("mediate1_shared_warp_o"): + v0 = T.axis.spatial(1024, i_0 * 4 + i_1 * 2 + ax0) + v1 = T.axis.spatial(1024, j_0 * 16 + j_1 * 8 + ax1) + v2_o = T.axis.spatial(1, 0) + v3_o = T.axis.spatial(1, 0) + T.reads(mediate1_shared_warp[v0, v1, 0 : 32, 0 : 8]) + T.writes(mediate1_shared[v0, v1, 0 : 16, 0 : 16]) + C_warp_1 = T.match_buffer(mediate1_shared_warp[v0, v1, 0 : 32, 0 : 8], [32, 8], dtype="int32", scope="warp", offset_factor=1) + C_1 = T.match_buffer(mediate1_shared[v0, v1, 0 : 16, 0 : 16], [16, 16], dtype="int32", strides=[C_s0, C_s1], scope="shared", offset_factor=1) + T.launch_thread(tx, 32) + T.mma_store(16, 16, T.tvm_access_ptr(T.type_annotation(dtype="int32"), C_1.data, C_1.elem_offset, C_s0 * 16, 2, dtype="handle"), C_warp_1.data, C_warp_1.elem_offset, C_s0, dtype="int32") + for ax0_ax1_ax2_ax3_fused_0 in T.unroll(2, annotations={"pragma_unroll_explicit":0}): + for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): + with T.block("mediate1_shared"): + v0 = T.axis.spatial(1024, i_0 * 4 + i_1 * 2 + ax0) + v1 = T.axis.spatial(1024, j_0 * 16 + j_1 * 8 + ax1) + v2 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 16) + v3 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 16) + T.reads(mediate1_shared[v0, v1, v2, v3]) + T.writes(output0[v0 * 16 + v2, v1 * 16 + v3]) + output0[v2 + v0 * 16, v3 + v1 * 16] = T.Cast("int8", mediate1_shared[v0, v1, v2, v3]) + +mod = main +sch = tvm.tir.Schedule(mod, debug_mask="all") +with tvm.transform.PassContext( + config={"tir.use_async_copy": True} + ): + dense_relu_0_rt_mod = tvm.build(sch.mod, target="cuda") +with open("after_memory_rewrite.cu", "+w") as f: + f.write(dense_relu_0_rt_mod.imported_modules[0].get_source()) From 9691ab98ba26d859821201c6aedbd65cf3e7ac3e Mon Sep 17 00:00:00 2001 From: LeiWang Date: Wed, 7 Feb 2024 13:30:05 -0400 Subject: [PATCH 008/286] update keep --- 3rdparty/.gitkeep | 0 benchmark/.gitkeep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 3rdparty/.gitkeep create mode 100644 benchmark/.gitkeep diff --git a/3rdparty/.gitkeep b/3rdparty/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmark/.gitkeep b/benchmark/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 From 261cded3a20bc57a85ba6b0c0c060d4e3b670637 Mon Sep 17 00:00:00 2001 From: LeiWang Date: Sat, 10 Feb 2024 12:22:01 -0400 Subject: [PATCH 009/286] update lop3 cpp test --- testing/cpp/.gitignore | 2 + testing/cpp/CMakeLists.txt | 10 + .../cpp/lop3_type_conversion/CMakeLists.txt | 9 + .../lop3_type_conversion/fast_decoding.hpp | 157 +++++ .../lowprecision_to_float16.cu | 238 +++++++ .../cpp/fast_decode_s1_fp16.cu | 526 +++++++++++++++ .../cpp/fast_decode_s1_fp16_n8.cu | 544 +++++++++++++++ .../type_conversion/cpp/fast_decode_s1_s8.cu | 603 +++++++++++++++++ .../cpp/fast_decode_s1_s8_n16.cu | 618 ++++++++++++++++++ .../cpp/fast_decode_s2_fp16.cu | 520 +++++++++++++++ .../cpp/fast_decode_s2_fp16_n8 | Bin 0 -> 846456 bytes .../cpp/fast_decode_s2_fp16_n8.cu | 549 ++++++++++++++++ .../type_conversion/cpp/fast_decode_s2_s8.cu | 568 ++++++++++++++++ .../type_conversion/cpp/fast_decode_s4_fp16 | Bin 0 -> 842312 bytes .../cpp/fast_decode_s4_fp16.cu | 537 +++++++++++++++ .../type_conversion/cpp/fast_decode_s4_s8.cu | 547 ++++++++++++++++ .../cpp/fast_decode_s8_fp16.cu | 463 +++++++++++++ 17 files changed, 5891 insertions(+) create mode 100644 testing/cpp/.gitignore create mode 100644 testing/cpp/CMakeLists.txt create mode 100644 testing/cpp/lop3_type_conversion/CMakeLists.txt create mode 100644 testing/cpp/lop3_type_conversion/fast_decoding.hpp create mode 100644 testing/cpp/lop3_type_conversion/lowprecision_to_float16.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s1_fp16.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s1_fp16_n8.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s1_s8.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s1_s8_n16.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s2_fp16.cu create mode 100755 testing/type_conversion/cpp/fast_decode_s2_fp16_n8 create mode 100644 testing/type_conversion/cpp/fast_decode_s2_fp16_n8.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s2_s8.cu create mode 100755 testing/type_conversion/cpp/fast_decode_s4_fp16 create mode 100644 testing/type_conversion/cpp/fast_decode_s4_fp16.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s4_s8.cu create mode 100644 testing/type_conversion/cpp/fast_decode_s8_fp16.cu diff --git a/testing/cpp/.gitignore b/testing/cpp/.gitignore new file mode 100644 index 0000000000..f65b0cab7d --- /dev/null +++ b/testing/cpp/.gitignore @@ -0,0 +1,2 @@ +# ignore the build directory +build/ diff --git a/testing/cpp/CMakeLists.txt b/testing/cpp/CMakeLists.txt new file mode 100644 index 0000000000..de95f50cd5 --- /dev/null +++ b/testing/cpp/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +project(YourProjectTests LANGUAGES CXX CUDA) + +# Find GTest +find_package(GTest REQUIRED) + +include_directories(${GTEST_INCLUDE_DIRS}) + +add_subdirectory(lop3_type_conversion) diff --git a/testing/cpp/lop3_type_conversion/CMakeLists.txt b/testing/cpp/lop3_type_conversion/CMakeLists.txt new file mode 100644 index 0000000000..762fd1f72f --- /dev/null +++ b/testing/cpp/lop3_type_conversion/CMakeLists.txt @@ -0,0 +1,9 @@ +function (ADD_CUDA_TEST_EXECUTABLE name) + add_executable(${name} ${name}.cu) + set_target_properties(${name} PROPERTIES CUDA_ARCHITECTURES 60) + set_target_properties(${name} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(${name} gtest gtest_main) +endfunction(ADD_CUDA_TEST_EXECUTABLE) + +ADD_CUDA_TEST_EXECUTABLE(lowprecision_to_float16) diff --git a/testing/cpp/lop3_type_conversion/fast_decoding.hpp b/testing/cpp/lop3_type_conversion/fast_decoding.hpp new file mode 100644 index 0000000000..12cd9cc60d --- /dev/null +++ b/testing/cpp/lop3_type_conversion/fast_decoding.hpp @@ -0,0 +1,157 @@ +#include + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N, bool isSigned = false) +{ + int zero_point = isSigned ? ((1 << (nbit - 1)) - 1) : 0; + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + compressed[i] = 0; + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= ((lowbit[nbit_per_byte * i + j] + zero_point) << (nbit * j)); + } + } +} + +void general_interleave_fp16(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + // For fp16 example + // i4s {e7,e6,e5,e4,e3,e2,e1,e0} + // |-8b-||-8b-||-8b-||-8b-| + // interleave {e7,e5,e3,e1,e6,e4,e2,e0} + /* + BOTTOM_MASK 0 0 0 f 0 0 0 f + i4s e7 e5 e3 e1 e6 e4 e2 e0 + selectedVal 0000 0000 0000 e1 0000 0000 0000 e0 // selectedVal = i4s & BOTTOM_MASK + h[0] 0110 0100 0 e1 0110 0100 0 e0 // selectVal | 0x6400 + */ + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + int mask = (1 << nbit) - 1; + int num_groups = (32 / nbit) / 2; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + + for (int i = 0; i < num_groups; ++i) + { + int left_shift = nbit * i; + int right_shift = nbit * (num_groups - i - 1); + new_value |= (current_value & (mask << nbit * (2 * i))) >> left_shift; + new_value |= (current_value & (mask << nbit * (2 * i + 1))) << right_shift; + if (verbose) + { + printf("put %d to %d\n", (2 * i), (nbit * (2 * i) - left_shift) / nbit); + printf("put %d to %d\n", (2 * i + 1), (nbit * (2 * i + 1) + right_shift) / nbit); + } + } + if (nbit == 2) + { + int32_t _new_value_n16 = (new_value & 0xff0000ff); + _new_value_n16 |= ((new_value & 0x0000ff00) >> 8) << 16; + _new_value_n16 |= ((new_value & 0x00ff0000) >> 16) << 8; + int32_interleaved[idx] = _new_value_n16; + } + else if (nbit == 1) + { + int32_t _new_value_n16 = (new_value & 0xf000000f); + _new_value_n16 |= ((new_value & 0x000000f0) >> 4) << 8; + _new_value_n16 |= ((new_value & 0x00000f00) >> 8) << 16; + _new_value_n16 |= ((new_value & 0x0000f000) >> 12) << 24; + _new_value_n16 |= ((new_value & 0x000f0000) >> 16) << 4; + _new_value_n16 |= ((new_value & 0x00f00000) >> 20) << 12; + _new_value_n16 |= ((new_value & 0x0f000000) >> 24) << 20; + int32_interleaved[idx] = _new_value_n16; + } + else + int32_interleaved[idx] = new_value; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +template +__device__ void decode_i4b_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x000f000f; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + // Minus 7 to scale the value to signed + static constexpr uint MEDIAN_NUM = isSigned ? 0x64076407 : 0x64006400; + uint const i4s = *reinterpret_cast(_i4s); +#pragma unroll + // decode 2 elems at one time. + for (int i = 0; i < (N / 2); i++) + { + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[i]) + : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM)); + } +} + +template +__device__ void decode_i4s_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8) +{ + decode_i4b_to_f16(_i4s, B_local_decode, N); +} + +template +__device__ void decode_i4u_to_f16(T1 *_i4u, T2 *B_local_decode, const int N = 8) +{ + decode_i4b_to_f16(_i4u, B_local_decode, N); +} + +template +__device__ void decode_i2b_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x00030003; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + static constexpr uint MEDIAN_NUM = isSigned ? 0x64016401 : 0x64006400; + int16_t const i2s_i16 = *reinterpret_cast(_i2s); + // decode 2 elems at one time. + // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0} + // otherwise the pointer of _i2s should be moved to + int i2s = (i2s_i16 & 0x00ff); + i2s |= ((i2s_i16 & 0xff00) << 8); + +#pragma unroll + for (int i = 0; i < (N / 2); i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM)); + } +} + +template +__device__ void decode_i2s_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8) +{ + decode_i2b_to_f16(_i2s, B_local_decode, N); +} + +template +__device__ void decode_i2u_to_f16(T1 *_i2u, T2 *B_local_decode, const int N = 8) +{ + decode_i2b_to_f16(_i2u, B_local_decode, N); +} diff --git a/testing/cpp/lop3_type_conversion/lowprecision_to_float16.cu b/testing/cpp/lop3_type_conversion/lowprecision_to_float16.cu new file mode 100644 index 0000000000..934c1cd2cc --- /dev/null +++ b/testing/cpp/lop3_type_conversion/lowprecision_to_float16.cu @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include "fast_decoding.hpp" + +#define cudaCheckLastError(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +#define REGISTER_GLOBAL_DEVICE_INVOKER(kernel, function) \ + template \ + __global__ void kernel(Args... args) \ + { \ + function(args...); \ + } + +REGISTER_GLOBAL_DEVICE_INVOKER(kernelWrapper_i4s_to_f16, decode_i4s_to_f16) +REGISTER_GLOBAL_DEVICE_INVOKER(kernelWrapper_i4u_to_f16, decode_i4u_to_f16) +REGISTER_GLOBAL_DEVICE_INVOKER(kernelWrapper_i2s_to_f16, decode_i2s_to_f16) +REGISTER_GLOBAL_DEVICE_INVOKER(kernelWrapper_i2u_to_f16, decode_i2u_to_f16) + + +TEST(DecodeTest, DecodeInt4ToFloat16) +{ + constexpr int nbits = 4; + constexpr int N = 32 / nbits; + constexpr int QN = N / 8 * nbits; + constexpr bool isSigned = true; + constexpr int zero_point = isSigned ? ((1 << (nbits - 1)) - 1) : 0; + + // create four int8_t values + int8_t in_data[N] = { + 0, + }; + // breed seed + srand(0); + + // random initializations with nbits range + for (int i = 0; i < N; i++) + { + in_data[i] = (rand() % (1 << nbits)) - zero_point; + } + + int8_t *ins = new int8_t[QN]; + general_compress(in_data, ins, nbits, N, isSigned); + + int8_t *interleaved = new int8_t[QN]; + general_interleave_fp16(ins, interleaved, nbits, QN * sizeof(int8_t), false); + half *decoded = new half[N]; + int8_t *ins_gpu; + half *decoded_gpu; + + cudaCheckLastError(cudaMalloc((void **)&ins_gpu, QN * sizeof(int8_t))); + cudaCheckLastError(cudaMalloc((void **)&decoded_gpu, N * sizeof(half))); + cudaCheckLastError(cudaMemcpy(ins_gpu, interleaved, QN * sizeof(int8_t), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaMemcpy(decoded_gpu, decoded, N * sizeof(half), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaDeviceSynchronize()); + + kernelWrapper_i4s_to_f16<<>>(ins_gpu, decoded_gpu); + cudaCheckLastError(cudaDeviceSynchronize()); + cudaCheckLastError(cudaMemcpy(decoded, decoded_gpu, N * sizeof(half), cudaMemcpyDeviceToHost)); + cudaCheckLastError(cudaFree(ins_gpu)); + cudaCheckLastError(cudaFree(decoded_gpu)); + for (int i = 0; i < N; i++) + { + EXPECT_EQ(in_data[i], int(decoded[i])); + } + free(ins); + free(interleaved); + free(decoded); +} + + +TEST(DecodeTest, DecodeUInt4ToFloat16) +{ + constexpr int nbits = 4; + constexpr int N = 32 / nbits; + constexpr int QN = N / 8 * nbits; + constexpr bool isSigned = false; + constexpr int zero_point = isSigned ? ((1 << (nbits - 1)) - 1) : 0; + + // create four int8_t values + int8_t in_data[N] = {0,}; + + // breed seed + srand(0); + + // random initializations with nbits range + for (int i = 0; i < N; i++) + { + in_data[i] = (rand() % (1 << nbits)) - zero_point; + } + + int8_t *ins = new int8_t[QN]; + general_compress(in_data, ins, nbits, N, isSigned); + int8_t *interleaved = new int8_t[QN]; + general_interleave_fp16(ins, interleaved, nbits, QN * sizeof(int8_t), false); + + half *decoded = new half[N]; + int8_t *ins_gpu; + half *decoded_gpu; + + cudaCheckLastError(cudaMalloc((void **)&ins_gpu, QN * sizeof(int8_t))); + cudaCheckLastError(cudaMalloc((void **)&decoded_gpu, N * sizeof(half))); + cudaCheckLastError(cudaMemcpy(ins_gpu, interleaved, QN * sizeof(int8_t), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaMemcpy(decoded_gpu, decoded, N * sizeof(half), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaDeviceSynchronize()); + + kernelWrapper_i4u_to_f16<<>>(ins_gpu, decoded_gpu); + + cudaCheckLastError(cudaDeviceSynchronize()); + cudaCheckLastError(cudaMemcpy(decoded, decoded_gpu, N * sizeof(half), cudaMemcpyDeviceToHost)); + cudaCheckLastError(cudaFree(ins_gpu)); + cudaCheckLastError(cudaFree(decoded_gpu)); + for (int i = 0; i < N; i++) + { + EXPECT_EQ(in_data[i], int(decoded[i])); + } + + free(ins); + free(interleaved); + free(decoded); +} + +TEST(DecodeTest, DecodeInt2ToFloat16) +{ + constexpr int nbits = 2; + constexpr int N = 32 / nbits; + constexpr int QN = N / 8 * nbits; + constexpr bool isSigned = true; + constexpr int zero_point = isSigned ? ((1 << (nbits - 1)) - 1) : 0; + + // create four int8_t values + int8_t in_data[N] = { + 0, + }; + // breed seed + srand(0); + + // random initializations with nbits range + for (int i = 0; i < N; i++) + { + in_data[i] = (rand() % (1 << nbits)) - zero_point; + } + + int8_t *ins = new int8_t[QN]; + general_compress(in_data, ins, nbits, N, isSigned); + + int8_t *interleaved = new int8_t[QN]; + general_interleave_fp16(ins, interleaved, nbits, QN * sizeof(int8_t), false); + half *decoded = new half[N]; + int8_t *ins_gpu; + half *decoded_gpu; + + cudaCheckLastError(cudaMalloc((void **)&ins_gpu, QN * sizeof(int8_t))); + cudaCheckLastError(cudaMalloc((void **)&decoded_gpu, N * sizeof(half))); + cudaCheckLastError(cudaMemcpy(ins_gpu, interleaved, QN * sizeof(int8_t), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaMemcpy(decoded_gpu, decoded, N * sizeof(half), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaDeviceSynchronize()); + + kernelWrapper_i2s_to_f16<<>>(ins_gpu, decoded_gpu); + kernelWrapper_i2s_to_f16<<>>(ins_gpu + QN / 2, decoded_gpu + N / 2); + cudaCheckLastError(cudaDeviceSynchronize()); + cudaCheckLastError(cudaMemcpy(decoded, decoded_gpu, N * sizeof(half), cudaMemcpyDeviceToHost)); + cudaCheckLastError(cudaFree(ins_gpu)); + cudaCheckLastError(cudaFree(decoded_gpu)); + for (int i = 0; i < N; i++) + { + EXPECT_EQ(in_data[i], int(decoded[i])); + } + free(ins); + free(interleaved); + free(decoded); +} + + +TEST(DecodeTest, DecodeUInt2ToFloat16) +{ + constexpr int nbits = 2; + constexpr int N = 32 / nbits; + constexpr int QN = N / 8 * nbits; + constexpr bool isSigned = false; + constexpr int zero_point = isSigned ? ((1 << (nbits - 1)) - 1) : 0; + + // create four int8_t values + int8_t in_data[N] = {0,}; + + // breed seed + srand(0); + + // random initializations with nbits range + for (int i = 0; i < N; i++) + { + in_data[i] = (rand() % (1 << nbits)) - zero_point; + } + + int8_t *ins = new int8_t[QN]; + general_compress(in_data, ins, nbits, N, isSigned); + int8_t *interleaved = new int8_t[QN]; + general_interleave_fp16(ins, interleaved, nbits, QN * sizeof(int8_t), false); + half *decoded = new half[N]; + int8_t *ins_gpu; + half *decoded_gpu; + + cudaCheckLastError(cudaMalloc((void **)&ins_gpu, QN * sizeof(int8_t))); + cudaCheckLastError(cudaMalloc((void **)&decoded_gpu, N * sizeof(half))); + cudaCheckLastError(cudaMemcpy(ins_gpu, interleaved, QN * sizeof(int8_t), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaMemcpy(decoded_gpu, decoded, N * sizeof(half), cudaMemcpyHostToDevice)); + cudaCheckLastError(cudaDeviceSynchronize()); + + kernelWrapper_i2u_to_f16<<>>(ins_gpu, decoded_gpu); + kernelWrapper_i2u_to_f16<<>>(ins_gpu + QN / 2, decoded_gpu + N / 2); + + cudaCheckLastError(cudaDeviceSynchronize()); + cudaCheckLastError(cudaMemcpy(decoded, decoded_gpu, N * sizeof(half), cudaMemcpyDeviceToHost)); + cudaCheckLastError(cudaFree(ins_gpu)); + cudaCheckLastError(cudaFree(decoded_gpu)); + for (int i = 0; i < N; i++) + { + EXPECT_EQ(in_data[i], int(decoded[i])); + } + + free(ins); + free(interleaved); + free(decoded); +} + diff --git a/testing/type_conversion/cpp/fast_decode_s1_fp16.cu b/testing/type_conversion/cpp/fast_decode_s1_fp16.cu new file mode 100644 index 0000000000..9c80e33747 --- /dev/null +++ b/testing/type_conversion/cpp/fast_decode_s1_fp16.cu @@ -0,0 +1,526 @@ +#include +#include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#include +__device__ half max(half a, half b) +{ + return __hgt(__half(a), __half(b)) ? a : b; +} +__device__ half min(half a, half b) +{ + return __hlt(__half(a), __half(b)) ? a : b; +} +#else + +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef unsigned int uint; + +#define TVM_FORCE_INLINE inline __attribute__((always_inline)) +#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__ +#define TVM_ALIGNED(x) __attribute__((aligned(x))) +#define TVM_HALF_OPERATOR(RTYPE, OP) \ + TVM_XINLINE RTYPE operator OP(half a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(half a, T b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(T a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } + +#define TVM_HALF_ASSIGNOP(AOP, OP) \ + template \ + TVM_XINLINE half operator AOP(const T &a) \ + { \ + return *this = half(float(*this) OP float(a)); \ + } \ + template \ + TVM_XINLINE half operator AOP(const volatile T &a) volatile \ + { \ + return *this = half(float(*this) OP float(a)); \ + } + +class TVM_ALIGNED(2) half +{ +public: + uint16_t half_; + + static TVM_XINLINE half Binary(uint16_t value) + { + half res; + res.half_ = value; + return res; + } + + TVM_XINLINE half() {} + + TVM_XINLINE half(const float &value) { constructor(value); } + TVM_XINLINE explicit half(const double &value) { constructor(value); } + TVM_XINLINE explicit half(const int8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const int32_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint &value) { constructor(value); } + TVM_XINLINE explicit half(const long long &value) { constructor(value); } + TVM_XINLINE explicit half(const uint64_t &value) { constructor(value); } + + TVM_XINLINE operator float() const + { + return float(half2float(half_)); + } + TVM_XINLINE operator float() const volatile + { + return float(half2float(half_)); + } + + TVM_HALF_ASSIGNOP(+=, +) + TVM_HALF_ASSIGNOP(-=, -) + TVM_HALF_ASSIGNOP(*=, *) + TVM_HALF_ASSIGNOP(/=, /) + + TVM_XINLINE half operator+() + { + return *this; + } + + TVM_XINLINE half operator-() + { + return half(-float(*this)); + } + + TVM_XINLINE half operator=(const half &a) + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) + { + return *this = half(a); + } + + TVM_XINLINE half operator=(const half &a) volatile + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) volatile + { + return *this = half(a); + } + +private: + union Bits + { + float f; + int32_t si; + uint ui; + }; + + static int const fp16FractionBits = 10; + static int const fp32FractionBits = 23; + static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits); // == 0x7fffff + static int32_t const fp32HiddenBit = 1 << fp32FractionBits; // == 0x800000 + static int const shift = fp32FractionBits - fp16FractionBits; // == 13 + static int const shiftSign = 16; + static int32_t const expAdjust = 127 - 15; // exp32-127 = exp16-15, so exp16 = exp32 - (127-15) + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FFFFF; // max flt32 that's a flt16 normal after >> by shift + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const maxZ = 0x33000000; // max fp32 number that's still rounded to zero in fp16 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + TVM_XINLINE uint16_t float2half(const float &value) const + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + // Same as above routine, except for addition of volatile keyword + TVM_XINLINE uint16_t float2half( + const volatile float &value) const volatile + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + TVM_XINLINE float half2float(const uint16_t &value) const + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + TVM_XINLINE float half2float( + const volatile uint16_t &value) const volatile + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + template + TVM_XINLINE void constructor(const T &value) + { + half_ = float2half(float(value)); + } +}; + +TVM_HALF_OPERATOR(half, +) +TVM_HALF_OPERATOR(half, -) +TVM_HALF_OPERATOR(half, *) +TVM_HALF_OPERATOR(half, /) +TVM_HALF_OPERATOR(bool, >) +TVM_HALF_OPERATOR(bool, <) +TVM_HALF_OPERATOR(bool, >=) +TVM_HALF_OPERATOR(bool, <=) + +TVM_XINLINE half __float2half_rn(const float a) +{ + return half(a); +} +#endif + +// Pack two half values. +static inline __device__ __host__ unsigned +__pack_half2(const half x, const half y) +{ + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// Some fp16 math functions are not supported in cuda_fp16.h, +// so we define them here to make sure the generated CUDA code +// is valid. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) \ + { \ + float tmp_x = __half2float(x); \ + float tmp_y = __half2float(y); \ + float result = FP32_MATH_NAME(tmp_x, tmp_y); \ + return __float2half(result); \ + } + +#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x) \ + { \ + float tmp_x = __half2float(x); \ + float result = FP32_MATH_NAME(tmp_x); \ + return __float2half(result); \ + } + +CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf) + +#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY +#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY + +#endif +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include +#endif + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif + +#ifdef _WIN32 +using uint = unsigned int; +using uchar = unsigned char; +using ushort = unsigned short; +using ushort = unsigned short; +using uint64_t = unsigned long long; +#else +#define uint unsigned int +#define uchar unsigned char +#define ushort unsigned short +#define int64_t long long +#define uint64_t unsigned long long +#endif + +template +__device__ void decode_i1s_to_f16(T1 *_i1s, T2 *B_local_decode, const int N = 32) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x00010001; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + uint const i1s = *reinterpret_cast(_i1s); +#pragma unroll + // decode 2 elems at one time. + for (int i = 0; i < (N / 2); i++) + { + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[i]) + : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[i]) : "r"(h[i]), "r"(FP16_TOP_MAGIC_NUM)); + } +} + +extern "C" __global__ void main_kernel0(int8_t *__restrict__ B, half *__restrict__ B_1, const int N = 8) +{ + // print B + for (int i = 0; i < N / 2; i++) + { + printf("B[%d] = %d\n", i, (int)B[i]); + } + decode_i1s_to_f16(reinterpret_cast(B), B_1); + __syncthreads(); + for (int i = 0; i < N; i++) + { + printf("B_1[%d] = %f\n", i, float(B_1[i])); + } +} + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N) +{ + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= (lowbit[nbit_per_byte * i + j] << (nbit * j)); + } + } +} + +void general_interleave_fp16(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + // For fp16 example + // is {e7,e6,e5,e4,e3,e2,e1,e0} + // |-8b-||-8b-||-8b-||-8b-| + // interleave {e7,e5,e3,e1,e6,e4,e2,e0} + /* + BOTTOM_MASK 0 0 0 f 0 0 0 f + is e7 e5 e3 e1 e6 e4 e2 e0 + selectedVal 0000 0000 0000 e1 0000 0000 0000 e0 // selectedVal = is & BOTTOM_MASK + h[0] 0110 0100 0 e1 0110 0100 0 e0 // selectVal | 0x6400 + */ + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + int mask = (1 << nbit) - 1; + int num_groups = (32 / nbit) / 2; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + + for (int i = 0; i < num_groups; ++i) + { + int left_shift = nbit * i; + int right_shift = nbit * (num_groups - i - 1); + new_value |= (current_value & (mask << nbit * (2 * i))) >> left_shift; + new_value |= (current_value & (mask << nbit * (2 * i + 1))) << right_shift; + if (verbose) + { + printf("put %d to %d\n", (2 * i), (nbit * (2 * i) - left_shift) / nbit); + printf("put %d to %d\n", (2 * i + 1), (nbit * (2 * i + 1) + right_shift) / nbit); + } + } + + int32_interleaved[idx] = new_value; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +int main() +{ + const int nbits = 1; + // permuate should be done at int32. + const int N = 32 / nbits; + + // create four int8_t values + int8_t *lowbit_data = new int8_t[N]; + for (int i = 0; i < N; i++) + { + lowbit_data[i] = rand() % 2; + } + for (int i = 0; i < N; i++) + { + printf("lowbit_data[%d] = %d\n", i, (int)lowbit_data[i]); + } + int8_t *is = new int8_t[4]; + general_compress(lowbit_data, is, nbits, N); + int8_t *interleaved = new int8_t[4]; + general_interleave_fp16(is, interleaved, nbits, 4 * sizeof(int8_t), true); + + half *B_local_decode = new half[N]; + int8_t *is_gpu; + half *B_local_decode_gpu; + + cudaMalloc((void **)&is_gpu, 4 * sizeof(int8_t)); + cudaMalloc((void **)&B_local_decode_gpu, N * sizeof(half)); + cudaMemcpy(is_gpu, interleaved, 4 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(B_local_decode_gpu, B_local_decode, N * sizeof(half), cudaMemcpyHostToDevice); + // print the last error + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + main_kernel0<<>>(is_gpu, B_local_decode_gpu, N); + // print error + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + cudaMemcpy(B_local_decode, B_local_decode_gpu, N * sizeof(half), cudaMemcpyDeviceToHost); + + return 0; +} diff --git a/testing/type_conversion/cpp/fast_decode_s1_fp16_n8.cu b/testing/type_conversion/cpp/fast_decode_s1_fp16_n8.cu new file mode 100644 index 0000000000..0c5c8b31fb --- /dev/null +++ b/testing/type_conversion/cpp/fast_decode_s1_fp16_n8.cu @@ -0,0 +1,544 @@ +#include +#include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#include +__device__ half max(half a, half b) +{ + return __hgt(__half(a), __half(b)) ? a : b; +} +__device__ half min(half a, half b) +{ + return __hlt(__half(a), __half(b)) ? a : b; +} +#else + +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef unsigned int uint; + +#define TVM_FORCE_INLINE inline __attribute__((always_inline)) +#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__ +#define TVM_ALIGNED(x) __attribute__((aligned(x))) +#define TVM_HALF_OPERATOR(RTYPE, OP) \ + TVM_XINLINE RTYPE operator OP(half a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(half a, T b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(T a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } + +#define TVM_HALF_ASSIGNOP(AOP, OP) \ + template \ + TVM_XINLINE half operator AOP(const T &a) \ + { \ + return *this = half(float(*this) OP float(a)); \ + } \ + template \ + TVM_XINLINE half operator AOP(const volatile T &a) volatile \ + { \ + return *this = half(float(*this) OP float(a)); \ + } + +class TVM_ALIGNED(2) half +{ +public: + uint16_t half_; + + static TVM_XINLINE half Binary(uint16_t value) + { + half res; + res.half_ = value; + return res; + } + + TVM_XINLINE half() {} + + TVM_XINLINE half(const float &value) { constructor(value); } + TVM_XINLINE explicit half(const double &value) { constructor(value); } + TVM_XINLINE explicit half(const int8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const int32_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint &value) { constructor(value); } + TVM_XINLINE explicit half(const long long &value) { constructor(value); } + TVM_XINLINE explicit half(const uint64_t &value) { constructor(value); } + + TVM_XINLINE operator float() const + { + return float(half2float(half_)); + } + TVM_XINLINE operator float() const volatile + { + return float(half2float(half_)); + } + + TVM_HALF_ASSIGNOP(+=, +) + TVM_HALF_ASSIGNOP(-=, -) + TVM_HALF_ASSIGNOP(*=, *) + TVM_HALF_ASSIGNOP(/=, /) + + TVM_XINLINE half operator+() + { + return *this; + } + + TVM_XINLINE half operator-() + { + return half(-float(*this)); + } + + TVM_XINLINE half operator=(const half &a) + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) + { + return *this = half(a); + } + + TVM_XINLINE half operator=(const half &a) volatile + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) volatile + { + return *this = half(a); + } + +private: + union Bits + { + float f; + int32_t si; + uint ui; + }; + + static int const fp16FractionBits = 10; + static int const fp32FractionBits = 23; + static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits); // == 0x7fffff + static int32_t const fp32HiddenBit = 1 << fp32FractionBits; // == 0x800000 + static int const shift = fp32FractionBits - fp16FractionBits; // == 13 + static int const shiftSign = 16; + static int32_t const expAdjust = 127 - 15; // exp32-127 = exp16-15, so exp16 = exp32 - (127-15) + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FFFFF; // max flt32 that's a flt16 normal after >> by shift + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const maxZ = 0x33000000; // max fp32 number that's still rounded to zero in fp16 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + TVM_XINLINE uint16_t float2half(const float &value) const + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + // Same as above routine, except for addition of volatile keyword + TVM_XINLINE uint16_t float2half( + const volatile float &value) const volatile + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + TVM_XINLINE float half2float(const uint16_t &value) const + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + TVM_XINLINE float half2float( + const volatile uint16_t &value) const volatile + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + template + TVM_XINLINE void constructor(const T &value) + { + half_ = float2half(float(value)); + } +}; + +TVM_HALF_OPERATOR(half, +) +TVM_HALF_OPERATOR(half, -) +TVM_HALF_OPERATOR(half, *) +TVM_HALF_OPERATOR(half, /) +TVM_HALF_OPERATOR(bool, >) +TVM_HALF_OPERATOR(bool, <) +TVM_HALF_OPERATOR(bool, >=) +TVM_HALF_OPERATOR(bool, <=) + +TVM_XINLINE half __float2half_rn(const float a) +{ + return half(a); +} +#endif + +// Pack two half values. +static inline __device__ __host__ unsigned +__pack_half2(const half x, const half y) +{ + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// Some fp16 math functions are not supported in cuda_fp16.h, +// so we define them here to make sure the generated CUDA code +// is valid. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) \ + { \ + float tmp_x = __half2float(x); \ + float tmp_y = __half2float(y); \ + float result = FP32_MATH_NAME(tmp_x, tmp_y); \ + return __float2half(result); \ + } + +#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x) \ + { \ + float tmp_x = __half2float(x); \ + float result = FP32_MATH_NAME(tmp_x); \ + return __float2half(result); \ + } + +CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf) + +#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY +#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY + +#endif +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include +#endif + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif + +#ifdef _WIN32 +using uint = unsigned int; +using uchar = unsigned char; +using ushort = unsigned short; +using ushort = unsigned short; +using uint64_t = unsigned long long; +#else +#define uint unsigned int +#define uchar unsigned char +#define ushort unsigned short +#define int64_t long long +#define uint64_t unsigned long long +#endif + +template +__device__ void decode_i1s_to_f16_n8(T1 *_i1s, T2 *B_local_decode, const int N = 8) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x00010001; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0 + int8_t const i1s_i16 = *reinterpret_cast(_i1s); + int i1s = (i1s_i16 & 0x0f); + i1s |= ((i1s_i16 & 0xf0) << 12); +#pragma unroll + // decode 2 elems at one time. + for (int i = 0; i < (N / 2); i++) + { + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[i]) + : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[i]) : "r"(h[i]), "r"(FP16_TOP_MAGIC_NUM)); + } +} + +extern "C" __global__ void main_kernel0(int8_t *__restrict__ B, half *__restrict__ B_1, const int N = 8) +{ + // print B + for (int i = 0; i < N / 2; i++) + { + printf("B[%d] = %d\n", i, (int)B[i]); + } + decode_i1s_to_f16_n8(reinterpret_cast(B), B_1); + decode_i1s_to_f16_n8(reinterpret_cast(B + 1), B_1 + 8); + decode_i1s_to_f16_n8(reinterpret_cast(B + 2), B_1 + 16); + decode_i1s_to_f16_n8(reinterpret_cast(B + 3), B_1 + 24); + __syncthreads(); + for (int i = 0; i < N; i++) + { + printf("B_1[%d] = %f\n", i, float(B_1[i])); + } +} + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N) +{ + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= (lowbit[nbit_per_byte * i + j] << (nbit * j)); + } + } +} + +void general_interleave_fp16_n8_1b(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + // For fp16 example + // is {e7,e6,e5,e4,e3,e2,e1,e0} + // |-8b-||-8b-||-8b-||-8b-| + // interleave {e7,e5,e3,e1,e6,e4,e2,e0} + /* + BOTTOM_MASK 0 0 0 f 0 0 0 f + is e7 e5 e3 e1 e6 e4 e2 e0 + selectedVal 0000 0000 0000 e1 0000 0000 0000 e0 // selectedVal = is & BOTTOM_MASK + h[0] 0110 0100 0 e1 0110 0100 0 e0 // selectVal | 0x6400 + */ + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + int mask = (1 << nbit) - 1; + int num_groups = (32 / nbit) / 2; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + + for (int i = 0; i < num_groups; ++i) + { + int left_shift = nbit * i; + int right_shift = nbit * (num_groups - i - 1); + new_value |= (current_value & (mask << nbit * (2 * i))) >> left_shift; + new_value |= (current_value & (mask << nbit * (2 * i + 1))) << right_shift; + if (verbose) + { + printf("put %d to %d\n", (2 * i), (nbit * (2 * i) - left_shift) / nbit); + printf("put %d to %d\n", (2 * i + 1), (nbit * (2 * i + 1) + right_shift) / nbit); + } + } + // design for n16 case + // 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + // convert {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // to {e31,e29,e27,e25,e30,e28,e26,e24,e23,e21,e19,e17,e22,e20,e18,e16,e15,e13,e11,e9,e14,e12,e10,e8,e7,e5,e3,e1,e6,e4,e2,e0} + int32_t _new_value_n16 = (new_value & 0xf000000f); + _new_value_n16 |= ((new_value & 0x000000f0) >> 4) << 8; + _new_value_n16 |= ((new_value & 0x00000f00) >> 8) << 16; + _new_value_n16 |= ((new_value & 0x0000f000) >> 12) << 24; + _new_value_n16 |= ((new_value & 0x000f0000) >> 16) << 4; + _new_value_n16 |= ((new_value & 0x00f00000) >> 20) << 12; + _new_value_n16 |= ((new_value & 0x0f000000) >> 24) << 20; + int32_interleaved[idx] = _new_value_n16; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +// nvcc -gencode arch=compute_80,code=sm_80 -lineinfo -O3 fast_decode_s1_fp16_n8.cu -o fast_decode_s1_fp16_n8; ./fast_decode_s1_fp16_n8 +int main() +{ + const int nbits = 1; + // permuate should be done at int32. + const int N = 32 / nbits; + + // create four int8_t values + int8_t *lowbit_data = new int8_t[N]; + for (int i = 0; i < N; i++) + { + lowbit_data[i] = rand() % 2; + } + for (int i = 0; i < N; i++) + { + printf("lowbit_data[%d] = %d\n", i, (int)lowbit_data[i]); + } + int8_t *is = new int8_t[4]; + general_compress(lowbit_data, is, nbits, N); + int8_t *interleaved = new int8_t[4]; + general_interleave_fp16_n8_1b(is, interleaved, nbits, 4 * sizeof(int8_t), true); + + half *B_local_decode = new half[N]; + int8_t *is_gpu; + half *B_local_decode_gpu; + + cudaMalloc((void **)&is_gpu, 4 * sizeof(int8_t)); + cudaMalloc((void **)&B_local_decode_gpu, N * sizeof(half)); + cudaMemcpy(is_gpu, interleaved, 4 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(B_local_decode_gpu, B_local_decode, N * sizeof(half), cudaMemcpyHostToDevice); + // print the last error + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + main_kernel0<<>>(is_gpu, B_local_decode_gpu, N); + // print error + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + cudaMemcpy(B_local_decode, B_local_decode_gpu, N * sizeof(half), cudaMemcpyDeviceToHost); + + return 0; +} diff --git a/testing/type_conversion/cpp/fast_decode_s1_s8.cu b/testing/type_conversion/cpp/fast_decode_s1_s8.cu new file mode 100644 index 0000000000..cc3feb9ae4 --- /dev/null +++ b/testing/type_conversion/cpp/fast_decode_s1_s8.cu @@ -0,0 +1,603 @@ +#include +#include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#include +__device__ half max(half a, half b) +{ + return __hgt(__half(a), __half(b)) ? a : b; +} +__device__ half min(half a, half b) +{ + return __hlt(__half(a), __half(b)) ? a : b; +} +#else + +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef unsigned int uint; + +#define TVM_FORCE_INLINE inline __attribute__((always_inline)) +#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__ +#define TVM_ALIGNED(x) __attribute__((aligned(x))) +#define TVM_HALF_OPERATOR(RTYPE, OP) \ + TVM_XINLINE RTYPE operator OP(half a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(half a, T b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(T a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } + +#define TVM_HALF_ASSIGNOP(AOP, OP) \ + template \ + TVM_XINLINE half operator AOP(const T &a) \ + { \ + return *this = half(float(*this) OP float(a)); \ + } \ + template \ + TVM_XINLINE half operator AOP(const volatile T &a) volatile \ + { \ + return *this = half(float(*this) OP float(a)); \ + } + +class TVM_ALIGNED(2) half +{ +public: + uint16_t half_; + + static TVM_XINLINE half Binary(uint16_t value) + { + half res; + res.half_ = value; + return res; + } + + TVM_XINLINE half() {} + + TVM_XINLINE half(const float &value) { constructor(value); } + TVM_XINLINE explicit half(const double &value) { constructor(value); } + TVM_XINLINE explicit half(const int8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const int32_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint &value) { constructor(value); } + TVM_XINLINE explicit half(const long long &value) { constructor(value); } + TVM_XINLINE explicit half(const uint64_t &value) { constructor(value); } + + TVM_XINLINE operator float() const + { + return float(half2float(half_)); + } + TVM_XINLINE operator float() const volatile + { + return float(half2float(half_)); + } + + TVM_HALF_ASSIGNOP(+=, +) + TVM_HALF_ASSIGNOP(-=, -) + TVM_HALF_ASSIGNOP(*=, *) + TVM_HALF_ASSIGNOP(/=, /) + + TVM_XINLINE half operator+() + { + return *this; + } + + TVM_XINLINE half operator-() + { + return half(-float(*this)); + } + + TVM_XINLINE half operator=(const half &a) + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) + { + return *this = half(a); + } + + TVM_XINLINE half operator=(const half &a) volatile + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) volatile + { + return *this = half(a); + } + +private: + union Bits + { + float f; + int32_t si; + uint ui; + }; + + static int const fp16FractionBits = 10; + static int const fp32FractionBits = 23; + static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits); // == 0x7fffff + static int32_t const fp32HiddenBit = 1 << fp32FractionBits; // == 0x800000 + static int const shift = fp32FractionBits - fp16FractionBits; // == 13 + static int const shiftSign = 16; + static int32_t const expAdjust = 127 - 15; // exp32-127 = exp16-15, so exp16 = exp32 - (127-15) + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FFFFF; // max flt32 that's a flt16 normal after >> by shift + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const maxZ = 0x33000000; // max fp32 number that's still rounded to zero in fp16 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + TVM_XINLINE uint16_t float2half(const float &value) const + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + // Same as above routine, except for addition of volatile keyword + TVM_XINLINE uint16_t float2half( + const volatile float &value) const volatile + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + TVM_XINLINE float half2float(const uint16_t &value) const + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + TVM_XINLINE float half2float( + const volatile uint16_t &value) const volatile + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + template + TVM_XINLINE void constructor(const T &value) + { + half_ = float2half(float(value)); + } +}; + +TVM_HALF_OPERATOR(half, +) +TVM_HALF_OPERATOR(half, -) +TVM_HALF_OPERATOR(half, *) +TVM_HALF_OPERATOR(half, /) +TVM_HALF_OPERATOR(bool, >) +TVM_HALF_OPERATOR(bool, <) +TVM_HALF_OPERATOR(bool, >=) +TVM_HALF_OPERATOR(bool, <=) + +TVM_XINLINE half __float2half_rn(const float a) +{ + return half(a); +} +#endif + +// Pack two half values. +static inline __device__ __host__ unsigned +__pack_half2(const half x, const half y) +{ + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// Some fp16 math functions are not supported in cuda_fp16.h, +// so we define them here to make sure the generated CUDA code +// is valid. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) \ + { \ + float tmp_x = __half2float(x); \ + float tmp_y = __half2float(y); \ + float result = FP32_MATH_NAME(tmp_x, tmp_y); \ + return __float2half(result); \ + } + +#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x) \ + { \ + float tmp_x = __half2float(x); \ + float result = FP32_MATH_NAME(tmp_x); \ + return __float2half(result); \ + } + +CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf) + +#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY +#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY + +#endif +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include +#endif + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif + +#ifdef _WIN32 +using uint = unsigned int; +using uchar = unsigned char; +using ushort = unsigned short; +using ushort = unsigned short; +using uint64_t = unsigned long long; +#else +#define uint unsigned int +#define uchar unsigned char +#define ushort unsigned short +#define int64_t long long +#define uint64_t unsigned long long +#endif + +// __device__ void decode_i1s_to_i8s(int *_i1s, int *_i8s) +// { +// // convert 8 int2b_t to 8 int8b_t -> 2 int32 +// uint *i8s = reinterpret_cast(_i8s); + +// // i4s = {e7,e6,e5,e4,e3,e2,e1,e0} +// // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} +// uint const i1s = *_i1s; + +// // First, we extract the i4s and construct an intermediate fp16 number. +// static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 +// static constexpr uint BOTTOM_MASK = 0x01010101; // 0xf -> 0b01 select 0,1 +// static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[0]) +// : "r"(i1s), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[1]) +// : "r"(i1s >> 2), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[2]) +// : "r"(i1s >> 4), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[3]) +// : "r"(i1s >> 6), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[4]) +// : "r"(i1s >> 8), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[5]) +// : "r"(i1s >> 10), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[6]) +// : "r"(i1s >> 12), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[7]) +// : "r"(i1s >> 14), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// } + +template +__device__ void decode_i1s_to_i8s(T1 *_i1s, T2 *_i8s, const int N = 32) +{ + uint *i8s = reinterpret_cast(_i8s); + uint const i1s = *reinterpret_cast(_i1s); + + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0} + // First, we extract the i1s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1 + static constexpr uint I8s_MAGIC_NUM = 0x00000000; + + for (int i = 0; i < N / 4; i++) + { + // 32 int8 -> 8 int1 + printf("i = %d ", i); + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i1s >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut)); + printf("i8s[%d] = %x\n", i, (int)i8s[i]); + + } +} + +__device__ void printBinary(int n) +{ + for (int i = sizeof(n) * 8 - 1; i >= 0; i--) + { + printf("%c", (n & (1 << i)) ? '1' : '0'); + } + printf("\n"); +} + +void printBinary_cpu(int n) +{ + for (int i = sizeof(n) * 8 - 1; i >= 0; i--) + { + printf("%c", (n & (1 << i)) ? '1' : '0'); + } + printf("\n"); +} + + +extern "C" __global__ void main_kernel0(int8_t *__restrict__ B, int8_t *__restrict__ B_1, const int N = 32) +{ + // print B + for (int i = 0; i < N / 8; i++) + { + printf("B[%d] = %x\n", i, (int)B[i]); + } + printf("int-B = %x\n", reinterpret_cast(B)[0]); + decode_i1s_to_i8s(B, B_1, N); + for (int i = 0; i < N; i++) + { + printf("B_1[%d] = %d\n", i, int(B_1[i])); + } +} + +void general_interleave_int8(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + // For fp16 example + // i4s {e7,e6,e5,e4,e3,e2,e1,e0} + // |-8b-||-8b-||-8b-||-8b-| + // interleave {e7,e3,e6,e2,e5,e1,e4,e0} + /* + BOTTOM_MASK 0 0 0 f 0 0 0 f + i4s e7 e3 e6 e2 e5 e1 e4 e0 + selectedVal 0000 e3 0000 e2 0000 e1 0000 e0 // selectedVal = i4s & BOTTOM_MASK + s[0] 0 e3 0 e2 0 e1 0 e0 + */ + + // |-----8b-------||-------8b----||----8b---||-----8b----| + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e11,e7,e3,e14,e10,e6,e2,e13,e9,e5,e1,e12,e8,e4,e0} + + // |-------------8b----------------||--------------8b--------------||------------8b--------------||--------8b-----------| + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e27,e23,e19,e15,e11,e7,e3,e30,e26,e22,e18,e14,e10,e6,e2,e29,e25,e21,e17,e13,e9,e5,e1,e28,e24,e20,e16,e12,e8,e4,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + constexpr int bits_stride = 8; + int elems_per_group = bits_stride / nbit; + int mask = (1 << nbit) - 1; + int num_groups = 32 / bits_stride; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + for (int i = 0; i < num_groups; ++i) + { + for (int j = 0; j < elems_per_group; ++j) + { + int offset = i * elems_per_group + j; + int shift = (offset % num_groups) * bits_stride + (offset / num_groups) * nbit; + int group_value = (current_value >> (nbit * (i * elems_per_group + j))) & mask; + new_value |= group_value << shift; + if (verbose) + printf("put %d to %d\n", offset, shift); + } + } + + int32_interleaved[idx] = new_value; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N) +{ + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= (lowbit[nbit_per_byte * i + j] << (nbit * j)); + } + } +} + +int main() +{ + const int N = 32; + int8_t *i1s = new int8_t[N]; + for (int i = 0; i < N; i++) + { + i1s[i] = rand() % 2; + } + // compressed_int8: compress 16 i1s to 2 i8s + int8_t *i8s = new int8_t[N / 8]; + for (int i = 0; i < N / 8; i++) + { + for (int j = 0; j < 8; j++) + { + i8s[i] |= (i1s[8 * i + j] << j); + } + } + for (int i = 0; i < N; i++) + { + printf("i1s[%d] = %d\n", i, (int)i1s[i]); + } + int8_t *interleaved = new int8_t[N / 8]; + general_interleave_int8(i8s, interleaved, 1, N / 8 * sizeof(int8_t)); + + printf("before interleave: "); + printBinary_cpu(reinterpret_cast(i8s)[0]); + printf("after interleave: "); + printBinary_cpu(reinterpret_cast(interleaved)[0]); + + int8_t *B_local_decode = new int8_t[N]; + int8_t *i8s_gpu; + int8_t *B_local_decode_gpu; + + cudaMalloc((void **)&i8s_gpu, N / 8 * sizeof(int8_t)); + cudaMalloc((void **)&B_local_decode_gpu, N * sizeof(int8_t)); + // cudaMemcpy(i8s_gpu, i8s, N / 8 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(i8s_gpu, interleaved, N / 8 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(B_local_decode_gpu, B_local_decode, N * sizeof(int8_t), cudaMemcpyHostToDevice); + // print the last error + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + main_kernel0<<>>(i8s_gpu, B_local_decode_gpu, N); + // print error + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + cudaMemcpy(B_local_decode, B_local_decode_gpu, N * sizeof(half), cudaMemcpyDeviceToHost); + + return 0; +} diff --git a/testing/type_conversion/cpp/fast_decode_s1_s8_n16.cu b/testing/type_conversion/cpp/fast_decode_s1_s8_n16.cu new file mode 100644 index 0000000000..91af5b7894 --- /dev/null +++ b/testing/type_conversion/cpp/fast_decode_s1_s8_n16.cu @@ -0,0 +1,618 @@ +#include +#include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#include +__device__ half max(half a, half b) +{ + return __hgt(__half(a), __half(b)) ? a : b; +} +__device__ half min(half a, half b) +{ + return __hlt(__half(a), __half(b)) ? a : b; +} +#else + +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef unsigned int uint; + +#define TVM_FORCE_INLINE inline __attribute__((always_inline)) +#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__ +#define TVM_ALIGNED(x) __attribute__((aligned(x))) +#define TVM_HALF_OPERATOR(RTYPE, OP) \ + TVM_XINLINE RTYPE operator OP(half a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(half a, T b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(T a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } + +#define TVM_HALF_ASSIGNOP(AOP, OP) \ + template \ + TVM_XINLINE half operator AOP(const T &a) \ + { \ + return *this = half(float(*this) OP float(a)); \ + } \ + template \ + TVM_XINLINE half operator AOP(const volatile T &a) volatile \ + { \ + return *this = half(float(*this) OP float(a)); \ + } + +class TVM_ALIGNED(2) half +{ +public: + uint16_t half_; + + static TVM_XINLINE half Binary(uint16_t value) + { + half res; + res.half_ = value; + return res; + } + + TVM_XINLINE half() {} + + TVM_XINLINE half(const float &value) { constructor(value); } + TVM_XINLINE explicit half(const double &value) { constructor(value); } + TVM_XINLINE explicit half(const int8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const int32_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint &value) { constructor(value); } + TVM_XINLINE explicit half(const long long &value) { constructor(value); } + TVM_XINLINE explicit half(const uint64_t &value) { constructor(value); } + + TVM_XINLINE operator float() const + { + return float(half2float(half_)); + } + TVM_XINLINE operator float() const volatile + { + return float(half2float(half_)); + } + + TVM_HALF_ASSIGNOP(+=, +) + TVM_HALF_ASSIGNOP(-=, -) + TVM_HALF_ASSIGNOP(*=, *) + TVM_HALF_ASSIGNOP(/=, /) + + TVM_XINLINE half operator+() + { + return *this; + } + + TVM_XINLINE half operator-() + { + return half(-float(*this)); + } + + TVM_XINLINE half operator=(const half &a) + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) + { + return *this = half(a); + } + + TVM_XINLINE half operator=(const half &a) volatile + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) volatile + { + return *this = half(a); + } + +private: + union Bits + { + float f; + int32_t si; + uint ui; + }; + + static int const fp16FractionBits = 10; + static int const fp32FractionBits = 23; + static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits); // == 0x7fffff + static int32_t const fp32HiddenBit = 1 << fp32FractionBits; // == 0x800000 + static int const shift = fp32FractionBits - fp16FractionBits; // == 13 + static int const shiftSign = 16; + static int32_t const expAdjust = 127 - 15; // exp32-127 = exp16-15, so exp16 = exp32 - (127-15) + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FFFFF; // max flt32 that's a flt16 normal after >> by shift + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const maxZ = 0x33000000; // max fp32 number that's still rounded to zero in fp16 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + TVM_XINLINE uint16_t float2half(const float &value) const + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + // Same as above routine, except for addition of volatile keyword + TVM_XINLINE uint16_t float2half( + const volatile float &value) const volatile + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + TVM_XINLINE float half2float(const uint16_t &value) const + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + TVM_XINLINE float half2float( + const volatile uint16_t &value) const volatile + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + template + TVM_XINLINE void constructor(const T &value) + { + half_ = float2half(float(value)); + } +}; + +TVM_HALF_OPERATOR(half, +) +TVM_HALF_OPERATOR(half, -) +TVM_HALF_OPERATOR(half, *) +TVM_HALF_OPERATOR(half, /) +TVM_HALF_OPERATOR(bool, >) +TVM_HALF_OPERATOR(bool, <) +TVM_HALF_OPERATOR(bool, >=) +TVM_HALF_OPERATOR(bool, <=) + +TVM_XINLINE half __float2half_rn(const float a) +{ + return half(a); +} +#endif + +// Pack two half values. +static inline __device__ __host__ unsigned +__pack_half2(const half x, const half y) +{ + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// Some fp16 math functions are not supported in cuda_fp16.h, +// so we define them here to make sure the generated CUDA code +// is valid. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) \ + { \ + float tmp_x = __half2float(x); \ + float tmp_y = __half2float(y); \ + float result = FP32_MATH_NAME(tmp_x, tmp_y); \ + return __float2half(result); \ + } + +#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x) \ + { \ + float tmp_x = __half2float(x); \ + float result = FP32_MATH_NAME(tmp_x); \ + return __float2half(result); \ + } + +CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf) + +#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY +#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY + +#endif +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include +#endif + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif + +#ifdef _WIN32 +using uint = unsigned int; +using uchar = unsigned char; +using ushort = unsigned short; +using ushort = unsigned short; +using uint64_t = unsigned long long; +#else +#define uint unsigned int +#define uchar unsigned char +#define ushort unsigned short +#define int64_t long long +#define uint64_t unsigned long long +#endif + +// __device__ void decode_i1s_to_i8s(int *_i1s, int *_i8s) +// { +// // convert 8 int2b_t to 8 int8b_t -> 2 int32 +// uint *i8s = reinterpret_cast(_i8s); + +// // i4s = {e7,e6,e5,e4,e3,e2,e1,e0} +// // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0} +// uint const i1s = *_i1s; + +// // First, we extract the i4s and construct an intermediate fp16 number. +// static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 +// static constexpr uint BOTTOM_MASK = 0x01010101; // 0xf -> 0b01 select 0,1 +// static constexpr uint I4s_TO_I8s_MAGIC_NUM = 0x00000000; // 1024 + +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[0]) +// : "r"(i1s), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[1]) +// : "r"(i1s >> 2), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[2]) +// : "r"(i1s >> 4), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[3]) +// : "r"(i1s >> 6), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[4]) +// : "r"(i1s >> 8), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[5]) +// : "r"(i1s >> 10), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[6]) +// : "r"(i1s >> 12), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" +// : "=r"(i8s[7]) +// : "r"(i1s >> 14), "n"(BOTTOM_MASK), "n"(I4s_TO_I8s_MAGIC_NUM), "n"(immLut)); +// } + +__device__ void printBinary(int n) +{ + for (int i = sizeof(n) * 8 - 1; i >= 0; i--) + { + printf("%c", (n & (1 << i)) ? '1' : '0'); + } + printf("\n"); +} + + +template +__device__ void decode_i1s_to_i8s_l16(T1 *_i1s, T2 *_i8s, const int N = 16) +{ + int *i8s = reinterpret_cast(_i8s); + int16_t i1s_i16 = *reinterpret_cast(_i1s); + // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15} + // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x} + int i1s = (i1s_i16 & 0x0f0f); + i1s |= ((i1s_i16 & 0xf0f0) << 12); + // i1s {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // First, we extract the i1s and construct an intermediate fp16 number. + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 + static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1 + static constexpr uint I8s_MAGIC_NUM = 0x00000000; + + for (int i = 0; i < N / 4; i++) + { + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(i8s[i]) + : "r"(i1s >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut)); + } +} + + +void printBinary_cpu(int n) +{ + for (int i = sizeof(n) * 8 - 1; i >= 0; i--) + { + printf("%c", (n & (1 << i)) ? '1' : '0'); + } + printf("\n"); +} + + +extern "C" __global__ void main_kernel0(int8_t *__restrict__ B, int8_t *__restrict__ B_1, const int N = 32) +{ + // print B + for (int i = 0; i < N / 8; i++) + { + printf("B[%d] = %x\n", i, (int)B[i]); + } + printf("int-B = %x\n", reinterpret_cast(B)[0]); + int8_t B_1_local[2]; + B_1_local[0] = B[0]; + B_1_local[1] = B[1]; + decode_i1s_to_i8s_l16(B_1_local, B_1, N/2); + B_1_local[0] = B[2]; + B_1_local[1] = B[3]; + decode_i1s_to_i8s_l16(B_1_local, B_1 + N / 2, N/2); + for (int i = 0; i < N; i++) + { + printf("B_1[%d] = %d\n", i, int(B_1[i])); + } +} + +void general_interleave_int8_n16(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + // For fp16 example + // i4s {e7,e6,e5,e4,e3,e2,e1,e0} + // |-8b-||-8b-||-8b-||-8b-| + // interleave {e7,e3,e6,e2,e5,e1,e4,e0} + /* + BOTTOM_MASK 0 0 0 f 0 0 0 f + i4s e7 e3 e6 e2 e5 e1 e4 e0 + selectedVal 0000 e3 0000 e2 0000 e1 0000 e0 // selectedVal = i4s & BOTTOM_MASK + s[0] 0 e3 0 e2 0 e1 0 e0 + */ + + // |-----8b-------||-------8b----||----8b---||-----8b----| + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e11,e7,e3,e14,e10,e6,e2,e13,e9,e5,e1,e12,e8,e4,e0} + + // |-------------8b----------------||--------------8b--------------||------------8b--------------||--------8b-----------| + // i1s {e31,e30,e29,e28,e27,e26,e25,e24,e23,e22,e21,e20,e19,e18,e17,e16,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e31,e27,e23,e19,e15,e11,e7,e3,e30,e26,e22,e18,e14,e10,e6,e2,e29,e25,e21,e17,e13,e9,e5,e1,e28,e24,e20,e16,e12,e8,e4,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + constexpr int bits_stride = 8; + int elems_per_group = bits_stride / nbit; + int mask = (1 << nbit) - 1; + int num_groups = 32 / bits_stride; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + for (int i = 0; i < num_groups; ++i) + { + for (int j = 0; j < elems_per_group; ++j) + { + int offset = i * elems_per_group + j; + int shift = (offset % num_groups) * bits_stride + (offset / num_groups) * nbit; + int group_value = (current_value >> (nbit * (i * elems_per_group + j))) & mask; + new_value |= group_value << shift; + if (verbose) + printf("put %d to %d\n", offset, shift); + } + } + printf("new_value = %x\n", new_value); + + // design for n16 case + int32_t _new_value_n16 = (new_value & 0xf0f00f0f); + _new_value_n16 |= ((new_value & 0x000000f0) >> 4) << 16; + _new_value_n16 |= ((new_value & 0x0000f000) >> 12) << 24; + _new_value_n16 |= ((new_value & 0x000f0000) >> 16) << 4; + _new_value_n16 |= ((new_value & 0x0f000000) >> 24) << 12; + + int32_interleaved[idx] = _new_value_n16; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N) +{ + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= (lowbit[nbit_per_byte * i + j] << (nbit * j)); + } + } +} + +int main() +{ + const int N = 32; + int8_t *i1s = new int8_t[N]; + for (int i = 0; i < N; i++) + { + i1s[i] = rand() % 2; + } + // compressed_int8: compress 16 i1s to 2 i8s + int8_t *i8s = new int8_t[N / 8]; + for (int i = 0; i < N / 8; i++) + { + for (int j = 0; j < 8; j++) + { + i8s[i] |= (i1s[8 * i + j] << j); + } + } + for (int i = 0; i < N; i++) + { + printf("i1s[%d] = %d\n", i, (int)i1s[i]); + } + int8_t *interleaved = new int8_t[N / 8]; + general_interleave_int8_n16(i8s, interleaved, 1, N / 8 * sizeof(int8_t)); + + printf("before interleave: %x ", reinterpret_cast(i8s)[0]); + printBinary_cpu(reinterpret_cast(i8s)[0]); + printf("after interleave: %x ", reinterpret_cast(interleaved)[0]); + printBinary_cpu(reinterpret_cast(interleaved)[0]); + + int8_t *B_local_decode = new int8_t[N]; + int8_t *i8s_gpu; + int8_t *B_local_decode_gpu; + + cudaMalloc((void **)&i8s_gpu, N / 8 * sizeof(int8_t)); + cudaMalloc((void **)&B_local_decode_gpu, N * sizeof(int8_t)); + // cudaMemcpy(i8s_gpu, i8s, N / 8 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(i8s_gpu, interleaved, N / 8 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(B_local_decode_gpu, B_local_decode, N * sizeof(int8_t), cudaMemcpyHostToDevice); + // print the last error + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + main_kernel0<<>>(i8s_gpu, B_local_decode_gpu, N); + // print error + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + cudaMemcpy(B_local_decode, B_local_decode_gpu, N * sizeof(half), cudaMemcpyDeviceToHost); + + return 0; +} diff --git a/testing/type_conversion/cpp/fast_decode_s2_fp16.cu b/testing/type_conversion/cpp/fast_decode_s2_fp16.cu new file mode 100644 index 0000000000..681d49936d --- /dev/null +++ b/testing/type_conversion/cpp/fast_decode_s2_fp16.cu @@ -0,0 +1,520 @@ +#include +#include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#include +__device__ half max(half a, half b) +{ + return __hgt(__half(a), __half(b)) ? a : b; +} +__device__ half min(half a, half b) +{ + return __hlt(__half(a), __half(b)) ? a : b; +} +#else + +typedef unsigned short uint16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef int int32_t; +typedef unsigned long long uint64_t; +typedef unsigned int uint; + +#define TVM_FORCE_INLINE inline __attribute__((always_inline)) +#define TVM_XINLINE TVM_FORCE_INLINE __device__ __host__ +#define TVM_ALIGNED(x) __attribute__((aligned(x))) +#define TVM_HALF_OPERATOR(RTYPE, OP) \ + TVM_XINLINE RTYPE operator OP(half a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(half a, T b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } \ + template \ + TVM_XINLINE RTYPE operator OP(T a, half b) \ + { \ + return RTYPE(float(a) OP float(b)); \ + } + +#define TVM_HALF_ASSIGNOP(AOP, OP) \ + template \ + TVM_XINLINE half operator AOP(const T &a) \ + { \ + return *this = half(float(*this) OP float(a)); \ + } \ + template \ + TVM_XINLINE half operator AOP(const volatile T &a) volatile \ + { \ + return *this = half(float(*this) OP float(a)); \ + } + +class TVM_ALIGNED(2) half +{ +public: + uint16_t half_; + + static TVM_XINLINE half Binary(uint16_t value) + { + half res; + res.half_ = value; + return res; + } + + TVM_XINLINE half() {} + + TVM_XINLINE half(const float &value) { constructor(value); } + TVM_XINLINE explicit half(const double &value) { constructor(value); } + TVM_XINLINE explicit half(const int8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint8_t &value) { constructor(value); } + TVM_XINLINE explicit half(const int32_t &value) { constructor(value); } + TVM_XINLINE explicit half(const uint &value) { constructor(value); } + TVM_XINLINE explicit half(const long long &value) { constructor(value); } + TVM_XINLINE explicit half(const uint64_t &value) { constructor(value); } + + TVM_XINLINE operator float() const + { + return float(half2float(half_)); + } + TVM_XINLINE operator float() const volatile + { + return float(half2float(half_)); + } + + TVM_HALF_ASSIGNOP(+=, +) + TVM_HALF_ASSIGNOP(-=, -) + TVM_HALF_ASSIGNOP(*=, *) + TVM_HALF_ASSIGNOP(/=, /) + + TVM_XINLINE half operator+() + { + return *this; + } + + TVM_XINLINE half operator-() + { + return half(-float(*this)); + } + + TVM_XINLINE half operator=(const half &a) + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) + { + return *this = half(a); + } + + TVM_XINLINE half operator=(const half &a) volatile + { + half_ = a.half_; + return a; + } + + template + TVM_XINLINE half operator=(const T &a) volatile + { + return *this = half(a); + } + +private: + union Bits + { + float f; + int32_t si; + uint ui; + }; + + static int const fp16FractionBits = 10; + static int const fp32FractionBits = 23; + static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits); // == 0x7fffff + static int32_t const fp32HiddenBit = 1 << fp32FractionBits; // == 0x800000 + static int const shift = fp32FractionBits - fp16FractionBits; // == 13 + static int const shiftSign = 16; + static int32_t const expAdjust = 127 - 15; // exp32-127 = exp16-15, so exp16 = exp32 - (127-15) + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FFFFF; // max flt32 that's a flt16 normal after >> by shift + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const maxZ = 0x33000000; // max fp32 number that's still rounded to zero in fp16 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + TVM_XINLINE uint16_t float2half(const float &value) const + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + // Same as above routine, except for addition of volatile keyword + TVM_XINLINE uint16_t float2half( + const volatile float &value) const volatile + { + Bits v; + v.f = value; + uint sign = v.si & signN; // grab sign bit + v.si ^= sign; // clear sign bit from v + sign >>= shiftSign; // logical shift sign to fp16 position + + if (v.si <= maxZ) + { + // Handle eventual zeros here to ensure + // vshift will not exceed 32 below. + v.ui = 0; + } + else if (v.si < minN) + { + // Handle denorms + uint exp32 = v.ui >> fp32FractionBits; + int32_t exp16 = exp32 - expAdjust; + // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1. + // Smaller (so negative) exp16 values should result in greater right shifts. + uint vshift = 1 - exp16; + uint significand = fp32HiddenBit | (v.ui & fp32FractionMask); + v.ui = significand >> vshift; + v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0; + } + else if (v.si <= maxN) + { + // Handle norms + v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0; + v.ui -= expAdjust << fp32FractionBits; + } + else if (v.si <= infN) + { + v.si = infN; + } + else if (v.si < nanN) + { + v.si = nanN; + } + + v.ui >>= shift; + return sign | (v.ui & 0x7fff); + } + + TVM_XINLINE float half2float(const uint16_t &value) const + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + TVM_XINLINE float half2float( + const volatile uint16_t &value) const volatile + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } + + template + TVM_XINLINE void constructor(const T &value) + { + half_ = float2half(float(value)); + } +}; + +TVM_HALF_OPERATOR(half, +) +TVM_HALF_OPERATOR(half, -) +TVM_HALF_OPERATOR(half, *) +TVM_HALF_OPERATOR(half, /) +TVM_HALF_OPERATOR(bool, >) +TVM_HALF_OPERATOR(bool, <) +TVM_HALF_OPERATOR(bool, >=) +TVM_HALF_OPERATOR(bool, <=) + +TVM_XINLINE half __float2half_rn(const float a) +{ + return half(a); +} +#endif + +// Pack two half values. +static inline __device__ __host__ unsigned +__pack_half2(const half x, const half y) +{ + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// Some fp16 math functions are not supported in cuda_fp16.h, +// so we define them here to make sure the generated CUDA code +// is valid. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) +#define CUDA_UNSUPPORTED_HALF_MATH_BINARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x, half y) \ + { \ + float tmp_x = __half2float(x); \ + float tmp_y = __half2float(y); \ + float result = FP32_MATH_NAME(tmp_x, tmp_y); \ + return __float2half(result); \ + } + +#define CUDA_UNSUPPORTED_HALF_MATH_UNARY(HALF_MATH_NAME, FP32_MATH_NAME) \ + static inline __device__ __host__ half HALF_MATH_NAME(half x) \ + { \ + float tmp_x = __half2float(x); \ + float result = FP32_MATH_NAME(tmp_x); \ + return __float2half(result); \ + } + +CUDA_UNSUPPORTED_HALF_MATH_BINARY(hpow, powf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htanh, tanhf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(htan, tanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(hatan, atanf) +CUDA_UNSUPPORTED_HALF_MATH_UNARY(herf, erf) + +#undef CUDA_UNSUPPORTED_HALF_MATH_BINARY +#undef CUDA_UNSUPPORTED_HALF_MATH_UNARY + +#endif +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610) +#include +#endif + +#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ + (__CUDACC_VER_MAJOR__ > 11)) +#define TVM_ENABLE_L2_PREFETCH 1 +#else +#define TVM_ENABLE_L2_PREFETCH 0 +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 800) +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 1 +#else +#define TVM_ENBALE_EFFICIENT_SMEM_PTR_CAST 0 +#endif + +#ifdef _WIN32 +using uint = unsigned int; +using uchar = unsigned char; +using ushort = unsigned short; +using ushort = unsigned short; +using uint64_t = unsigned long long; +#else +#define uint unsigned int +#define uchar unsigned char +#define ushort unsigned short +#define int64_t long long +#define uint64_t unsigned long long +#endif + +template +__device__ void decode_i2s_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 16) +{ + uint *h = reinterpret_cast(B_local_decode); + + static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; + static constexpr uint BOTTOM_MASK = 0x00030003; + static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400; + uint const i2s = *reinterpret_cast(_i2s); + printf("i2s = %x\n", i2s); + +#pragma unroll + // decode 2 elems at one time. + for (int i = 0; i < (N / 2); i++) + { + + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[i]) + : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut)); + asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[i]) : "r"(h[i]), "r"(FP16_TOP_MAGIC_NUM)); + } +} + +extern "C" __global__ void main_kernel0(int8_t *__restrict__ B, half *__restrict__ B_1, const int N = 8) +{ + // print B + // print B + for (int i = 0; i < N / 2; i++) + { + printf("B[%d] = %d\n", i, (int)B[i]); + } + decode_i2s_to_f16(reinterpret_cast(B), B_1); + __syncthreads(); + for (int i = 0; i < N; i++) + { + printf("B_1[%d] = %f\n", i, float(B_1[i])); + } +} + +void general_compress(const int8_t *lowbit, int8_t *compressed, const int nbit, const int N) +{ + const int nbit_per_byte = 8 / nbit; + + for (int i = 0; i < N / nbit_per_byte; i++) + { + for (int j = 0; j < nbit_per_byte; j++) + { + compressed[i] |= (lowbit[nbit_per_byte * i + j] << (nbit * j)); + } + } +} + +void general_interleave_fp16_n8(int8_t *origin_arr, int8_t *interleaved, const int nbit, size_t size_in_bytes, bool verbose = false) +{ + + // i2s {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0} + // if 16b + // {e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0} + // Assuming size is the number of int32 elements in origin_arr + size_t size = size_in_bytes / sizeof(int32_t); + int32_t *int32_origin = (int32_t *)origin_arr; + int32_t *int32_interleaved = (int32_t *)interleaved; + + int mask = (1 << nbit) - 1; + int num_groups = (32 / nbit) / 2; + + for (int idx = 0; idx < size; ++idx) + { + int32_t current_value = int32_origin[idx]; + int32_t new_value = 0; + + for (int i = 0; i < num_groups; ++i) + { + int left_shift = nbit * i; + int right_shift = nbit * (num_groups - i - 1); + new_value |= (current_value & (mask << nbit * (2 * i))) >> left_shift; + new_value |= (current_value & (mask << nbit * (2 * i + 1))) << right_shift; + if (verbose) + { + printf("put %d to %d\n", (2 * i), (nbit * (2 * i) - left_shift) / nbit); + printf("put %d to %d\n", (2 * i + 1), (nbit * (2 * i + 1) + right_shift) / nbit); + } + } + + int32_interleaved[idx] = new_value; + } + + // Convert back to int8_t if needed + memcpy(interleaved, int32_interleaved, size * sizeof(int32_t)); +} + +int main() +{ + const int nbits = 2; + // permuate should be done at int32. + const int N = 32 / nbits; + + // create four int8_t values + int8_t * lowbit_data = new int8_t[N]; + for (int i = 0; i < N; i++) + { + lowbit_data[i] = rand() % 4; + } + for (int i = 0; i < N; i++) + { + printf("lowbit_data[%d] = %d\n", i, (int)lowbit_data[i]); + } + int8_t *is = new int8_t[4]; + general_compress(lowbit_data, is, nbits, N); + int8_t *interleaved = new int8_t[4]; + general_interleave_fp16_n8(is, interleaved, nbits, 4 * sizeof(int8_t), true); + + half *B_local_decode = new half[N]; + int8_t *is_gpu; + half *B_local_decode_gpu; + + cudaMalloc((void **)&is_gpu, 4 * sizeof(int8_t)); + cudaMalloc((void **)&B_local_decode_gpu, N * sizeof(half)); + cudaMemcpy(is_gpu, interleaved, 4 * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(B_local_decode_gpu, B_local_decode, N * sizeof(half), cudaMemcpyHostToDevice); + // print the last error + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + main_kernel0<<>>(is_gpu, B_local_decode_gpu, N); + // print error + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", + cudaGetErrorString(cudaerr)); + cudaMemcpy(B_local_decode, B_local_decode_gpu, N * sizeof(half), cudaMemcpyDeviceToHost); + + return 0; +} diff --git a/testing/type_conversion/cpp/fast_decode_s2_fp16_n8 b/testing/type_conversion/cpp/fast_decode_s2_fp16_n8 new file mode 100755 index 0000000000000000000000000000000000000000..c9eedddfc983b2bd8ef648fc585e2af620f62def GIT binary patch literal 846456 zcmeFa3wTpi*6@D{1p=apikeZ>sC6uOOT89_F@aQ0F-YWQMYKWMTA)BIDHJbbux0Eq zMlGf6q^9PlbXt|%k+7C38QoonPLl-Kyj-+>*=Fuv(k?{K% z&7}bd|p*2FVM#R~zXKhF46Nq3is0H0kADa~|R=rnCA9{8(Pey=o3AW4a;u zHi5t3Cn0|4U*A4tm;kv~HJ!*WC9iAjwfqTLfZVH^4r8N5M#zlf#jl} a&JYc3sH z6X`aP{>#O$9*S4)H5X6x5&wNf*ZHfDvdO)oF8*Mt#Cr=rg%bXvIObjh?}Y0xoqNCG zLWRHn!!59ud(Gu{5u_94lb84TJD2Tr%FimE8pSjBnrknoHZ8c~tTRq+syn%9!IGBN zlUpm#I{Bfy%>Eqmk9&bKSacB0?t z1KL9!{6T#Rvv|E9WV|lRwF=c8$nYm!|H9%4!~Q0ko+QP;7eT^^t%QNJCzoG?q|VqQ8@AwITg3SZi}v`Zw}*nyh5w3o;kI)uD-ec$_2~9_02P< zPHI}Zq<-eSE1K%H+S)4@FI`f*EIh9{Tw5zu_o%q6adGX^Q2i1O-?S`&RsSN)BNT3I zuAf(j8M!qQYMWOyEuFt;UO3!byI{$JFl18S+`MFIEzo)41xuIY73VKqQYX}w)rUjN z8s|0F*X6HCBjES4aP!iY91FX-{1&%_>sxEX%_}k6yr!W0unq;mBu&fe>si8YWV~oW zQ&ZmdmG$9eq5Am?8WzmgmenuLE93hW^PtE%*A^~aP+Z*N%kv?4;rhIGU43}o{KmXU zAqX#6Twm8*XHhsFAdG4470pZM)yHU%q(RmHFT< zT{6EAco7{QNts@>Abx34{Yt3wrh29B6zx5y1b3@@Ua*EK<( zU5eszya=h5h3n>@cp|1eORJr;WS)TYrMwJ!QGHlzXlQC#)~G>`ogZ3>BL(ymtzmxS z;-z&k2!xs!ED1N%&Tm|#;h0gkpjm5>LrS=L$^6A3ts&gpvIK?*D9!>X6QrLP)`W|w zyJX&yrOTS?>rt;B`Xc1A1hc8d(QE!9v|HOSZvk9rTs%MA1TMp9w?tdK2onRV3uCrT z&~!EFZfG7{01rb8>d;tSOGtx@^O`VxwU8v-9A4U_EnYk?q%E7@SYKDWazTAloz_sl zv_Wftbn`V(7UsguO^{PJk7en2k5%^zgUW`s$v0w=#7h?@v#u6`I zwnRon3oV5rA%zk($ZBbx2kMJM%}c|g4WK(MP&#NI5bdQQ$U|n2r3%dpH$w3iFI`@* zVN5YQaIt>La&6hN`SX@EXp38xU`YVBKzSP^M3pN+tgUTmg(wPtu?Dn;1yG`;sND#? z4=TG}YmvbRG=BxAfH%+u7A)4TT+)I7^uI8Hkvihb5OuJx7!%}Dn4GZ9>T za3%NO2%(qD71QiTR!|o;bJj}6{Ngp0_8$3!Hs{JURewbEI*CSwO+4s=?{@QKi(uWR$B8#G zo+jSPxYjJ~M;K3#+|BrL7s-gE8LuR6GJYX(?Z@)DV=kWah&vfyN!-o&O~iG^e@Z;a z_+!Ld8Gnj+obgwQcQI}fPci-#@m|L5(`5;>j2}+ic3W{ADu}xnpG>@x@ma(JjL#?D z$oLB45yo#K-pTk+h$k8UHSr$CA10n*d>!$A#{Wv(aeHwb-XLDixJlf@_@~5c7`M-m z@eeWXBHqFHsl*eEpF_Nx@yW!~j9)_BWPCAk?T+H|t|IPa{4U~d#vdlGGrpO4knz`v zw=(`d@i^lH#Jd0~H@kYj< zBpzY>MdF=|zfC;J_@~5s7`M%m@yRgmB;L>XSmKU5i{s!WUe5Ro;vUAYB3{F|K|I8G zf_Mkx-NX}&KTEuu@pp))8UK>F$@syuWqh=|ip%RJ?qvKz;%>$l5Z4+15%D17cM)%8 z{4wHj#v`kxpIwY6h^H7&lbv41`-o>5Pmvv4XK@@%;x5KZFP8pPGJY8G0OO7yN&QB~ zD~U%KKb7osGCq-blJVKZdl(Or{S4z*k$ykp?Zh2-7ssKKcsb)8WZ%R1W29fh`18a= zjQ0`mV0?ghg7L#@WIVeWPm(`r#(RmIjQ11Q?kO&>>qcqc$+(BOoAEQqAD!{@i3b_K zgm^3C%ZSGr4^qBejK@en#rOlndl_F(Jj?hi#BKK$$6*(77vmk|PbK3?;sM6f#2XnO zeu<28gz(t{Uc>kr z;vvSLBi_OI+r$%$e@wia@nLghxzda~iJOd%C9eInxV#?XPR0Yo-HZo`>x{1?9%S4g z-pcqr#N&)VLcELdb;MJQZz0~xc!qeE@ehgHepVcZ0pc#k54=>yxsvgthzA&V6K`a^ zns|iq%ZPU}-by^l`0d1d82=sd4C7A`?`QlK;*R@^x}mk4>G>rWioE9 zjF%CQGd_WM7vnRDrxD^{6~{p*eHY_@BmGLocM%UTp0m@) z_}8Q#VcZ_1_RsjS#FLDlM!bh{op^@v8shzoFC^~xb#WY45ie)_9^xLxA0uAF_zT2C zjBh92!MJUnj6;HP7x8Y!&mo>>d>V0+@kPY72a3yU5O*@3Ans^PU zG5%}f9gIIlJi++$#Jd^aPCU(cmi#mscibf7qy471yp_bAj1Q}m`fkR9q^~m`As%GB zi+C&JY2tCl%P8M2#wQR@G2TyhdKtf%^s|h+qOx4Jhl=B{jPzZM>!e@F_${O#VEmWF z8yOFgoe1Nd#5)=P3)x9B{tEFP#(T(4hVd-%e#V_Q%Q!fGTO5Z<;^mBgNcnmgx7SNM zHH;riJjD1p#5)+DLOj9vmBhOlZzrB+{9fWF5DzdO zAwL@#Kj2Ckw+Q1W5btDs67eMCHN<-uUrap1_;tkl8NZFVLcm?rp#xEeAX1sy8$#^Sq?UCa0-bCEV_%DgO8Bb9C>Wudg z4>F!5-paT$CgTuid_Cpc#rWTdrx@4CPA}sj;#tN!iQAIJao9=rU5slBq@R_Hk0Ksm zyqoMdGM*tGVO+aK+V5oiG_s#$`~u=Vj4vRbVLU>-pYi*MJN{4{hsTJQGwvckJ&eCd z`ZbKdMLfi~PIfvNZzP^z{A04y&3K&j(~Nf$HyO_m*B&h{uXd}9zmst{aW~^Z;yUB| zFO+tIjCYWJE8|JxamL4zoi4_ENk7H77MJ<P!2d;__yQI~jkP?6?_!i@45s zKk*>rjwTt;R>o~VmORe5n|K%FLEx*ZpPmyo@RW+Vi`}9@p9r?cX4?$T{cn9Nm67OXELE;I<*Aq`N{vz>i#&;6$VZ3CijAxqh!-;1YKb5%2 zxQFVipYc}W+MkQ#-$~rTc$&DA@oLJqobfE_yBWWN^gWDULtJM(PP~TkhlvLncibuC z7GnHa(r;y4C;bk_|3Uh3#y=vSV0=VK=G(=1i0pSW-bp;g`0->X&G-c3y^N>Gj>-5X zq@QJcIdSbT#r2*cJ2u9(yJWeXjJt@t7}trr8IMuEm5l$2xX$<=i3b>ej(Cvq5arv* zc$|1ElZ;R_OaB`g zKd@Qy5aZ>fA7T6y((hork@Pzmj}uQYo+O@RJWITramPK<&mP7riKiJ463;Mx0r_t- zo*N(f8NY(`wROey-b&oTcqirSWIRQ@obe2CH{;s9vb-L~Z=`&6#>+{+hVdZrAma(L z6JorZcq`)(veUu%3#1=sd>iouF>YTb%iGPki+GChZt^G1cqQrgGH#N-$@py2 z&ob_~Px_-hSzPax#BGcRi8~qZAnsy3N!-o&V)Cbw@igh{j7Ld7z<59D2N{2m^cxx9 zNW7KtUg8nPP2zFJhlFL^IvIEWRPrvyYltTqj}T8Wo+94ExJkU1@gvBe4C7}L&ob^O z-p_dX&!qphRB?M~Bz*_tZNy!S-$}fj@c`McWc)GG_b~o6@c`rhBwoY#C&U{W@1T4` zj3-Y@+(8TSzH zXZ#7WudOex_g2z(F#bB}I~ngG{c^@XAbmIE&V=;S!}!SMlIx7ukbVv0UBrWor-_Fc zA4~RI8881i)jQ*pNk7hbkn|IrlYSTDDbnv|-0=&UZ;J6s;%UZ1#CsXNiFY#IL3X-0C;cSjUBpw2|BCGNFrFd(UdGpxeui=Vx3auh z#@{0Se#VoeZ+p79Jve?R?Kl{(B<^DT3$kC%cn9fMGJeoC(!PiBvBY)8<0nh~0OKLz zHH>?R2N`z|Z)7}klC&RUJW0Hj@i_4a<00Z5j0cFv8Fv%!WZXeK!FcvW>1P+?Y2r!7 zlf=6jj}uQZ9wgqwxPy3_@$6XXPcP$X;u*%1#7)K>#IuZN$4LAAjHiifn~U3RlDLiW zIB^H#A>vNPJ;Ys%yNH)Ft`T=LZk9{`D;e)4?qR%#xXyTzc!2RZ@fyZM#Dk0nh&M9s zCLUtkLA;gm>x>782N-t|uVFlMob)ruc#?P{<8k64#)HIL8Fvwn zFrGbD=G(z|ns}V?B=JthBg7Modx&>2?jW9IJbR4vr{^i@3>n#wGpCGM*sb&v=ly_Dpg6cN4cU?jY`9TqEveJbSeC$HjPtcsb)~;%>&f ziB~e-Mcl)9Cvlze2=M^pjl^ph4-gMB?k3*IxPy3z@$6AD4y}y$5|1#RBHqDxf_R+q z2=PwFgTxbzdx&>2?joLKTqEAixOt>3SBmj8@gByL#M6w&iT5%dB%Wd1P26N$Bc5eE zJ6ihL&v-9!?b+h?pCWE!JVD&Sc!ao<@gQ**;~wJWjJt@t8P|wcGM+g?mdnF$Zz(~QT7_c9(Lo?$#d++^HMJj=L)ct7J=r}STYuDJcD ziQ5=Y5_d2jC+=iCMBK%AfOt9MZsKmn9mFdc&mKnYpYarNo$(0q0OLX8HH>?R2N`z} zZ)98}9%4LmsPway@f7h0;|by&j7Nya84nQeWZX?W!MH}ei}BPU($6I05#rs92Z*N_ zcM$JkJUdF-Pcxn(-phD`c!u#Hag%X3@hsyG;{A+g4wn9Ce=TnRDdIN96T}^i$B8=` z4-t1U9w1)MxSP0}agBH-5AguwF5)$eYs7<$XGTgt8yQa#4>6u3-pY8Kc!co~@ealV z#N&*+iFY#YAf8}6yPx#Gi}5t^B;!fq-HgYHrx*_r?_oSZJk7Y9crW7`@eJeX5z>E? z@dWWK<3Zy6jJt?yFBG@`%y4Pn#(09bgYh77C*yA7F2*(D<&39nGG9025#p7Mdx(1& z*NE$kXNF1p0mf6rYZy-u4>BGh-pF{6c!+Ti@m9uN#3PJr#5)+z43+-J8BY=KWIRDU z!FYst7vn+VNya_IyBT*8Pcd#I-otoyi1a_rc$#=Gcqij7;t9qb#Jd>J4wB_cGM*vc z&3KA(Lu(~O6R_c9(Jo?+Zg++^HAJj-~tM3$?c@icKQUEKbY#BGeni8~k% z5qC1~A?{*aBVNvUN|S!N8BY+eWIRaR!?=sM&Ukh}{-J^Z;~C;LjHihQ8BY>#WIRqh z#CV8!E8_v;5ysubI~aEmk29Y6T9&Jm@f7g{;|bzjj7Nwk84nTfX1s=Yig6F|9>!h7 z(~N7xdl?U&D%(SbaSw5maToC{;~MdP#xti#``Syz?LS4_#(09bgYgJ)C*wilF2+5? z%NchOcQYQZkbYJ&9wP2x+(TSv+(kUVc*ZU5*D#(S9%MX1ypi!B@et!K;;oEls9qwB zCx~}29w8oQJaC%yzmst{@dV=<@h-+w6rU95U&{W~!?-6d+fADBBD7;ePH!0ks z@SiALS9qtw0}8)e;WdAg{$OeF@8RTrf(pmK&6E3SR5<*p#QaZ4;qouA$V;sXhd-5= z|A{CZW0CvmP`He+yb@QqjHKk93YUK;Nb-cjWeFwkQuuxniQh?uJ7$6mC~|TH(3B*@KsR6@H+ipHaB__drdBAFSwS6+TMg{R%%s;o9E|^?s@K%K%ukeV%zpwBPg`c4CxWda7-l_003Qs6}tirn#exkyY z3O`BV-3mWh;VFfmqVOJtpQ`Y*!rcn*Rd|KMGYZGQmzMi66@GdS!fRIHXDGa1;b$sb zd!^9+&r-Ne;rO>>b3YD+SLPtRIu$-n;Vy-rqwsQtpQ~`U!pAGTQsEO6?os#;6s{}$ zJcS1o?ooJ+!o3O)Dtw~C8x=lD;UR_l6yB=vDuqWBe!jvx6h2wuafRy&?^JlT!V?Og zqVO(-U!d@$!u<;GR`^tfrxbpn!g~}xP2p*U2Nd3`aQxfPxu1-}r{^HNnhKwx@T|gT zD!gCevlOnqT4?{X6>d}b#R_*Qyhhk6+`ctGJnh1V#2p2C9)ze3@S3ZJj=kizQ}-l}l?dwsc|h{79k5MDbJex<_W3U5?+ zr@|K~JfZM~3hz?*B84Xv-lXtug)dfkO5sZs-lOoP3QsFMr0`yaU#0Mj!kZOtDtwv3 zvkDI@ykFrh3fFoI?SHw#Z33I!h;IGLE()GZ&P?k;q3}?^pO;3fE{oI1Jc53b!fzUWGdpexJgf3je9XT?+r1!oPd_T?D?1z;_Y&E&|_0 z;JXNX7lH30@LdGHi@8n%WK?7;MZIf-I zaChL$W6v0&4IEzvzwJkR;2p|hhpsO(06)hcj&cbo_oan=5Xys4P6>Gg%7al(3V9I9 zr6?za{Ke@Y4?#IDIzXovpFHm-%oDuRpC?AM&TFAGd zd=SbhA>V}Z!6+w%d;`j(P)-Q>YLxNOWnWy#%_tv=azw}rQ9caikdUuH86Q#h1%*5Z zWqdT*7ZCDvln+PQBjhP4AAz!4$P-Z>jj~I~=b(Hf$_^o)hVoG;YeF82GCpeT%kIYV zAB(aJ<&2OINBJ0((?UK7i_<_p9_6@@KScTaC`W|6 z9pw{H4heZH%H=2rh5S0oV^9tV`DK*HqU;g!vnZd4vRlX-P(BG|myp+@d@{-oAwP=p zDJW|~ehB4LQOK|n{${8WwgK`DRX(8W+@@XiignSdqr=y${@(n1TfpS8~SEG!N z?)u_FZbtbmlp{i3i1OJehlG3u%J_(`FDT?WDC48KzJQRYqkIm^9wARb`COFULY|27 zc$8g2J_qFqC_99F8p=OFSrhVDl<`qpU-nB;|0sJ<&ItK%l)WgYg?tdo6H!hHc?8On zP)-VY5XwH36GHx?0^}-`<3j!r~IRlwCqzi}F;I9YTH-I({a z4$AmwsxKhq=_t=Z*(2mBC|`=QTgVepo{O?e$mgJZ8OjbJpN8_~C~HCdSsE z>L2AG${8Uaj`BQ|(?UK7>LntprIomJlALS6r86n?;@>M9Og?t;z%_ygYd=tvcP)-W@29(1nCxm=8%J}G} zFD~R}l$WC%5%NNmSD+jc@)aoKBbvUTkmsO`k7oJ;LY|KD)hK&}JO$-zP<9J>BFfjI z>=N=hC|`%NL&&G0{6myAA&*5FAGP#lKNIzjas=g!kPkrQ%6lOAhy zm={2ytJ^k}+`WGLQ6=fN%-}I;Z-=jJz(4MtmaP@L{ITQTIAoaCU_TnGP}A(|MxJPF z_Z!=uECHz^rMEp*V!rzcm~7kaXnFg|QlREA$ilvEMv0Jg<0<>PNrTpJ9kf1EIy?=0 z@QA_dw+)nOwhmF6ywuvL}7<>BkMmafkX5-Ii@G0B#@e1NTL5M170 zxojUXX=KG4Lcvxw*t`xsGv4=hqqmQvb882|FWq<=%5M$~fw!@f9Z*0o6l}Edx^BFe zFWANs^8it>FH2i8Pr4wPd5SRi?m9FC{%%7hZ^W#uw(~F(s&VupSMU!OVSp z*;=e$)3rbR`mMAuQ(Su-3n$D&q0P)Qm@(9PoqoD(11)}->ANdZmtN+byV2Ne{@|m* z;QyItj~b?l40U4zv=HaV0|R{vq3wglXsK~XvBsw#4GipvnRXa38yZf@w~eDcT7G>I zo6rXnKl8rhOSFX=6wliY1_u^GcKfe6co;-)Gh}=U_zTE*7Jm&fF~NQ(lz`I{{sHg7 zSC<(D1A1)2#}Er#rSyGigVvbOGQQbY_7#*_i_R?jN{2NO#`Rr?k`D6PZ49M3ptjfOT!0NX3BdR(s`V`(@@>x~KC7*fiYcA^d zcECUT8t>=c4)|wZbJZ8#4)|wZv+_%C$I353+51&h2mAx&-H?1YCjT0ee~rlpAo)P` zxM!Mw_R+?T`ThfiHfltP4mb}>xYK67_Rpc<^C39zgR39?0AD8DQ3^4_^8Ok6l2Trb zZ^f=^yRK5$!2D;dxN(OB`1s+t;d{ct0o_*;-*JiuuYM#>}#jddy!|-k{r;2;|g_ zYs(yZY-X7YSr4j>)QxbN6ML>3hs98Xy=lEJ#t6~nV&Ss!$lcJoDrG11m5m3%hk|{K zuPkWJ_`c|qrwoM4${}c29`QEnFZ27L(}@o0fo7@u!JUwK`>3H>%^d8Gt%dGr(^qZ6 zqHHub!N`K^Z7grM4UpG(`EKuLUwDm_(F1S#SEuZ^Ooi;L@D2Lmy|~RRsWv|M&i2RV z4lVJ{IzM(2-c~g4%*nCvA=<1-v6Bv(ael1s5UqNC@1$618I-7hVyyJL^l?Z1JGF}*~e|BSe9 z^eLAI+uQM09McEoF8hqPmFuP9B{6+);hLIuNO*coFD+dAKuI?=d{#^!lDlSK<1g!1 zt_}-3WBO3_>Q}j|dfU?_)g`a_N>cuD|Fo|@1ZoKfKksGU%e}Q5H^xQ0UEs3!;%ehL zDC%sl7=^vFyj$M(&h)nRmCTx9Y`OchFZESBu`T%E=6J5x*Z|Qx$-930(cY2`-j}|p zUcUn(vvioIO^KD>kFCq)GhV1No)gW;8=E-bk4bwg&b+Pepq8+2(7b`v5 z_)FPeKp4~O1D(GjM&;kZ8?f;u*zj)om+`srs?V5P)(C|KU&EpxKH)@bu29#4`WKjG zdsze$$4U?K8P5t^{{XS1cgt3|vJ>7K>p}B#)V!k%$8J!AYj>1kAHgK;WjGOwmHrR> z)B;ehmjR%O_M`Ma!n3g~$>mi$8Nv$!jfqZk> zNx~lBF2LT{^v~>%Yyl}Lm9}h+jl}xcl9?16sR93^|Mn2LTqXeoJp+S4Nf!j1~5u=$dEH1IJW2 zQa~zfAr$f&X>Wf9)!Iet##A^~Kq_qI`Bd03)V%h$x8uqUQek_?R5++WDs1oh zRHADr(KsMH4^m*O#}qiIKniU2`4pmqC@JjWQy~Smbxa|;#4)DtxZjD+uqA^KM<4H& z-M-<_+F^cVn8x;1??501)@N+%x7&_~ciO70@Jqv1iSr7V-sUT_m5POcQLeY8O7Z67 z($5W-x$W&+pj@bBgO$ihzG!*dK#Bd9R{%z?Ejw3(a>M3d#V%iTZrOO4t^16oveEEw zR8_^leK=RDGTy8*UV#1>E^|zZjVc2xqhW$^R&3y=oKQm6dzCoR3!Q>N7!4+~48O-2ny7PYuOf zaF4S`nkvGyP@O(PraDdu3)4q+UdShp5OXo9Wb?)2pk2TkE;nC11Lk{soiENGvDf+H z9XsTFafWVe(WBFC0|)EITbK5|`QcD_Y@;lYycc_CR~w(fbY`a4cxBd%YU8VF<9VO) z6a=NKRMXnNu-R|C6-FC<^{4jsqu`B^)?;&_*~CVs^!0C-wta4E-fw;1V3=PUZ+)cy zwhHy`1ik9$G+R~d{DEmPe`>gHTwtq?PIgp7im9=?@yX4Fu~GZ!#w15|^gKP*hNBfs zl1ja8{~GKcm)4DQz0r%E)v@bc#thxwemT_P)YWg;+i|3;Zu_z%d>Q;66dt2TSJ-qT zWb=b@zu|HCqjtYx^F?bht;c87=wKW>l0SOAO&@~U=xv)_exth;e9)uQvbv#1^ytMI z-H4-G(G~qhs2j``X-81o588gv_D5?oexnn$ebMXteMW0HxNL8K9F0S%^&uV%Sljy% z-RS6nhX!Ci4(8M1?FyjD6uk9CCwKachz*({Snl#09W~-@Z4!**vm9_a0gmEhoanS} zqoWhvfT4s#4sl9mh34dx--u-3tuH#Q*JpH;3x8`(FeP2T9$asQf=+{ic8G#r z+;0SgYMt~8)sjnUg!@Txuvuf!e4c0c; z!hA-*(mP>4=_ObZdIv_8!TR$aa0fG5{xCEy)0G{n?f8}KzsAcj-+ftMzYPYl&u~6% zGe3AA*2$|=uy>`y9oH5fKNWI2ui{hKN-*Df2e&LB=;m{wd@*N3vHM2BFX$0yKLrN* zPLZw7-uCv0_jphin z;;he@*a60R{l-nQQ`ADInA0u#LfhtYoe~>2$%K<3;Y3JyJtVxk1DbKKZagXzPKJaN zp;5kSZ|?)I!6+<&Hz(jN7{OizSzwRyNAJf3e&ZhORMA_}uyIo_yw{^Q;&Wi#xIy-@ z>mxp6E%q_U2Rh-SvX4#f^cy!|AA|0a@EJEXicVD90S0Av>oRVV-EDF=n8NM`y%GA@ zTI_Do6+OnI{en$~&Uyp&yEGW0ewP75var*juu~!sBoU+QAx3LO?}K5cSrl~jVc@QL z1~e7weYiKxdtW2;zMr3v?|qH3_iYAE?0shzHh}in|4vBq{`b@(>;C5z`H244S=9d$ zJNMfEu7b-}{jYv2bhX2=|6Kr6V=AcVe+NP7ssDMvK;KDH-dq3k$EMlvh6VR3xM8)u z-H-Pw2u1YfGMsuveNj45Tc5rCSXh(9 zroM@@B;5yt(8%g;FDR5~f7L2J6(eKy7PI^G!IAIbRK*XICU0A&r0w&wefG&u8-1%` z$ZaZfRX;IEOAPWjV1j4AW4#|dDzDzM%NLvV`|8jBUNyWYdY_y z^(`h2)}N)p`AXb`mZC{M`8m4{nW*w|JVFGr{Q*FD;V^w|jJ2tlMp6KgnDw}P; zV*_~cR`nA$%^|*SdiWOD?)HnP&nb(Ps*L&67C0fJsz+t)?BD$1{d~Cf_{-eB;V*bP!euqFmhElZc7sc>pYopfrCn9S zpY^`9qb;)=iURq4>5EPMwYSY&h||&WI6w8^UX`z7X4%EK|KRi3@7z4q*f#u2Z`|o{iARL-@ZBwZ3VaeMowM5%V%8kmCu;;vAuoz za52B0so7Wm5|Y4Fdp1n9aWMgJ=K$@uiMN-)+sRV*Qc$=`zO8|`t7Wo_HMqS(lamYG zfN>2BtrzSxX7qc(;UwsQ9W8umPJ*VTf$pf)&xEJR`>8KFBkPM^x-&Yde@gU^J2Rj# z)wtYIZJbqYoNt?I%y#amzB4BcNIlIc-B}F} zfAqGEmDO!eyL{uIi`j3y4@ONN`08leeN`1i3d?H0@d3zAj~zMiC5#~RPx`H1^5(!h z(l3a^EOd1|)C=S}v&@6#S_WAbm(AXO4GbJ=`TC9k5clF^tZQMVZqz}KhZ#vDWTlyh z{8o;Ljo1oD;bVOR{| zJ%1q#4=_BJwEPWqaD9j=V4c*rj6B42A++*r&dMT`{*sdxqV#Z1T7Xg_C!za0Zp%sd zSbRr&PO1l~D3dw_S17s8M{tn>#B*#s7nQRi9(13o0P!UJCbLh1&H$n-3qu2H6B#JTm*M2;8Fyu6(G(ygcTqTLCjY`4JMhc0C9}M ztAN?4;a0#b1V<}iCV~SLFayDExV7bqFV1@GRDd`P@`eHenB-Xnh~p%WD?pq#=~BQ{ zOmde3{0M%efC~_`C}0YLIt5fCn4th2!9)d2Mo^)E^AWfdP=&y*03U*{VRIl?{7DFQ zDPSUke<;9<;5h|&5Uf?ec?f=^fFB^}RKNrTH!5H}g5?T07eT!O#2KiW3K)kPlN3;i z;4}rCjo=ssoP_|-MdeC+CV~NYVwD4DAb3v!rz3b%0jDAOs{$$z{8<5R1P>|TR0MY` z;1mQM3OE_T3I&{mpg{pABABIsu?Tz$7=z$+1(YK=RskmYL$!TWGf zC=w6mSrNClqiXg5N0s@78_y zD!`85CIuXTV5I`~N6@H%kq9nUzF^D~;h+u%eBOb*&nQNuXnJr;lP z9&48g;SsC947UMbhIwsSunH!UbzYd6HucSf)NR+6%^j4R$-u0scqW4;lVA>uYyZ(R z@O-`RXhBT4$`6? z9VVsW1L4uV!|27i1wQ(RF;tH=l?}r~8P|*}(abfd8qoE&)*#IBI>8ZttgZ|e?eCiV zg>gb@Y-~A6hv})!jFM`!wFY)-%&%s{!VY#$CgZM0?RZ>k85_e3&~gP>1`9X|!lG%W z;}>!gWP|ySy?p`P(!s!QF9QPw@5~eP_Jij8f0g!HlKJ?9;fgoRS748|50}1ho?ytT zBS&Zz0|O55;%#H&j&`efymZdDiN_)^Z9agidydDK&BgJ!hEypYJ7+2Jm9ce3z%y13bN1z=XovWejD`dB z7@hCKP8G~eZG{^7VWQwcco@{ik&rvp2DB?^C(_ z#%|%~!%+F6ag72$4X_J`l#KT+oPFtj=4_~792>Uqx7hxLs{JwhuzwY|@7S|_FZi!h z0d_&WL;((g0;CUz13bnysB4Q7*m({Q&&h{JmH-Y*%u?;gRQsWQ*?;6`EJ7E4!#*6s z6RtbW({iQ{0@sItX=}eP=5{~5MELE*A*FcyDcFA*q9;Ow_J94oz1jclr_Ap=zlHr~ z)&7}#wSO14f7V{?Q+N)}nLiNR|EF_=W?8G_xb@zeqUb;`DvZx?cmtch(|io7MuZ3L z|8Q^iU%QV*r}f*|pP||xx;OhPxqate?BnwrukoVX4%>~-Z$5_SI8Y)ubOZ}-99nSt zfQ`vaLX<@jVW(Fdd4gSYzp>S96kFnOn>Yuw@+r&@?v>?q;PMy;qxi)lj^*;kMMq@v z=fZ~EZO88RIqpKiVjV2YZudOg10KxZFXZ#4Z|ARO;cquQJiu}=f4$t_4BiXdHd*=m z!2u&+%8>GZ)8%KuWqQtl{#L<52=RRE?;^g{#tRVNN{p{d9Mr*(xpG0i04l2xWZ000 zr-K;e!8xZVP>@9m^X7YGkX^YTpD3NKEDCa=y+Y$0hkhIVe&Yk5u^mG#s?#RMxTnDL z`y1i~eRRAcC7$a&j?=Yj;}dYcCVF_Zxm2_eUv#3)7o9TF7o83d0_KbY?>@3Go#=x7 zOg%cq8J!NhUhl%bmS{?1-T?b+qkOQr1+!i^gq6&- z9VDg~miM7Q+{t1Ykj~{M{_6ZlP67hv$+-*;gACI9LG42Qz(cxi!n@PHF=3p_^VoTj z2tYvgmyK~;137e9yFf^AtbqOOYvDizEP331pp|$o%7Yu-x^B#ZU{AH_MvFr?u7g3V z%H=nvmivuaZohF^CA67Ab`2t-kj{!tvu)1!Y8 zCzSl~3?>2G57)c!It(7=xLXd{!b1w&^mpMCT^M+-1qb1R#}7<59nf|}^v}jIBX)Av z2x!xA@NuvIpipg_JbRpNJAT3{=#hX3o)~OsT}JcQT>ZqL3aK~8hPWXAebmqI?_gQ@ zWfuR(>ZfS_8u&j@NsFKXiU^6xMC{}pw&F^H3HcuDYcKWGcsmQ?uxUk&T)p4r+N0Cl z3Xv9$;C%No@B$i*#X4XZ&)|Uz3&Tk9d>U*$e$+56YFqt`z5QYsB;aJeb1)c>9(3+G z_V#muz>#2oY$6^Bsvhi*jr?$`@$A$w8>-iDA3QBKdOAG0hm&~sV*58XRJWy^-f^v_ zy?qcQ#Ao#O_6S@O&*>M#@4?~O@Vhj89{e5>E{ESk!v_{6e1==<=HNjN?&JVRUBzg) zb?UgD@O9NOj}v@@3*jrlN$8jxs$;r4JO>2mo6lCqLV9>A2rzVfI5idshCLtz{l?2v zV>PYe(?MwU8?RT#B5|-EKF&L?8{Y^s9oX^1j_OsZlt9mdve^P>0VX;i-eNyw$VD@U zX)t~dsXmg$yQ}28c>n(vZ(BazZ%@Z~!&*q(WX%t6#k(cU72t+pGk=9DCFuyA3 zZz$5g!$M!IS8|hSV>hg=!2+yRFpr3^09#upii3#N#=E{~vK`uo`9Qhc|A9>%nC%(c z%unFhEOzcNoEFws>;1x~)gM=|w#V1Gs_dk=0=S^JZC2`|2i9pHTMbtytE!&j(r_hf@O3G}lg+ z=IyI>(2=vrLVJVzF%)%%Z1*tXdDpy&Y_`A$uf~`8aH{&A`FEbCX-EEm+aN=Y6#J2v@@C32d zEX%L%7H^HRoL9-53v+FZlSmAeWln;oBBHz&qYP)w#+sQjr6%4pFM@i&t?JRndoY<> zHf)3zt!&%&nd}k|fah@NPW!WXIEGKo<^C(g50@ZfxqJ;Z8k_m)Tv;#SN>OD+jb$wP zEe;>zi15t$MU6$;)!~_dO~drZKQk1YEQ?7l4^X=n) zY#+EqAlh!*mjmI8%vX*ZBqHU+O>G?1UdNW>Ixp99&K0R)(hhxeI=T~sWeSXY(Q!CH zwN$~1rQ(gY7i>lgeCii;MX6=;JGUxlTkApZ|2bgeBd9}j^RfAcfgiEmW*+(jS%|QkHUG=he6TXV@`+4DBNTo? zA2MGBd!p^F*Q2E{ykBS2+urThu_Q2O5euDYhZCMn=|(IKPo(rQyM3|Mz2e<_-u32S z_!Sor_b4kb?L|M)%mf7rM2-!J_y#TViKNbznT z{^!>3O{Lfj_PHOh>qh49&2UGf;ReF((O|=|M>k$tn}H|N&@bV0ltad;rtPxDT-dslciNTEe+2>wS6c_a~%(Za8DXfQHhRBLefqNI1eKj1;ty!;#K3UDmb%)GE z_A_O^QMA5vn2U3n;PaRMFNfnq$2yl=c`m2OTyi%v7-#YuuP{!?o2JvW>sJxZ{3&u^SJ$g!hK|~@p&RQ7sUE??92XJH?a8pa9{Q>Q|)*4?;}3fbNff_ z%l>W%V7`9M&-P*eXl{QU+^6;$pP#DsBm1)d+5G8u&n)ea*FH2Ufh{?P$@w9oLoYSW_5Z zaV-WPI)bTM#T$4sqzsRhlz(J*z$2llMim^Qoof5YYd^5cQDw{;Rb^b}>}!#e37i(f z`IT?RV+>7xS}-18ePSJtO)r1zcpM*LHM8m5|I&EK`F)`6>k;-FU&I;W2hdDnXO==) z@xj3S(p&VBEPgG8j9>I1*fVyRpFs6uKlsLa>7?sf!Jj)v6g)sfCNwgb^561tKFahM z=^xNA{s`^LzUFZInyX7Xpn2fI^#ga@A=eB3=pcV|3OqTR4xc)l!J@zEG8=%79HT?%zB_O;}E^5Xn_lGZcdD6i+Gy_R=~Re8t%kd^n1aeFB5oge+* zDsN#vG7(y!7`kwOb~QR=xj%d5b*xZBWufx3{z88&*xw2jowvXH!@b!b#_c~)DT20F z`!}fef3jEm4_?dsKi|^6I^Thb2uwnprZbll&Ua=&PVn4!uk(bnd48{)T_}%uELNEB zgSBJ4P?$!-&05TaZ-*)sjp7Ih-!1QB zqxitTk9PRTN>;uL&lL4V^Hn@S0{3B`dLBo#jy>gnu$uqSZP`QbEZJ`JJZb zcjkYT-+4U0f1Y0GA2QN~{5C*ci1Oij@Eyo+Z|lK#RwyOGeT#P0a`Ac)`tl4hwR4!4 z=khrEv!~!-?m$Zb@>&UbiP;@&o?ASSTh6mPL}r&;=oH3#aX;OW+Ut1M3sDy(#qsQ4 zyY@YveYTvH_RiA^aoqR#*{J5X=v(A>BhT;q*7+5$A9q7sMA2~l;)JaBdS5u2=d-Rt z?GMmG@VsNu`d77|+_?|?PqwhQUH%RGx&0MC*uwplZtO^_z7)M`KU|97?jrOHJf{+d zhx|pG-?iNS*Y3Rkv|p05|L+~r{yxsT?9VO!PO*sVf`yTDy)~C!_OD2xe~CNyrieJLb1UrZR;1c~0FRR3tReLK$MJIq z5EnQ+w&LXIQLtwF)zd>Y|NPQ>s!P&d_%1m3zKx|9+VD^!epJAIAk3BJebgVl+!~of};C>F@5(u}b2+iopo|xA^3`K{>DKw{l9SZg+t^0h@ z*}@Ino5p#%v+ruWr=x#M#09+1+YfAl3-jd7UamLbdV{zQo2_uY4z7FI^~=TeQgMAQ zTt9|gml+Qc*D>P~cKv*j9uMo`D#F?KHf+o1fS2>1KalGS{Aitw zL%w~r>iC6+qJ3dyRlua^tH<#}AW$OMT74gm|H0Wn{1o#Ts2zxi%S=F{5>>$GEw5h1 zs$luaqWQJv+fu<_c}}kn>j9czt}5`9-b2{GWc#{5ub-bCuG%RM3!u}z;H79`==AQ# z@zXExttQn5?q9;W3q86FPb>PPli*y_1@Oe+8kpqasV8VhFyhaIgG6u$>m92_fAjtYTcFqz5OQI>IaZi~r{Or> zJpV45usXjoS@jz4&f88b37upBSyhEb@;wRFv|BlG_U;LO*QUA4VwCKOs0G0k5 zUG9W%mzzz{5=5Ihi$(UAOO?o?j}xVjA?mh8=Q>~6hQ-Y_me-)M{2||OW%rW(7P=S3 z5nAzbrQbr$_u&Ux3jJC3SM1OD`PY1V%lBv5U$H;Kv(uvMGDF#4u|FTEUiZ^Dg2Nep zQs3G4VNrhs`?9~w{)`GQsr}hU{TV-Oh39FVeUI|=ay-C~XUXy4$7*`n-{TVb@H0@U zzVdF|Y>@5KE1nAU9U>51$NmKem^?=O4@Zy9*Lc(3z`7Z_n80bYW`3E`&;EdjpzR!oD;!GL~b1V=K2ETQ@QpV@i|_gy~XGDu}XZ7 z`gZ<3(!}C(;XeJdte=C`{O|nNw~EiHJpUKqP{`inQy71K0d*-F1dcyPLWb7k&*nv} zoDDJ`i}?=Rd1k>02t0@mCx7^-6h`Fog;OeUzCs*c!2|RgXqz~E1`hkV&o!7AP~Q1Z zDQt)H9#F>ofgd@agfc9EGQfxY#pwyL?!a;7F)`{Y4(WF5ZzMBF^# zAT7+%u7zXyIJc4~Y}dc%(%1KiJ0hIqhi_SfL)S0>jl$6CkQibWh7UT1%ix?-Q`ua& z6hyT;JgM7=H`owrdQj7x4fPE($BFp$9q@_EO}gj`epaK0A~5#qeiX5P90vqAf)&~i5x1Rm}01f!oX z#G}CFRfak6$FeLKhC|2r0wY*#!nDkAZT1YG@hN^P)Q5&!y#qJ{cEeY5v^T?ep0n9E z5eIj5J^No7?`;r3%Xp82zpjI2EX4cO{2fzQ9ItKW=A-|!@xWdu<5o2O!mtZ{ z<8v5xGti!4*u@tZhhf~^{5tIPZO@OqXW*#RC`Vowj=Zhv$cv{X8{%R=-S}_2zb%Ib zW;xzIa|Ms@l}G(&@qOfq|6P1*U;EGED<%)0$f+L89N^m{aOg8VU&0?ASiLLi(ZfTk zR_|)L0H+KAxkA+q`0!qSNPA@-FLL-uS!A$Pd`qWwoB6LVN^(aDedm_1_?16Kp9+En zI^5=A-XQMvm}OrSq5L;h0_-p=dXoR!L+`CR<__(n45xY zmC)|sTNHoj7ULo`GyL8y{I*>`G&_%Z1v;SPamyeoz=PT^%pY#XfZJDpDpDR$eeTFI z`|8g@@J45ku8uA*M{`xt^Wpw-;mE4!Y`FPc4)>e$o!}GPM#4A|x)wj~1c%6TAH>x0 zmJq<33QU~v+oa*a=8UzZ=VJlxZjJDsqtmvZ@=OFhiV(q{-6=h@5jh9 zxAykI_~3)K`|!i05O@ca2UiE?G-w23@jtF;ir!T#TT6ZlJoyM{EO5-NXqhJ0tLS5H z&cEG!A-DbfNHy*R`;E6@3y3~hS`9hFf(^fa4NrGMQCqPgjQPB6y&t}>4`%?71}5py zd}N{V{QP_vWsr{+T>PPy!o~CHqDx-HC4b@K7`g~y1dHo%!C$yI5-#GcyHsAp-3gcj z6t3@tSnBiP5brwNSAw5t6)5EQ?L<2eeWFalE{Un@@#{ z_;pyA(jzb8&T`@6eRL7JGv>CT!Mf;=TdFLx)KjbS~ zl)ApTC~Y}hw(J(R=sP*LOD#T zeq^XNWqv6fa7qtkL?GjE0Q~O1;kpn$R1ZA>PrB~7LsZ(1f3L&3-SOr+EZ~k;L=bko zxDErgqh}okbI1C181NlyfNwrO~7_r|agXfve)wxW+(LQnf zm8XG3pO^;4h6m92{Z()t>F=;rY<@47{Up{{0zCWo4^o?`I9?+A!PEKvH`U1KM&DX_ zgNErQ*q`3VR?EeDl5{8N&VD-I1E`FoGK0k{U{QqO`OQ54Fmar2D5EY!D84XHs)TjXg z6C@G@Gy@4tZ~{?45J93WvV%ASD1yP62-jhhr>MB2qT&;GguYls;=IuyGL6}MmyaQrn z&u#H89Y%fF`=v=h7WKcWZU8c9v5}op8abN-wrP~434!e#{)JURc&t29CxEmXh(S4k z{A)A=alnRwK%y-`0Pzb*E*etRq0XoQ&i%DYfnvmNqquq&+9{>1m@cH?-eS)<-=B3n-EsK5>^^)2yAFbqjbif? z*A-{)qd`v=rr+B6Zf!8+(JMt3$d@~KNZ`aIjC6~2l(n9n#W z-!$wE$Kp~G3z}?l`o2Hu^sUCpDC`VZK4ieTywGVV3ZF0>x5mSTJp*NnrgM)7@Wbm8 zxKBsVhn;X!>S1ZrQ+in}>O5P-R%z_OF@6Sp(LUZc(zF_!y@}ov>|3P$iWdx|1IoXw zF8Zpn?fv?DAFO=Kv_7W1503|~m%J8&R~i&N3w1D;E(@1iZ?JIbIWw9|r8mLfswn=z z9k_D#q1f?aG%uP9V$Hrcrijm6YOoosd=ve9q&=R+yx^2***;GptgidVktQ$v5%vLK z5Tg5TYh`eAqX>g*BK7C9jcT8U^jD2PhC~n=sB2IN3&rY>S6*)s${<10yd7ut$E2?U z))L;St16H5h}s_JcOgB{Qvz? zb@E?SY>`DT6N&2NA02;{uBt}yUAv%hVa{>J8P{5g?l^<%W$KSfeppTy)xzmZ+y#Ze z5Knz9gy282t#e%!1@s#ZtblE)qCiaW5s(E~Oy36P)>MEw_-%(v+bFO8vX#-Z!9``e zJ;^ZcJg0^!F9_gR1G#598`{phYJd1*KGY_t~7yM+#_lAhjEsKr4JcQYLB^3z#DW*oGF zNc)IoZ!@7v!``-+*KBXE^spF1S3xy0JTUwN6hn85fxD>?>NO}_y*GePr=pweQMyS1 zbz}LEZ%QGP(AHHw{}4@jzy8T-T;W_262?2jY~vtqJQ2{C4j$&xH}>P=n)7#NNAvUY zy1&cM(?wCD*spi^ANX-WJ|b>h38$i4Ch3=ZX1w#GeNE4Ow(3Ug(#c<16v`5dLPA)) zPO`4beNrdg7Ve%&W@))tT0GB#B|FIUG+l^}I#}(_G$B8iMSk;y{A$bZptXOI-yT+e z_ci#N{963K%^!eR-NBrL5GBUr+=m}@hR^XGsf))U_?`y5$dLSnw`-?$a3OeaXM4;g zf%(@(Np^?cMR02F8%VzT<*)~CTq}&2MUbmpAjd(X|9HV@FY);8_&51I*xkbKBlTIb zRrs9(3;8SjTFyVfj{hYjd>@n#&y4X=p@!#c{)i>yLLubNafE!WmFXSzm}$e$M9xR( zYaoB-`+XiH(XA6W`aq(7f%G8c{G8a2F^`c9em`FGxA}dj&?4m4@%cTwmi%)6hP@4% zIrCjMt(t&QER0sM*nA<(PH}|!J0=Lc<|fAGJM@1gMGF(%3+#0Y?1JUE648wq+=Ei( z>(TuOmdVZhOke#!@pGG%pTwB_R6>N+@WUNH_xovj@pp1dhAyC>YZsn7F7Da$HsN~{ z&~XM^odL59WQj{Ts;=~X+RY-Q(bykAN1{fQ;=W?^Wd2pPipQR6Ew8C4kD49JTg7*` ztX+AntmSQ}V>IKa^0ut36@UNjYN_u%mh#9S6>2AW*^Q2oZ>E_prl&67QHO2Dvv83d zhr5-2&@PHAn0j*o#xqF;fuxKAbS@YeHGK3S=2L~iwsTy;J`J(cgD3OwUJI;2gJZ}27_$A+6#wZML z`*=YBJ6G2;upn@g1a>}{uibD2uYUpD9HmO|tr9%)pw=1C+m1T}r=dg%buv`zeo(s` zz|KHzj1nbqghagoCHDt-Byd6a)+FKv+@YDTZm1F6+m2}6kF!WBP-|zPeF5krR|(i$ zfC3jHCw`B33w$j?2}^*zT)TT7NX`e>$l_d!tRI2SLeO~&C>xOdbFK3t6pq}^Ex_8Q zbzctFR{|NkZ=KfylQ1>_139-KCvtqN-Hi-)?Lv{eQKXKd_M)g?QN}@(afJBR?%oCb z-N47LUjMzo#~WehYLvC-^@79bep_M5-$jiSF|p^MgT-x(#~g_Fm^9WhG3}UdGSKts zO`C7Ri3ydB)%r7DJtEXe-Dpt&_F|1cacN(E$onuK(1buAI0^emWh-6UI$PQG*g2ua zpeOFawMH{#e@q`{(g#{Hz{nX*efkForr1P7dmfZ}8iLOd-G4>?67(zmruZr`5kVke zpboY=t^sP`288$ECMc}cP#8K+Ct;$%*WR3scG3{H&=iq}$rLwTQ`R(Eb5V1J4qb!* zcs@@{D&L&Mt~T=X5KfH8hGUv9nnvK9K#!z+Z1B1QT`{G&1v84VS%}GPh`3%H>%Juo za~AC6Vq5P<>|piCqrF`WFDa5d(A7nLZ8;PZn22dSEnZ=ZEDPtrfzgnHfm86VJuOHT zQ?L={IamrsgdT+-<GlHVcJep6nS6nl+MIIHrshcXor5jV1GF6m14XOl8r$! z`%w0F#@<&CkH*U(y?{>5Lr@;=S8)OU28 z@|HtswYX1asgS&u zXTxaqwH8s_8X<+t^m#RY*ler^S46D`^Zu(T9$nGDSkUh)0@$Nin0rnf7Ou6jumdq4 zQoD+U$o|o4@<+5@StRcF24(g}*nYq=7Y!cuXL&&6?T;0^B0PC-I`OLC8BY$&uCd7B zCc%Vt5blS6<^8h{qvTO7KW9pQoJ--lvF9Jn(yn-Nxy;H^`JXHoD@*X-(9dcsKXfdc zht;@Wo-StI;D*z|DusS8G)(ch6<&DM?cTE?>fGq(TxaW*UmAlf-cy@{5@HJ=QqW?pO(+lbf=N2geODa>kp|SY-Iz{f58bBET&1%Hd=u~ zbP`{54`RIUXY-#xYy{}>0TShpg?-8U?(x2|S$-}Jca?R3br75DydIJvMI}Rqas|sY zUwd(DHFDhZt1JZfVT$E0A;;DoO>%sRMgtoV+fn&cWPT##`LRi!I3lB8MZ}`y`LllC zqe7nRyie=rM3^z;d7I+FRz`7sLZ7w=*uuitgo6bwvxsg~RiX%79!_i!tmd6UTb%ojn7wsq>GFBM%XOEEF=%PvJ z(|adP_t4cz1Nx$=m`MrXjSO_t;^t90MV74JiYPxuhA||2agV$1PTV!&CRd02jLb>6 z=l28$134zy_y`VNHL!Wtg7duBzE6)tDf__7p~Q|**S%Lq)MfKu3h|%^J?{|Sk~t5* z#gu$w_h841*eQqjL>=}pi5Y)m;S=d9nq=>*e3zf{Z`63zI^N%t*H~CjTzNf@sICaG zs=TsgbeLfApO9D2_A%u3^TDd}Qk_VY$gWQ@i|pmlZkSp0dI#K(l9zFuL~K79_Jsv8 zwyz@XvjS}`Jq>{&9!7|5Ia-pJX=znFO}`RqBb)=23Z@ik zA8d?#R1PHJ|D)R1KU)7kV%NW&wf+a<*I&g2)wceoZt^>QwN0qoRyG9^2vO_8{y??$ z|4aU*94mM1Uk*n6%QaA@(IjkT524|kea!dz7#aE_e9VN1kNI9S>l3odqRrgEK1QG5 z6+ULHGuY!KDcZrcz|Qa_=)Y|XF=S)!fbC}T4#OImOx@TygG3xk#g8o<@KqOC%eH~%1e2N-^?PrvSF0S z=tgF$d6c|9Ya2seBhe4mQeL%cUziF~2WE9Jk;q%&UpDh_riy=QDOoi9%O}6YZC|@& zppEwfCt@#PS=gKtmUX1gq*vLaP^aupJQQ&(qotU<8sA|3KlsmD*6#ohFRIkfeWk^J zRPK*kzl2)XFUoOfX9!F1-Y-XXR{O6yf1rK6d7Jwaz+zuYtd%un5fbhK@|E zp1Nvc}%HgWLF*vL;KaDvbi_-61 z(1es<4g9@%dNmTfFUumq3;!~I*k<4jI&31b{qaJS{xvi~{J|pekoOT-9$6xG*GA&4 zYj;LV86opNtF{wMBxRV{u6BH#(lyY=5!k_a{_g;V*GyEdJg~u111&t^B)zZ)w055nIT=U2b7|_D?Z|Tne3$oL8#(rqSP6?N!LPuqwaOUTfs{jr!H_ z>$LK_YtLWg7wa=3z=4MH=)Z;kd;+?%w69qH^DH4tM@)l#sf9(B`LTJc-hamC?@y@3 z!k@BNvv-;xWEY#idRG4C{}@wt{|kRNRpk#oc?|#gc|F!{Yz_w_1Uae+)rrMn_5LZg z{FXu$Rpb|ozpU!`Ta#(w@3!4BBp8#wLiK*WGc5cK`bYR{BKZ>&M=^qj_f&%>$Sxwk^y>I?So!;K=RYC8yI>|HzmY7p z>iL^mhYipXQ@^DbTjX~^&HS;)gk`1azeCtmtoJesGVV*(#s>jL(w$h@>-lz5eJW;% z;ut`cqbzkK^<=db{`?M>jtXTG`*mUwEOI_4LVrR$`iUm`Cosv1L;t>uMC}dw7uJTp z5ojddtJnvLu@G925c&%tWUn8IfC06pv5P`Byz%)sdZC4nC;rQFkdcovK3K{@oG*;L zmoX*|{X`4>KQL8_BZs@K^qbT`A7i#-h_ZSMqfmVxWQO{$qZ2MEwla>0D%)=J=TK-= z;A~tKn9LwoU=+eTx}@Rx=fHK2rK!M0>>l4Dcry-0Pbqc)C0Zx z0U!HV2y&N}-O*R6ndfJ;Qx(x?I&}a4w?1F{k42wHH^tOvmu41ye*g7f&}Tt}iqhx9l1`L9 z6JzMJIVKkI^{L`As(J-IW>f8v6j3eRoKfi$fgn84Bd5#y zoD56*|68B+7g_Y#=d+mleD*wxK06`i1Wvcs`h4bHgOJka&l_XwGgecbhT)fLs%CwP ziKlvBYr$7kqPMs3;dEG0KCh)u!|(~3UQ=c8w62M$m@#-6vK1ARao~KSE;4V3ULR)v z5KTYRppS8zFwh7K$1rk<$wFr<3*VtbvarDGfzk12k@Zy^Z&dSdv3`#DFQ^BS=uZpg zkOmIXtKlu5Y7fUBMJ-CR2&9*V8xco9>jClhYhv(ors4+^>KOcti_cGnm7g`6qoosv zANigvMm|-abyp9*$9~?OhB!EzrvhFnKo-Lu4~?dPQ!!eFZpqncrRlHj4BE+uZK!o^3Rq;4F4x@^E-qCX2p3fst zd?i}9Sl!$TX{YF%t-8iiycYvj-LJ~#%M8H*|+v626pIE1Osw*AfNQ$cD9b^ z-|^^o5%dR%_8o%-T2GD1!cA5dj-W?_49qMT@iVk8ZPvGG{l7~4lIv}+^1#PC5RE>O zuFaqkY*e#PrU5LUB^M@W+NS*=Rfxyfm>3{OgSCo3KY=hplYiNvKmDoEws<}$&NrFjwPCgFPwo^Wo;6;& zKYVN%om8*!{_uya)LKWVMdGzFx;5?J8U92*Q=;e}5W^rre~-Ub^l@l6y0&j@Flrlz zzLMu0P^d*b|Lcgmi;|~1LLut>S_8Zw%C8GG4#P# zKmviE$#N}5zU$s;jV$vL@~wD2FjxN=IgLzMvDvSu$xL|vOayZ$V~miX-+HFSmJ%%0 zkkR6$lv_k=1=Hul4iG<&8vp}m@gb@+3)=7X;4#ZebkAs6=pUbCzgR(+*(Bim>*$># z@{j`u?hhRLgHR-^2vO3ANzeI$cD+zv%VIbD5&=g+(0kDQl?cjm9+4`zFl9wI^;VXuFQYIn zbJcJ9h^x+B?ix|wFJFqPZ#Ptm>U*31ff+#cZK&$|1lRW&10JbwXsF=CRz}rAedkT5 zGiewH^9eCODoQv1iG2}K&q5<@7_Qj(rY5d0fUdZzbf3y=r8VDgQGsb86I2a;SOr(v z)WDR!E$h{t6e+sjBp0N_t_Cw+^|bLIE1i?&Oh+wE!*XNQXB!k?)F*In&Gk7g6^ST} zL-ztJ-4*N1@{@!2=>FC;KSz(t_AwLMHRL%8q>bIEICNj3DUDg4T?O43@~m9)U4dB{%kc*pbFrI}iM4>1biIBS%56JobU@(ZWzH}A*T;*`O;@-Q4}^c<`s4?@ zAf}yOe(D}fT}SfMZ|Z=BNjz9qiUi9ra*HLh=`Ao@Q_8XV-$fuhdYEK_Fp~)12W7P z)&ICHLjA;1UK4mPUyTN)*e)C6wqQ z6PHqg`ZdaS871f|*enyn zC{ZpG!zr;;CPq+Vp-haV#B(w+iV_dY#Ar&C$;22+OqPkUlo%xw<0x^9OpK?*^)hh> zCGus$Nr`qc(U}r0WTFct(q#gkG9i>A6LkGqsIE+OrNp6QLI&L^5t0cPCBBu3LP~rl z6WuAXS|+Ze#CtN~ro;l7D5AtuGSPz)56VPON=%oDVoG>q;(AKlE)zFUqQ6Y^qC}BQ z^rl3fO!T3|l`?T7B`%eTzLaPp6a6UBP$q7oMEIzvc7ICzE)zFX;zyYnK#6TKF_03U z$iyv_SRoUGDDk#Tlu%;6Oppf;JuVY;^JeG)nV@^bLsMnq3`&fb3A&#mG*~7YQQ}6K zpnJeWg)(8IL`RvR+blv?$V6RATqG0l>IoscOwd*8p=6oB?}X5?KZOj);fMAkA;!^Y zd3g*!j8EqMYmApLhTv=UaL2?;$RqCX#p!arCpN-qNKxfI&-)Y1{RaA{OP2FMrJw4~ z(V6>+9QvGkUwbeX!KOPl(-8#p6ODZ^X?|VxAv8=ebtmTE`U-Rraup5JBi!u~u6#ny z3udE&p`rL7n&N7#H#wT_&pD0-+_~(WtuEm{9ds)xiix{j~noxAlzF(mT8I9rOC{ z5=56InARUm=ht`~wg585K{}V#F#}r{$xt(SYux^d*XjYlJvw;12R{;>ACg7TowW{{ zr|+brIu6RJFFh%0H7&Q#kfO7AM-dS zntYJEpBOCbPrXe#!ZCGQ89jGE>jY^|4$ufp$kV(`$agZ)cK<~v6t}-7Cpte!0ynn% z8xv_1N>>fh1v^~>Z7tD8yG@Cs3T1-~Lc%t#_-WiG?7V%tR+%(>2iXa=oBRQFj_e`Nu;} zp(ZGg=9f6%enn170awY; z(Q~pP%1A#l>&YK9eoD{>9~WbEZ*!_uinf60>%m725s3ZZpY?B;akR9ymW7E6Q3ACz zV@+n|f4HT&$jQLLlro(TC{Z5u>o=UrbOf#r_e$et{RTyU;9{s&&rX81;zB6--h}s2 z-2+4BJP+Wqdy_wu^-I%xsSGw)`-<_jI*ApJEKF^p{lS!D7JqP_;8M(`W@AAt*lHZ1 z(-n@MJy^e5VWub7tFW2@=ai!V7ssQSt@A+D0W`dtMP`cY?&3|(j~=x!@Wf&kQ)H3B zygnp6WAy&@a7C1+s;`ZsnIkUxh9|6ss^3cWVnLOwt*)far}yHyW@&m|MdWM{X$>OT zhvHW0GHc=G?{SrcOrzHuO#34#`m>Ndi;v!jUq6^`}K}C4x#saAy9ysG`Xp z-ayfD>|vC()YHwp3wyYX(5tlvX$~79RHHtjG1NiriCdrFj#%n58&MwUnWE|wS+9`$ zt7^Ssup&i9Fp){oeu&s#<+@f~-@4aIu~`(wcaLy91-Cmetd3 zV5R#q!c0)pDB~0UPVIkuJBl=>t*POC_16zuWat(plMLZ)BS9LTxS@47t?=@&rri-jhR3WbDH5WAJ)4-wVY>0<*O!tXbXnk!TsKj^4w_D4v zZz>;}Ft3-A|2mFqcG&&zfX=uk7y3>ldg}X5)Ug%lAa^6k@ok;*7PYEP@!&yAl^#L2 z8rAqkdocVzU2ZRyM@J~4=Ymp0P!h|das7;DC6y^iMOvT5KKP=l2)#dfz=bIQ(VI2A zMtV)G^p?D7u?`bG)!rMT+*-8v4-QzwHb9geY44a%IMsP4OepxWE!sg&)3doAo=&lr zJcnA#O|tMT_rBmT>fLcWY>Adl@hV}|3bFUBBD|@BtNt*sFVxP#Ue^Z{fAv_@_6;1C4gExccmOOKr!#VU;BK`Ffb-C$hKUbrfODc$(y-54X+i?WV~d?aR#} z|LD?=XzjGVNm@@-r(45(t!GN~c~&vojCxNFx?H zpYBS~EvyYV*Gp;TQs>l%Av57WbF= z`6~bk`lw%oNl-S`6*myjrs4XOHRIQImd{Zyp=Mo%oD zcK7_OYDfDBmFm4C@qReTw+BCg@`tA@FI)YnKcxmp1!jZVx{uh87?k(bss_dTHGEEp zZYZTEP|%JdrX);EfB#2mO7l>~b5ILmVmv~A9bQn2wx>w@sv{ZW_m8#?sjhup#@rf# zp*!^Q1^;XNngtzIZC~YR4`y);qw?dmuM5Fz4ejeoy-NEUO7>s1eTn`VdTWLQ7W%X@ zjBrb5lf}_<4k_6yaYrl5-O>}-nPDlB(|P$ve;a`)goKG@LV|&ZqR*&ku&Xs)K3B#&tJ z@Qbh~uD^c2UybZb*hlGGi=_b1a``%DBzSW5%}@y|4&#fc1<2E_G+upK$bo3g=F0_n z^rGZgFLaCK_Xn8gJIqC7J;$>CFi=v(voXPXnLw<5{)fhCNRq1CEW!%n(Woa7_Af4Q zH^pM_2Rw*HJUlI45z7noN!EHL2Z|bM*EI}y@nS|wU}~y11;h$K ztP6-0f!K9TK^*h7UJiF)Ai{BPHpubgRkZ#V8GrcpPA;PBA=|@VMF+^wOMT|n*6BY& zaHMUzS1rYJwY;?}g(RRiLHAEmmM7nK=u76a!Y4=i5?moE*TaD6#_)#bg#vq&DmlZY?Q8KT9r#y8}rXv~iwnEAztT^6Fh+YD%CeIpD=D(W2~r z#2MIy&)1Jkg0n}v>+q!K7_VEcIY&E(*4st~u19OpB-CZT@snhBUx)=vP{kH10!T|IAVfeFXrW=cFc8!Z&1p z2Bx?h_(Pxbl59RaPd2UJfAW*XW=For(kv0uRC0v*H=!zseG4_Rcvpl&Rg@ZLgFXuP zoa;m7^I+b8@j9?OqbZh<9guNGK2|?7`Cf~BfU}o1Zm@JB{sgJ|f zxF9n$PIT6O60vyKB9tpW!G9Q|Syks_T;~&bfTK{`tv>_PV6V{*+E9Uy`d53X&XL}4 z(hKG)%Viot&)i0{WMSTSX(W=OnkvE>^dGT9zz%UPOv43>fo+h@(H>(0Oa;wEm_`wq zh69$;6xT??QeYdcmfUn2UcV!o@Zeoz7(+3Ngc(2#`4AXc3YiE(ECys`ky&7cf>z6H zehPrYDA$2MDh`gEKWZML7mcaC$$^brKnp^-%qRJ?Z8uU!#(O)omRr?*zlpv*%AfUk z-f*qbpBeLK*cngS`{~UUjSiqO8Z`I@c=5Of=y(VCPMkgob6tDs)f64l5YOd)5&q81 zKlrvd`Ae*vtewZ4^iyqK*(qE)zbi_{l|oap0*$9<{m2112ciiLKs%@Vg0r^=MrUM@ zH6E~~zzY;<1983E(BuP7SY3)!8%UJl5v4YSXYqUq zfxQ8Fy90QZt9x=_LTaFM8U)fA&JZ=B!_G*;<{U824A^64h=$cDRJ4!4)%k(WxPiZW zGVB`N5F|tfn-;oG*q?3jdA7wjB+f)xGka$==9w0a2-$DNxsT77pC-eYxr^AyMon|1 zNEfjh7Qh1ONCGjf{Qdd85QwqzH}_cxWcpevO__^aZpdRhYpP$0=MA`7ETXa;`mjZE z_!Sb8Z<^5{Q$2&<6x?eMetpVL>(!w>mRx;DEM z3FMuLKkNW1co0cHi?n_vh1z&Wfi{~;LOX%ijPxJW9~a`Kb4*Kd#WIafj^Pm_m-cCq zxPmUq|G}`(e(mf!*#pAUk3#GkoCn=|jVB$%J@MTAO8-f27I_hP{*C_2S}p*^|IC9+ zcjEUGDqJ0~+B#_^-M74if@vnb$HQ5^Z+VYz&v~Ag5aX7N`5j5)ekF}U5$`Nt2Qy2f zNPD>y#UO7)8u!zXT1aD>!&MjWwiIY@P*H{2*piTwwIKKv@(2rgl+!X2xIMt~;47uf z@^HYes>tJJM2yu|9y?if-q`ZM-LNbVx)%1G!r)YhAVeZ4i-@2AH4uhjv4|q=rBXnU zIU<9yhzzEdNEy5*WH81NDw38c>L0o9+^u~|x5eY6FvRE5Xkd!1UcKNMuBDUiF3ojn zYvrBDxx|k?8ByibZRFB?6w6+F^mCr#7D#2}Yr9NlGMn047WI@_gv!Oa$dl-6o+b(? z8q;Hxg)!?lXi@s{mFn{>@flH(yMPHWah9eBSpLA1;p6BBa1jL7sEGf%b25(bUqAn~ z-Pedx9sFE`ZAKUcA@DH)TewNidCK+=*^@D#!xHX7Z$2s3wUpGcG7p>jK@7dGF;4SB#CD%>KR zLRiDd|3+KGNj>=qlQkSMS%d8NBlbY1kgP9xQs}&b%lAIdig#-}*d)-g=s)p9i3n5g z?k%-ihV077JVs0V5n1IO_SM@+lPDWG11YIu=Z9n>zEB-BQQM3LaSVb6?J$OXM52;i zFfX;W47zh<9%%5)As=SKmNB`9unbiP3c92!!$UO9(Et6qFatoFX4uNcGtguv4}2H} zFD5we$~W&BZ7MPxT@E#`Lq3*p(~MA4{FdW25k?a_oxd3NV)h^Fig0iL^u3r!qZ%}G zM>9>)lVAfdnLK_p8m}kF2%Jn*HbHN<-PD7_f$Z0}KE@_Pg+xEDfPVLF9%AzIJv({X zsaL1iOs^GD8n3b6(leH}AG+Z!>U1U&; zGIadGWR)yKTPUuE!D=OO?}Fan!e4C>dk@jvvC_@8Spons*`KVGzunr9jAH!-yZ}Y6 z0%V_qb};&L70CfgAuXlT7=@exrYq2jnrJ@GYF-x_%?i!q@ezD4>_>n8Ygr)I&Q`V% zS-7S7KBT@gY%8NV2d&?1)V*`0By9e}oQ;NrQ>bjU{;dm{96H6_3{UQ;qj6fJhB}zW5#$gq#u7@vhzAD`SI((xiL}l ztiag(G4s5rawgx;E#Gq~&_1Q{8ZNCrP^7IjH!=O?N4a%%rqOFAJLC%LV@2!w`m5;H zbyA=7h^cj%9kO^I%0ajeh&w@V#RXG|RI>LKQVhnsF#G(&aDijazgQWL<&Ok>O zge9X7$7j(~f5&rX_9^&gE2BkuT6L=1J0r6&aj)$@LUDJPN;tC6BXqVT%58JWn= z9Wsv;@iK?aZ?`nBY&@=kqGhw9X&ber*;K6d85ycrj3=)UyB*=_uY6?sh^l_WGGYFr zX11~gWY@v;9||$0LNmhsQfNlBFM$mbGIZ*VK%ArCP2+72I}D9N4z>sCUkmVl6Pg9K zA!xjbM@}|l3Cz$Z4Y$68g5WJ<>+`fPt@?cBU$xceKrXl%ea_-AA5|Ie{ch<}gDSQhrrBmJBfSQ@o|L`m5p<=r z+T{*nb;uZ74WNDl+rDn?E2hY9657GmAf8g&#GmK0myEAWak?d6x>5Z;b_A#ytzjjz zu54v5LND+VSd=-#MQsuDJFJ4L~HT@yB65>kvQ8_I}KQ4Av>vtm|oD^>sZiYKTKuiOd zR^Juq3%T5w9O#*%^}xDFH%$7wrn&2St{J`F+m9#nUxOR#dhZ|D zo@4KSEpcr+h}u$eZ3I1j@*udb5?*YqhZ1^2{l)@0pMRUuDf(mcrI*Fnu@6?UscQ&$ zu_9N1o&vrNo#WYEQB}Bz6k3UcpMVpBNoS#x#CWAGU54U-*m?C0E?RY$rh}+rokh&+ z+rfZE`Y+2%tl#GG`t9RFQq=WqfwEXY*7e&nHd)mAZwx{ppgg(!Bl{2ZF3k&~*Ka*% zK%s9dIIWj)#|~e{3$?r~o9UwEGpuomwOOo;rJ3Fr4pC%n^!}b@y)j*~&HE?w^2wU2 zS#P}d3rk&gAY2-Cq5T`V-WV;txkHG%=bo-mxk$~Qb3nF`Xs%LTS~A>=>=U)Tpl?#HqZ)JN#wK|`Y{a%i-T zhm|qnL2Op$-w3e0942{T-5yMu*X>#F?^V&e(sbntV|}jjp(uXtl>E5j@pJEI7GXB4 z$`6_x$xy6EPV6jMW1+G5q}bL&Y|k6xGup(bF{YT08uyb|x$o%>D31qs&0z11QMkJI zMc%f`(}Sw(+!Y%wd=GnoyX2@GqZQ>h)k0y_oT+=KZ-ev?tG0tFJDUoOrOlAJxN@TFG4?$VH1$ z+C9eUdGr>aoQar*y`Mt$FDZSC@%dvB{TuO8hdhI&JuZQ^xN%$KrGD99tv@+|bYS*Zf2JNmC42_0!YJ(Ej^l=%+FG_*?p!3#rGnkIz1~2xM$5`dX*ILceDH zoTk{Z>L<_2&Q64bz+B_BpHRntNI#Qd1~KLH=6Z{KdI*Nh{W01x`1o7;IRdH2q<@8# z{ziWc`}m^6U!h;Kenu#EtooVvkwqNW3U=b?CmVeIBmQk0q#jc~HY-DK_+rTC`K$km zd}8?5e(~r(RACWF>sa)g{3ZG|>t{Wr&f>7@C&9|ja|maN-+o54|A+LGs~EESw{h4f zhx1Nl@l{(<~SG+rA2+=X&$IsRE^;d9vZ7=jb)6*251S5mDK zuRP9L?wO+8==!g3_ZRA~;@M{bsTT3<`_@`&HmlTVA()=#iTE=Jt8MMdd&XK`$2jGE zdsVIYE3%fi_ugp!EaS~jAwX6zjW@@GMz!;)U)ES^|6rtaYy5`T*x^9_;fe^ISEkbw5>D)L%73KV- zqCOoRH>YV9U{2G5h&hctmZ&rxYin%FLEQakNTHrv6g|t01M5I`R+e0^s6bpEJ$R$!N{_UZwMT(WPJ=9X9s3Hi5 zX(m#v*zKVdq~0gtg}$fdHC_lcCU$I(+onmQCwm-xB^iD7_IP-uMW&D46Yb9;_85!a za5KHiHZ{|mVWoFze0m&j9%FuY0BYx!eLgBVmWkFlzq48CzA?pYJ2B^XG2}a0ktX?8 zX4hQb$FYSN)vsM8=*DR8vE-X4F%;Y7KAgVp_V9y;yN*YTSBW8 zkrQQnI;ND|XDsp>{}!k@wrSppO}@dzg?Xk52mhXg(?P z8kZS~6~0asIXr61B+@=i@nLkF>jWyw{4eB#ZG{0u@+0;Jdn)=na!K0zFDG^mIcV64 zwf9KauoFFcasl=bDRQ^~2@V}Dz(PI64i{k83i}qVZiY}0vWgIDtkolQ7`qh!AEoU$ zipB(~L)AqPXQTH;WFkc~iSa$zKaKIH@BJ0#h6907=e=+VPIp-)564}awrZv?71B4h zY8+ljfBIS~Cn7D=d6tIy8<(5oE>raV56FEQ#9d+=!q_MOP2cjggggL47}NsZjP`0M z#woIL_XsL7oGYT@OZ&@IL_f6ODwL1SbJ}LrHjDU_B^au>G!@@A6k5jGC*s=>#^$6G z0b$xn%cUl$&&AFmnVE`jJEULb;hItz`P#3hh^8v>ZG?p#z7+ij2#3Nm%FXd@C-tUJ zs?Txa+g^K0W)R^ia>G2AH_R!1jW*18;jeARZ}=C{Y!vvGvok!XbXKehY!94$#QEYAz#pV~WN*i!n9aqyLDUdc7eNy!--ut^$UzgZV*RqYDgzmHtoSv27C{pWpD<^Y@oU=8 z6f}{`JM3qm{`g2RcL$PiG-Hkklhk(7c794EN^O3% zny2{2c_2jo2l0zCN^R3+MwA-LvX#-Lf$W^=%#5BiL#P=+5;%NZ_5cInvy>U^o*M-p z3AosoAAG6V8sOd3<~(KL&3T-W>44v29b4|tx4;L(9%Q^lk%)KRN+B0j;?sse$ENtS zim3Rs#}OV*RpAJ8#HXFqbH=ktEP(K%Z-|1oYK*}ihq<8)?vPb{T7@g9;?pi6ty$vJ zvYASi__UMyN8>0y4e?gDLy1Ow8radVDT)!Fc9c&vQGD8<+okbvi&ODwca=#|SB+1L zge6#m(qv@9Tcmb40V8}y3sG_6$g7EXv>(UFn%n#wr-^o!779@LqPOGw z-q6*30H(!$mkV9pAZV3(Z*kw7sjGYJ9ZOf2GZy+cMI~UUA@(bY--dd5hV1A5L+KQ~ zzMojfI1d3-RpJpYnS8N04-|<)pfekE07Jzg?B_THs&|UqFTD?a9|cgoPh#h2ZfXCj zey~;4Z)#Ni?u8*m>Sx*&U;MVHpK$?6AIJ~&Gj_!z^^4jMUwjdb&`kT_eAq;?_#rmd z9%ue4XGiwa6n_{sZ1RkV$o>aNlzFqBcRA0>+tSg@TV(P{>w0K+7lIksm1%(RH8^-% z$VKomU!9+(Q?b$X2gjrT7ItM%Gv-mp}@qaaT6IkXA2 zo(xm!3Jbgpg~q+1<_2p~zm4E}a8XA45Ox}Quhg<0`~hUoj)dk%zk*z~^P`;z4UbaY zv{~2@nOPbXnpAQkK z9VMSzs+1pDU+JuhruCIRC_4W7%I(&2>x*(@tgl4M{~oeq(NOvO(yJ@KfwlaX(b21o zFPin5=0CpsL{lchum^*inViTatwQ&sLSeh8}JN?Ok^l)qxBx`hNMcrM}+V;@7uv+W$-SHOEI_Opm2#x|+hj2sS9T8WAvv=PwZ>pwEL6 zx%yYrtiv>Y`RXN6ePg9=;;meLV|X8lcZ6LxIajn>#CL;~DWX8AJRK@erB<9V;RaH9 zC#7;6E6+>7wB%l)L}5Pway$N4l->szdKNo@ZX>Pqeha=(d20O*e$=fiXB|Cj0?Z-3%lsE{hl@3-A(r=GmV{jJYlv^0aUgIEsIcgNe`in%{>rJ4T5 z)LPN+X{EpamKy1!y<6`~T>|ZK)oDNIa(LmI?o0jZ1xvjr3c8Ugq%r>Xg_9>?N7g>; zNIi3`f&s6nPv#uiPp+s>8|UUUZL6EpNJgcp$1}reJKdb79eQ(`w&EjcWBme)(T6to%~+1C*v3SC&W8B$3L+ku*uH^5saH#}E`UoR9r7=|C#)kEBT>^3D#q zpY{hG)H%*CA*SggU<_E%;*1!Ss-K!~ zc0{?gh+k@DEqKGtMhk(8)O<5oZ+A+q%KPtg7Dn%hQ{JrFl^3*@*D_9dZEIIvCu@1% z_K#iPZ*8@z@4ja(^_^uYPwWN7eUGpc^av}pypQ-emm1jfT3W`_S(P0eLs`nY0s3YYVFVMTOiqves|mO;&kn$9N`oaXSDo`o_5}+>}tE z`s&NuS= zxHtz#vmab@Nmq=B`$dxV0LsFpf=*0aaK)k1HfI&sP0n5qe^Q(%%E!0tP1o}WxAvWF z&gvVSH_}6a4p5~{$38VhS3tvH+)B{+d`aJrEm@m zoqJ_sg7mM%Z;`rB*DRj`9IB;~MsZ1dlMDC?NW7I%LjR*s^%I3V`-l9TRwT#4&Vnz#bKnE(Zflgc)EFVMTg z!myiVo3j#>T>08c+#(k#-xcUG9{&HO-{Ak*+A#7&SlZ}}6p%>CLOFDp99z|NVip^r z4s;?PQWpwvuKY`nIwgdQGxeSQz)axg@z}>sLOx{aNLe_4xK}2uH;Hc}GTA5Quh`G! zon7DFEZ%CR!7nYIgI;O|jh6MVc#qLLm;_v?g`B;4C7rH!;_XG6#NgsO>?G*V+?a@l zShl@ie>};8axzJ1nZ%#|Fbbhe7=(sBblG1v@iHURokEQme3zr)B>TOB^lt)WWT403RMpxiU8l`0R;aeHP_C<|OL(kj)CR^E2To!Mm70ALL94BJmRvV!UHp+%3LqlY+4S7JHj|urw#A2Ri&L}P z^lp$<;9ofte`fY|=FjYwC*DtU2m59@-)|1ef=(f>z|f8q9{!>z5ygJjz~)DXpOD|? zf`00F6k)694T4kw3we*_P zd6oC@`UuE^R5Ht;_)k#!W@ho9D7;~l{5Gzm;NZ9XnXaP(d`y6kDe!R#J}$r~1o(sk zpOoN}0!-CF;mkP-Pjd%diAC+d$e)=eZa6MTTtg~zmL56FlLFK=$lo+EL?K$kW*>lo zoA0X(G{A*74}OY=8`a9JJ9v9C?{c9TqxtruMuy=m4f=xvGFsBWHjAS>X*fJZSsmFcabfDkmr$rxgQ7sgi~?;$ULh_! za|fqjooh-l?#gz-P~c>IE^dGLiOVzT))cxpGa28+#hI>Pr{SQaw|lJ~+E#iAeisE& zGw244!a$edg~7X%!^OC=)>gI-&s1WJ>{vmo*9Zm;+mD?tSbPl-SMYso=F!u8jLsKO5a{)cBpj{dId$OqSI|52OmaN}y4BaB3a|AR= zLCGM)_Qeu9Q9zRwv=u|=N@za;ZK$BF8G5&bUM-+03Yx~y0TS9&KvNZzo*E9@uaVG` zI0T0JrYR`BJsh?>By^X6HdW9o7+Oa{R|}{^L8)60+jln*(wHxx848-s&~gdAUqCYz zv@JsyNa%0@ZK@VGme7#`I$lAY z44oyRZUOZuXlI6wmC)7#I$1%xFtk`gQv~!L1rh;C;&{PTCBA~Mr z)WOh0Nus`s1$4H8UdYgG5;|8v=P2k!3|%IncMIrT1#QF7rzLcNfIh9D7c+F4gkB?{ z^A)r?LrWyoA)pHsG=rg?CA5x!E>uwTpU_~2gzmD`rZ%REvJ+Xu;f7a*T|aDGNhrgy6>6L*sN#v^ms-cIJj<`nYr1mK>G zww(ETWOi!-yyJ{~Hkt1;&W9|J%w&|nInkjY=ldUl%dq{&Izj|5Aty3@u0%jDdxz~6 z5;{Xb?@>j-D*^hdgpOb+4u){?@ZW$1C9H^G;53&kRiWcAM(Tl5{0-X&%bXVr8naX; zZ~|z7gw_+#*$PTCfUx~C3H=EpA`qCPphyCGnuM+w(76hVq5!Q-6g7WMK%Z7nr~uGU zB{V3Y^Ar>|2m^hZ&}|CZo1w2s=qLgGRzdqPG$^4(0=i2<>Gk)peVl||A)vbz zv@b)im(a!nsw-$ehF&S52k;s7-K(HCG1M-hTLtu21?|t!!{KDEFRmCQ+=B{wGef_Y zP^|6}^oW8EV(9x4Iz>Q_D`*KrpOMgk0-6lgS(`%`das0DE1(S(bSOh_mCy?X)J3+& zBXNfWBnv=_LL13wEhO}3VN5Xk@5NhCU#n-37Fzf=*!QXbEj8pjirf zCqsKk=$QiARzW=s&63bxk)pcM>V{*5mtm-WhI`kq@L5cM<+0d9t{$eF5k5jt1PNXiVi29u$xp0C6kG zWQKSIg#2N#f=p*fe*tOc^sP%Q^sUA7;A0YO5B>W$906HbnX2|r#o9*!Fi>|K*iic! z{A58s_5)xm36(n#5GmFfPOX1J9;fz4jCj;j_HfD`PPKJLu!2dDajQ~lvTKOUg?DCg1?h?ZmN5t=b5ub}zY1^)Fm z{{bwB;X_|1Vr%BCMMy)iTNc7mzRTVo@kbQTQIP$y)3+uOF*^Cd;SS{3gh-Lp#;(D? z7Y^Q*6o$h&DE@|WoKzpe@&hWB+PF=M zJ2)HxIQmRnk~9WER*Ap+$`kWSPoCvD%eHtmvLuEcGvcdUP@KE{54N%^@hP7}qz2`= zf`jsK$qz;5fJAGQjtHJLi3q<+2^Kp@dqso4a}WNw>&%vy6=-|&+hjB-$leOYZ)L@I zotc4;*Mc9=|-!|4aVy^Dsn5&yO<_h{o z%+=)_b7j)^_fp>%8v6YhIw$>}7S{s4mpN*I6Y{afvAzrWd`t=#MRTAP%Hcq(3uT}c zBB5NQ?hF`2$uUT2L;lQ?V-kV1420tpTSW+@Y(gMqGr~!Ua8e>rw3Ufdcd!JyUQ4>B zca-xJf2Q?nXX%NvJgG|CbD>(*f4X1`uJ$`@Wf$Y4J8%KsRze8kKx~FhO_mXdAiz!j z`IVx{hxVO_;P=vhez{)4hYL`DeKvj}nt;oxgNeWgVa31~DKe!J?R&*(*1pd+%6Os6GA16yVS>Y?CzxWS1#>>C7nosHB z^tZw{8~;Ka;fgM9`hdu-ne-R`mHXz2KhM)271^bwzx!iy!p)&i->8Sv0H*UFAiflk z&_OUp?E&dmDV^$*X|0`#J+0`L!bNsVr9b#8Yu?Qv(_C|W&!TiW;#cuIPr_XCZ?XK# zHLvJaLdo)OLvRzc*gxQd2?ud7;RIsgQcCX~l7NFG+DSx_mgi^K67*RJTf`QkpZ8IG zIO!$tF+qfz6B-Bm<-3y;^hf=WbD-V|{1&G!8QZpaE=fi2#)Wu4^l$gv?g|Vnb|FUe zLP1{~J))r1+d2^??~^!CLm9k3A}5}i+U~iS<-|vnDBsmZ%u(4`r~`Q^i}yQ(+$rW{ zxd`L4p>V>;%n}$Kfl+!7YGo^%h@aSY`FOTE@rC+axCI6JDpDZ`I)tWwcS$`kw2U@f zJQPEltbLB7L^3`$&?^O}jY2LQE!~YXML;N^{Th+Sr0-rNQUnTnE=A;DDyY1_8B|cZ z&s6%=C|U03AV=sp%XRDzkZUKH1>`ymze|^}g?avyy@|3%__$g7e9dgt9_4vkRwo{U zx|a{&)DPP|-$65hsgNup6(2{HT!BHwdg~WaeA#xJzYJg8fx!q<-l;di$D*>`9#0Y8 z6fGtJHsCM&Jhv4EhT_97`k@y@mg@^~%O_{qjW49EYgH+mWtP@PDXaL)K2Os^m;jaX zmcRi`(Xp_e15=?<@c1xuG%innK%m+DrT78{(3AY4du8e4hyi^0Ngpf<^9*zcM&(g8 zZV-4LH^?!LQ|9OZ7CA?jETjdo$b~eMIB5!}k-%XV+2BC?&*xE9sM-jrHn(pTWy+5w z!jjSg-Ec2tUvzlEVW1ze z@4XB8a9V4vJ6MDpT()~hAf1BqaPD9q-1V^CdlS-WMcNj3aG-q zy2I2Q=7MK;;ND{Wf_Y@tFuwcor6@27RvOk%=Z}Xx-lD)Xd^oKC`5fo+3{g3B{&2|C zOXc`V=IC4)=t*U`G~6h;8XsMV-lcoon!5uN(AK`vUzez93iWA$3M5+@-4zR-MSFNe zFqx;{EppiW(*=_-+^{}WmNOpZ;KO0Px5(ug1SUC07n!4n%F#yV=%jL7D08$_IZ|Yf z^Hq-Uvs80j>2VnEHnU{m+J4b>e0BxSCxMsEL8m~w+wKlS$jV82orInLTl}hH^HZ0LZ*@H*@o$pH zjeptJPhx293S58#i(F=7>X&>KP`B~vBt{(c3Gc;5K8EB+X&*-BFXDV(8t41-alUKv zyGvB!DjOyASQIG8a|VjC+&Bf2 zT(+JMl-T@hh-a1$j=PAz+IQUp9Y1ClP3;Eq)b*#Ke-zE{Nw@-+llR@i-}ZUNxdT(# zNY>y3+Lq^*qCh*c*mpU7$kU@R(2NZBMNx?78d<<2vVhjAfcs9T+cq5w8V>dpn9 zRENSqpFF2;ZKAXN2HSntQ6oZ=gLC~#zXmu3fj)5WMVW;a1$n4I0{-UGU);pN1F7!8 zmR}TUU%1+bZ2ns?V|4|Z;Jz9b-374DLanl}`6q?G%H+aTm31+9fuo)cba(I$7?KE|JL`U6#B{y91^uT{jMebq9slKz?|O~ zmG(|53{FgTwa4*YG^P)^8pNA-Pz1-@%N4i<#x{{%aaSZ@XcO5F!_VFVBb!J@=52`t z3~VBMU<~kYfpJY__j?u+FszAWSRP!1h4~9EO>KUVmEi|Ni0FJ;g!vQl%k(}YtuNg#T4$M#&Mm7ev_O)_Y-8_O>U-MemsTfBe`8>L9q0S1 zINx85^Zj4)JJ*iRX^HWaLFW#F2l~U`Oc4I2M0|h=Oc4I2r}zLfm>~SkHR1zIVS?~C zt;7eI!vx`Pnu-rFi3!5roFP8IEG7tlbCevAZ#DeQRyE9nofHI$xD^(ZVPS{{7gAmk zKeG#dW)=GxZeD?Qdg)y5e(>&Spf$OS@krAEd=z?o_A!I_!*)-9cc23~p`MHo_H>sn z<62q3)v|yqWC1Odk7*_mniK{uASZPuXV}iezq_H>VY3zA^()L7PMcIOA<)Z}y%M)wc^c460rC!X zao>zNpM0PFBux`Oq_{(IkP%l^>1Tm=S8zDYV*vpQwF)r<7wz!F;v}lTI*0!AW0Y?) zhA);WdU;+7a{P2e&b=`Qmy^&^b;_yvk>E@!$F1#B2b`v)=}qoLf!e@bOhOOH;x!DH zEd;1=A|eSiKN^EX@PVoXmP}v*x**U(5h$;bz{ywy9;`~BuORTNAh7;XAs`$utcilN z`H>hToUx>I;0~_D5qJB2wz9D>gd%NPKRPhVrUhdS+f%ku%p*Qg0BVBGPr>4afoUaZ zM4m?a3s@R0tms-omWUBvKl~m~zwvjn{5zcf@&LFwSwL`Ha#8RVGKSDbI7{KT(5!}Y zb$99x{H4j5K8KE{jY)O|F}O#2ys|Jj2A5gyO}KAK+~E>8OW@)aSrgBr zqH!r+op3!8x242A_z2Z`lp_lFifG&tiJL2Ne;qICyF}n-M&TX+&{SWZ=Yj5Vd?xyf zBc7hAkBa2SQ;x^|4%# zt?W{K6H4=wKX+mb7q++GgmtRp@KfX&4?awuu4_NG988RajcwFUhViwSz9KreAv~V> z_8Yo?m5bW{1&X=}o^!TH@1XQ|C|agBQo0c+%k&CLw~wUXpft@LB>Y)QQ~xK^4^sLA zN)xX~*75D;;Kx?B7o*>zK#%c-zV=y@8c|!h7`%qI;&WkeF-fj4xS|w)oZ;(J3a782 zFUz6FiCc;SH{#|PT+@OKG!t=#Z^&}t(x0?O6PFYQr%i?<{%m1A$SH9>HA7Ddjwbhb z2S<}ZBjly>12^_V#uB%7NH4%~4P2bUEZqcHXd{?a{#^eVT?+g2k*J5>p3Emu?>77f z4Sqi@Fu7R&KFFuCU0S_NKy3d02)xFPIN9r$eck~259W;cFClb#3=gh_KgTijpnc~L zAmVR=j4sVy2UwgRS+7FYW#{1J8uE{!{Pr!MQzZbJ@;>2wikNSgmQ#+eg2pZ!`UCWj zZ<$7W#+k=B?qH+7j58fLoY}24Dg#LIwN&7o0~}}gi?WsOp#9}9iAZh)!R|I8lnR93 z&LPG+FcDWUz4MQRvll+hRBJ8$48XxyzQEbYI1MC@eZ|*=GY>crz}KHe1kecCmuz~)T9$`1T;OfC zz!}9jqa=>K%`&3193lKtXm$tT>=w%GEpU1=PCLed3|j9dobd{0*}rX6v-F)CSq5nW zr#0i$l{oem|0SFZ;6MhiBPWvsTi;<|ElLP>N@1MO0A(5M!P2PBzilkxNp4Ru4rK7cw}dkbILZjJ09sautt@AVz-csyaBi14_G^|BPF@7Ze(+P^ zG!!_Q0%tDMxsq`pgY&i%&OscD5O%X*CZT=L+8|)wnpk@7V z3KYEM3Y?1>XRySv_h;VLjyB|G|7b%5=lC|RPcq|N!8nk?jk}1>WQDWrNDHc2`V1g~ z&Psu^r!~p?7{0L#>M)&_z<~_<;H#2>hU}8XPTNumr$XZ7?D!AS*)@uAa+)X{RUen2 z^C9Cr!Z?t@$W4SZ2RO-kIo1H}GI1bMUJD4Zh!p%U04CV-&evES%-&h7`GR|ATfef1CtCGP$WM9T51Ut25 zoHY_B=XAyy!Zun$R7yul6@0#V6e*Oy!${cMso@ykB( zwry0O4I>P>*~b%(bnf#7P6gv!CV5N0h3S+koMo%eU^}=1h>*c>f%5|69Kbi0!Rbt= zEpQ-%6X?5?46>13lo0H65953&adP&&N4)76FTxI1DI8UwUoj4aer{x(0OLRg-!pGd z14r3GDG(LTB7t)ihjzi#l-v^G4EXLxm#*I`2xHoU1=0Iv&Q!>7;N}eMSp9)0ob^i~|`A`+{&Xfum$_d>sfZ zq2?OQ$r3mN80R{PWADs3-wrl#>?;Z9n#&pIAde0^bzq!xC2#2`xzC!daF+3?O-sKQ zh@i7X;51{LAHX@w;O-5?TRw1L2Sv!KWH24sMG3)9$5TiKZ%Uk;J#Q1vk=qC-r?JA3 znRD_4ogIubg>lFqFmDTiqh#{4 z?HXAw@}Dj7@$qQY4yaK20gfM{QMS{ z!FuFWc5s4*adN((UqZ5eQR3vB$L-Y5I5|OuvuymiC?uG(9Egy?H)j&gIL3ht>N9V- zz)>=A15x#jr2;3&bUH{J`wQF__6{^~>?tOke1S8BaT-Y8(w}{UWIa#e2m)IAXDe9- z$pR;jaW+GsEQ6~*BAnj9fgL=9oJs~?VC1D_a1P@?@eg_y~@D_lE{kIcWmtp$w9BUCCQ| zfA((~3TN5=22`{37l8=>`T2f$o%aVX{8mFYRw6Y|%vTG%! z=3_}dWoe0NT4_nz4SXochcqAfzSmlNpEIX>Kfhmp$jsvPUTd%S*=L_UXC@|{HD4-? zdDc3rGZ-3pkOdOu06zv8U#uEuzic|ujI5igWYz;*%EL12Y?0v}g?dS6NvVCT8Oj>i z@CS2-jH{^uPry%fL9S#z1vm>pPi_zqiR-VvjT zh^859e#7x#8w}RGtu@++3m`;Ov=2j_{pZnCoXnmstwY{R*0NHbqFvW$d%DT0Z%YR0l4%V5U9 zcW8v4^|QP3oE_8EKSc9qQ!?G7H0GLBq!}w3tPC5M{LR3V*e9T?RV@1!lZDMV{Wi(2 z(53VR^J%Ez#HXUU^AalcrqY<%TA?T}$wTnb!Ii+;fM?jegc`ATE$ z_=Gg)vc}r8m^HD^SzamCoMl(C3}#etspWm`t~_r*tGlB847xhdh73H0`NB6@Ch)9% zNHmxh!=Z4MnkuqIEc=vYxPD&8rBZY$sS);F3N@S;ll0#hMh=mFJuv*stvPj=#I-dBzk4W`AJkN#$0Cy^bvhzxv(WiaC&mbKQUWYr7%mOu?B z8gPNe@H(ACnj)n!&*R~3%3&v-1_>tsuK*m|T;LeKxTnz!D4DgFFW678%<3gFObfNL zkCjrtu%@Z%%Cng-Jl8@a(<1TjU0pA-_gS_DlbOw!$Fe=WZ3E5qA36q}#3dJ9{n46+ z_d09fgu(9sy(}6%Eha(6JyCxmTh6k7Dw%oaO`3rr$S^JPfa_^d4qQ)*Aw0a!N@G3C z@0*;ug*28anpo$oppG&KfkhA53WmEQ?jm(5OG3R0%U z0Kl;o1Fx{GoswAxE6Kn|S!V5CpbXSEmb_BxdDi@avw)rG#%thJ&}b*#fKcCL{RUT1 zv^0h_pD2ymU;PxaHRgS!QEyhgESdqVd6qS>A(hYh7qw;5ctR}H8hyg=!9@`LjAYGkI0M*& zwygO88tuU&5b8Ip-WJW)1Uf{^l*X)N-|llX=JljeuWabLjylU(L+3SLMRQkBKNo6^ zo(jj_XAiCsO)bCqcAL_e*<9z#p6tOIkUT_Kb{tne?ZHacT%t5qTh=_u8f%i)=ojE0 zLx`~r=CSR=DZ?Jr@Jpf2&}a{OLAY!Z-9N|cAkkdUn#D?EM)^GdHOtnR=f!Al63s=d zd5Sfd2i;ins@90fVc!?;u?I~=GpYx9eXG)#!#^Y6`apwuFdd}!U?Z-1YKVa^n~>%r zrLk)Gm^NXJHCStwb&(sUbX|rAAF$>>IKkKh&tj_cai-mm=VA!8rjKZj-%NErR2p*- zuL=)18uO?5_Mog5q8Y|@o?s0;Sj%s|(Fe1*5$Pc8dmTb}y&cysjO}LD+@dsQ6(7@Y zdawthKxz+W0oNX+vgQJ%v7+41Gpw=FwMHM)d!$YxYktOg$R2dyUHJ{r=y~w_zwH9n z<)UfB9=xwKroo!uZ?-k&M=_dTaRov@%g!R-{>2)2P|N$|*;=DLco;%>@Ndz~U`R~l;@_wzh!tR3@|M)z}$)LD>0b-u?5 z%pUl8?(~2L1#o?w0hwHTlnr!Ugbm|aHe1Qeb6IxeCfho5AcVTqWYOHf!+VG|utDBW zd`C3egx0zqoX8bf0?WE8nc3$dj7gSYy4V zHOtauHf+O11T9^_Ep5iB&rbB>ma1>GTk;HtP;a(oh^CV3yrwkfY<^3mSTxuxTm?n! zmD{>gWcRadILqKf_gAQ;cDj_lNdEFRPl!uIGl(^Qr7_!csqX`9!$%--yx>_qE(TJO zue8y?Wi0y(s{mb#tlAk=D#9|WNMzVT3T(}W8cyWgMDDF)4R%d4*@>H=;Yo*C(HTO0 zaQ`Wqt&Q|Cs27yREZ{c<4tH1c$UO0u9+#R&?L{_|Wp}CJ`EKLk%@G;K#p^ac*8w#K ze*mjIhBt*ZS165n$wF#rAT*c|B_PF#j{%DlviJuEvFs4m0(Rohk4V;(W!B9iLq+vg zd!dxdVa*EGz=_qo>aV=PaUvcfOWnmehIJcV{eFfF z%u~bjP3KQtT&7Fu3uXz_aAKKgcCyB&G-euWzWoO~(FdeBQ4Uy~kk8=+=CbTRSVh>0 zPJFDMVVRXAGRKLVq|_YNe8?I&(VF*lH$bE3#Mg5iCz^<60&AX78r(l7$A9l;Ys?o( zqh7wO$I=ZO;`nC&Eoy9P|2SJ~#Oko`HV9!uL^S{4Iu|I7DcAEmpuvPl0m(xI-XFO7 zJXWAJYktOZ!yYVU56)wawRbja#548ztt2V+^JzLt|7IC%xR=ku6ly+rP%>QZ`;{A`P%qRExw&mTkjo#b(TB+0!huJ`kC+Nc@OZ6v@IKMBWzBVJc)neHAN~!|2paYs zs#Nu0Mw-ZeJ4MH;5o;B7WnM6i4D1UTro~d=dRiRBnySv>z%Q)%QfaI=dFe}FjWt1Q z?Dr`lgcGY+^D=AT#1rhq63cGM(;Y&s86%pvS#zJ#n1At>V3cSuC(hE{$jj)cMe9CF zjbT|wH9X&Jmicrk{r2XHSv)6>VuggJfHlXlP_Ye9vZkgB+wd$%ZNn$P)u;OcJz29x zX{?jHu#~gLx>YoCaMc?2FDZ5ABsm^s8JyV4vg;t@8kiO5LaX2DzEL!BTxyWgm;n<0C ze!1NT8cc|RAhi>df$P^;)40wurLk)G;5K26)kZWfCqh!{ZkCm@3{J@V_Y1FeoY?U? zYN(f4%|z3WHMDNi2E_ai-|-$P8qA5;p>UPjgar>yv|(8*HA3G_FVHk^txM?@b0E}k zqCzw#YxZJoWG8;$IkCylPP79FCjbuy99uE)<6m@q-%~PcATWuU$s(t3>H zZD-AR*1(A=thp8%)WD2*2eQ~2)^;pjkj-M*%}Qn#uxwAdZJ_y22=yE6uZkwhn&xVF zzP7whyet}hKdQa1NBxQHc9w0&Lds?|uOI_MkYQRhyyi@cw!qcLwgLlKGh1n_?|E9B zx`s5?BGE`!{HaXryRdknt4mlmie)h4MV7q>nLa@Zq17J^n=2Y0Yq~0p`4eA0@cvy2cYv93J)_ees z_MioXTGLB3`;XIMy;5n+syS5WK1XA2pW)1)^F;G8YmQ;_gSN?T+8n>1=C{}Yx2^wXL(e_+LC54!Q2 zIj?Gsz9)A-gz#XtXe#+0$tOxmjYjRiv8(Of&f=17VFogQf_Da!s zS<^~s%-XevuIJ6W?pY0O{w z@|;Jv5W?p7)0{J+IdHXY3#?|%<4R+7<{4zL#(G_A^d4^$PHK4YE^9)pfd`HJ5@TsQ zyC2UG2(_j{H2bchSL`j7#yt5f^)uShn5i+E0iyYqHM?>7U=Oa~ep0nYd$4;dd(c`m z9eHf?mBwr`i|TA^%O1QBk}Zd2KjYlg9<*T1IHj>JQsL)+8y7`LVp;on~vyqc1v|4x+i1H9K+HU=LRFyq>Ey`nAPE z2+`C*oMsqXJJ!6TH0G0BXAm@)2Ty_29xMi~rfQ(mQJQZ>N@HEcR}>Fxte#q<=XJ5v zxqvkptbqqtaGh$re}V#763&6lwIp08vY(DnsisP1_I;fW(ZeF+C1KqQ_PDISaMqxu zA6c^vR~xqB6ua|_BP92S2A<%Br3K2GIpXbaP34HaQ%U~fAQamx z;Z$2=zDyeRL5nYO;=qPc*OKNdT&dWGX?$&cTWj=dirXQC2hWS<`@>}SD@tQlJwqPc z1r26H8c6NIK;UXS6Ijlg5lUlCc%L+0)>w5@lt%A;E|fYiv*udXz=J_tXLX9*kLPU& z)&IF+{eY7I{XEQ?Gq_Um)xl!TqmIUWfHca3H$-!sUygmj8tj*nS#y=v=>NIoN(kY> zeWKahk$iheY0S}lJ^txZ_Tc#Qjt3oq>;JiA2W#$C8f)p(Gk)Sci*MDrSJ*0Ba2 z1X%Nd*2t&t!@e`mu?N?SW(sRwP#QCvJ-81V%!7>}^*o3Ju0FpXNaTL*RvN1c>-6PQHEVv-|P%flzBoMYDl5$8a@c4?6Mf<|jpiR}dx? zvXZEKU6~?#fn}ev3^v@(rP6gNy*PdyWg8Mj6J||BY0PU`v$qY~FcYM$6IwPUM zR44?gJ(vqz-3$r5&6<8nV|~F(TpQL{me%MyC&Q%9bF8^k_2YSqHJdMTJU9|@`e`qk zR}Rti*^jFydvMlt8e5s8F_*+>4q%p{pT}79F>ByKbM~OC*64R`9)%DdED+7TtSMC* z^ITp^542_vvOsDNcz-IE2eMhyM`^5fr8Kr#tg)JEjXplIKMh>Tnu}FGo)7sH>JN>c z2cJLdcyJ2S1^pzm=2u)l*@GK-6O-s@%*ipDRiZimJNfnzYv94Td65cc(k z5FV6?W;<)1R2s7#doTtX%!Ae-wFkUERUWKhO>d>K&SDMyxd+@=*cDb9?Sbr11C^|~ zQ1#<^j^9JsaiQbEYY=M9W=t1&@HA`o;DXB@T*Sw8vZFDF#%QLBW(;dSU=2K2%=c3> zv_`)}aVdoGV4!IFvgR?RG0)<$9l3x#I5^4ifcK~B?TOJ>(B-6;(pXpXyncr@R+ZN1 zk7>yMG%$!YEmS|Ab6GPO8a)rjK&UlyFkRrmSHICQ{RtOo_FycZ3rUW~>=L6HDVh&i z^KaI`gL`?ttuggH;10sR!_Tk>9YymhYaUe^^GBX<uTGmW; zG-hIqrn_jSv4-{=yiNM*MH*X3YxGB`*FVi3#EB-%nn#qz-1i1)PPJeUrh(KRY`_sz zLkyI&<|d`FMzCf9Ypg+9qx+czA^Is{&3URH&%-?5hC!p}K`RKgCPy?f{xv&+71Y=vznunCe zZ1);zCPIUGPykYUFcY{c7g)fW8|i zP-YjfQbhAF)_jAV0DJKL1RC3Gj>cRNquC34(a&A1d4o0Z;3d9q+E;62P+{Lg5W<6Z zM3c#y2b9Kqit8jm0}lcqwFia3)shwH#G31s#`=oa-X%$-vEsExuW#v6=WNzAQT=$H z<#{j)8a)qImD&ZYBd`kn)bFQbO8XgE54cW>qcO{3G)qMDIcr{J4LrD(>umMvd7!Qf zJs^Y!kBepwYyPP;<`b-$1Pwe$2B|&B0v(3}sCt zb`R8#rybWh)ZFo4Dui0|Ijn*Qy;$?5(wI;2n+bCqjX5wzGg&n0ta+I=@ZeS++d!?6 z0%6~U5W<7LqG`dJ`;^9f_A#1o&7gq?zmzy0v<9v{II)k8&oxS8y~XRnDi6()Zl!c`y<}eZ&1V(ad4ZdZjVX{ENoc-qDyHNu$<-p`w|= znip9E558y3&SrWZsO!T1iR?j|Xi8XfkJ6ZL@%3j4H1J?4NDL8%{f>TRcdjmvx3H|e zl3CaAG7@B&^)Hd(ZoBL!ta(twhU-~#1e*@F;XKwHJ&%WndkHr{CN-4oev!qoY^9Q! zdwC^!OJsa6;jbs{!(&}7nuGOpl)|im6H~ax->N8?Fa6KE|W=*QnSOZzZ-v=MIdW*)@lI%|dV_3Ein-q2;o@I}m>r9AqsHKbO z7G2c3L5A0zHH($ToWSo@Tp=2~e%kbec2CJpVNmGGVA+!_gA+Y@J=xJz&k40QJOedK ztrCsJn%+udUdEcq(7=fSAZ1QO0LL~LI8#Taw^TCgB%kU5mRT2w%$XBeQtBvcc4Lpj zPHbmQL!zD&+b6hdSZ9mu6PC?WGIJ>JOx_e3&xw~H)CPWm5y6R%SyRj!II)7S6t{~; z(6H|gT@PkNL^hFSH!7LAh-K$M1~XEC>uJ#+xc*B_!jX-vy0N0ckv7?hXzjk_L$?u zWx&#hGRn^gjzFKG*|pe zb8ouRm>=*Gi$9mGturms$l_%ch~^@$GmJIx;8oTP&>Hf?)gHEDp&YEf^;P&GE+Iea30*F?_)_2+y( zAI~1di{{`j#hPi*z=In>Vu*l?@OxQ-X*6KVii5^JPuFqOtwmh>oa4le8jefdI2}jb zYR)5d7oM>5d#X4tbw@gJqmtuNw~daYZUyI;em5O=`YqwO)Gg6*)Gg#ZQukUX?#bu4 z)cwz&_IQjuj!WH_bsTkroL}npcH)Mg<5Kt3F*{EnmGem51y030r3dZ&9zVyW?r0}&q;g#9rs_EACUbtN zyX}C}Z#>7PZdk`rw}D>esQDal;+|TLOWlV3_IQjMj!WISI*z*4i0kuckP|nmI4*TP zI*z)PoJabt-e>3cRB&AC7CUjHgyT}TgN~zaA?KHVchx)n=5t)?R_HkD=5ZdWo8`nk zL5@q^6Lt1@3_r)E?t40px~ZIB>gGFfBbnn;*VJ*;jpsa4cf+rCeoq5lF6ry<1Sf9P za$M^AbsTkT5Z6Bc@*k()YK}|Y89I)-Rh&oa<~ngtCC8=TcqeXDa9rvx`o+%gE8+ao zZ-Emx3OO!yeL9Z1`J6}U*6elq&EvS#EqCHZkmFLfn~tMjKj)Xa2lv?HF;Y1$bt`oo zb(1-d)E(f&J@Fiux=o$9(LgWf)b)4e&vt%aE#lhe2c5W4!*Qv5nU14wHRqAK-_|<) zR&iYFKI_DdN{&mvH|jY0t>FC9?;pGE@fal>m%4B1IO-O19;th$6ZhnETNxssz-v(L z^IyB{@ffunm%8ukIO^6QuGg!3ow%o(<5KqmCvH@6TJD?_ zo>Y!Y-BwQANancI-T0lI-xtsMrS3#0ZZzQaufG0v(Q(wRMO^z_x5MeThT~HAH79OV zb6o27)p7J&#rdUff)h6?IWGM!*>2|#RB#^Yca#(NlyF??rZ{n$gt7`J7+s zPIBT#9>=9_Hyua6LCzy}4{o!^b~y8jZ}_H-P?2={U&pMse85)H{v-ib(d|m z^9LH}HbS}7eb9+}YB?@-FLUBX4acSKx8FGZRwJ&@qi3DCQN?kod!vq{-%8FSb^q96 zkH=HNajE;36E{jYE_LtJar9fr`K90HPTa`nxYYfu#?Bwe<2+J#tP}SHIWBdtaN>ra z<5KsB%}&3moL}lrcH%}d$E9wDj-%gr&LedjzP88XX`ove>h3*_j!Ruj$I)*!=aIVg8|?9T=)R6VzWn#w z)cuW0j!VC{Y99SoaDM6cEGKT1a9rws^tqisP{?_t?mwNlC!gceZ!0HmfWN`=(hnkiL}pW*4pDS zYB?@-Kh$y5twCI`S0kLbr<&tZ_d+LbRB>GDe!0fZ@2lke((mI=+^FEV)J@lM)GgsW z((m5YPQQg5m%7uOxRK9ssmp(}gN{4;&Ex!1_pi_F@fbmlOWpT%9CiJiN9x||#677T zm%1&SxRK0psk^?~&hLxo{8D$E6E_-)$$zQaQO8lY7IE$K?p02|H5`|^Q=Pa`&2g!l zrQ_(git|g|6QA1SF)BGOb>G!-)UDt=((m0)+*87Fse8T?Hwrl}b=Q4j=lA7veyRH} zCvN0%TQN?kod!3G>ZYAfDe)lbN`mNx& z)P3EF8zmfGujJZd7nw>i)3E>9>UQOTUwyxKYS)shgqW=r^D9 zNZp2o_INyb9GALtowyO?xYWH}$I-8!^Gn@xow$+8ajCofBRhW}ne#~9hn%=4p5szC z#fcjY50n2=ck72vzqN>KpUa)NQNwYm+fB#OZ#CzUy8Ei^@p!5@E_Gjb;zlLMrEVV` zN52)EU;1t0#ElY;OWlPF?EHa3&Lee4I&n`v$E9v-CvN0%TGu*)U82W zUl;P7xTl)qQull(Zd7qx>hd2NR@cEw&M*BwI^U^V!Evd3m5!rs3Fnc1cmCVyw~*sf zcZw4?@;NSbgF23W^EkiMJ^HRwH^_0R`<9NQuAlQr-8-GQCzaz;x49EHk~uDQKbvRg z_r-I5sXNw*8x3Q~f2n(=j-zfZ;@amQ-*Nh_;keX&!HFBy9GAL1bsYUxaek@$*W31Z zj7pA6-S>4Ibt^cJ^n0%p_mps4>R#Z)jY5t~-SuzT`F;7EU+O;Q#Em?TOWlq-j=Dk4 zBXxJrb^7&lToc*yZN--&x_IWBcg zCvMbmTp1FGavtgTmp7e$D>yE7XE<@AgyT{-SI5zB zA?KHV?!=8`j!WHc zI*xwhIgivm_=Y_mPs4-cztpXC;zljUrS1S7N53_QYoD7saif~!Qg`L+cK$#W=aIUF zPTW(;ajDzZi5nFhm%880bowpf{L*jKi5rC+m%2e6N5A=;N9rDZ%^r^@kKaX$2)OPJjbPOCns(+j3)o3ZtW{hzqN>K zpQkx-qlV*BH(STiZ#CzUx~FH@hxR5`K8~vPTZ*AxYQk_U@N*uiTj9h#sT`NOSx($Y=D5^7QQ`C(&-tb9drsVFc!2zuy7@Yeerple zKATS5Q^RqoyJ3nwPNSORQg?!mqu(mdFLnJ++^FQZ)cxgoJ5Qj3^GLrloVcfi<5D-* zi5rC+m%8yfj=K4rU+OM;&i28`B1f=V3vBDN$gN&t@;PR!UM7*Y^)Y&9V{spSpl(Km<{Ao zUa(=~MdG5?2x7dM>y_FfHW4iRO%274VSps-51H~ep4gqOi8b0j9LshWHbq#fwrMcf z1v!hAa-jj8biQCdD-8?|73rs z6XWTPhIw+gxNFh%I6^ut~9DqaShf$2wg89vuWW1Qz~Qj*ji^M*_n}OURVJ z)x@@8)+l$ep9#BrIJ4w$l-NTTP+l|aV&4*WzOdx)AY#`r>l^K2pB8qUu;g!BVyA<} z{B0N!JKoX4W`b4ow_zaIQOtUdj*4ab3Y#P>`MZ+X3}&r)l-Krm&oE9%{$3%nFtII} z4Ma1A4I3X37qya!apvy;V(U!yx5CxiDSXTtg^M6l{-zSUf>|TW#qJe0Ls;_n;2mJ6 zFl%;jv73c$CM@~8l-Oa+`T{P`RtdY~J{|-4TTX0Ou(bZV^3E4_4p`-H9Td_TklDZ{I&Qo^v>+~O4JXF&cM-9Zz+(PZ zxsS0V+j_6YUvHs2$ zh}U0{WfR+<*+A4@A08ntYQ0R1mN7!^>$=_Z5z@{*3)!!HEZxeyY-#u^eA)H6-kz^V}V3T6Q#y9t{ zzuyq!_*+cu=U{364vy_@r6ijQnesPC>}$*#(_HLaVTTG!{(6WV!>rjfBR21I!gdsv z{Qd4Wu$j#I4&D^YJ}T^SY|1bO@^>z=t-!+H8k%SJe7INGHDHy$g~T36BGxn4mA8km zQ-mdd1H`Uk)(R7AuMhVLME+hRvZlmNWi~Kcv0MqyV8Oa6`~Hi=pDJJ;M`-Q{)&pbxT`ds# zYl*CY*e=WlcHJQ?_P@kMtq#OE{#wK)f`z|%uC=eZB*%J5FR9h~4U;F$4~F*Zv;H zt_{os#5n#=BlaDzG=Fc89q(pXV#cYd2N4h5O$)l-wI+kGV3v2dG`pr zKv?p32(h!6wPv`kZ>0i}zbi!6f!HEu1D-pzzx{}dTJIC%_f|4 zC1lFqYGT_kYm~d#r-a>&4IEhVH%jcG^C+)b;9?7eoi8l;JBZjd%=$XH)`whS#|caR zwk38tSj^uFy0+NI_|>1--%PN|--g~`M=|SJ)GwBOS=c0D$={X4W-x2b_=k8#sl&z| z?A(x${Qde>E)gcSC9?q@FYnJjA}(ruMvUX{0AlOTWq&JN=llXmRs@;yHJ|t{2Vaea6#13QDx2u0Fn>RMl-#lWQG3#mT8bfPg#|TUQ`iR}#l=52H-D3UiArSd{B5q~S;US5OV>A7-W9?Q1FQT^CN_szPx;{3yl)GeE-d-Gt0&kLW~~BZ zZGW2xME*_|**s#8oK0g0_!Nu%FLrHU9wWx_x0u+^!NT8U*D=0dl1+t7`5PqmHD+-i zD|Wm&!VVRd{Phq!hFLS)#da39qp;-fcUfRFnf2kDCSuNui-bLnO&P{O{>~+~6SE6pb`4nNZy~V<3}QXoT(5EJr}KQ6A}skEAa)hA7QY^{{k=~h@^_ucni4ye*}!|W zPV)TiOkC7jOpN32woI@i!NT7PV(oMOElG9+n=<8ZC9&O^HIC-QvXg{eB`o} z%@bW>*@uLkEG+rkjo7^jGzMSPmA9X;1BE4j4Puvog}+sc{!v!1rD zyeYyS!Uhg3`8$)?q0CwZl-KrmwLs+WnV0#n6cF2m*}%%)!eakR9KOed7{_0W*hH}K zH`O)X?LMOgB8)6HNPFl!ESu?vJffDIh-lD{*EEo0U< z+I0@SEbJ0t$=~6`4g^cKgA_ zVfSJKr~IuTb|bT%ZMm_%UHUDLVS%vZ?+{{VF>5uY@!I~D3Pk=sCM7x$Tf}T&0`0kY zedtGA)Ov^*$KRtjfenEre_iiitlq|BXbG9}x0={C%o?U^yw^+N-Ppi^C4ZyD9*U#9 zW|+<)d)@H~J6~AxcM!2_nDtF?&9eky#|caRwk38tSj^vi*ZWR~r}20*!76_nZUj4u zS&z{_b_`zWElF7NcO|hI%v$xdSGN7#gPj`^lE3dti7>G(nGI}GEbjjj7qw;(n5!+Z8Nb-(2iDGKM)|mA`q!He=Sa$mQ?hsXT@;!jiu}Vt1dR zyw(YqzdZyZe|LYwvij~|7cv_dLdTfrZwun^cRGo2{9Q!sB(U%|=$a2<8N*g=;FP~5 z#13ZGXzOA>kixTsC4X~?O=s4ekrg|4`bgeU!jiwOiA`YEH`8@2o|L>@g(ZJ~xdH5s z)9i1hYYgX0-UP79-zs9~GwU&2V>tRE8?_l5II!gJ1Y##LYt8EvJAY>jME-6OSvIl# znGMXe-@kZ-I9%VhxcoJVZ4Vay=2Kq#Jl;cK*ysY8^0)RMU{9SQ)@XZ6Ec=KQjuV#r zeUI49%$g^9#Igg0-G~hw@{+&fiJil&?_kGR_GV$H3rqfH5jzen=5Mv@J2t@wYy}*{tBlgHi8be@+VzK|lu8ocX zG0yxgCiZi%|TOrA&LZ=>9Aj-$G&!{6(zCbg@~&P7#*; z4G_DES*t1aX8U`eK;&<#$eI#6mD#{bd;WGN4)4Da7dgX2F%Q{_YTVvasZDH)8jmpfUJPyN<;wVFwCJ z{u;zC0SkX?UF>_prh!%d)?5d6GP9nwF7_p14`Blbmi(Pb>`-Q{0(w8d_II^F@e+Zi*Ecv_1 z0=s}&v)@xo3Nmi+BRY+q*0ZmxamP+@a~C4WzJ2AjsL zFX}pndJ5Z0Sn_u*vB!_Gzj-d!FYI1y;FP}=#BOBPv&eNXzO}Fmge8B65Ic)mtMwJJ z{+0?v{+@iE4`Byli7q#{iRukKX zS)<&=t`v4RHgI66T9nvBM=7sa;9}snE0{G57kiPg8N!mk2h+h$Vb-kg z9_vG*u+4-ef0q(Fj9K65v{?3!=Xk>Gzy=OuAb-n=?FttDmbg6IDeN4u%HKRzh-0lSddzz`ZQ``d!Js5Olk$KOT7P6CVh z8+7&dge2RF4V?0~gxJB%8W!E-x98b#VP^?T{^k;!&a8Rj@>sTyu%m<}e_Ioqz^reV zi@jdhuELVPzg!J=#~dYyw#2Zxylgnf0`FT`x{V`M7Py1`aIwJAv4V%>M2E z`fP#7-x`r+6WgEJz`+}|zmE`ydu+rw{+h(L2Md2|=vrudwosCFflT>Z+X?Kc24amx zE_RNval(?n?-9G1S+j#{{(iKcd)tT&9P*OC$74lL$x zp6m7eZNd%%tNcwSHiub{LHEq<-h#rW3rqg)>IgQ4S*rt$!CoJl2t@w271=yuj~t>g z1U!nx{ujG8Fg{`&e~XFz94!28NR1ssb4fN8GUacO*w>gfwz=+YpPtN~4HcIB^$KVHW0W@SbYA0xTuv+jN|XN zE5VKg3xD%_#`^owS|0BaY|50smBe;u)+kSjW#1Ndm9XURXkwF?HJiHpeO1`W!jiw; zh~4`ejlt)0y+`%3umgo9e+^=nfQ7%cu06q95gu(W?%KVHMVr3g#@ZfXy90kh_JE+0M;_5e0;$V>jtAhwKI-zL|2@qw^Q zge8B66FU$r{4I1HHqF(UJl?*-lE0^~02{}wRd5Mdd;VT7 z5c&J0$TkqW^#F|_5Kz}b?0>Oq1M>hej=$50eFrT3O{Ki{81f|9LdcZA`NTfXtYNs= z>x7*sEcx4s*uKn~^}g7=->l}|a)c#+PqYJ@#;h;bb=~}0*jB=lziWv-zMuWAbmd(a z=Dd5cfm8lg5WA6CkKsDTZ%Ez+!jiv3h@Hi(HHh-s{+0?v{>~Cv2V#qu4FnX6{V#FY zV-Vx`d$cXs5Loz|=j!bW8E;F-l)u%)wqf>f`-fJ-?#2cVEcqKH_Rv1cYi?*0>+kVN z?8AIv$=^Z5u3^@f>yKrB5_X)hXJB#vmpFWX3^9(s1Bk7wXMf{eW4K?E z6+x!_O(k{(vqrINoy-w7Ls+VI@N%$Im^HII$9mRP*k;0#ze|Z7#;mWCEAN%U?!X2P zV<3OaiR}s&{wBNFHp0#UtNhI)wi&aY;%j2ZU_ld*Y#M_-e`gCs{^p7-o7n!$26m}q5jGwnE^756 z#_`uAwmn$7zIBTA_gYET1v2GtZ3@^^{~^}cmLALA@hKnJIAO`(_lVuhtl88x&w2~H z5gRzAjOlEza|CRSR zHf0zC`8$``R$$?8h3mXnEn`>%R{2{H zB7d(cXMdX#JC)hMrVF&cor#NDmk{IlyUho7Bv|-c=z4$td`Wf$n=<8ZC9&O^HOeWk zz1KKZ#)Vf2Oa6`~Hi=oYgDdX=VJ8bq{&pjF?;aY1FW}0%P1u3LlD`JAOTfb4c-MX9 zb;72BRsPnr0Xvyl&rvgWydMjD2pc%CkpaoH=fv?KND-Lb--v-x}8#nhIM6R{85Ewj;A1%XJR@ zRm$_Budw9r>5IU|F>5uYG1&fIE)eXP862>?L=%}X3g(f#Ig}#bA%;-PqYS`#;k9p zi+xPkR>G3MYl%I+oBgeD`8!0|z1YAhe=CUH$gHQW6+4DZVHXHX{th8_7PD4UVr_p* z1tNbh7Fh>kiY5V32R^>uQw?+ZIlSn{_mvD3j~{#Lb*_3Ra4 zGr=l<8!iMpidj#&>)eb8n|KrCblKBfn3F6|4Urd zx{Dac-vPwd{mB0MU1R7e$%-IT{-zSUf>~p&>)PE}*bHIG--8!`ox-fy!F6t4B5X5Z z$={{K4rA8W$;CDob_X_a7z6oRPHb1O@HgLeKehJ~o)2@tDu45cZN{u;QBv%&xJlSC z!jiu}Vt4PNycXY2wf*fO5c!)fvU(HjLS_S_DKA)j{+GC@)rA;1RR zl58tBaLV5jVh1y83~;@7aD}k5gr#b^#HKTAp76%@c4Q*^FiKeRw>7Z|%=$*V=H%}m zGuu^I^7ogPV0Y{ke_iZ;VH3bAf2)X{&#WiQ#r`VnW^CZVlD`v(oye@!fmqw$*#eQj zJ4BXEY=340ov1hV_YvZv*0t20x(vzJr>^zJ6%}vH;dSDVCnki8t@|Rfqnw2lfrb4Ft4HEkrvxed7?N(uj3QPWah#kYMIm4B=tFRq~ zC4aw52Aj#OZ>8(JxJ=mN*py)mB9iYGjWXp+bG#|uPtbA3`wWY@|TX`8G zMTTT!_Zy2mb&I}%e0`6;oU&vM)d<(!HDvXa$cvPr;IrZQ@+M;l{wewEr>pVH(y0hP zu{2Z~Xo}?^uIK|?{LMaGrueZ?^s$jR_7ne1K{sJfl`Fd@=mRu6q$t@q3EczBy_oYp%FJJQi;g2IE*e7{;lrNdkNIl zMgD{Jp+@z0Fv9EW#-BRf7+zbCJFboXW6SW>Z-9TV`)1qeM!?^!E2SLq>q=+Q4@hyO z8s%A>f5ZMM{B1WQuGZg&6TBmMJ4SsJ`-oe2R5Q+;%dg7=SnW2(3gxobGI5;>Ycz5ug)!s|va>9SswLjB{p?J%O;;_>5#7ueK z7tbR|k{!J}v4QmKknr?3YNyUTA}*>*GVEYm3UOz+}^v%QP^pPht9gU?RE zzx~fn5BfI;{W~)K-)DuZv#aY8GW}bErMtbs(pvA$llmv5`@auPJmL9sOVGQxI{Z!X zvQYH3G@5EOch5PO=59_}x_?V3oRgLq!Uz&W;r?l@yoob|BHf8rmSfxRef*Zv`I1&0#;Izbu*DnMgde?l2By zI5TZz-MgP8P@_#mI|)3(qw4yx-t#z~=Y$_m8rHWb&_CitnSN zxqA5DS(va-;rISynUTc(=jTK+4<&|5*TrYu(A8UZE*+_E4NulD!IwJ4?KhU-*kVk- zVwj0_r!XBbhZ61i8}6ACT;DT9O#S_*8yj(g45ZKRD1M{japG;IxwsQ=FwqxA-2`+$ zR?FYeL6*Oyb z7+Erv9zLEnkcJkJgS0-rZemj&b(`WkI%f5E;xwHSyq!+_0ip2eexb;dLqp-i_!C4} zFk5bm;}h!`GDUJ*m2dEtpP*Wi(dk*`2fgK8Fqlv9UAb|eCgPvE;_W!t<*WECZHixI zMuwz^A`>#Q%75^d(JB%Aq~Xw^LnEeSh4Me_a&x-U*%8s0u8Qv^>5D%(%CuGKd z7VNMES$jvjZ_SB#bHnH5gfA@`94Vyi;X@kU60H6i@RknW(q9^jCLGC*bPa{OWrq`s z(z7DHCH3cODBo89Cn5fiV22vy?GtUeF*kBnZn#Nq_#$sP-sWl4JqBTBR<~W=r|=$Q zTZc!!xvDWp(fg>mwGAJ>JD%hr&Cu!v6_X*ChqR-v>*Mt)1DX#Jd=+ z>}wS)IP+qBTzq!;2SC}?`%^%rXL%PN&WQ}k@I(1sX2I7)gsSV?2mLj{f(Ckq#2YgG z-)0u9qi0w%viygF1;-^7^3-+B^nYEjou2#u$Sl~x4=^J?%PRPT9%qtCM}y&QeWD%r zXGbQ`pQ0*mRo@I>(n#N@mK{DFirj>lx9kM2lA&(*Bo^>=Y-_q$pcY2fE~l zG=quyT~@(f%8ijGg}=u_i1xCnsk)S){}g)q9Op<@!A@a*M>p7$>#-x(~}NRRvtnFTfUAn4zMrMTcTeylp`@>_P~ zMw-~|LpNaHE$fHlI|t=^-_VG+>~=iO4xf<@xi9}JeO<#H+#cgj@o(pGucvW%7azyC+at*r2ucf^vkG?5@3Q>+v!r7j zavX0u*87A0Gg)#Zv;3cD6>Ju8pE_1S|F_g4AL&0E!rOBq31{#Z#fmSeAACyf6LKT; zH^#hWy%EUnHZZaHx2$ljck%k{@UCD<0}VX8`XCNUmY9Z`NBQ7Xe ze>hlhlAg%`+tUF3+cOKeHCpAWe^1ZwF5aGnOJcz)0yF(}nFWXF5r&G6RWH;yuA4M= z#xVApP_)ZeAwD?XvJbE*)66QmWQ1PdL;9$A%PQ1wi}&Y5F3bsEP5d6CR!7ExQ=AGbiFXbub|=xCAQ^u64$rc#_*~llSQ-@H{7y7%W{CABv7kL~QJX zIgt=Q7>)-?6d8;Mt>{5NJV>Djx8Om0dJx2ebb8Pg54zHWYw#e09<;-Q5IykWL0?oj zHx%vFmjca#rN=KUo`jZY9^H`JDl5DRiysb57OjHou-nTFe27$}+Z}DC_5luj`C8XEUr|}%iC^i!jKhEJ$`LSU5n?BKmADzYRo=8vr zeRKZ(hG4j+PqfWfxsfJ0;j^>Dor`XA>%-e~qs`9bMq1~DTVv@hO8L8dGuGL4SZ8y> z%|qeqi%ZlvgV>z}yZzyPY6zCX`nPdbg(8<`hcBtis-Z)bo0uET$D!Jbl@Vv%L>wO6 zPs%%xfX03Z;(C}Jxi&j`3y#mp3yVf)cRMn6BwC`ML$f0fC((mD@SqhvxD^jl=s^}9 zw5JE%@gSWZbjE|O^x#T7$e;(Qco3ooaI-LkO6TT8^ZFw2Xju7qLvdU%eAHW(0Mp6s zi!t`_F*zvHH#-Mq?@1h#A!utt5?$K1zEC1IA}Z zE@tC@VB^!i%8qnT%MRa=8*YMg`U=Q589%hu${@mrYt-*c$-3a_HM5{gR=LRNbhQ}mjhqLGicg-%Xj?XRMkQ2_uR_+rz!V%-4eF<^& zkj}*i@Hn?y_1NN^NE~*2Inlm2`>Mx|$cbd|gCTf;)3ADMe>}ixSUt8k9^f>r9(yw$ z;54ir+YJwJ8di@@#{-;()nl)~1DuA{V=uu2oQBn7O+4sZy6)WU=*>6_tH&CpXD%#` zWZsaKo)uo}Ez3MXrymx`8!@iT@H#ot?|B~T5Msn3Q1}EyD0ihyPJrqK8PQPgb`ty-y9sk;MT_udCU_2QZ_4;Bh*9Wf&`3 z#hIE#6TAn4w6xeqyj9&S5wx$rf&~+ahXJ%mnL=j>z5flFE63`$(3(v9JpZx%&q}&u z`#>bFwGBnP{}Bo|gu49|!u}m=(wC65G${Z|@+2P~49X`h{stI0ko`cIK`KosL7y3j9nTKq zN{LNtAtr-!d~+hhF%)_^jze{Jqq!mx_sZ}be`OTQ2?YME>myXr{hGK@5rZ_*NfDv!NFT}(3O7ruYlUDa(lRYnzNHBJ zi-+;ClI?X(l=KoU{bK_U0U~etNa}(27c|xk9_yPj){MFh0(#46>SH8$@z_|mkb}pH zo4~$vWwc!5QInzq{F_kJ8~++bo$xQ8 z1Xp6#aZ3DeFBrmSC~1fA2>}jGiHYGAI>(gnHoNryh*$m}@$ml|L=Z?4h5(!|(d_SE$-g1uw+9N<`kGK33_CH9P zjsTrL-tzqt=!yWHKi=~1DNy=Zh77Cx0Zcu-4vd^f%NH4s#SNFqgKEE!72bbKr2F?g zBVWRALXm6gx~`|IEKa>A5x@zDJ;d*Imm`RsA}&>3nGpbQ;#hRqx}R&=q-u!^7K_6p9JA|7^z8aKU!58?&FQ@IG! zX@|RU+v}!)kMzI>pkD^ACrKDAT~`w6=Xz`p@hsW}w~(qs-LPgC(d?TUvs&-My~CX7 z&B*j;PKWh)jnt&{S9`Hui{D_e9(kNCy#|vuRNAN(YdpeYjeoIN<6kV+_!o;c{^fa! z)opEd=_)zqM=*@+h$~e4-=VMmJGA`2L+>D@a@#|rJ9jzVaKjD1@RkK|w>dW&!kNLB znd0%_@S9D!(Y`d}iVN@@i!~PSfi&TY2jMvuYb@SFX}%R_<2e>ix)pcBbBrB} z_c)qu#U1cGofq#6nry|b(I*z`kyxzP;<+r=e7^mCoo?CWf3DM;*8GokdI<+(*6Fu} za@OgW6k)H^<%)2v(_;|)uXXw^CAHV-TSV%teScf0>Ac1@C8I7?KyMlCBJ?WUOr?8N zFQq3}b4NuZG_j8Z$BG7M;yXoTX3JV13U^7XTZ3Q-uR7?mc2y33QTL(J`LwQ5bUaWw zt1~e&oQacItJddIy*}R%vp!>f$oh;si*;C^WoJT1th^tu&$(Eiv54#S8T)QopJ{_4 z>+^AYea0pQ>oYbfSf8;;!TRiMQaC04Q~_3?YLujX3y1JYZ4P05v}1VpiDSxdJ6`>N z#OM7Vak}q7<^G;$yo#&N^!jYq8AQLQ%}lgF&ef0vus#>a`h2|vus#>i>4JTt1h76A z$ogzb0PAystj`G&!1`Pu>+@k=p9`=)7tkr=E&rL<=W49aWVpRP_vUphW`UOd0^g7A zjytE=%e;(vN9*(S>i@Ammm=s|pC9Icvp(O;&z<%8Hp$|w&q0;JUZ2xd2E9I~NCx-% zEaRpHIuq|CJcIu)%~_y-UqzdbM+Rb5&cN7db>wBcr(_DI;W8toKZgd2W!uyGM zSici%Ex%9ISeNoTy+CTL+ks`9EpAl zX-T+GkGF|fyAyHx{Z#(>Yu#WT((u2jU;3N+Ev0^4sc%>6OGKUIu08N?>VZlWluSK3AzPQ|j+uqYi$pdsEaYZuL(5Hn#7MO1+m=jdian^)OMlcdK9j zoBBswHp#8Gl=^O^ZYSz=w|eN`)Kiptq*A9SbsgT(%83rgjeuW@a7F2eMGlvWYq3h? zL?`gkyMQx!%kf998z)Y{@hii7sExSy(P-{DeNJ3G{)ZQA3;wKo9hoqR`JDeoe=7-v3Y)8lvofaI2m6KM3L#1?_*XCyQF}^J7JK=SJGm(3)>)k#JuWHLaLkrZNy?@UP|BlPb30Hpuq57}>dT@28zs~&J>F;uW z9@F3XlEvvSUS-hzHLRe1o&IY1`G55%qnG}+Nq=}hhjw^h*S!x(q(vz5ND?*oku=Ba zyzEZcKjZGeyr%u|Hs~&$4OP3i{CRa$f(2~qv*IY z#)%o?HtotdE_0+?x}{r20mTI&uHXjI?v|(^Iz3lUj%`ScCb_xh=H55!&7#?iaS5{s z1B%GvJ}RO{yBS#&85V{4eV?c5obH(+`I$f7_r3S?xqLqK)TvWdr0S1;?vtw-wb+(1WYzx)4g)f*Ebj`lO$}BLwHS{W@nXKp#H0Q84VlB^9 zLv8wk3ujtkIYM89=RuRR!@UcIY<2oBtSgKI;zh3GKoj(hd}{}r-~)Jo3(WI!dHzB1 z`LFUkzWDrGdENT~|I&WEv9I73;AO=Xg?d@~u| zCwh+%cm%*=-dMAren*F;pX?*xY?H2s++$@QZF*wBdaNNsc7VSgFX(e|Jr?S7$$I?O zwh*kx!?p!~J#w}Me?8`!7K-a3{guuJ`)8uANSnY{SN=ST|Kt4=bqm~&miF5})zlia ze}?Lpi~DCY@_K=OA3waS&n5e36&V7p0e)!L5=H#*JKKVfAO6F%Fc3fdqy2LeRU+OY zvVm06D_2&%`>|ev7k25mgbyz3)kNuj`{#K5VsX7j>2qq4MaHc)Z*Aoy#d-?L=jfac5kCf60d z`XMx(`%>Yh_2zFH(1n^P;=_sPW4zkKvC(p#9-PO=yu<3Y4QJflsVZb;`6qk$4u4Oi zXP3shh7ISX{43l{#x)BD@*9Q47;X~<8NGLU7oq%7o*t>P`W$!8@T&aMw`F=MAl>go z!KbODKi=?!Hvx*Xq8%o9C|eH#h^*B&4**|37_Q1$|rDVe*3i@{jwcltI;F}yWNIHU4>9nV#qYw_F# z(A#b4OLzujzJkoDMxMdbXV-8p&<50YTlzGf>r8!!?Y1=Xq_gWyeQ52r^ufXhA#WRP z@EpqC0yG(X3tK?Xk(lRHCcNh6Bus@b#~V&aA6R&j$qeWNvXCs0lk_f-LTS+4o}5qV z#j%zyE=^sb>>cvf?TmS$_~=jLEj`??dfe82MjaWfC>b}HjCwLyQZguQ=VkBghG;yXh* zR4Y*w_V070`f|B5Zra4j?rzTiEGKl()n(Dv_d=}>h`2;G?h@&*sK&uR-Je%pn$sHX z0f_c3e8zcP732K&Bkntaba^SSJy4w&cDFfO-$pwQ>9W(l76rCvM|0rxCUcW^a&R21 zS{i#9QD4N%P-7oe=aMRcLfy+?w+Xq#7AowE_DtLjr>=^42-&=F1f?t6{>u(ZZxPXY zTg?Z?=k9nDmO=#L`E!@u4`N8qO;n|Nd_B25>Qv}kz2)4s zDbSPKP#vAQL==o-?pvaC)xJUXREs0K6uc|kRuG}Y%`{H@I+V_)2`W4C^#fajKs9KFI^s>%| z4iMd|Q1*Q}o84!OQdYXhC{Y~oTL7brN`P&kL{U*IYBHQZR*8bjcTt&eiC;xZ=BU~< zS`=UAA>Ob(^e=ZvZEuVy-XwBUD#^>-5cB3};z|-I&1}zHLIS0k?V0mQpfs~R6C;7r z%=XM#Bv6{!o;igCN;BIt$C5y4W_zZV1WGg8Gtl>Q&}iyFmtrhRGutzJIW0TTXc9`Z zaV~l@pP@HHCxwT24(eC#E~8B`{Wli!QD%CNewIM~UwIV0nFZ*}1n}NpkLeTDV?s}9 z|K$OQqfHU$G4Xz>uczb#bW>4{$y8sBp3+-I04~vElIo6&vc;h<$Gt`c*b_yMNjIla zli6hSn0odX`BsvvYU&81_wb(T$Xs2Pzv)MP ztn8s&Ma+J2?%HDZ_zNhCMrW>(cSsp`gsyeV#3Mxcqu9M>7RS`Gu zZ}yWuhHbM@e7CW5J$ghUd$^OuuTxE zP@Zp+n};N>CIJ>`&s;_VEYO}o)7t?Hv}evG0TyV_I3&OV?U^wozyj?V?yU}3pgnUG z39vwW<}ebf*BH-@2Me_K*K2HyRy$&%u@jy%@56?|{-cTs zP&xZJ;Z>kFN8JUo9nb<3y}2>WJxl~qdoo8?(nY{np1M}mq7vS4-ct#M$MLzuo6HW{ z;-{j~EqZhPT2RfqNzco1nRJm!4oTJaZ$cFBQqObRv7bWV1EV+R_tg>Wgoc+xcME?f z;$B^yzxfw>cGVS#m?D=xvz`FdbU-x!lmArx`D`B?5jTp7LPaf?)3O3Crj+j{?dtfK zks(kR4TDwI_x!}9*Z2I81nPLbg$H1p2|%e*RnH#=-0y`x+6iR6_kmI(N+DIZ%6xaH z_SUm;W4?NYvNkFy_B6#Vomx1G5_Z28^_fJOsTG5aNwrfVXpu}8yMhs^c|v>%ofJn! z$bC72u-APifA!D#gg236BvIS!rD{hc+RFU9|8;T5D_xxNEY8x@(J|NY7YABc%)J77xPSjU7wCGd-d%rDn_DRKHP$9TkaQYY zGC-9-z;t$sfKz=8cmfZh$An!_Z7d-m26(XqEW`9FcLrxbtS087Lg3CJyJ<|qjnTj6 z7z0#FKH*KAA{W9h#oe<}o*FyFN5P!~;7R-2-1A1i{-zFzxfk+{^J)_AIk11ut6zN1 znDS1uUj~kM@$j|(}K=^@6UV$HY%V_~Wu=>3ce&DZOG5o+Ie`5H7 zTi><(z$?fY%n!7>hpaMZ-&dm9_prm`hJsN-urTvwjj`8wQV>oocWj)`_k6*gR}G!1 zW8BW%E#+vjB6Z3&Xs-&ZVqY@NE>#x zK0j00xuA4DSm#gsM)tll94TUIug7IC3yYH-xaQKrnwNL=4ZvqoUdg7|U0o~qw6|a1 z$RBI%^i`O9iuBgo@2_&#>l=Bm$xo0kYGUMzxnAGM*(U!a@_)e`kYB6$Q%wG$7&GWvC=!SZ)Dn3+jjv%f?sxE&)M0tSnoZH z$5xeljeR3eq5$c#aCgsdO6N1??!b`lE|--DY;^2)!N3O7iFRgSwj>OE{KbW@=t(g3^lGOe&mDy&Lg`My+$ z!M1Jv18Y?HGgG1K+VC;M`PjZki#+Kix+=B7Uz!GcXs|QV&=blY1d@(4yeyzf4B7@xebz1~Uldy`+430Vf1(p!rOIfrA%#4ID%PHR_B8!NqeK7XSTn z1*C*1&!NtQ$ZVr1(p5Q6(?B1yPqA432jAV<7xf;JX@bxTeYY(dUMe4Ax%g&#qW1-NWNW1l!Li)zSWGQTPvA2Nh@Z;V zwx-Z`7xNKpnwV}Mq=Ih&Rt08UzO^skGJxgdi@4Y{G%$uxRy6GX&bJmn@cF;B1IuK8 zT!4do!0)`aUL3*ff1=e2P^D-*=2YHqn$>$^&_mL zE-DL`!1UBETs(K7E-njLva}d4Yrl0(7ky`EpW{6yYsyZPBfhF(vp|rpa0gen*Q4c~ zf(y=Ks>7?24fhL*2|du^l&_99^vv5M>b_}LxjI)(fBR1;twr~mcJyO=0sR$>ww_Ldv|m} z_|fI>*)9wC=(JwLLJDjLEeOn%b~!g&&J>IVwpO~Eyl-ibpOfkW2HW-&FtC+F4}oD{ z;|v|ZN)Nq@-2R>weT9mXSO7;+MFwDyEr zx3ZJGoSZf-+c`M$JT`vIo;3<9PI*URm0Z6AeI)EC06)%|#Fg(Rb2kYd8rt#~X_yB7 zgQZ|e?z@h^N>U^m@}ca0^g~qzzY4h$Z`oBYil@7Vq>j)tnxinNpO^ZZL-*|hB^$yZ z!m2|TbT}=K+i(1*lvy3RZ-p%r7tmtH9Jf7aC$*29zumH)-N`H0N>kF4R`?`kh3t;f z@g%)j2tdyXP`-?Zp~;bj95fxv-<}f+jAM~LJgt2Id3L}=;rl`Y-FLH`A5Sv@Y&2Yr zgtABS0Uht9{&?@y@t!upOKlX~QAE>+`;B)xEq~R<*`c3?viHj%KMmdY7Pv`#3205N zM)in`3Z-KCHTT)RwUtz7Ge2qP@xjvS{)dazf2`FxjJy2moXC(V7fSU`0J2lTX;HZu zSgR+~7e%{>`-RM#!U0pSpm2_Yf6)d}b&?XDaA=&CC+ynyXq(^kcP(?ZGg7`)ChFU4NLglu2BEGx17H}%`Xm}H zdg%bYeC(9>Mg+{xx0RG&h30V=IW14xPc79?`BCTnU5)KoH7iE>vT=En6uP^;X|Q0v)pp5EK*Ap~evl~j=;&Ve4WxlfKP}(h@EpB-&hBC#DixU&vB1CHD@X(~#YX~ckrkor?b`j-G83z%fm|}c zkx4JbARzPkwNUnp+CcZ*%EXmd)iO2R@!aGgAR%hMQVHwRToJU1 z1N|m~KCs`wW20z-#Tc$Fz=UgAG}PkQg=jPW9i;f|Ju)7!8uMoQBe*BjD%vrH=lRk5 zqPb*o$CB0Wc7iKb=YajNrr}ej@Kb{4F6q;viWz^kL1eR~@+U0y;%%cE{~Q!S7;`?*nUZWK45b<{#lB1Yc&LZVKsSXy^H<6Vw1G{ zyV8Dh7b=V=8+-j?xmP#twp=^eyZ!!ayD#6-uYAP&jhSy)VA-uqRiyC|zbA6YIOUrQ zvipPiE|O;+*@iq5UJ9gkw7jQ~(Xvg*F-z8Q9TZ5k@nJi2{yM%$?gE4MTDI}^>v(n= zTWu+s-+Q7`4;*d)tM@g%QC$qNJZ!kO>vh z_<(7xy>LDm{kS<&jY~}9x21&au)+>H*Kg_e09pob!A%OnX zEtD;1YegF8+hR(DmA_Ev`Dj3E^R(CxOurShBIO$kGsrF34kLMV8vkSp$u*7HT~p|z zt&%nPz3u^B1C|Q6JWlzig@?&3fxCTm|M{Krq4QG;_1heEd@tJgtK#=oLY)-8D4i;< z`R~#8bs!Vk&rx6)I{h&F!Hp9p&28!7*&;Apv$341`x>7EeQ7e1e& z<)Yveaf7ly;SCk8A+w~+LG(or(5Iylx#1S} zknWkTnAu9oXV_+sn(uTJPA8)u)DM>u&|nIJY>dT)Edo~$Dy|SdsI;kj@H_T1iwm0> zOF!%+=TX$)GqZNDaVakx+P}YbG$fn!Nz<41!hzab{gPq1uAp69v;>=UapA8rYsJ1? zNoSo##YYAauMohLL6*$RT>8(vn7P$yxxW91epqaU*Y@;mD`cgaB7C2sr3T@&QDn!v zp5YwtEBq1QEX8lb#ZKz}5v)eSn^x&G?q~Jja!vR$oKTg>CAyICKN}zYY`lFPdd}|* z4`tQ7#_v$9Y+80yq;Np9V_51ae)mX;@3$D1n_6dq zZNq<_-|sn2@noI@_rp6|`i>2k%X(Xy75hfx1OVyF6k;Etx zb&@!MM7<=wKw=DiYT_aNXu>@l?P58f)Yl=B#MV`snp4qu;2lFZBcHqyuTGIyxR&oB zddLgB5Q3Ha4`=pbFxp~`vr!QHB2PJo*}I6@i^%M?$49U9y(AbvL@{p&@2g`CQemx5 z-}ds<9DE-PNnOqF(9{L|R;142cUY>9-#t?MN+r?b*Gj*pQooEJ4$Syw|BTe!lH#e^ zr4L_EG<2lUM;Rhbr!I}5rmv~-q^dN9MhdF=^7d3xHJU;*?d&>D9Z9NQQwNb^K0BJ^ zDgBGhOy+#L!KF)*O=X*{{LTN{b&tNLMAtn6RRW^O)VkDlOA{HfQs1)+zi)CjF~joD zT@h>fVB6?zId!%eZo_5*q1ocQ#r0yjehJJINm1F)@K<>ZNt1E^WBi3R?|8>0a#!K?)5UKb7aGh_@2_ZiXK2jb zzc&AMzpXh_67Ku%x_lcvQrzu&C)I{p_oRV%?xu=F!_H9lYylTh9QZC{hzW=55`KrI zMvE0%%c2T=dE}3j)?%U;D6QdrBrnE1Y}Hor&KvQfS~B8R)aJLX9a!`nTl4`fn#B4C zHzq3w6&q!X-9RzhR5n2h|b(6Vi(@+Vs?l2tKp&nL)*#vBmbKm=~>- zZk|7=(7)S4A2J_aa)fm9m_dbZvxQXI3TLs>$<|8&lZ03WJMP2Ecg{U5_=NSzuX~%T zUhXG<6Hw8moioKC73#BGwkz*sCdsu3iHF_602YFaB zg|gxv2j8OQscHNUNgWqqk@Bs4#5PXT5qC#^HI$jOB_dZE;ja7ekEtpC81okmD7`NK zQ(IcN%rf(_md;9CK@ksPK+%u$x7nhinJUH1LeayAwRt+TZOy>WoMY>V1qaPmBy!hR z=2sSrsVCl#^P{v_dN!S`Oyq8_%Kx^s$S%0s`Olzmv8+kb$_R>7=e+?%KF+^pihPyM z)zCQ>=-p83m3^FwZ3x}#`pK#cn|Dw)_?BrC&5iuge*4W+lDVJCqIxHY1HC`VGSB;U zesBLh%lbMyBY%nc{IA4a9U2o(g95~zFXJa|E}gId!qY6C78nHZ36ghIHoWgiqb)m{ zQpiyDuhsq*6>8@5PV_eN-+#-_Z_^YH!&Jw8Q2Vkhp8JjT50!vJg8Q!H=}S7pR8%&+ zKJRDwi~Z`-WAP*X3s|CPU%c3ZW`^jmQMAIQWAxWQ3avEOL!)M-J{v38L z81LQpzJ9dr(y4JbS!R4a3sN=r?R?Lh13LSVcJ>eP+;44X4-h=;c$#1A=XdhI^K1J3 zEc5$0zeq2*f0k(xt2XmJ?;d$BV(~%tZu-7h-ss1*d^jd{<+wQ_T&{t&Qzyu@DgqC^ z8YO~x*DoAHo66J6-!~uC4YyO3?L>zT)3y&eeAo1)YSB(g)z%{+#`$9coXk&61Sw)& z{wspdKw&X=Vtve=O8S=iOD+cErkY~z<#oEwW7LQM)sfQ|@iBJRhn3nUu3>U_?5xbc z3qZC+=ah%rE}sUxn&M5IV?7N98Z~1Ye_?gPn}VLx^L`9p}kLxZcF4wbR|8p9y}l^I?C?lnBASvNp&Im$MG7Pzy{Mn zed?SVd`HvdIoN2%#~|#b@s?S0?@`uM?*uRg;->z|w+s8KhcZvM%8n?nezH0xzEq}Yl8 z6OCOk9~yZMNjX9b-4|7qmB6>}9T7uiQJV9{O%cxuV$18>g%_|3U!r#5zQ38e+rha- ze;8;YqFJ^dnc@y4@CPh++Hd6q9V8$WFS~A4E{B++T*a}UtkW zXPFM9F7aXXa$5sGZwjO5*#h`$FuVE;+XS?L@$z?!t#P2Oaj2=GYbH3x@2-JQ{L@S< zh0hHcg-_h?T*?8!a&_3{D)Q4y;89Sdtk>hz(%ZK9-?3x+iRP88*8^k-HVO9M-KJKs zUN_qU!FqkoHW92>v#sH;SCgqR5FWMnW_;E^O#qDhP($7CjIEQ=1J-`X1fdi}^22-YiSn+VqH4qL-tuNzE_V7+W_1NfW7atVQY zpns)+3@w~WC40X4*-%YN4|1GCrmu{73-6Vs6e?#W2~lYg;^w*CAL`F>%qxY1C704L zyQgscx8jRmDI8quEB#ZXaGo{J`Tto8r#{v=p)yq^^!Qmq@xZW}0DU9pIb_G!=4@N) zuz1VuL_vCR4~}-BgDSpUB?H^rjN7Mw0{woB<0gQqlrI&y|B!^#Vq);L`L@)53_@)s zb7Lk3J!kiM6ITH4X_W*ffcgJ=9DXWe4zHfsBa~XqeJ;j?g&tcS3BM`tBQyUd`O)x) z7-TpzpOFVF>FUEP;5^ODD|HuoY>~_jHEd}o{ERd6LwOf|DZFatA}zJ`BQdn{Dwd$9 zojN<-7_)SrF?~&8WENvyaOqeze{OSGsy@~-S0_bvIiIN;WXCXYp17;(6+nm!95_9_ zd>2<+EYH&ISjEoz9VQ8A&sV`gh!AON!6m!$4x;z*yD~Rg{wDb2ysh0K&m-8t1XKOE|Og*({!H`8RU-=7e7JIP3 zE`x_x3!cm=?;GDVr;s^35#T}qi8d(w=gv~1u+rwVrz8f zR^1-q<nj1iHd)pD)yir^DnZb5%*@s3L!*}nm#r@4wZ)U$sm z?v4qy-X_B-cGh_CUmd#p0`h%7m2iK@5aT2k71r<{>9&$PXwJK^I{8z{Xo4GCAie{>Xn}oNA zPRTT;MCd9rE#NSJR$|Oi>IXV268y-_io`xLvjX0JiEJ`z!i(HjBQu|uLE&dWB5U!^ znOV@$a+v!sVTD0G3d7946gHSlP(&xv&O_+2b%y_Q?zvwLI zHjq{f$04cxWj^F|$6toc6Msch?2KU6ieK4Pp$E&;7bUT7mw~tL4YE+A#Ukzm*zrwp zA$*Mab1_E5-N&VP%IZ!7#n{{8#Ra**0#pw~E>N7=POwvwLfGAVcb3lh zFwvcD>v;+4s^@Q|{7lR539lCcM-|>VXmCFotdj7WR#+hGlF0p8(Uh1fe@2us!8fzd zk=qRtjudf*w&uUhxm&RF{UC4Tf$pO8GMtje@WvYg_kEr1cuQUC z1jxIwKNlJhK70=I%&Id!5fTaVa2Nr~Qa^y8N*xxVm6Orfbr<#AyTJQKb-qp?3Ub|K z{*hcAxTwf)h6-S+hTS8^XSncDH)%M7r&)0D-wI=?R5~BxS>X@70aj zF?pg=>4sppR63Gfs7Ikfgb)v}7J^%ebfMAL)j*5r<%D7iVQ-pk8h+L6&6o@1Q>DAKC?C=mN;vdB{VZ~s)xzB8no|#f8BEombgJ5Q zTNZD77(SW-GfD3{?HeG_WZ*U7mzb}}%$H;^XdE&}5@d}fTf)hWr;lj(ov7L9(eRe2 z!CYc<&fSKVwm9L9QR4_CN+3@^q}(UbnV;$Ggu5IQ**@=Oh>|fC*kfAobfcp7Rtx+t0}awnG~)9x|%9wf5thYpCUWq?dOeWnxBF1RmEMa zl6NMM6%YL`U@k1c<{I1yAmA}qG2dl^Hu>W|Qk%6To{tCp+ z348{r7mi#a1Vkl{QohgKB8%tn#EPKM<+KjrP)eHc zZmA}YrZ}?F(kATYXmJuo{4spm5EhQ$aEGTZ`kc+Cj{Ce#8XW46%)!-FwBfbT-NyoD z+~YKSrc6qH-z#P52rc8Lb^dhnhE)jZldh2%Rs(~OZo0xwAm^v(mnrQwS1HZE$&mkI zX6>V3);e_7s_cb$tvXoq9hzAOYx{4Dd#7oHQ2eWTl#h6qQ-KTdtIk$a!;dMdms|-z z`f5VxW9|{BijSr9rG1T8D#e5QGcNagaXz&o>Ft+r_l>)Ir|JQHT9ySI+?Az-{W4v_ zv)T#9Lf4K@T+=uWy`j60UI%lo=8yrvaoMT`VcaI6#l6a=&*;Z3ghLwAOk zql0EHx`Lm=D{XPgEtTRX-eq{X@0}@PqD{&Lm}y1(;^hgsli(7aDZI2J5{=_3(^Uo0 zh+L3p7>5GeFG-ff>r@P+ib2m$7lv>}t$*qD4vDx;;2b=oxqC=t&P47!>shTXl>HsR zL;%7Cca#Jm9D;LL6M*m!spOX#}Ps&|QRQ zY)G9spt5)tds#gp;rDN!VhlR1tidJse~Va+K)A%9Yn9#zXCTrLtqI+8Enw(Qt;wH1 z4e$f>@=Kn?>;B_EW&maBkyxd=@5|+_L#fYD>kC}Zot9aZ<;3Vl0Yo@HEc&qg7qxm{ zelPwOp5=}5hdDmR6dvaZ0N%vml)(JR51;!Ix0$M_fhqVki=(`QR4}DP)XsY-_f=xS zYdJ2W-Mu(6bB!Ly;mt|B9vESEynO>~ANmScM;SSM?`6n~G2$ywiwjzghF3c?pOebr zl>o1e#4AWu*r4a;PI@8X9WLDb!~F9!**F&XJq<;u1S(?QA-u#NU=33vTUNseWWP0OcWZA}uCzUk8+XL61aQT+WV&c_%MINT#$|B6nCj>3BnL zCLQ-q#64gff><7}i9@PMVk!whx;Hb41R&j;fe~;EAl;ien*<=;n<1RSEr4`yhEwPk zK)N^c6%v4SZ{~0kfOKzWB#CCc1156QfaqQ#E#e%|a~jk2j>|cCp6E;QcP2gbBWj*A z)dF^VD0>|5otC-vKz3ZVwN7hW)a?l3M9*jJxGoc33{et$0S|%SLq95_&Z(BuLXbsX zD|p-2aB3O_y#G;cLM-a*-PcZMrAq?d(S~S!=$A>rT7`_f^au4Wb+m7Jh^8E@)MZ(+{DBUoi8%@3uePSyvTs^e|kfDC3yE!GLv3yZT?Gt7Uc$X zq~6be>~hnRO>6>9;8ekZ?)2uL_Cdd+hTOTF#*&(XU^Q zhF=U8Y`wjuuh846S=rq#*931wyJ%aWBbk*Y;9eQ8Ox>LH_9S<=>vj7di|ci>tQXtn zuyY;~nHGX9bufhrAA*(Us=ocWnd1i%&nu#=h>HEWm8qHj%GL!dE9-~Oo?X}X^3`*8 zN+^Unw=*ch;+y`7xD3GFnEwMun660cO{{a_X1o8`xv$--VDWD%S(7zSm-T$huRnj@ ziDmc&mNOqo8k~>-0O%vS&J(KV&pQDEbzEJ{?TLk=mvid@+x?lT@fEw?urlBe@d>;p zVq>GT7oGNvT=L$Cw|^*yyj?=1IFPM1tT3_ed)R9q+FhG8C27G~T;YlXEpBr#Q+y8Oi0$K&2 z4jf=To+|O}B9K;3rqW*rE~Lc~`t@A`aB&xhryFEEi@DwFuq@6Ax5uTrvMa?WsuPB1 zNrrN3892-lq1BZmV6c?nrH`ahnzs|-^K5J15KW>l5^qwXh^0mgh9oDwsmLL)=Y-o? zIJ5*G!)=VaTj38#46Q2kQoZXfgW!b8oESoZyER;{OBe)m+4=}Kh76bEMi>N9lu*Zu z+sy{K#BR ztBHFTq2G$Dp?KFhHIyjhZ=L*Adi@;9u9wHA(1VMk_a2^f=T(6u0J#fM{hl|*@JvQY ztKND2r$nMFUe1&NC_|KIG8PuXIdxHQE)GNK`6YxyU4a)W2Rl+vlE(-(j+}!Eif%D{0Rsq|6xh5LFymp;oI{Ie&#J>|~+ zfwB))IPGsE@$L}53wQR*cCx@H#@sL$d}D|e^}ddRH13DEB$z5i{(*2`aJjw({5KpY zxA>t=O*2lKDbIvUdCS+)1wsIydJ{xLtavyG`=m~A=njqVt~)gCcL&$+NxIY8#FRyW zb~9Kc!4Zicey~m?RHzEn?>k>4R5#SdboP|G708?cjR^`vx2_P!k&^GUd8c48i*ne% z9S**e)o1et5i~j27K@R*k%h;RN3?Gg`S}arpBN<#evK392}~syFm~F@k#I+nQ!Zv% z1gdVV{BhhrI|jK$!mpTfJls+0wR!7ik+`1zYSJ4ex2*jOPXPo;qQU5`E0p~WkMYLq zE7LuN?{SzJe8whylR?GYb%tSUv|dmlRuyM4fOiYR6_L|Pb&gQK$rs6m)pVWH2cmg{ zGxJlyt&CW_(<2>4o|{3 z5`Sgp@>mnf{)3^>zWSV<1iAuV2>k5|xg2)v( z{b~x^aoCgp4ld6MViE)PCyRtRG}a~iEo%gw4%Rg8-GIMboS+EWft}nG%02`x!!PUa zXBOiIeBn1d2p5a%4E~Th%44~2zzb;4%)q&~CZ>^qDbSv|kc9HNk|baXv}X_^8gJo^ zBwz}(XHFoYe6GGcpWEWlZ6Ep59NK7o$sS!iayFZhPyGtVqqC)NCO->%DYN3o~^x{S-L|M?H(O> ziN(5i>RRIo96;6*${m=r1-yJ3S{UL899Q8E&c=f#M3LhT@f1EP=^m5>34eKD3oqd+ zd_7qDfTRmYW{*(ot%^$h&MGSHG>RNiY>vj{qX~;f3&-fyo;=}+jN-wZG;+aT-?Iv? zIHx=ac7iNC$GBYa`wA;%f9r`vq2DN|llvsOLPcXQ&aGY~YTzAYUpqyvrdo3O=a=gw zw~7bO1R#R^7}1pI3D5}fj-b0t{NZb>^8YcOKDNGR&Uw7x)H1XW@}B1QrZR*HCSz;< zHqGcuU5%qCGWzl}Z3!^JHCkedX(%-|XyhDQB0PUh)S5bcG1Ph!lj+kd z!fUAHotLhUdC4Z3v~tGc-vwT?cYy+Tfj5oNi$jATDeFOWYg9ZYDqOIjUm^bki-C>K zLLG?s6%eshh#I=TNbg6(?^}{sH6bEKbY|`ZD{SdfN$S>K_vKh_1hRz&VP8&HBai*S z&+c}s5}H(#myIauCfsoTq;rLFG?yC&mQ*A6xrV9FU5gevq3W(4lb(p!2d?44@OGL6 z0^W0{*5}KmihFgFrGJI{QX)dL_w1y1E_iiW=2pVK!Dns*v0emXU6#23HAj3gc7)oOAm~f9zxA?sbG_(YL7iMtp)xP8B;xf|e$y#<| zcb3VYqcGk#2fOp$zJ zCT?xCd?1a6-;Ej-03%WsqUv3gbfShu5e;)iTp?shQB)3_Np85R(AyubLs@aat*|X7 zsNBB-_$KtNyb`mn9>~{3?$`#A-?`=a_&C&DuUGk<4Y565~}0tl$SFoiEThK zHx;PdabRjzvSDrJx}fn*x8|beUd zIA1_4cug=z3y7sU|BI+1mO8oLsFAy#0_Kpoqqas*iTfB+0+E0Ib_%nLQu7G-4(8sN zzm5zsD!4J`P37{wF@LdUbf@r~aXlGp@-bTl@xoNCa;hx>ztNO9%C><{=#ARONLvN6 z&Qz%|RcJ2MDz-~+;6cFajc{U$v-=P|=}m z88jvB(|az3N3-mrS1@{AtYuv#+i9qJ9uXTR{JL=`x8(#*BnD?FJD*pihA?t*KXM_z z_|M=M{~7$^Kcinoxh?j~_{+eIUkuC;1G#=x{-u9L`Wv{tgKS%dN=qmk27knGR>EC} zNRL66)00F=BcU&?WumynqySyNbl$uj)uprjLI3;rs$AaN~C@&5Vy{+;~og8#Mo{^i%& zd>z`qF`@zAm$-W|T;C*evnrW~kjAwngEg8Z+Il9gB^j*KELkTL*OCm@YL={(iEBv) z>orSRMH<(V4AyLxteJ^xNe1gSOV-WAwWPa{>7SXnmWmHxm5(9LP;5f(7?GtW@%NlP zRriGeQmA!(1$Smb3MJij$1Z(rsF{o@K=E4yz~n)uf0L%P&!4 z`6Ws$zeI`UR|Ah8LiKnH{w3ry5(zKPy}Tsroxxcz49*f$%K;UCNtSIW=-Pvl*YGCU zaLIH?$UwczrQhj0D8=|bA-b7NfBFWVxl%n7&k_2V=gWA8ntYMCN4ilGJlDis+%ep) zd*yVV>(FXU=EmxFLHd7by+nJJ(vJ^&aSa*8eME^mGK%|%3{Hu!755Pt zW5|%zER7F~7j!WaGUsZVZFD0@Ea|d?#NOH9NmbVeKM+rTym$I*dX|2Rw_>kXf2JIz zMf=JceO7xt)|mq?LGuD;MZH0rZ zLi}G%1wBQ@`atMU-Tua2Z;OeC4TUDQurK7lMR9L3X91FE=<~q*fbs0k{&2eenr|=K ztj{ImIm>)L7|-#xfNlsJ!SYi9LhH|F11C2xc|zk_=Dr@t?x%lp!qE3zB|xN8z2^Uc(YcvIoZ{3ehR zJxzHi`c0w7o_@Ho3`_D8$0)VZ zdCTF@^^sUZmF-MMdBZE?H$y?#XLm`Lt-U$yJSaf1iQhdaL6lD}cFKjo<@qAVn>&I_^@PSJ?vv+nF#m}+-x;9^Rsmeid z&)l!ne0pDo*z0&eM*aK9ZEvSAUr^+Zc-JB}*iHP$(*#ltXG;`2y>txnX^Id%1)4J^ zd(2a*y2a3&f+Gt=Ca;@~D#fxAYopg1%y&N3{(E ze09uKSpvQyYMc^&#VTZk9@|b#*M4405bUZ7yj8Sz!rL5k4~~RU3s@yEFFS(|YaGau zD3HaFudOnao@0c*3!=MX=S#@f^hT9i`p%t-`pt%idz5I;_^kOUNYjLei~5&0wb@$a zVT9If&T}-mHX`Gbpf?f~*4s}Mx$vibVCs6Yk-H}09VmYZ zO;jVi(@3Zh9(=foYJ`Vf_e3?q8%08m@bFZ45qITw7)R5@2=7)`&$me9MLTwsHjKGlLZ_v9hPr5Es)VebB(EVmh}co=rJZsG70fz1eR!tmtb&Fq<)UJUoj zYa6fiMd-bl3jm*(g0lnT7PgS7j?Nu7;bk%-?!Mv%sU+U3sZW&1L*Gb#jB}%mxHrb~x1pg48*WPB;NabRJ%3ZlSYc1H1R9F# z9o#>+O!RJzK_$vz(}fOh6mIOCr1!^KgYe&A}I`%c8)^mV~63Obc zlPy%h?mv)M!p@&Ad~D+;=74vDzOILV}Y#IQ<0|;%`Y6QFy&XY?XbLknj%2;4O}S zTqTH}2qML2O=w#P1Yr>ZKVrj&`JbNZF;~@>Xz4ALNKUGqZBb|t5<41x%^I6T!<(W8 zJXCnCkdZrPo${?jF`=5}Dx3}*Kz4v!%T<#%lpQUc0o)^`d!h&xFlq-W{0VH&S4Dm- zlHWjnEZ9EkoAaztP)C$1pl$@nJE#Pr$7)9vv{0`@+Pa$UVzD`|F-H_42>Io+qEqu{< zbXq<=Ff}9D(2==P#QyxfG~tbf?b(sJfCOyMj!c{cY|oBN6Nx%WoJs<=XGi8Z60kix zGG8VE+p{BcD2bRPpn1lD$~%$4O@r;(0h@k76#3g6oHJveA{$buv7fO%D3Tk7=dZ)M zW$|8sxwUTDMD3io_mo9b@bks#7^IfQ!j76>4j1f;`n0jAUJ176`dQl?zwT_%knPqh1SGGUB zhz#j@bOFG}?hpY{RJu_5d`&7^ zY8;a_lp!o5Ui2Xhu2!(7G$HEPnyT+5G`}H`6o(r#FMxREdEWl4%_@{1?n9u+xn?YH zL)p0^Uu?LpGQGi=f9M$KuKINo2MMA|&o}6_yAF!W+%CkI0q95dFSMiKMSl0(^%&Ku zd73D*Y9HCz>}R9Q$_b=~C6Vw)5o5c?1h6knNj;&|qD|xtiQNA9S~$({o#l{a=Y?9A z0gcGhyyN5USMytrV{JG{nxRtIUEN7m40`D%Wu6?i5J{QuzF^oBJj5HiGk7DemUBLl zL&J(^_|;g2khn@LY$bsoV)P?OTtEVmcXuXEqD~S`BoKLbXYfWm1(A1m<~S1Y(z-KW zCJ~dwp(L#P0tb_NxO?U%HZDqGq(^vN`v+GV_IC?wap!`>n z>Vha^GCsW^{06u~cw*=<7jnm{H3;Rps&Fyi;Ie$M#k;1 z97?Rciy4PPtdh#Tg>8VbeLr2qz-0{DkbbJ_AZiiQ^T!=6B+w1DvkPlsf7;jUd)~1P z^%j0lD~7ag)%GopUSGJFN5SI(eZ6yHe)>H{v+dmE>0-B?#8;|FC#F}jRg%-;WX(?G zYL}o^n`n3&+X}@Opyox&+LI`u!krntxP-+T%Bml`+>D|ONc=CrHw|P!xY9-jgopwe z5F!d>K!_-i0U@I3F9Q<)lqI5>Q0vnJGeqvuuf!k$lz3S$DUqIQ#VAl*$dHHAGj9~5 zN#oVMC@nhfMX*A@BK;xJ$4yU4#r#bG*5`nZ^3 z^Ty5);iT-ZBLBL8H`gV4CNJQf@ULfo#c*SNe(!XBF5&l1AcJlO{NA%{1)tx0ynXNU zdq>#@0)Fp4wt~;^-F1{USie#@>lcP#wpiNzB~DISpFu`Z`Pd{rDC-Z&-J6`f7T zIai`Hhpjoeso9wIl6w|vSH82tIE zl^c_X+?ZeF#{42T=C`;EvIN&FYLMmBWsoc*Hx|KGlfOEWZSaMaca;=3_+oIr!54$` z4ZawhZ}0_zw}xe*Y!P1s-7@&%V(`VxQhX7}_Iy2aGMu&O6s3ejNUMmX34#cRHm7e= zJo0_A1&`Dnsd%KH|J408PyKNkS=dlA04wpwsj{%mUoyz&JS*CImuGo@rj>=Q*7xtF zj!RluSck1}n62=8Q$cY_1|Tr>r{C{@XCwY_1AH=1>U2e?&~f?OEPumo@U$fnTnvRh z+S&<$QkbJkz5V!s;y=*UxFAMsAg%*{KCg5x=g3^vo-A{jszOF#naIxNILcU%zr9~s zC30pQEoIX$dPxLGt@qNEQW|HNPvhW<8z?@6y|o(}M{znz-njPCVxQ0o1WbAAHhzbs zX7Edh7=9~Kr}ImM9e#*SQ>1N0h&dWO$gN@~Zvl7Qepr2EC;GDcuaTof@Lf71>h>Lrg&^31|)(Y{! z*?6a*tknJRN8m4+eSbUGjt%*zx(!v6(tV=pE#Rpxs2cQE44#6j8I>HpZ6HzebUb&F zIs4}bIuoMi6&;|Q6mq45Z~;w;%!bs^)Q$XBq$cn?EEVS0Cu&e(XW4F0G({{Pp z^yMem_s}=$f9@>X0Q8MMO67CBn9;hoSu&7~-*=P)I4rW1c+ zcC1TxKivmAwa%Bn{KGe<{5daIOZ<}8610##Lvg~vI&U!ipbO51($R~L(a-WJ9ZvtV ztKe&F)1q=eV%rCl4u}Ct2gCrS17d*E0Wm=77}YGjR<*Pl7*KrT^~+jIr{KxR<;yQ6 z_oD$>PY%xd1zCPAziZ!>yoP7ga$vHI4qV38mVT#iX8=$-P&T*%;rV)=36ix4PLP0K zTKPUo$;?q~!To4TO;XA?E5g9uVG5KEl+P5!8AtOBrSoPiH(Nk2eF*(;B5#{eQ-GpX z@R?7S%D^YEN}Cwf?TGn$^EbR=@JTJl!*1!Ggu7P3Yjh#jQb1jJjm3jvsBP}o<%f=EvmX` ze^Z|^BEXD7ST_r7f5lAFX?Vft_lT7&MEEQDN2Ij@*CHWjrZcj#!dC>MQIuFt&lr;hY0=D0A<*%&6wc^wz)a1WyR-ms8#%~FPzO! z{xIo5Zce>r_F3Otfqk&`o{VuTSv6Fkxu!b#+tp-=u%_wqNcIf*n@?;R%3i}JIN@Gj zBcU{|tc$t(sY});F^qE8M>qnKwLjEC(^{T^COtgTti8ge?Q)jVGllP1SyX#Q9=6&9~J#0>vsi^H1Z|**n9G6 zIW9PmabUSrsx&~}kCZ-!vQt@Eh?brw%(#H#nWq>}DZh|&W-+cQ<@FDJzZ?JJ+NQ6Wc zUj=Cd*hIXbi^{($-sQXR)B9QsKROUD6iV+YK-{{Yl%i5~emlpesN|`i(i*>$=mDB@ znj)gqSCMzVBS$q|eN{bK7(W^WgYy1UrY@A-F5O^EzXT6DEuY~ECH-4G3RINO`#D2Q z|0jLFa3a2K`ueJ1l2@=Q331WUy7bP23T5<0_oonbk%p^kD?;~wDDBZ>>``!U*qcA! zejnvTray-DIu!J=Lupc=!|9tWm%fGg@|8~1(g~f|Q0q~YmdVA{%6}{XKH?`So~s$4 z#^%G@VL!NFcp1}L&F7gR+H5hrK7_UaT|WR2*FbrkavVM6)0$~(*B0N+a}f^{o`*5ye4wj)=x%v z6F){D=)QkRfHwcp#-WHW(o^uIHA93jYv}~yT)bu=^DLAY7Opp1Wg>C4wuPn$iUwj^ zSQqX>$AB!~x=jPqorSHfp_fieaIL zVIkU$XN$GBA4dzYUWy$=!%U4c5w8Mo6OXMX;o|b`gM5U+3GV>CHN%&%=bQ8N6%4cI z4Ikg2*oyMvL>w@=7`}YSmWh_^HaoqnR!H^Ps-!KuPcGw()u=8obQ+yLj$D-k$UW#l z_4$-uF6o{JfnENN-B*k_S+ry|XV&k0hs>|}4w=P*^NqmO8frdb>GnDnoC9Xrw|OHQ zJ=OlRjMvmRf?58s-zPPboJ9h`b4Lb#hWHfh$Y7^)HkaIv4DuQADcF%YgfLs^d9s6+vI*drn==U;)nBt}lQ_G=PS)>19-32_wn#C`nsre-|HNS+W=2sbK+{LJHnhL3;jv*Y2&-X|h z!F##C`tQ=SO43&vDVKgYePO&|1%5dyOii5m!mk(pOJDkQ65MABo4b&|{w<14P?P1NMLjmk|FIO%o^l zvDNOv=19~|c-4X0iQU*qqt{4toq>8UUqus4yl~;JtKI54M#77Y2h#A$gy3dXKM|~f z`(!b1HR>i?MBU`j&2bnwvJ!_f^MzOSa5m>-T!Pr0CGXU}HGwLM>U+iAqw}*zK!D5H z7SBzEM~vemj_6Q!Pni#IU%*Hc5g`7I2!1G znK&BU2TdG}^GT=*FA^*hN8>CKs=|8;302`emV~PC){;;a-a|;Bh|&%|m>?>=nY~0^ zSQJs3&{F_WM?{hL0*-jy>MbM!Mc!ZUtw=%L30=wyk^Qrf5CL*TiY!(ww5jlVpsFC? z>MN3{8ZSDd=jAGT@VJn0_l%>|g?%r-v* z73bwfbQS|`1ZpVqdZ}oP9rJ0Da|#_!v6fA@6gw$;bxy8hCsIEHbRylK!V@W!jMTJ_vaV6Kff9~ z9T5|;(-AT4tAZ{yZv(6zqhh~|pAO9U?!XKP%KlYumJDgocIFD66S?t~iH7G>DQ#Q; zIAN;Eeh;;tWgcYDhgwfC53=t=tzR||viDKkG!L@>x!;?IG5WBJOF!QPA(-v4foJ>8 z^nLv_+0+?HCYKonjd?k-%#Tl{wwaYW%7a?Jbg!Z}hmIKM1? zVyr8wvItiG6{uAEmNofD-ZvQN8)OL|1tp<1`P*{znZK(lO9+locA{1zHU?23{(BM~ zhZG)B#4{h*;$mU_Mi3BOAXV+^E#0wg_-^Qq|3^yYM*gIBw{*u0p%b7tlHfu5V8J6LH$ccABH=nnMU4Bdfz*U%lvPYm6G`RzcuV}YeN{>Bi5-VhxqpWZm1 zZu|7cBHsJQxgw!nGB9!YpFwZjx`&}RxYzb0iz6(7H=F>|BUDYev9GxjR@8_>68z%p z%Nn{Q)fLbrLcknl8|-=1r%wu&J_)ZXY-dNZnap`47jL<~fXFT*Fa~h`)#g?n zmpfR?oatq$1M(NzM{mw_&@N)EKgq79@E0g7mn4EwoBb)Yjv;iQggET0TL$H4bOU`FeNqt@m`6U!DQCbSM{&`@=&jx0Q z%5lFcGD{^H>03pKBi2B06wG6f7d&As$5_YWlJ z>BD(I8f7fx!S;c~Je4zi1ZCjl-&kr;)MGL!C3+4CnPHJ1vO0#}KhQiLe*av0j#8K5 z_jCNhxL`g-{vz^2QUSjo8H{M4DZd{X4E+89=JAH#-_JbW@cYF)Ug7rtbNhH5TT#y; z_`c!y|NmwmFWoBYMfE%6B>vz3?w9WL|BL%&nL5dVV*20x0>eu2u91(ibp;pvkJ~Tk zyAY{CwvC8U=RQ_(q%g29gzM3A``Bw5^HdPFfLbz#(L+wmy^{Qz`c) zvbjvC=b%JwU8q&egvFq@CczaGovx0`W!y;TnD8U9nC5YF#BYub?Ib zIw*9Zo)n@fCLYj~jK59E_)GX5mO72!JtR`uSBovRu1|2Kpp`>uCFz}3@`ju59r9*~ zwC^1pykP}n+&g%am#>u6fxwpW^G2lJac@XToi>>X-u#g_L3Z4`fLl4o`6PMu19{aj zskS~r_}A>dsK>-}H*FWG7nlOQqj=Mr1*o$#If@hRmIX0cd3es@rNa8I${#x4oR>cs#(P#c z2Ae6AJ(|*33QtLT*KwJ^x;n}<7Yz;N8}M^KgXJmwti@{U9m;-@LhyGXsg%E4V8);? z-wE!m!m5d2OQi8x+I)AczJ;MPdIc7#IC8{Z0_~0l_24QRo)Nq+X#|+vSHUT(t%5J7 z6H`P*{U^SVA}Z=X(R!%$bmrZT(TEb>58YEu9!hHa3WVs)bbUHhCzI=VdWOuSI=T|Y z57BqpqHLK$5JrPIFfd~c?}XQ{AqvENNn*Gt%Y+q&`&AQq41X~NqDqNzo;4bTxdTf_ zjDVITkdubDMvSZ1B7bJ9x|%lTi(+T3R(x<{!kevIZYPX?*j}+gRU<=*g*qWu`*zYB zf&XcQ?h*th3C}xa{g2_U!8l7d_oO5Ya^cThs!-PYBfZ$epjPlZ8n*;7vw^SYg}Ax` zLp;L``YJ&+&nw?1Qj51z;dk*eIr{B{Y~E7UF3~q_J%;K6wl$c6fR`0h-A{GZ@>(a8fhkfHCgg*pt z4zOVB)qnA7Fn}i3mg*T`qSTuHZ@W!;JOl-w25#%yFjTG|M0$3Jg50ZdN8 z<07F7kQW^Ap-H{M2$x75-BurHho^--Z6(rFX=@P26eoVLhZ$S#mQ`PdsH2joKtfd9 z^N|qK`k6?GX&rV^hA~}M4ZA4Am@cb^U6f%=msP_q>J`&E?4k@~x~v)t-OKQ_DvIf{ z>Rbs=3%zed>3Cc#h%Mik2~>2~VO~0lZsjhA5P>e@y)RNWoNFPavYwT3W$l7 zh#Pkcp3z!{sqKMlfHCGgd>!roX(IBQeR>inz(%z`nk2TuI(Dief) zbG}~(YJ^*qkC&@@Mg)JT&Q2mmw0sX^T3K}PUUfdRALAXWdrzGfd*Qw)EnaHO83&RQ zXZR&Kj6+gA92_b)_Vz2_-AF*E9PjLZ#Wqzla%HX=y$I%K918zL7R!|gm4chI2zW~Y zpu%1R(-Z*DDgapjMOXy@JfQ%E0Kh~Az(Wd91VE7h?ofc?0Kim*9J2`sJyW7qV|^ta z5e|;$ltjOQKLAt@jAsK)T0{n}U>&l5YmI`C_wa$h6FJy2}&ccdh68SJ_UTn4)? z0+&G`MBp+W9e~*9e9mgPuQ3JmUTY8~yp&wrACdxkh@h0F5ie!M>jB1vfasn-iUUKD zSt8_NAI>kZukjs(bQK|QDxN_>_xf!iZvvjNz495{XeL4)B6R-($OFkhKvxKHBk)|v zKmpOKMaVk_&qcUk973W9c?UZ}9b5C!m9Z+sU|Z4Y7|2k4?Z{ zdJx3=A9beyhv4$Uwr>XsFNHV<3GPA<2q9#;8v*i1{JD@Y#OeO53b+mrO`aP$eVO$f z0vy<}3x+fy5`q#aa2}@s{vDqZQV=9b4iN7au|a~MK1)XrLZW<`)aJlKB`eF zy7~p>eC}bhuaYdr;T})V@K-<@8Q8sOrNh7;{LE%x%Z%__cuylfT5>Hhhpc}jXvk_s zSG7whNg-|il}-O}qj-3~On7)NL>z6fCmi;-xQ;28Yv7CaVzg^m=L zSuaR|3n3**pu30*y2l1JlqF!64U=)#;8$gU|0Dz4n*lz^h9||_)9UGg_hpf55O41) zsN$e~yTBm$U8qF?eM;SBfw}V%kkR)+i^-8d(E68g92BZWip5u-Q zN~8IH1Rq6MGCu+*BCr8a$q1xi!;j%(p5_P?BhIZ6zDAGK_LyTbt=_0 zIK@vxE0768D{K!iwBiOl91wtyR0{p6U1$<8kPdiaI^YrMfPK;dckxKIoQ2V9f8Pe$ zT@s5qTeq9+^jInw`SB&lWBVh#Sk>b(%;D|CK9@K*O00{olt=jfqi0cytAY~za|kqN z30r>x4{#KmC2ai>JQUzT*!u7AP>2U%>%YQ75gvrC*W+P09>~^J2wyZEerpbV{RLR4 zu(YE3LMz7;oAWG0s2mT3N>dXm6#+uqI8ob|$50l9vRF?R*2^(3K?=nw)?%;{!T2}` zWi&4#W{?sze=bvpLJrO(lnq)6T^Q5lJska93j7B^(Q)Ld5t3EqF6qdr$nTde`1i^D z;1>Kkm6Dl)Z6BB^*!F>$f^8p|DKM+pg-8K=nK>00PX~M@9Z;-Tsh`tL@7Cfrm7+wT z*azzjLk>=W{8kp_g~gY%Cpc{tfl9`!4{*UZo+slO!J1(Kst`2elE_M^Xs`fP3Z{K{ zMoF+JO&4T?3ss_PusHFM#O5m8AzxX9kZk^F2~kdqQDX^UyAbdY&&p!N=?ILR>%j&h zsD|7p8mY>QTwf4~TnUmed5=1o`>x1!RUotoo^=66yewJ@!S1~z{3yY0rQj_xeV61t z&c%)r?64sR5oxg@2Z9|i1HlfMfndkO5oHma=}@1u8t(T=0c~gvTGJZD*qW-=l@C?BY&xl@u zXPg(94qCU;AH~Hhct)JJ=@9JJ_@m_@synK%zj`_ZI|K)r4w|~7>ifh6U_7vbXLVpj z@k4*)y!3Ki2p%NLbw`R^2Wd}i9Z?8z-pCCQ>&~hq6}e7?Ix`nhmvB)qZqh;E%5B{3 zL$S`@)+3kbJgdU^QBFeyC)h}_zejdH@>ZQ8xWHUu*kF@SV&5JkIVn*PTX>Q$H|=3S2#+q`INqT%*LuBBcGr%d4?pYdMP(TmjwLpr$ee+&i#I;L#kWO z{eGuIs$0(eey2mKTh9G{r$ee+&i#I;L#kWO{eGuIs$0(eey2mKTQ2+klu_arkm{CK z_Y$d2;mBj-b^yFrQLWE2qU^h?EO17wS2?tVW+ z#d9e-_nUUj3>LM72XYs{G_65YiR z=&)fqw&f!w&|xiH?8t(QSuywINC1r$_9PfN|;+JGLrkzF9eYwZZ8SaD9mw=Ltt|e39mUqi9nb` zZ~zE%KNewb04_FAm5@!WdSl!cy$zdes68C>*$v4M zM-ziT6fPQHhD`8JM=l;uGuP)tKGrs+xyJZvd71`k4x1ssjMw0ENONVv=^OC?X|7Dz z{G)h)G*>1({ttM7G*>1J{w_Q~nky6TJ{J#==E}(0jrv(wl|UDyxhp8mRU}JuPc){N z=0JGRwByjU25r9=9EIRYIHYqq59!cX2>pLc%!fh}hjcDehje~)gmy>=rag>{9MLht zE1Bfc93%WDlj?Ym0f2oxhaNmRo}=RLI)`(re$E3l%|8Y)r8Wyj5bbu&;i>0wRH!?g zju@iO`r_Jk4$g{b0;oh z7P_EL8#TY~v_~C9YF?FiW-pr;JI`ouKeX3A@AFGJ@AD#d)H(0fX{2WC1y%jC&;O_< z#n}9!{LIMm*!rVYulS*0Xh3V;HRZ<8~D|T@fpz zr-~C4xgPoZrq2&Ar=4G-AYa(=vV%n(FIU}D>2#6@Fnz^)oqH-ri+3(|+y+P%)|5(T z$iizNKMCI9L;DcYHC%Jvy3f^i={+hqrd;+6S!fkPr84|yne_>B;nrJ3|K_c)yWnlJ zLm8ux6%3J?2x-hM(gn(}}Rj;-hAuOd7#K&RaP4SvhMg)n$zfWEjbypS-$7lm_t&#Rj~Kx3pSQ zZ>h}1b%i(>R{^$;y9%$Y0BM0UXmgo$WrY!b5;F(8tDk~PQ|+#XIqh*gz({J$z8(c2 zLNdh#Mo9*&`bq>gmIUrA8;j^~>#HwB0@2^rSC2)afC#K(&f#`4R ztAB(<2@^j=!en9~60*BGAW=2fSNB%CtJV4_9dX|HQtLG%oP#AJc22pF+Im$5`YU@U zu?99`S(Zkwlr_NUxpJUbTthz2|7S4&)wv7C$*={fK^S-p1yzIIpCsZWUzWJT3K`%? zRCdf)HurBdtq@v%$ZR&zPtlN)#)`r`c; zW^`73H$*H&&(LA^^+U>MSo}CY&EtwstMmuWi{ut9jUO;xitvQbxpxG?aCuB=xY^Vl z#e&$D^6q%|^coTcgyXkAf91d*&WC|iA4;JDR|S<>@UDbb4A*9%?AMD9yzgMpd7OAE z;Jw*KJr6AeqQ}m#q5ai?JEQ)efcs!v>L1%WJn7(Dq0ibDI|T8kP}@CActALI3bwZJ z++YDskq8Yf5Lg3| z6L3{CKbKGGwoU<&b*6y0NK-(IT7&Kb$S&J$T9mM2L^R!rt0^%l0@2YBkVR3i3L+O2 z-ljnGOi_3z;~5I?i-G8Q6g;cW#WM&UcK%-^3a<~(1-Q!9jE)wCcPO5r@HXLqjwrl1 z3Q-M(_cb$mCWX$b{`3q2289>lP;G@5WU0BE9JutO_Slc-D+Tu<451y{dLu_2ZN?3v z0Al#;S?DWSOe+NYWD?vDq^D6_+9D^8u|(?AjW%6$_V@ zNJnU&&@VOPXXZao28V61_glIZ;4zwDtJD(aGkvEvi1)@z+uq6bhYi&!6hMWT{1BQ6C&he(i^H*9Y%{oJ9Uxf05J5jx z>)&fN1>w5{DWT|%Emd1$Jy0Ege~RjSi2CxzuCH6-ccCmfS|6R33#Uod0Ok5Yww_=C zd;@53jtLK0KG^VT>YAzm_~N^hCIhJ{`M`#Ft#uXvB&}HqJwSf;3aOhf7f?ei{kMR zwYK66-dWM1OL4mDMoA+;kBT|JB4tDpCI0O9MA37x~*voJA_CT2bJt9S@2(G@@ zT#hb%#NzQ;H5e}w9?R|l-0J3!&d4^cs=U~4A(9w}<|owtTI_ltt^d-**(*d;34K!N zl6%}M2#jkaA2HdK_A<=?9?L2SGWBQ_;(;nyWNoG=+xvvhVEyd= z;%j)-R>Us@1!>@zn2j70fP#IwJf$6R#!U;LKe(c^B{my(r64CywI2gDRWbu!wO{O7 zylK8UR>hxXu>X4NYVx@+Z|1>t_XVPTB5zJMn(euZ0)^#ZOuRoH3AJj1+>BkdRj zg(tv!C*TQv3h-VD_$hk+o&smT;ylvtmIIfC-QRd$9nKonIK+2=Y`XdP6S%bJ9UV>y z%KN0ipV8qkwma~@=0^VJRGmwGZuTLRmy7|ACfWNB42PEMC(%}at;D6BJoP2iqx9IQ#67Xsr zzJCIKkq+;dfREPU6qvnF7T|CExYlr)4+C^KGCKM9g3-DCRdC7wQERy1{RJHk$;By$ z=M~*@6kN)=w>4bKneD=Hu8*tGJ-=sKyKu{G;+gaif}SFYYe*8m09Ax1Tu36MWbF5U z=)z()s;|j*?b56-L6%76v+rk);8D&0LJu-$d-l)Lp1YJJ@|2maaT%#FssBEh{Jo)F zht7o5(`O4dAo#4M{z*AjAbVBI2ITZ%t6tQ*E3{!}a3J)^g5dWY;-=^e--mYRR>ggh zi*vAnE!elT__?aLfE@Kca#pUd_>Ca0&#HRH7dgK-RL)>GU-3&-i=GPRJ_a4f=C#*Af`D%2MK6@7O@$3t&E)E^Bh)x;|_vYNV3f1th4^Ne~+ZGGIx(LkCq&O{;kJ+2+-eA0vQa+*8R9-Xv7Z=(>>MxJZF0os(O*bVRtZ4& zU^W}KoTrA`5Kxwk>mSunACkon|E&f&OG^2ddN`PG>eqn^GFs%hs)GM79Z-nY4wov#oq(~Sn>1V?dq3-DeCKN z8Q(tI*V6Kh0HAD06=)`Cjp}g;D%uF0!dXWY0^;;DAu9w3j7LvZD!3fR<8jH#@|wEb z!K`yt);`jJ->OpkNdx9eR@HnO4dV1Op;RP+uv)vVe5n7aRw}1m`~;P?i_G0Ms?;8Q zs6Rn#>k4M&xpFDB0Bvn>O2tEcFD)xu9Oo8Y$;wGI$H|I^`VG)8oDq-){6l3u06hto z;G6!^r6(kt>e6j2@fN8<4Mzb1*J%J$kQKi|0O}K}zVQhH;JcO`{Wq{ z=4m+!2)JGY$T&_Dfa>j1=D1J;NK3~EK(#|A-$?@C*G*j(@{>W@C$xFQ^EJR-1negO zH6BvN4ro&j1>?!K_=f^e?Rb`eR|KG{6|%5g0ICeZ%ipyeck+|_v>f*m(4YbCA>al9 zs9FjLxKabim`>C_;ThvN_is5fPuej80PvZ*FNro|4yPFm23EXl7B2Cal=DlI3?BCl zH0{R%!N!z4we+Jd&?zRtCVnCWr4m@Z_v7%o@A1se>d$Pa{>;9+&w)uVFv_8U!fFA} z%HDy;NzVkV{Q&WSw~DHi;8&Emumxwk3HvhKHg%tH$lx@3z6&CityA%|pa>fEFap3X z8RUNJEse;q92sC%9nVUbEsqLhg*+iq$%N6*=P?PM?tp7lBJ8PU0{^&!yM#d?t9}Tn zt=?3oNQ*CO?!wNj zogg64tQ>g=TQTdmq9VkY$}N5s{%EFKkVZbv(xn^nmu}pD=|{bLtX;aPyRS!MkF7m+ zjl>HO+b1w5jd^_`$Bb06P#0j@n0FZg`9folVUAn6?FWF8gKlGvTe5)SkxB}hgd8K4 z3^bZaN%KMGz%_{j%zOA&s$0Tde16({XzX5joAw^qZq}v zC&ZVkTYRu%oXbDu{1f0F?gAdD!*PvkJ&2`@{j+>w2C#Rlf*Wfqc}8k_Y#RtX(R3D? zK-`ADEC?QM%$dr@Z$Q<558P z7a}RyZ~*Laioe|&F6FFj4VQBMtiy?8?_$Y+duzDlXZ%BDw*Y=b0TleiFV*3qFe-SN z4i^Pe!H4N^QAibhpbi%WR>8aJa8Y;_d^1E9w>_dDEBJaHj=ArQ&(k{mpalE@9WDyH zDkrMLMZs6&Q>DYjAfVtAb@;$U`K3Bs3=XRNV|6$tnA6S!bvRa%1K)#5>h>4w+JS$l z!?C;_I8Wla`MHv8{4$<~3&%Ql;M}+E!pSG>{Pj8<@|sirG#xGmBGs>PI$R7&3O-VY zi{VL)+mSk43{?tF>C$ZvXQthrFCd77c0o?N8;N)m__91vP6~Yqa6J=yk+>EK`tpPL zUBUv2cLs9_m;`{e)!Jt59nup20}4?5P1fb<{Q65Q&{LS$i^MP__|>ua6;G&lF|pPB zH(#`5t2eY^i&sqWMtyH&h@Xh!BE5VFtLKGkg;NW=p;m1J5GLlW_@REs{X7-S{3h#m z*MHrY3sJ5SUVv9Ke_Z!5CEC!JeT><+GVfoQSH^?A4BtT5Z<4?NxKY07Wz43u8^0EB zM=A}DBEB*2QM?|h^tY}g(YF+V%#Kvr+s%-Mj9Z>1KxBg5jy{!;sHgy%r zsWeLzw@)EVsCJE1TH99Eb<622N$G6wOVJmD#EeuL+ubZr>1#JIskF5<6jQo#Xu)hm zUAtAuDI@V^0gez!xsS0AU9u=%|4hpL87op6+Cmnl^s`%;RN7gUUal+ujKlyynmC5Ag`ZpDZ6D%?9x&-UX16SV|cGds+FsOZXxF2Vj;|&SZU8{X?Ott6@1F zsZ=t=V#bHp@I$4L(OElEsbkL~n>z$6Sz`k~RN7c?41+Om6`&)PE*6){MQiw_{vp6w z@$>niQf{8+$Sc+6GRcQ$c4-Ck{fPN6zUcZO=F1kdhgZ)hA6cH#rmoS;6LsoT=EDaC!5#Znz2dc|J_V%IL^+@t4tmbFI&VvhE(##G)^FPdq`R0qx7%XtoRTeA_~W0I^-Va=_3%k7X^fT^#%KGL!>=%f!Hk+ zrQZMw05Lrvt3B)d0%9*M|q+dVLAoA^ z$VC(%CJ?(W`RWJySJJP3I;4?bwS2DiU1q{&f!HOBs{ED?QR`oah_d{+4iT;W0YE~# zu)aRu4&uE8PeSbz{9&x9Gw{QOR@6eP*^JxjsJ0A;+-Ra{68pAX5{MN{VbTFsIEhLp zFTphX8(ylzb@^6@4@~4g zP={;s^Bzs^H8pvg+pXMj*5vawI$V?I|DnS*`TrgruIUT)I$YB)rs;4^9~q~^HT`9z z4%hUZBXzi@ANA4Unmz>=b)vtdOWwr=_#1z_HJqzQj=QIG6Fb6Jj?sz=KyW-E329aAIoc^3)1sJ!xGLl9{QY(Jw4lVRwWoHibQ8`TS=hJzr8P z{g9SksHGQc=|(%orK+n z^@R4i#CW}jb!op-1Qjt68|;=7Icn+KcC{qBs&0@E78tXqp$wANJ~W;D-Dib86p;Aa zVE@j=uTEp1^023h%&GwmSaW5B0=OZbqZ6?1_Hblk^X&l3BS7@XTtIT+vO|r=>~8q$ zu!@5Snezi=Qmo<$(iQ4Fa}EW}j4YDU7=Bk_I}ow!jm(u+1|qi!wgQp6r5$F}hs{sm zYZd&Kt>s}!M1KNTFn4}t;xpCEfTx5E0+t|LzHJ(R4cnPv(&6WoCWfI=K8HzAOdRg` zYAoP>kg~*lwZ~s$1vjR9ud#$n(!JMML$bfhE?;92N2L2+V-;P}z1LXAMv$OZ?dKx4 zS;w+;?==>3f4cV?E4eYE3Is<+OC~H5PM3y7wBZ>5}ff#&YPdYSn*@ z^(;&GUSmP`r+crlq8rn_&)52&?!CsEPD}S*V^K$>d#|yoF6rKDEQ?;bR{hsl*Rpi) zH5PV%y7w9@yD{B+jip_Z`8}yv{IqoM@xozEM`V64-*?IUUgqD%B>RcIzBJZHe^smT zQSYVv`!m0n@^4IfFZ}QkfZ|@l!4G@RJce@T07vem8!A!X6Rat|lQ7}$V%_SF$L0^^ z*>;C04vA~OS5PZ>El%DDn&)Z-xKE2(0(oCK$F3txP`KA&npQ9w+-m_gJ8ImE_WVC$ z7Y_7+?_=vf{2Dukv{<9^tPw*Rtr)!R`Kdr zYdkP+#H7X)zCZ_!!=_4v<}yZ^aDCvBhFDBt`X%b9j+1M zi8@>(!lgP~BfiJ#aE<65sKYg4y9dPF?XN~;b^E`m5!W?(egzkz^$#6Rf?&(}_q2vf ze%=0Ws{BISrs?@LqB*WLT*?{Q8ZPCy_J5&|CUV*Of#?f&@cDhxaa~`iBa{btQ}^e= zL#*%5b7*7TC;4b0$m8MhuMyE=q|Skl*H$P%(!%Wye8zAI{DZijcy;S z!!^2npbppQ_8yInjo0Y*hdNxN+iP^VMz{Z=!?}WNVf`K*uF>s!9j?*sX*yh^+v9Y& zMz=@maE)#sslzq8-A9LObo&d9zKz%D_S-sKquVQWxJI}Cti$tV9E6_SuERCDeVq>1 z==P;LT%+4%I$WdM!*sYtw+HHQjc#|-;TqlEtkJ{q8r_B|z@_sxTz!|hes9G%fqrM>oRlU4yY&0M<%)jajBIVt??s-qEC&>KzLeJ{6({v36(@>liAqrM z1*lE!q7p8xAV zq30L9q3HREAg3BVKNXMZ>A7o^xck~>1C;tMnvEF6NpPqid2j)uoJAxWXzl(JF8=^O(_!d$0M$l&1Rw#Po@C;1h3p^$Gh3G9-WA5(+ zrG5yd*wnk6zog!50CH+F_)|i-Nx&PJ2LwDO?-zJB;5YU{Ie>L5I8I4ocu6Pe_mevP zZr9(MB##UIPV%?H)oMEgTvsc-tQ;}h=?Z_Vh~vL|S&{EzMZU%7`a@$*i9^1Bj)9@= zYkx-^KND{hKPCMxQfE~h`mOt2b$Q&(Adj0F^xMoJkDD3faWjKFuKQhedECq(kDD3faWjKFZf20j z%?$FmnL!>mGsxp+26^1fAdj0F+-meDqD`$<#8J>`E_~RhUc;z zT^=V8aXnpJYeLlRAdIflYJi}~I zCbk?tq){v>u%pm~0uycSkaa$g^9mD`cP;bb_f9s0MirV!<`W3YPBw&~Lu&${_LRZ{ zwxNXvmr6X_))t4tC*<>MJ<|i$0-L-$)|ExV0?eI@l+gyxN0j|CFUv7w8_y70pJH-x zjswlz>i7u}hv)DZAQ|BE((oABuG;{r?_)f2Zj?`kt`EW{}^%^# zBsC4Iz)g0VU@!VV{G;AS6y4u|k{H^m!j|fX?7+?wL^DGS6iD}|`gLUBm?Bw=A4fK) zWKA$zQDCQy@NY4BlH+D3~BTuNXg}N!uF@i zz;ep6pO^E+fkDq9MA-|do$~$$yMo2|{dSzKu%0b(@84lhkZt zZ=8ZGbt{ORB%@C)i#fIte?q%(ylWlE%ajJ$`PsbIweO(ei1mgZ_^lYP&Q@*ZuV_s> zci4Rhdh&5mP;cph-F&-QxWSJ*I5(gkHm|LYa7$%@>WH<{MDN&_*gs0KJOvd#!f#ff3$&Qo@eyE*e? zca2o_FGk_Gqa%CBX4VcLGOA{Qg>i;qvi%#J%G9Ve|8xKg#5rGF89daOJsbveZj$XQ z;ZbhJ$Mk6g`DZ%+Tp_cOm6Y%x!f?6Nl`krB1vU;M%=Ofw1~!rlgoDe)5v!B1V@7=* z3Pjt?5YK(^=$SG5Rs0>f5SffHj+Dbe(+zo!{UrcubL=i`j^(~i6Kj@baq^*v3T%Dt z*8;Tf{7m#++0&|K+%vL7_F;W&p_lG0{BiZP8ck2r?O`>V9ZvU8*JyS)-5yq>`Nehr zbdBa0*ZtEqnqOSEht+6yINcsrLyL&*7uWsMH5wn+{nIrXAJ_fUH5wn+?O`>VA4T_1 z*JyS)-9KHU+2M5mbd6?*)BV#mnqOS^PuFO6INd*8quJqfe@0Edi~~m_uKTBJG&`K` zpRUpDaJs%!quJqfdsvOe$8~#Hji#sR_OKdFPm5~$SdC_f)9qn3nx3Znr)@Y_Z(O&B z)d1kw&s=`$uq=B%9H`f;^2_+_f&0R-H`jooIOQ<3W(r*D<@Tc#xRk@VE&><7_`mTK zKS%BN+{OK#ySU#omd!gE<}eOP!oQ;SdwSgU7UCi2? zuaEEe`FH$W>EGY+b9)>Vpa1swId&ea{mNp0-K+L1KL`E1kUCBD$lS%`=PPmSZ8`uD z`6-!DBQKyvJ{}}b(QG89lX!#9=Ry;bP0Jz-kz3aC4gN7rn$LesH>f073ZfTzTl#x& zl;db>F^*DxhEz7}m=!5ZJS-pb{KwsjkADUL!&{dp>*a4Dj~8xw;3IO*kld&iuc+3RlQE9RgS60T zEx9I}8Y}t{Uev81mNZ3O=}8Ga{aNbi?*192udhz%>voTD*c1KpTdxmUCyjR+Z7Pjj zI>AcMj0T1YuJRIn*_7ym>J~YDK5f$rr6Z31nbaZP<)APSt2|Dz)*@SB&+}Vv`m^6A zSIr;tR>1h{t7X}*#%}|AozftUlVRm>+9S)kqqRqCM51MBMwd=k8OTk?ss{X3c>Bt1 z+@}Y&j#K?;Ks=ZAy5q;m&Xua^>nCga`grQ=g;2~|>+6%Lkyr2!0*qspQeQ8kzFtUu zy?`&q`?07basYg{=fZ0MOxL2a*sBW_YwwG;pi#p`RmUa6P}QHqUt3k*K!E7wDXRK@ z_>8JL%fhg+UpNWeT2<$|gyoH@`dgh^GVeiOh_AvQ%0HEwe*QLUcJz}=^z(^2T+`1>b$EWFoyY2MO+PJ>lJ~|xa&?$%Uz}$JR;8G6538cWK9NyzBaM8~{p?-cz2L1euH2V3E&94)Swp=UnMiTOG0)Jv*fF z^E-d|AMkT&B=|Y5pHn-WoUwJ<;S470#?=$fh`P-D`*!V0Q71Dezk_&yGUL5;x}Asd zu=%*l-;wKoz|Xt?@!R0%9bYov@pGGRf5*?2{PG=2>;0(hV2d+h^tph;>a7(`-6Dm%k9(n@%U&%UYh$S3-Mx&K!jhqB9N;eVUD+ z#nlIi2AGJK_Gu$N2<`1P6XDtM)pcg}i!Ga6zg`~Y zIIwfs-%P>|rh=>gO2V6|-{F+mg(`>+bfbW$#;AV-*>R}}4=0Dekg~g-@=$j<_iWTZ z004Tx*l-iQ7czLbk}ptx9tVr{$l$s4H2>O;2PZ?w_Z$UbFlNoCtw%2WE zew+c@dKpFI2NzYWs!JF)~0(66j!Xl^H$u zY<@S6uI=y<_^^2;+Y9py;mUAggnRJCBa11C$6LVb8c;3*({=0iCp9ot1_xzDu8V)% z`G?2&rD%f8Yuvq8bpS4R3XZn#3`3-J)?->*r6_AJu3$uglVzl>rXh{^pr>7=597ho zz9*&Cq7f=Qw>BT`^4GC^juVh%m%oC3)2wq-cHnHOvJ%1B_d7D=)e{r#$>OXFOBc2xhp*yZGU20mN zsffV=>xGs)yBWP?P}TYqyj|R?9nSn2hhk(HGmOWP@+HQD{y`cOvC}8?$#}MW7}^Wc z!o~#$p#y&F)hdJch4tK_tbf_T{c%@e)sgPw!Js!!%i;;G7;nXN+g1bUNCQd=ZRmz8 z$&B!3rgwF#D)bkxs(Q}+=fi(e{{onY3hN>U z#Ee4Lplxt7vN|SwE#H7_M-X{O0osCr{>ELOGrs<7lG-p z%<386iDjIi=jf-3+s?bzjWEyu$JdHb8)hpQ$L+2;U$J4e+bqPhrikR>UmxR`PS?CYLQR94_4ycRrhv@Hp&U@oA zG-R8)I@())ITlpo#Q6Ggmq_%$n!jU+67jPL4zwKQO@ zGr}*aq3fbIqg0w9cyRKG{DE#Ig|x5qMnXm|EvvFI9itSk+G z92}_Vx_z(9gdvTP)4Yr^S&Q+92c;QZe;-s5OL4NtzHU0sxXYm834Ejev8LRq0eYFw zMic}O#!Y8AGY|AxtFXT2Erhr^DW^^@jGNLhq*2BFmA*_^&hdQ<|F6T2MekVeIb*%$ zeoPe@rbH6qOzxBq1_+$Te?04jp zB=rSsIy?b5;)}%a2>OjHV}rQ(cXjY!Kcn~`SysGM2C({IN2JAkRO6PAZ@rahqP6g!{#SCA{*bH{Q>R7%#&A!+CRIN$>}P z$ANz9Wh~;i{=&Qe(2(b_Th?^eG31jKxxlG2cSJr^xK$vG{R*^gy=lEseh!g5&RZUU zd&%0RaQpnni3Q}Af^o%npJTw*;Jfp@XX6WgC}Xh;>u!nkrJ?n=oF5->49l#z%v((0 zBDT@QrM1@Xfc1{51Tua6JzS35;7CiH1JZTg;(|zSlRc&nBT>#t@B>bQLuC>`&iPDD zfT??&e)pi9KWgL63-YcWV!SdFRz_%GL> z0xU^X;3IZei@hGL=k%%$kkGhcehz_L#mPl|%cQ=KSs(6J$4CPAG6&v`Pg&xwe@M#| z&PA=}{l?Mjq3dj-5>sIg+AVUZh{BLO>FCX>TRI zbHNvkFb?dh~e$`g{@kJPCcCGF*BR?t(cRh&)Wwf55u3 z!i+pQpWj&5SDKMqsoI;?c{TA(6x1DgKo!lY&#^MX{n@|B)S@`C1A3|6)8V1s=g9k6 zMe%C2J`(*wQ-F>sXxdaX?MgIl2AWnqJbpQHy7j5iRU<7|tecV0LX^%;+&hM>^BaG~ z#`^kNK;Vb}{bK%F4@ie1kw$rn3SJ^zB|mn2Q;N1qb1@5Y@d@4A);PG*C*%dJi7@Jq z8jp;U4P>U&G$UgwpqU$g#a&vsr@I^jT4`F>jZeX;-EUOL(ZG;jS%5F`#hKWnKdRic zMq`>iT#MK7UnS{FGLcXyVrHD{$>Gd7dV#XNKLeL0Xl;&cWQpDOiGnT!)QrrK5F2^1 zzQ0q@-vCOKl0jiW!9$Jk<$xtT3m^_gnCk&@53cd9-ywzxqrQUW)$i~dzOC{5&@$8r zdkmDm*b#P(r$~*mL_`OS-F2_3^hl|+plk3hR$Ibq&E}|0rvjL*0jqnwyP&-UqcRq= zcR)OZkGxWM^G%fd$Y1^`j^A@Zou40zvF0V}FJX625oQ#1;ELAEu{GJDL50e%OnM}Q z=BPaa9Cc1uINe^ilXQ*YF(0|r2VwWb3+u-B$#iPF`G6UEk_Bbq##Oc}^du)4o~~3+ zoMd>az*Fc+5;JH}52Dsch!3!iz4(W&S%FEu`YeBT%-gcT8-luy)%z!W;6;4}Trz=3 zFAi~D8JOH~Gje?FxO>!4b8(rWStY1}tR2Y;kUdt-Z)^B1=1_?rzCAVDXOt*hZN6I- zuu5{^CJ4Fwe3vXrvYFzg!5$_z86SbN0+CQH^T8fsl&%`GZcNnIy~WREhWQWf6ju9X zB9F4`csUF&zrY&Ai^YIog0u26_!=AWi+P`G?!@j>=kpxg0yip;phAA+!$gZr&G$nB zf%9vW$rmL~UhFlIbpx2mtQBTt!gv^yAQe_(OA*nmQB=qmGKQq`G9XNtkr6cSTx8-X zRBu^bIQSELem)1J>dC~)Iw>xYKXL#m+!Sy&7t(Gv(hQBr3SN!TF2KN_fz=@sH$F?| zbQpE0VYg|NB2Q8q>K!4hfK~oPsx<0-_$;x!md6xkjsukSRNX0(6)QJEo50A|1sej9 z-*T+|5R|K@VF?wQ#kbZ1V9X}p5J4-xIfXJ9`cO0SD62G!XB1u?#olDhg;~S>#a|h7u0_+Gc}x~uMq35Q9;69UOHcr! zt~|~Nj^c1 z;J0}dGoBjg09Y!n#N3S?JeS*;2jg953!lwdU(;Y(2>_p+H~ zxM#gb8ICj7^F}v%-od2>^LSHc_}(0{E)cnuD-R+O+l|d=eOo9OkUoG$fzFZyLHdwg zf(1yQ5%moKza5(|at8P2*YZQmAd1CU3_=XcMP$K8*cAepP{Z*o5L6t`AcBhoV@@{U zn59sJ00HD%;*h%rU$;wNi(klH@Io2kpYvVxZdHN`>EbwPeAr&c0W+<4O(eG>>D&i_ z=_csn#Sa`uGxC&hM&2VqL~~PV9^gLvJ_!c)+9gG)v#(Q(pQATE^qAz)#BU>h0iElq zm27i8YvsjLo)~_&8$Z9>4iXTS5HZ|eGF`n`;C5>R06uH3%wv3kvZbg;Jge}+7^LR4 zF=(!=7^5GK4iet)ZqJ?ayTJ1hegkMRal@Vz%J1ERHy%0M%4c9_m;7SKDyB+o?<0F# zBIO`mH5F!bMoGZ>GB#$K(CP3b)?c@R3*6y6-iu+M34ahgrZkMhwKx#`q7e?TxahDW z=hEjj0M_n@r+?kj{ESOKB+lhYg=sP6>6=( z8Gx(nLB>$85k3)(kpkiLde;b-;}QNlD}pi)K#@f#GHxO><#M$E$r$aWli-m?_*HgJ z+zpUIhbC4TT><&AJMU7gY7dAa7Qj$0#qwB}jS5QjK=DrQ+Qec5=f+pEq`^fwwIJJ* z+k*xd;NoG{6**qk_$N#&_WCsTm~sAO#ea-TQvSjr$>^mFv6I{E#PtL1%v z4|;qG-=C+yXUotEC7G!%f;w_i2odJ2z?4OM$*v)>I`k#>dm$!9J-64QKM=T|l64j9 z`JU?c7IKfs@d^6H9@Psw$8;+pezA~D{&EQD$SJTHyAj?WSkaifG5dA4BQgUFUbrnb z>Ec7h;Af*EcRXz5vDxuc&@>yb@_bzKv!3+;yiz=VI7-8Oz_h`fsX77v*{dw&pT+p? z#s0+j0KGoUWv{&FTz;3YcVcfJuyVXW1uLrv-T*M!ni^4OjN?7gQHS?eLjV*iYZi|w zG;TbUuP|QSKt6uszd;k2iHlXd?B0J8AHr(l_iF8{{u87lw80iKxOulZ%r8lfn|~*Ea5#s< z>w@{=)j_PYZg0=g4D%INeBBaWUDXP10`BPN_~vi=H-qI}308Nv`*e7J2cBGCnVx6s zRJXn)`0Idc^&PH%^OU@k`ugec1r8iy?HGEK;OK!Ja!1vn@Gw+)(Y$A8v#XHJ4|P4^ zK9aZ9@NY%C-EF@OZnQn)W2Hdlo<}}psdEZ=+buxa@e>EW(uV8t@WruJznTWLJPL)u zLup1ks5FdY2J5O04>tx!qUi2U5r;cPIQ)enA6{xBtVAAcKrx{-c&%5w z)|!CT+cUE}9J1NfJ(vk*5ULp=ct_X0m45?r@u z!fKd|`n~{gz6ZKO-(}NW+F^um;^Lxy46W;HG(dP=FI6(x3Uo#m4wNB2gI&P|s)BKD z1zn^9ltYpvwV3!6_URXl@KLCM2IT_3^{Gj1FK(gVVpl`sQF_4oOfNnB5nB}P))I1> z|0^fIDmk>017Xx(gnVgBs9&K$?)PRfOgBb&E((_ZayJi(a{8OvqM@q1 ziB5T8<_C#Bxl%SKhbQB)*~e?h%8b9h13~pIOqSq~@H*nosQ)vDH4O7N?C9W)D~bK+ zz!Db5jX(8uXa!gf)|E_6+q>nx2ojI0AFdcK>{=6sh)+(I`jmuSotEU z{57Ng6|_B!eHQu*32z245FL5+t??_x)jf|6uC--E~eS{J!}v87F-A zk_$hI-(8Q|;E&^CaFCc-5mh(fgNNJ?c7S=JwKq&rt=)vjIGNCt24|XzkvYCmv1Xx* zoTtgFplT`#o#Ympjm#+YXuHsDEc8bx)M@YGZqAu}b*;1)B^>0!FD9Hb3DE7ePOtw6 zX!9Qc+UkOSP0+1y6(?4wy)uJEkT4!cMrQ}u!)R6fa5Mulgo3LrW499C18_{lc`R6V zpfTPtJn_HgKv1a5&PWD){`Xj7QQ^;uu zblQ3lA=X<~4|Q-patlwm0||s$L49#<$STDn08OQp{1tD`pxvRR(^W4p8j<6?*0ij$ z$P{ewxir@wxd>k2iFsv_i}K50BlkwC3%piUL1|>Pr_?I-lt#`hEVarCOC!~TORcKG zShso3s$H(GKsM1?6P-09qag^7%8wt4NtK{4iR|>3<|WY;+yCAKa!#IwayTd)IWq!R z6b5HEykkV{R!pAezuELjU8Ifq@bY!t=`4s;i&+~V%*YiAQU5Cp6F1JN0$I>PyCAOk z_@&Z9kj3lp4TjCMhYi0KY)UB#r5}tVSE)wM3uAuo>h-$Y#v-G3xz8>#* z>vsPJ-M6apW==wyn|)`@f+wnL@DO7*{kTZyUVS+J_Aq9L)pO6PezX(z&R)!;U-0B% zfCT;{wl?V6uLo%l!=Xh{~TScoh#C5-S|Kxa<&*2V8p1!vD(oKz0I zBk?T!arGto$gaM%6y3`!g;r4iJ}6aETS#H7ZJ}qgF$!``tm1p9qfyn-tNQNvS)`4M zUcm4N8o@we@cN&Rxv`rp@)KC8zqAPisRlMF569<_GFFm$Sg(~?YeGv)Iz=bEO_X6| z&X-wdfKXgiSY}-SneaRaK<7XkIur8C8AZOx=)6dEe*_fI!}W3q+e>{r>ht75z!^Mb zN9<LRnwW+}OIC6q(pX zGVymUKSlx_@Aju_NGKGA0af6TlWHhpF-|8Is?b=SU7(PQi|lrq(OH-iZ;)OH#WI^l zm9Smz0Get5OC$X;E(h4OVuDR8#yYfOl+cQNYlbXspDg)NL%GZYvgF4MXZl=O@?%dX zt?2Kw#t2QoHXYD`2|{Ehdj=ytNr((KdLTVjhzyK4NYC()R^%;>=Y-zNw#M}LVzKAF z6bo(0wni0_TI9*5pQT&6g*Ihdvr1$Yd6#|+F1In;TlW=bjL#bDn|THvgb-A4OV8C) z@wZELPj9F>m(MdN3|Z&hfuggG+m=~l^WtGFIcNN_-Yoy?@2Eef3dD+>1nY zA8%cABN8a7*UWxRqo{dj#mU=cJpzE|tayvwxf9{L+n+vk>Jwbj%#ktiF5_) z-}6t_FIuG9Lkz1*Y|aoR$KAkJ(P8<>sj}hSRQp{>z54_fX!qS&@-8?lR(?_B9{cfLK&D$)o_hpM2;kwA3FtKHWGrrj45*_wn z1#&EXFNb{a+P;{d*JU;TP0=%Phcv(C(#t2f3Y(vEpV9wducrMr;0QdndLo`4hhG~a z`#rzP@O}sOTUlQV!3_4s=89MxrU$szg&6hnS({>&P~|c9&Q>vP_)9qc!5?7kgNLH0 zTPzHDw9lP~cF&6B;R4O?nUP}CIt7bgACEUwu9L1*xlAn^=9k+0UH1vqX;`Up@FSMM zTqa_BV0^?;!NmkqyAFTJ1km>R{ltV(|1qDHpET0YfNs;&v2>dtkwz9*ABFiUQMV!1}1AydV(m_B%{kzCoY*Kgq7y&i5QD!{hNes^)el zJQeVXw$vJ`=a3LTr9p8x-c&5Wn~)wr z9Zis?z^+;V&%$(mLCm6DDXL{`E{H)64Bi|kmr$2v#lE@#$Vvfg`yJaE-+Q2lr5)ef z?n^VixBZ9Xdn%Z{J-)X+vCrfC07hU5|D+w?V>&&)EGqN(zSYk7+UFPi*5`0N?t#m3 z6^aA;Jou3o3g3cfdVjL*(QrsIth`2+&ts1y$8p`E`7+j45Q z8`pPhE+>`g?FmGm^3K+-}2+qvLpmRMHc)pF9@Xr`@p=UbQ0Vce)C3YMKH!>9qfosCk!i_(Z^mcHuss*8n z;F<_vO|WQ%Gel!u!6A1v95IQMAQQ`f;U{q7yrbI}9ebXa_yW7gh_Dd$1tlTubQ@v6 zxK|