diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy index dfe23f494..7d814ca9c 100644 --- a/.jenkins/extended.groovy +++ b/.jenkins/extended.groovy @@ -54,7 +54,7 @@ ci: { def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906','gfx908']]), "rocm-docker":([ubuntu18:['gfx908']])] - // jobNameList = auxiliary.appendJobNameList(jobNameList) + jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { diff --git a/CHANGELOG.md b/CHANGELOG.md index fbe012dfc..353c37631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,17 @@ # Change Log for Tensile -## [(Unreleased) Tensile 4.28.0 for ROCm 4.3.0] + +## [Tensile 4.28.0 for ROCm 4.3.0] +### Added +- TensileRetuneLibrary for updating existing library logic files +- Support GFX1030 +- Support NHWC + ### Fixed - TensileCreateLibrary crash with relative output and --merge-files +### Changed +- Change cmake_minimum_required to VERSION 3.13 + ## [Tensile 4.27.0 for ROCm 4.2.0] ### Added - Benchmarking and library support for CU efficiency vs. overall speed diff --git a/HostLibraryTests/CMakeLists.txt b/HostLibraryTests/CMakeLists.txt index 0152ef259..7711c646e 100644 --- a/HostLibraryTests/CMakeLists.txt +++ b/HostLibraryTests/CMakeLists.txt @@ -19,7 +19,7 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.13) project(TensileHostLibraryTest) @@ -52,7 +52,7 @@ if(TENSILE_STATIC_ONLY) endif() if(NOT Tensile_FOUND) - find_package(Tensile 4.27.0 EXACT REQUIRED ${TENSILE_COMPONENTS} PATHS "${CMAKE_CURRENT_SOURCE_DIR}/../Tensile") + find_package(Tensile 4.28.0 EXACT REQUIRED ${TENSILE_COMPONENTS} PATHS "${CMAKE_CURRENT_SOURCE_DIR}/../Tensile") endif() if(NOT TENSILE_DISABLE_CTEST) @@ -79,22 +79,19 @@ if(TENSILE_USE_HIP) find_package(HIP REQUIRED CONFIG PATHS $ENV{ROCM_PATH} /opt/rocm) endif() -if(TENSILE_USE_OPENMP) - #set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_OPENMP") - find_package(OpenMP QUIET) - if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - else() - if(EXISTS /etc/redhat-release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - set(OPENMP_LIBRARY /usr/lib/x86_64-linux-gnu/libomp.so) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENMP_LIBRARY}") - endif() - endif() +if(TENSILE_USE_OPENMP AND NOT TARGET custom_openmp_cxx) + + # Workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/21787 + # ensures we link to HIP's libomp and get an rpath to it. + add_library(custom_openmp_cxx INTERFACE) + + if(TENSILE_USE_HIP) + target_compile_options(custom_openmp_cxx INTERFACE "-fopenmp") + target_link_options(custom_openmp_cxx INTERFACE "-fopenmp") + else () + find_package(OpenMP REQUIRED) + target_link_libraries(custom_openmp_cxx INTERFACE OpenMP::OpenMP_CXX) + endif () endif() add_subdirectory(configs) @@ -119,6 +116,7 @@ set(test_sources ${test_sources} ContractionProblem_test.cpp ContractionSelectionLibrary_test.cpp ContractionFitness_test.cpp + MultipleSolutionsPerSize_test.cpp DataTypes_test.cpp EmbeddedData_test.cpp KernelArguments_test.cpp @@ -140,6 +138,7 @@ if(TENSILE_USE_LLVM) set(test_sources ${test_sources} ContractionLibraryLoading_test.cpp ContractionFitness_test.cpp + MultipleSolutionsPerSize_test.cpp llvm/ArithmeticUnitPredicate_test.cpp llvm/CUEfficiencyPredicate_test.cpp llvm/DeterministicModePredicate_test.cpp @@ -199,5 +198,5 @@ if(TENSILE_USE_HIP) endif() if(TENSILE_USE_OPENMP) - target_link_libraries(TensileTests PRIVATE "${OpenMP_EXE_LINKER_FLAGS}") + target_link_libraries(TensileTests PRIVATE custom_openmp_cxx) endif() diff --git a/HostLibraryTests/CachingLibrary_test.cpp b/HostLibraryTests/CachingLibrary_test.cpp index 88ed5c694..796ad962b 100644 --- a/HostLibraryTests/CachingLibrary_test.cpp +++ b/HostLibraryTests/CachingLibrary_test.cpp @@ -260,6 +260,8 @@ TEST(Hashing, Tuple2) TwoInts tup; size_t h = std::hash()(tup); + if(h) // Use the code to quiet the compiler. + return; } TEST(CachingLibrary, Simple) diff --git a/HostLibraryTests/MultipleSolutionsPerSize_test.cpp b/HostLibraryTests/MultipleSolutionsPerSize_test.cpp new file mode 100644 index 000000000..f4bb4d55c --- /dev/null +++ b/HostLibraryTests/MultipleSolutionsPerSize_test.cpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2019-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef _OPENMP +#include +#endif + +TEST(MultipleSolutionsPerSize, ArithmeticUnit) +{ + using namespace Tensile; + + auto SolutionMFMA = std::make_shared(); + auto SolutionVALU = std::make_shared(); + + SolutionMFMA->problemPredicate + = std::make_shared(ArithmeticUnit::MFMA); + SolutionVALU->problemPredicate + = std::make_shared(ArithmeticUnit::VALU); + + SolutionMFMA->index = 0; + SolutionVALU->index = 1; + + SolutionMap map({{0, SolutionMFMA}, {1, SolutionVALU}}); + + auto LibraryMFMA = std::make_shared(SolutionMFMA); + auto LibraryVALU = std::make_shared(SolutionVALU); + + AMDGPU gpu; + + auto Problem_Size3 = ContractionProblem::GEMM(false, false, 3, 3, 3, 3, 3, 3, 1.2, false, 1); + auto Problem_Size5 = ContractionProblem::GEMM(false, false, 5, 5, 5, 5, 5, 5, 1.2, false, 1); + auto Problem_Size7 = ContractionProblem::GEMM(false, false, 7, 7, 7, 7, 7, 7, 1.2, false, 1); + auto Problem_Size9 = ContractionProblem::GEMM(false, false, 9, 9, 9, 9, 9, 9, 1.2, false, 1); + + using Key = std::array; + using Table + = Matching::DistanceMatchingTable>, + std::shared_ptr, + Matching::EuclideanDistance>; + using Properties = std::vector>>; + + Properties properties; + + { + auto freeSizeA = std::make_shared(); + freeSizeA->index = 0; + properties.push_back(freeSizeA); + auto freeSizeB = std::make_shared(); + freeSizeB->index = 0; + properties.push_back(freeSizeB); + auto batchSize = std::make_shared(); + batchSize->index = 0; + properties.push_back(batchSize); + auto boundSize = std::make_shared(); + boundSize->index = 0; + properties.push_back(boundSize); + } + + std::shared_ptr matchingTable = std::make_shared
(properties); + + using Entry + = Matching::MatchingTableEntry>>; + + std::vector table; + + { + Entry map0{{4, 4, 1, 4}, LibraryMFMA, 2.0}; + table.push_back(map0); + Entry map1{{4, 4, 1, 4}, LibraryVALU, 1.0}; + table.push_back(map1); + Entry map2{{8, 8, 1, 8}, LibraryVALU, 2.0}; + table.push_back(map2); + Entry map3{{8, 8, 1, 8}, LibraryMFMA, 1.0}; + table.push_back(map3); + } + + matchingTable->table = table; + + ProblemMatchingLibrary lib; + + lib.table = matchingTable; + + auto theSolution0 = lib.findBestSolution(Problem_Size3, gpu); + EXPECT_EQ(theSolution0, SolutionMFMA); + auto theSolution1 = lib.findBestSolution(Problem_Size5, gpu); + EXPECT_EQ(theSolution1, SolutionMFMA); + + auto theSolution2 = lib.findBestSolution(Problem_Size7, gpu); + EXPECT_EQ(theSolution2, SolutionVALU); + auto theSolution3 = lib.findBestSolution(Problem_Size9, gpu); + EXPECT_EQ(theSolution3, SolutionVALU); +} diff --git a/HostLibraryTests/TestData_test.cpp b/HostLibraryTests/TestData_test.cpp index ca55e37f3..304da6068 100644 --- a/HostLibraryTests/TestData_test.cpp +++ b/HostLibraryTests/TestData_test.cpp @@ -34,6 +34,7 @@ TEST(TestData, Simple) EXPECT_TRUE(static_cast(data)); +#if defined(TENSILE_MSGPACK) || defined(TENSILE_LLVM) auto is_regular_file = static_cast(boost::filesystem::is_regular_file); @@ -55,6 +56,7 @@ TEST(TestData, Simple) std::cout << file << std::endl; EXPECT_PRED1(is_regular_file, file); } +#endif if(TestData::Env("TENSILE_NEVER_SET_THIS_AKDJFLKDSJ")) FAIL() << "TestData object constructed with unset environment variable " diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsLite.dat.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsLite.dat.gz index 639348486..bb36540c7 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsLite.dat.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsLite.dat.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsLite.yaml.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsLite.yaml.gz index 2f0429fd2..f691bf8c9 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsLite.yaml.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsLite.yaml.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.dat.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.dat.gz index f8acfdbe9..3d664c8dc 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.dat.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.dat.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.yaml.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.yaml.gz index 85895f6cf..dfed30b69 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.yaml.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsLiteMixed.yaml.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.dat.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.dat.gz index 17b3bfac7..57639e871 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.dat.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.dat.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.yaml.gz b/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.yaml.gz index e295ffd8f..30d788ea0 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.yaml.gz and b/HostLibraryTests/configs/SolutionLibraries/KernelsTileLite.yaml.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.dat.gz b/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.dat.gz index b9f2538e8..576be6a13 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.dat.gz and b/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.dat.gz differ diff --git a/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.yaml.gz b/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.yaml.gz index 552d6951d..5d5d78fa0 100644 Binary files a/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.yaml.gz and b/HostLibraryTests/configs/SolutionLibraries/rocBLAS_Full.yaml.gz differ diff --git a/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bjlk_SB.yaml b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 000000000..03fa17623 --- /dev/null +++ b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,947 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_TT4_4_WG8_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_TT8_4_WG8_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_TT4_4_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_TT8_8_WG16_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [96, 128, 1, 128, 96, 96, 96, 128] + - [0, 625.0] + - - [192, 256, 1, 256, 192, 192, 192, 256] + - [2, 3099.0] + - - [384, 512, 1, 512, 384, 384, 384, 512] + - [1, 7594.0] + - - [768, 1024, 1, 1024, 768, 768, 768, 1024] + - [3, 11640.0] +- null diff --git a/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bljk_SB.yaml b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 000000000..5d9b2e10b --- /dev/null +++ b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,947 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_TT8_8_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_TT4_4_WG8_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_TT4_4_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SE_TT8_4_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [96, 128, 1, 128, 96, 96, 96, 128] + - [1, 584.0] + - - [192, 256, 1, 256, 192, 192, 192, 256] + - [2, 2982.0] + - - [384, 512, 1, 512, 384, 384, 384, 512] + - [3, 7156.0] + - - [768, 1024, 1, 1024, 768, 768, 768, 1024] + - [0, 10760.0] +- null diff --git a/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bjlk_SB.yaml b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 000000000..40ff65bbe --- /dev/null +++ b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,726 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SE_TT4_8_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x16_SE_TT4_4_WG8_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SE_TT8_8_WG16_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [96, 128, 1, 128, 96, 96, 128, 128] + - [1, 606.0] + - - [192, 256, 1, 256, 192, 192, 256, 256] + - [1, 2979.0] + - - [384, 512, 1, 512, 384, 384, 512, 512] + - [0, 7159.0] + - - [768, 1024, 1, 1024, 768, 768, 1024, 1024] + - [2, 11133.0] +- null diff --git a/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bljk_SB.yaml b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 000000000..c00468794 --- /dev/null +++ b/HostLibraryTests/configs/lite_configs/navi21_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,947 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_TT4_8_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_TT4_4_WG8_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_TT4_4_WG16_8_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_TT8_8_WG16_16_1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [96, 128, 1, 128, 96, 96, 128, 128] + - [1, 568.0] + - - [192, 256, 1, 256, 192, 192, 256, 256] + - [2, 2757.0] + - - [384, 512, 1, 512, 384, 384, 512, 512] + - [0, 6759.0] + - - [768, 1024, 1, 1024, 768, 768, 1024, 1024] + - [3, 10013.0] +- null diff --git a/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bjlk_SB.yaml b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 000000000..d98146924 --- /dev/null +++ b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,278 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 625.0] +- null diff --git a/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bljk_SB.yaml b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 000000000..a7d49fb9c --- /dev/null +++ b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,278 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 503.0] +- null diff --git a/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bjlk_SB.yaml b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 000000000..ae110c6f9 --- /dev/null +++ b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,278 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 505.0] +- null diff --git a/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bljk_SB.yaml b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 000000000..0186ecb5b --- /dev/null +++ b/HostLibraryTests/configs/lite_configs_mixed/navi21_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,278 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 418.0] +- null diff --git a/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bjlk_SB.yaml b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 000000000..2af3c446b --- /dev/null +++ b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,725 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 959.0, '128': 687.0, '16192': 1012.0, '2048': 987.0, '256': 821.0, + '32': 348.0, '4096': 1000.0, '512': 908.0, '64': 518.0, '8192': 1008.0, '96': 623.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 660.0, '128': 515.0, '16192': 686.0, '2048': 674.0, '256': 588.0, + '32': 277.0, '4096': 680.0, '512': 635.0, '64': 403.0, '8192': 684.0, '96': 473.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_TT8_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 659.0, '128': 512.0, '16192': 686.0, '2048': 673.0, '256': 587.0, + '32': 273.0, '4096': 680.0, '512': 633.0, '64': 400.0, '8192': 684.0, '96': 471.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_TT4_8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 674.0] +- null +- TileSelectionIndices: [0, 1, 2] diff --git a/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bljk_SB.yaml b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 000000000..03f0f51f8 --- /dev/null +++ b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,725 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 888.0, '128': 652.0, '16192': 934.0, '2048': 912.0, '256': 769.0, + '32': 337.0, '4096': 923.0, '512': 846.0, '64': 497.0, '8192': 930.0, '96': 595.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 626.0, '128': 494.0, '16192': 649.0, '2048': 638.0, '256': 563.0, + '32': 271.0, '4096': 644.0, '512': 604.0, '64': 393.0, '8192': 647.0, '96': 457.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_TT8_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 595.0, '128': 473.0, '16192': 616.0, '2048': 606.0, '256': 534.0, + '32': 265.0, '4096': 611.0, '512': 573.0, '64': 377.0, '8192': 614.0, '96': 439.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_TT4_8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 637.0] +- null +- TileSelectionIndices: [0, 1, 2] diff --git a/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bjlk_SB.yaml b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 000000000..c605f0fab --- /dev/null +++ b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,725 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 889.0, '128': 655.0, '16192': 933.0, '2048': 912.0, '256': 770.0, + '32': 345.0, '4096': 924.0, '512': 846.0, '64': 501.0, '8192': 930.0, '96': 593.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 626.0, '128': 493.0, '16192': 649.0, '2048': 638.0, '256': 561.0, + '32': 269.0, '4096': 644.0, '512': 602.0, '64': 388.0, '8192': 647.0, '96': 456.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SE_TT4_8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 0] + Ideals: {'1024': 595.0, '128': 475.0, '16192': 616.0, '2048': 606.0, '256': 537.0, + '32': 269.0, '4096': 611.0, '512': 575.0, '64': 382.0, '8192': 614.0, '96': 443.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SE_TT8_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 640.0] +- null +- TileSelectionIndices: [0, 1, 2] diff --git a/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bljk_SB.yaml b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 000000000..54c8a9020 --- /dev/null +++ b/HostLibraryTests/configs/tile_aware_selection/navi21_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,725 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi21 +- gfx1030 +- [Device 73a2] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 827.0, '128': 621.0, '16192': 866.0, '2048': 848.0, '256': 724.0, + '32': 333.0, '4096': 857.0, '512': 788.0, '64': 479.0, '8192': 844.0, '96': 566.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_TT4_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 568.0, '128': 456.0, '16192': 586.0, '2048': 578.0, '256': 514.0, + '32': 264.0, '4096': 582.0, '512': 549.0, '64': 372.0, '8192': 585.0, '96': 428.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_TT8_4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 0] + Ideals: {'1024': 567.0, '128': 453.0, '16192': 586.0, '2048': 577.0, '256': 513.0, + '32': 259.0, '4096': 582.0, '512': 548.0, '64': 368.0, '8192': 585.0, '96': 426.0} + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: true + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_TT4_8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 609.0] +- null +- TileSelectionIndices: [0, 1, 2] diff --git a/HostLibraryTests/hip/RunGEMMKernel_test.cpp b/HostLibraryTests/hip/RunGEMMKernel_test.cpp index e4162937f..b772f362b 100644 --- a/HostLibraryTests/hip/RunGEMMKernel_test.cpp +++ b/HostLibraryTests/hip/RunGEMMKernel_test.cpp @@ -785,7 +785,7 @@ std::vector TestProblems() std::vector>, std::shared_ptr, bool>> - TestLibraries() + TestLibraries_Impl() { bool debug = Debug::Instance().printKernelArguments(); @@ -847,13 +847,15 @@ std::vector>, { auto library = LoadLibraryFile(envDir.file("TensileLibrary").native()); auto adapter = std::make_shared(debug, "TENSILE_TEST_LIBRARY"); + auto device = std::dynamic_pointer_cast(Tensile::hip::GetCurrentDevice()); + auto arch = device->processor; - for(auto file : envDir.glob("*.co")) + for(auto file : envDir.glob(concatenate("*-", arch, ".co"))) { adapter->loadCodeObjectFile(file.native()); } - for(auto file : envDir.glob("*.hsaco")) + for(auto file : envDir.glob(concatenate("*-", arch, ".hsaco"))) { try { @@ -870,6 +872,16 @@ std::vector>, return rv; } +// Prevent the libraries from being loaded twice. +std::vector>, + std::shared_ptr, + bool>> + TestLibraries() +{ + static auto rv = TestLibraries_Impl(); + return rv; +} + std::vector TestMemoryAlignments() { return std::vector{MemoryPageAlignment::BEGIN, MemoryPageAlignment::END}; diff --git a/HostLibraryTests/llvm/LLVMYAMLContraction_test.cpp b/HostLibraryTests/llvm/LLVMYAMLContraction_test.cpp index f21d49b76..adcc7ac22 100644 --- a/HostLibraryTests/llvm/LLVMYAMLContraction_test.cpp +++ b/HostLibraryTests/llvm/LLVMYAMLContraction_test.cpp @@ -79,7 +79,7 @@ TEST(LLVMYAMLContractionTest, Predicate) std::string mydoc = "type: And\n" "value: [{type: TruePred}, \n" " {type: Not, value: {type: FalsePred}},\n" - " {type: FreeSizeAMultiple, index: 0, value: 2}]"; + " {type: Free0SizeMultiple, index: 0, value: 2}]"; llvm::yaml::Input yin(mydoc); std::shared_ptr> p; @@ -136,7 +136,7 @@ TEST(LLVMYAMLContractionTest, ContractionLibrary) " library:\n" " type: Problem\n" " rows:\n" - " - predicate: { type: FreeSizeAMultiple, " + " - predicate: { type: Free0SizeMultiple, " "index: 0, value: 2 }\n" " library: { type: Single, index: 0 }\n" ""; diff --git a/HostLibraryTests/testlib/CMakeLists.txt b/HostLibraryTests/testlib/CMakeLists.txt index a31f0ec73..c3617da98 100644 --- a/HostLibraryTests/testlib/CMakeLists.txt +++ b/HostLibraryTests/testlib/CMakeLists.txt @@ -26,4 +26,7 @@ add_library(TensileTestLib STATIC ${testlib_sources}) target_include_directories(TensileTestLib PUBLIC include) target_link_libraries(TensileTestLib PUBLIC Boost::filesystem TensileHost) +if(TENSILE_USE_OPENMP) + target_link_libraries(TensileTestLib PRIVATE custom_openmp_cxx) +endif() diff --git a/HostLibraryTests/testlib/source/TestData.cpp b/HostLibraryTests/testlib/source/TestData.cpp index 7393c2997..6674dbc4d 100644 --- a/HostLibraryTests/testlib/source/TestData.cpp +++ b/HostLibraryTests/testlib/source/TestData.cpp @@ -124,6 +124,12 @@ boost::filesystem::path TestData::ProgramLocation() TestData::TestData() : m_dataDir(ProgramLocation().parent_path() / "data") { + if(!boost::filesystem::is_directory(m_dataDir)) + { + auto newValue = ProgramLocation().parent_path().parent_path() / "data"; + if(boost::filesystem::is_directory(newValue)) + m_dataDir = newValue; + } } TestData::TestData(std::string const& dataDir) diff --git a/Tensile/AsmUtils.py b/Tensile/AsmUtils.py index 63a1a808d..f07ef8661 100644 --- a/Tensile/AsmUtils.py +++ b/Tensile/AsmUtils.py @@ -132,7 +132,7 @@ def vectorStaticDivideAndRemainder(qReg, rReg, dReg, divisor, tmpVgpr, tmpSgpr, if doRemainder: kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(divisor), rComment) kStr += inst("v_mul_lo_u32", vgpr(tmpVgpr), vgpr(qReg), sgpr(tmpSgpr), rComment) - kStr += inst("_v_sub_co_u32", vgpr(rReg), "vcc", vgpr(dReg), vgpr(tmpVgpr), rComment) + kStr += inst("_v_sub_u32", vgpr(rReg), vgpr(dReg), vgpr(tmpVgpr), rComment) return kStr def vectorStaticDivide(qReg, dReg, divisor, tmpVgpr, tmpSgpr, comment=""): @@ -172,7 +172,7 @@ def vectorStaticRemainder(qReg, rReg, dReg, divisor, tmpVgpr, tmpSgpr, comment=" kStr += inst("v_mov_b32", vgpr(qReg), vgpr(tmpVgpr), comment) kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(divisor), comment) kStr += inst("v_mul_lo_u32", vgpr(tmpVgpr), vgpr(qReg), sgpr(tmpSgpr), comment) - kStr += inst("_v_sub_co_u32", vgpr(rReg), "vcc", vgpr(dReg), vgpr(tmpVgpr), comment) + kStr += inst("_v_sub_u32", vgpr(rReg), vgpr(dReg), vgpr(tmpVgpr), comment) return kStr # only used for loop unroll and GlobalSplitU diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index a610380a9..a509d2420 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -1,5 +1,5 @@ ################################################################################ -# Copyright 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -551,12 +551,11 @@ def writeBenchmarkFiles(stepBaseDir, solutions, problemSizes, stepName, filesToC globalParameters["WorkingPath"], globalParameters["CxxCompiler"], [problemType], solutions, kernels, kernelHelperOjbs, \ solutionWriter, kernelWriterSource, kernelWriterAssembly, errorTolerant=True ) - newLibraryFilename = "TensileLibrary.yaml" if globalParameters["LibraryFormat"] == "yaml" else "TensileLibrary.dat" newLibraryDir = ensurePath(os.path.join(globalParameters["WorkingPath"], 'library')) - newLibraryFile = os.path.join(newLibraryDir, newLibraryFilename) + newLibraryFile = os.path.join(newLibraryDir, "TensileLibrary") newLibrary = SolutionLibrary.MasterSolutionLibrary.BenchmarkingLibrary(solutions) newLibrary.applyNaming(kernelMinNaming) - LibraryIO.configWriter(globalParameters["LibraryFormat"]).write(newLibraryFile, Utils.state(newLibrary)) + LibraryIO.write(newLibraryFile, Utils.state(newLibrary), globalParameters["LibraryFormat"]) codeObjectFiles = [os.path.relpath(f, globalParameters["WorkingPath"]) for f in codeObjectFiles] diff --git a/Tensile/ClientExecutable.py b/Tensile/ClientExecutable.py index 9e187261e..f050dec6f 100644 --- a/Tensile/ClientExecutable.py +++ b/Tensile/ClientExecutable.py @@ -59,6 +59,8 @@ def clientExecutableEnvironment(builddir=None): options = {'CMAKE_BUILD_TYPE': globalParameters["CMakeBuildType"], 'TENSILE_NEW_CLIENT': 'ON', + 'TENSILE_USE_MSGPACK': 'ON', + 'TENSILE_USE_LLVM': 'ON', 'Tensile_LIBRARY_FORMAT': globalParameters["LibraryFormat"], 'CMAKE_CXX_COMPILER': os.path.join(globalParameters["ROCmBinPath"], globalParameters['CxxCompiler'])} diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py index 9987cd6d3..172985bb3 100644 --- a/Tensile/ClientWriter.py +++ b/Tensile/ClientWriter.py @@ -130,7 +130,7 @@ def main( config ): for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForType, \ indexOrder, exactLogic, rangeLogic, newLibrary, architectureName) \ - = LibraryIO.readLibraryLogicForSchedule(logicFileName) + = LibraryIO.parseLibraryLogicFile(logicFileName) if problemType["DataType"].isHalf(): enableHalf = True functions.append((scheduleName, problemType)) @@ -670,7 +670,11 @@ def param(key, value): if globalParameters["PrintTensorD"]: param("print-tensor-d", 1) if globalParameters["PrintTensorRef"]: - param("print-tensor-ref", 1) + param("print-tensor-ref", 1) + if globalParameters["DumpTensors"]: + param("dump-tensors", 1) + if globalParameters["ExitOnFails"] > 1: + param("exit-on-error", 1) param("bounds-check", boundsCheckName(int(globalParameters["BoundsCheck"]))) param("print-valids", globalParameters["ValidationPrintValids"]) @@ -695,6 +699,10 @@ def param(key, value): param("log-level", ClientLogLevel(globalParameters["ClientLogLevel"]).name) param("max-workspace-size", globalParameters["MaxWorkspaceSize"]) param("granularity-threshold", globalParameters["GranularityThreshold"]) + param("pristine-on-gpu", globalParameters["PristineOnGPU"]) + + param("library-update-file", globalParameters["LibraryUpdateFile"]) + param("library-update-comment", globalParameters["LibraryUpdateComment"]) def writeClientConfig(forBenchmark, solutions, problemSizes, stepName, stepBaseDir, newLibrary, codeObjectFiles, tileAwareSelection, configBase = "ClientParameters", libraryFile = None): @@ -734,7 +742,7 @@ def CreateBenchmarkClientParametersForSizes(libraryRootPath, problemSizes, dataF metaDataFilePath = os.path.join(libraryPath, "metadata.yaml") if not os.path.exists(metaDataFilePath): printExit ("meta data file %s does not exist" % metaDataFilePath) - metaData = LibraryIO.readConfig(metaDataFilePath) + metaData = LibraryIO.readYAML(metaDataFilePath) problemTypeDict = metaData["ProblemType"] problemType = ContractionsProblemType.FromOriginalState(problemTypeDict) diff --git a/Tensile/Code.py b/Tensile/Code.py index c4d6939cf..a637dda84 100644 --- a/Tensile/Code.py +++ b/Tensile/Code.py @@ -310,7 +310,7 @@ def __init__(self, version,lgkmcnt=-1,vmcnt=-1,comment=""): self.version = version self.lgkmcnt = lgkmcnt self.vmcnt = vmcnt - self.comment = comment + self.comment = "lgkmcnt={} vmcnt={}".format(lgkmcnt, vmcnt) + comment # let this derived class play nicely with Module.prettyPrint() self.__dict__.update(self.instructions().__dict__) @@ -387,7 +387,7 @@ def __str__(self): numOfRowsperMfma = 1 numOfRowInsts = self.kernel["ThreadTile0"]/numOfRowsperMfma #numOfColInsts = kernel["ThreadTile1"]/kernel["MatrixInstN"] - numOfDstRgs = (self.kernel["MatrixInstN"] * self.kernel["MatrixInstM"] * self.kernel["MatrixInstB"] // globalParameters["WavefrontWidth"]) + numOfDstRgs = (self.kernel["MatrixInstN"] * self.kernel["MatrixInstM"] * self.kernel["MatrixInstB"] // self.kernel["WavefrontSize"]) if self.kernel["ProblemType"]["DataType"].isSingle(): for iui in range(0, self.innerUnroll): cStr = "a[(%u+%u*%u)*%u):((((%u+%u*%u)*%u)+%u)-1)]" % (self.aIdx,self.bIdx,numOfRowInsts,numOfDstRgs,self.aIdx,numOfDstRgs,self.bIdx,numOfRowInsts,numOfDstRgs) @@ -658,7 +658,7 @@ def __str__(self): % ("vgprValuB_X%u_I%u"%(self.PLR,iui), self.bIdx) #if a==0 and b==0: # kStr += dump(aStr) - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) ##if macIdx == self.kernel["PerformanceWaitLocation"]: ## kStr += "s_waitcnt lgkmcnt(%u) // extra wait for performance%s" \ ## % (self.kernel["PerformanceWaitCount"], self.endLine) @@ -737,7 +737,8 @@ class SrdUpperFields10XX(BitfieldStructure): ("index_stride", ctypes.c_uint, 2), ("add_tid_enable", ctypes.c_uint, 1), ("resource_level", ctypes.c_uint, 1), - ("_unusedB", ctypes.c_uint, 3), + ("_unusedB", ctypes.c_uint, 1), + ("LLC_noalloc", ctypes.c_uint, 2), ("oob_select", ctypes.c_uint, 2), ("type", ctypes.c_uint, 2)] diff --git a/Tensile/Common.py b/Tensile/Common.py index 81e4696d6..fc32f6425 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -23,7 +23,6 @@ from . import Parallel from collections import OrderedDict from copy import deepcopy -from subprocess import Popen, PIPE import math @@ -96,7 +95,7 @@ globalParameters["GenerateSourcesAndExit"] = False # Exit after kernel source generation. globalParameters["ShowProgressBar"] = True # if False and library client already built, then building library client will be skipped when tensile is re-run globalParameters["WavefrontWidth"] = 64 # if False and library client already built, then building library client will be skipped when tensile is re-run -globalParameters["ExitOnFails"] = 1 # Exit if failures detected. +globalParameters["ExitOnFails"] = 1 # 1: Exit after benchmark run if failures detected. 2: Exit during benchmark run. globalParameters["CpuThreads"] = -1 # How many CPU threads to use for kernel generation. 0=no threading, -1 == nproc, N=min(nproc,N). TODO - 0 sometimes fails with a kernel name error? 0 does not check error codes correctly # FROM MERGE #globalParameters["CpuThreads"] = -4 # How many CPU threads to use for kernel generation. 0=no threading, <0 == nproc*abs(CpuThreads), N=min(nproc,N) @@ -181,12 +180,11 @@ globalParameters["PrintIndexAssignments"] = 0 # Print the tensor index assignment info globalParameters["PrintWinnersOnly"] = False # Only print the solutions which become the fastest globalParameters["PrintCodeCommands"] = False # print the commands used to generate the code objects (asm,link,hip-clang, etc) +globalParameters["DumpTensors"] = False # If True, dump tensors to binary files instead of printing them. # TODO - remove this when NewClient is mainstream globalParameters["OldClientSourceTmp"] = True # Use an intermediate sourceTmp dir to detect file changes and minimize rebuilds on old client -# PrintMaxCols applies to dimensions where multiple cols are printed per line. -# PrintMaxRows applies to dimensions where one row is printed per line # If PrintMax* is greater than the dimension, the middle elements will be repaced with "..." @@ -200,17 +198,20 @@ globalParameters["MaxDepthU"] = 256 # max DepthU value to allow globalParameters["ShortNames"] = False # on windows kernel names can get too long; =True will convert solution/kernel names to serial ids globalParameters["MergeFiles"] = True # F=store every solution and kernel in separate file; T=store all solutions in single file -globalParameters["MaxFileName"] = 128 # If a file name would be longer than this, shorten it with a hash. -globalParameters["SupportedISA"] = [(8,0,3), (9,0,0), (9,0,6), (9,0,8), (9,0,10), (10,1,0), (10,1,1)] # assembly kernels writer supports these architectures + +globalParameters["MaxFileName"] = 64 # If a file name would be longer than this, shorten it with a hash. +globalParameters["SupportedISA"] = [(8,0,3), (9,0,0), (9,0,6), (9,0,8), (9,0,10), (10,1,0), (10,1,1), (10,1,2), (10,3,0)] # assembly kernels writer supports these architectures + globalParameters["GenerateManifestAndExit"] = False # Output manifest file with list of expected library objects and exit -globalParameters["ClientBuildPath"] = "0_Build" # subdirectory for host code build directory. +globalParameters["ClientBuildPath"] = "0_Build" # subdirectory for host code build directory globalParameters["NewClient"] = 2 # 1=Run old+new client, 2=run new client only (All In) globalParameters["BenchmarkProblemsPath"] = "1_BenchmarkProblems" # subdirectory for benchmarking phases globalParameters["BenchmarkDataPath"] = "2_BenchmarkData" # subdirectory for storing final benchmarking data globalParameters["LibraryLogicPath"] = "3_LibraryLogic" # subdirectory for library logic produced by analysis globalParameters["LibraryClientPath"] = "4_LibraryClient" # subdirectory for building example library client -globalParameters["BenchmarkClientVersion"] = "Both" # Old, New, Both -globalParameters["ClientExecutionLockPath"] = None # Path for a file lock to ensure only one client is executed at once. filelock module is required if this is enabled. +globalParameters["ClientExecutionLockPath"] = None # Path for a file lock to ensure only one client is executed at once. filelock module is required if this is enabled. +globalParameters["LibraryUpdateFile"] = "" # File name for writing indices and speeds suitable for updating an existing library logic file +globalParameters["LibraryUpdateComment"] = False # Include solution name as a comment in the library update file # internal, i.e., gets set during startup globalParameters["CurrentISA"] = (0,0,0) @@ -253,6 +254,8 @@ # control if a solution is run for a given problem globalParameters["GranularityThreshold"] = 0.0 +globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile trainning verification or not + # Save a copy - since pytest doesn't re-run this initialization code and YAML files can override global settings - odd things can happen defaultGlobalParameters = deepcopy(globalParameters) @@ -261,7 +264,8 @@ 'all':'_','gfx000':'none', 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx906':'vega20', 'gfx906:xnack+':'vega20', 'gfx906:xnack-':'vega20', 'gfx908':'arcturus','gfx908:xnack+':'arcturus', 'gfx908:xnack-':'arcturus', - 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran' + 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran', + 'gfx1010':'navi10', 'gfx1011':'navi11', 'gfx1012':'navi12', 'gfx1030':'navi21' } def getArchitectureName(gfxName): @@ -292,7 +296,7 @@ def getArchitectureName(gfxName): validThreadTiles.append([i, j]) validActivationFormats = ('NCHW', 'NHWC', 'CNHW', 'NCDHW', 'NDHWC', 'CNDHW') -validWeightFormats = ('KCYX', "CKYX", "CYXK", 'KCZYX', 'CKZYX', 'CZYXK') +validWeightFormats = ('KCYX', "KYXC", "CKYX", "CYXK", 'KCZYX', 'CKZYX', 'CZYXK') validMacroTileSides = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 6, 12, 24, 48, 96, 192, 384, 768 ] validMacroTiles = [] validISA = [(0,0,0)] @@ -760,6 +764,10 @@ def getArchitectureName(gfxName): "ThreadTile": validThreadTiles, "MacroTile": validMacroTiles, # MT0 = wg0*tt0, MT1 = wg1*tt1 + # Which instruction to use for MAC: MAD or FMA + "MACInstruction": ["MAD", "FMA"], + "WavefrontSize": [32, 64], + # MatrixInstruction: (M x N x K x B) # XDLOPS tile definition, only valid for gfx908, gfx90a # MxNxKxB specifies matrix instruction variants @@ -800,6 +808,9 @@ def getArchitectureName(gfxName): # -1: Use dwordx2 if support SRVW, or set SRVW to 0 "StoreRemapVectorWidth": [-1,0,1,2,4,8], + # SourceSwap: Optimizes MatrixInstruction store pattern by swapping mfma input order. + "SourceSwap": [False, True], + # Disable overlapping AB-tile vgpr and read/write addr vgprs with C-tile vgprs # Valid only for MatrixInstruction enabled kernels, which by default overlaps # C-tile w/ AB-tile until it's due for v_accvgpr_read before the writeback. Illustrated below: @@ -1126,6 +1137,8 @@ def getArchitectureName(gfxName): {"WorkGroupMappingType": [ "B" ] }, {"WorkGroupMapping": [ 8 ] }, {"ThreadTile": [ [4,4] ] }, + {"MACInstruction": [ '' ]}, + {"WavefrontSize": [ 64 ]}, {"MatrixInstruction": [ [] ] }, {"DisableVgprOverlapping": [ False ] }, {"1LDSBuffer": [ 0 ] }, @@ -1143,6 +1156,7 @@ def getArchitectureName(gfxName): {"MinVgprNumber": [0]}, {"MaxVgprNumber": [256]}, {"StoreRemapVectorWidth": [ 0 ] }, + {"SourceSwap": [ False ] }, ] # benchmark these solution independently defaultForkParameters = [] @@ -1480,16 +1494,26 @@ def GetAsmCaps(isaVersion): rv["HasMFMA_bf16_1k"] = tryAssembler(isaVersion, "v_mfma_f32_32x32x4bf16_1k a[0:31], v[32:33], v[36:37], a[0:31]") rv["v_mac_f16"] = tryAssembler(isaVersion, "v_mac_f16 v47, v36, v34") + rv["v_fma_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0,0]") + rv["v_fmac_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34") + rv["v_pk_fma_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0]") + rv["v_pk_fmac_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34") + rv["v_mad_mix_f32"] = tryAssembler(isaVersion, "v_mad_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") rv["v_fma_mix_f32"] = tryAssembler(isaVersion, "v_fma_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") rv["v_dot2_f32_f16"] = tryAssembler(isaVersion, "v_dot2_f32_f16 v20, v36, v34, v20") rv["v_dot2c_f32_f16"] = tryAssembler(isaVersion, "v_dot2c_f32_f16 v47, v36, v34") + rv["v_mac_f32"] = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22") + rv["v_fma_f32"] = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23") + rv["v_fmac_f32"] = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22") + rv["HasAtomicAdd"] = tryAssembler(isaVersion, "buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:0") + if tryAssembler(isaVersion, "s_waitcnt vmcnt(63)"): rv["MaxVmcnt"] = 63 elif tryAssembler(isaVersion, "s_waitcnt vmcnt(15)"): @@ -1604,7 +1628,7 @@ def capRow(caps, cap): return [cap] + [('1' if cap in caps[arch] and caps[arch][cap] else '0') for arch in archs] allAsmCaps = set(itertools.chain(*[caps.keys() for arch, caps in parameters["AsmCaps"].items()])) - allAsmCaps = sorted(allAsmCaps) + allAsmCaps = sorted(allAsmCaps, key=lambda k: (k.split("_")[-1], k)) asmCapRows = [capRow(parameters["AsmCaps"], cap) for cap in allAsmCaps] allArchCaps = set(itertools.chain(*[caps.keys() for arch, caps in parameters["ArchCaps"].items()])) @@ -1673,19 +1697,18 @@ def assignGlobalParameters( config ): # read current gfx version if os.name != "nt" and globalParameters["CurrentISA"] == (0,0,0) and globalParameters["ROCmAgentEnumeratorPath"]: - process = Popen([globalParameters["ROCmAgentEnumeratorPath"], "-t", "GPU"], stdout=PIPE) - line = process.stdout.readline().decode() - while line != "": + command = [globalParameters["ROCmAgentEnumeratorPath"]]#, "-t", "GPU"] + result = subprocess.run(command, stdout=subprocess.PIPE) + for line in result.stdout.decode().split("\n"): arch = gfxArch(line.strip()) if arch is not None: if arch in globalParameters["SupportedISA"]: print1("# Detected local GPU with ISA: " + gfxName(arch)) globalParameters["CurrentISA"] = arch - line = process.stdout.readline().decode() if globalParameters["CurrentISA"] == (0,0,0): printWarning("Did not detect SupportedISA: %s; cannot benchmark assembly kernels." % globalParameters["SupportedISA"]) - if process.returncode: - printWarning("%s exited with code %u" % (globalParameters["ROCmAgentEnumeratorPath"], process.returncode)) + if result.returncode: + printWarning("%s exited with code %u" % (globalParameters["ROCmAgentEnumeratorPath"], result.returncode)) # TODO Remove this when rocm-smi supports gfx90a if globalParameters["CurrentISA"] == (9,0,10): @@ -1717,7 +1740,7 @@ def assignGlobalParameters( config ): output = subprocess.run(["hipcc", "--version"], check=True, stdout=subprocess.PIPE).stdout.decode() for line in output.split('\n'): - if 'HIP' in line: + if 'HIP version' in line: globalParameters['HipClangVersion'] = line.split()[2] print1("# Found hipcc version " + globalParameters['HipClangVersion']) @@ -1730,7 +1753,15 @@ def assignGlobalParameters( config ): printWarning("Global parameter %s = %s unrecognised." % ( key, value )) globalParameters[key] = value - +def setupRestoreClocks(): + import atexit + def restoreClocks(): + if globalParameters["PinClocks"]: + rsmi = globalParameters["ROCmSMIPath"] + subprocess.call([rsmi, "-d", "0", "--resetclocks"]) + subprocess.call([rsmi, "-d", "0", "--setfan", "50"]) + atexit.register(restoreClocks) +setupRestoreClocks() ################################################################################ # Assign Parameters @@ -1856,7 +1887,7 @@ def finish(self): pass # Append copyrights to all files generated by tensile since they belong to Tensile intellectual property CMakeHeader = """################################################################################ -# Copyright (C) 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -1885,7 +1916,7 @@ def finish(self): pass """ CHeader = """/******************************************************************************* -* Copyright (C) 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +* Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/Tensile/Components/ComputeStoreVgprs.py b/Tensile/Components/ComputeStoreVgprs.py index 9c1df64b5..19f9bd602 100644 --- a/Tensile/Components/ComputeStoreVgprs.py +++ b/Tensile/Components/ComputeStoreVgprs.py @@ -20,7 +20,6 @@ ################################################################################ from ..Component import ComputeStoreVgprs -from ..Common import globalParameters from ..AsmUtils import vectorStaticDivideAndRemainder, staticMultiply, vgpr, sgpr, inst, vectorStaticDivide, vectorStaticRemainder class ComputeStoreVgprsVALU(ComputeStoreVgprs): @@ -54,13 +53,14 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): tid0 = writer.vgprPool.checkOut(1, "tid0") tid1 = writer.vgprPool.checkOut(1, "tid1") + packedC1 = kernel["PackedC1IndicesX"] + if kernel["BufferStore"]: writer.cinRowPtr = writer.vgprPool.checkOut(1, "cinRowPtr") writer.coutRowPtr = writer.vgprPool.checkOut(1, "coutRowPtr") tmpV0 = writer.vgprPool.checkOutAligned(2,2) - kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", divisor, \ - tmpV0, tmpS0) + kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", divisor, tmpV0, tmpS0) kStr += staticMultiply(vgpr(tid0), vgpr(tid0), tid0Scale, sgpr(tmpS1)) if tid1Scale != 1: kStr += staticMultiply(vgpr(tid1), vgpr(tid1), tid1Scale, sgpr(tmpS1)) @@ -76,8 +76,6 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): # TODO-packed # Eventually need to modify if supporting packed coord1, to start just assert if that case is detected #-- - packedC1 = kernel["PackedC1IndicesX"] - assert (len(packedC1) == 1) # would need to extract/scale indices from coord1 strideC1 = "StrideC%s" % (writer.indexChars[packedC1[0]]) kStr += inst("v_mul_lo_u32", vgpr(writer.cinRowPtr), vgpr(tid1), sgpr(strideC1), \ @@ -104,7 +102,7 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): # coord = tid*VW + workgroup offset kStr += inst("_v_add_co_u32", \ vgpr(tid0), \ - "vcc", \ + writer.vcc, \ sgpr(tmpS0), \ vgpr(tid0), \ "coord0 = tid0*VW + wg0*MT0") @@ -115,18 +113,22 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): "<- wg1*MT1") kStr += inst("_v_add_co_u32", \ vgpr(tid1), \ - "vcc", \ + writer.vcc, \ sgpr(wgMT1), \ vgpr(tid1), \ "coord1 = tid1*VW + wg1*MT1") + if len(packedC1) > 1: + kStr += writer.extractPackedCoord1ToRowStart(kernel, packedC1, tid1, 'D') + writer.coord0 = tid0 writer.coord1 = tid1 return kStr class ComputeStoreVgprsMFMA(ComputeStoreVgprs): - kernel = {"EnableMatrixInstruction": True} + kernel = {"EnableMatrixInstruction": True, + "SourceSwap": False} """ computeStoreVgprs @@ -163,17 +165,17 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): kStr = "" # coord 1 : wave part - kStr += vectorStaticDivide(wave_id, "Serial", globalParameters["WavefrontWidth"], tmpVgpr1, tmpSgpr) + kStr += vectorStaticDivide(wave_id, "Serial", writer.kernel["WavefrontSize"], tmpVgpr1, tmpSgpr) kStr += vectorStaticDivide(tid1, wave_id, kernel["MIWaveGroup"][0], tmpVgpr1, tmpSgpr) kStr += inst("v_mul_lo_u32", vgpr(tid1), hex(MIBShape1), vgpr(tid1), "wave coordination offset 1") # coord 1 : thread part kStr += vectorStaticRemainder(dummy, tmpVgpr0, "Serial", kernel["MatrixInstN"], tmpVgpr1, tmpSgpr) - kStr += inst("v_add_u32", vgpr(tid1), vgpr(tmpVgpr0), vgpr(tid1), "coordination 1 = wave_id1 + tid1") + kStr += inst("_v_add_u32", vgpr(tid1), vgpr(tmpVgpr0), vgpr(tid1), "coordination 1 = wave_id1 + tid1") if kernel["MatrixInstM"] == 4: - remainder = globalParameters["WavefrontWidth"] + remainder = writer.kernel["WavefrontSize"] divisor = kernel["MatrixInstN"] * kernel["MatrixInstBM"] if kernel["ProblemType"]["DataType"].isDouble(): divisor *= 4 @@ -181,9 +183,9 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): remainder = 16 divisor = 8 kStr += vectorStaticRemainder(dummy, tmpVgpr0, "Serial", remainder, tmpVgpr1, tmpSgpr) - kStr += vectorStaticDivide(tmpVgpr0, tmpVgpr0, divisor, tmpVgpr1, tmpSgpr) + kStr += vectorStaticDivide(tmpVgpr0, tmpVgpr0, divisor, tmpVgpr1, tmpSgpr) kStr += staticMultiply(vgpr(tmpVgpr0), vgpr(tmpVgpr0), kernel["MatrixInstN"], sgpr(tmpSgpr)) - kStr += inst("v_add_u32", vgpr(tid1), vgpr(tmpVgpr0), vgpr(tid1), "coordination 1 = wave_id1 + tid1") + kStr += inst("_v_add_u32", vgpr(tid1), vgpr(tmpVgpr0), vgpr(tid1), "coordination 1 = wave_id1 + tid1") # coord 1 : offset part packedC1 = kernel["PackedC1IndicesX"] @@ -199,17 +201,141 @@ def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): divisor = kernel["MatrixInstN"] * kernel["MatrixInstM"] kStr += vectorStaticDivide(tid0, "Serial", divisor, tmpVgpr1, tmpSgpr) kStr += vectorStaticRemainder(dummy, tid0, tid0, kernel["MatrixInstN"], tmpVgpr1, tmpSgpr) - kStr += inst("v_add_u32", vgpr(tmpVgpr0), vgpr(tmpVgpr0), vgpr(tid0), "WAAA") + kStr += inst("_v_add_u32", vgpr(tmpVgpr0), vgpr(tmpVgpr0), vgpr(tid0), "WAAA") # coord 0 : thread part - kStr += vectorStaticRemainder(dummy, tid0, "Serial", globalParameters["WavefrontWidth"], tmpVgpr1, tmpSgpr) + kStr += vectorStaticRemainder(dummy, tid0, "Serial", writer.kernel["WavefrontSize"], tmpVgpr1, tmpSgpr) kStr += vectorStaticDivide(tid0, tid0, kernel["MatrixInstM"], tmpVgpr1, tmpSgpr) if kernel["MatrixInstM"] == 4: kStr += vectorStaticRemainder(dummy, tid0, tid0, kernel["MatrixInstBM"], tmpVgpr1, tmpSgpr) if kernel["MatrixInstM"] == 4 or not kernel["ProblemType"]["DataType"].isDouble(): kStr += inst("v_lshlrev_b32", vgpr(tid0), hex(2), vgpr(tid0), "thread0 * 4 : mfma output 4 continuous outputs") + kStr += inst("_v_add_u32", vgpr(tid0), vgpr(tmpVgpr0), vgpr(tid0), "coordination 0 = wave_id0 + tid0") + + if writer.prefetchAcrossPersistent: + wg0="PrevWorkGroup0" + wg1="PrevWorkGroup1" + else: + wg0="WorkGroup0" + wg1="WorkGroup1" + + # macro tile 0 part + kStr += inst("s_mul_i32", sgpr(tmpSgpr), kernel["MacroTile0"], sgpr(wg0), "wgp0 * MT0") + kStr += inst("_v_add_u32", vgpr(tid0), sgpr(tmpSgpr), vgpr(tid0), "coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0") + + # macro tile 1 part + kStr += inst("s_mul_i32", sgpr(tmpSgpr), kernel["MacroTile1"], sgpr(wg1), "wgp1 * MT1") + kStr += inst("_v_add_u32", vgpr(tid1), sgpr(tmpSgpr), vgpr(tid1), "coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1") + + # extract packed rowStart vgpr + if len(packedC1) > 1: + kStr += writer.extractPackedCoord1ToRowStart(kernel, packedC1, tid1, 'D') + + # release resource + writer.vgprPool.checkIn(dummy) + writer.vgprPool.checkIn(tmpVgpr1) + writer.vgprPool.checkIn(tmpVgpr0) + writer.vgprPool.checkIn(wave_id) + + # StoreRemap: calculate + # 1. local read address + # 2. local write address + # 3. global write coord0 and coord1 + if kernel["StoreRemapVectorWidth"]: + kStr += writer.storeRemapComputeStoreVgprs(kernel) + + writer.coord0 = tid0 + writer.coord1 = tid1 + + return kStr + +class ComputeStoreVgprsMFMASwap(ComputeStoreVgprs): + kernel = {"EnableMatrixInstruction": True, + "SourceSwap": True} + + """ + computeStoreVgprs + Compute workitem/TT offsets in VGPRS + and coord0/coord1 + tid0Scale specifies the number of output elements in 0/coalesced dim + that should be written by each work-item in each batch element. + """ + def __call__(self, writer, kernel, divisor, tid0Scale, tid1Scale): + + # writer.coord0 + # writer.coord1 + # writer.cinRowPtr : C buffer coulmn offset + # writer.coutRowPtr : D buffer coulmn offset + + # alloc resources + tid0 = writer.vgprPool.checkOut(1) + tid1 = writer.vgprPool.checkOut(1) + if kernel["BufferStore"]: + writer.cinRowPtr = writer.vgprPool.checkOut(1, "cinRowPtr") + writer.coutRowPtr = writer.vgprPool.checkOut(1, "coutRowPtr") + + wave_id = writer.vgprPool.checkOut(1) + + tmpVgpr0 = writer.vgprPool.checkOut(1,"tmpVgpr0") + tmpVgpr1 = writer.vgprPool.checkOutAligned(2,2,"tmpVgpr1") + dummy = writer.vgprPool.checkOut(1,"dummy") + tmpSgpr = writer.getTmpSgpr(1).idx() + + # constant + MIBShape0 = kernel["MatrixInstM"] * kernel["MatrixInstBM"] + MIBShape1 = kernel["MatrixInstN"] * kernel["MatrixInstBN"] + + kStr = "" + + kStr += vectorStaticDivide(wave_id, "Serial", writer.kernel["WavefrontSize"], tmpVgpr1, tmpSgpr) + + # coord 1 : wave part + kStr += vectorStaticDivide(tmpVgpr0, wave_id, kernel["MIWaveGroup"][0], tmpVgpr1, tmpSgpr) + kStr += inst("v_mul_lo_u32", vgpr(tmpVgpr0), hex(MIBShape1), vgpr(tmpVgpr0), "wave coordination offset 1") + # if kernel["MatrixInstM"] == 4 and kernel["ProblemType"]["DataType"].isDouble(): + # divisor = kernel["MatrixInstN"] * kernel["MatrixInstM"] + # kStr += vectorStaticDivide(tid1, "Serial", divisor, tmpVgpr1, tmpSgpr) + # kStr += vectorStaticRemainder(dummy, tid1, tid1, kernel["MatrixInstN"], tmpVgpr1, tmpSgpr) + # kStr += inst("v_add_u32", vgpr(tmpVgpr0), vgpr(tmpVgpr0), vgpr(tid1), "WAAA") + + # coord 1 : thread part + kStr += vectorStaticRemainder(dummy, tid1, "Serial", writer.kernel["WavefrontSize"], tmpVgpr1, tmpSgpr) + kStr += vectorStaticDivide(tid1, tid1, kernel["MatrixInstM"], tmpVgpr1, tmpSgpr) + if kernel["MatrixInstM"] == 4: + kStr += vectorStaticRemainder(dummy, tid1, tid1, kernel["MatrixInstBM"], tmpVgpr1, tmpSgpr) + if kernel["MatrixInstM"] == 4 or not kernel["ProblemType"]["DataType"].isDouble(): + kStr += inst("v_lshlrev_b32", vgpr(tid1), hex(2), vgpr(tid1), "thread0 * 4 : mfma output 4 continuous outputs") + kStr += inst("v_add_u32", vgpr(tid1), vgpr(tmpVgpr0), vgpr(tid1), "coordination 1 = wave_id1 + tid1") + + # coord 1 : offset part + packedC1 = kernel["PackedC1IndicesX"] + strideC1 = "StrideC%s" % (writer.indexChars[packedC1[0]]) + strideD1 = "StrideD%s" % (writer.indexChars[packedC1[0]]) + kStr += inst("v_mul_lo_u32", vgpr(writer.cinRowPtr), vgpr(tid1), sgpr(strideC1), " offset 1") + kStr += inst("v_mul_lo_u32", vgpr(writer.coutRowPtr), vgpr(tid1), sgpr(strideD1), " offset 1") + + # coord 0 : wave part + kStr += vectorStaticRemainder(dummy, tid0, wave_id, kernel["MIWaveGroup"][0], tmpVgpr1, tmpSgpr) + kStr += inst("v_mul_lo_u32", vgpr(tid0), hex(MIBShape0), vgpr(tid0), "wave coordination offset 0") + + # coord 0 : thread part + kStr += vectorStaticRemainder(dummy, tmpVgpr0, "Serial", kernel["MatrixInstN"], tmpVgpr1, tmpSgpr) kStr += inst("v_add_u32", vgpr(tid0), vgpr(tmpVgpr0), vgpr(tid0), "coordination 0 = wave_id0 + tid0") + + # if kernel["MatrixInstM"] == 4: + # remainder = writer.kernel["WavefrontSize"] + # divisor = kernel["MatrixInstN"] * kernel["MatrixInstBM"] + # if kernel["ProblemType"]["DataType"].isDouble(): + # divisor *= 4 + # if kernel["MatrixInstBM"] < 4: + # remainder = 16 + # divisor = 8 + # kStr += vectorStaticRemainder(dummy, tmpVgpr0, "Serial", remainder, tmpVgpr1, tmpSgpr) + # kStr += vectorStaticDivide(tmpVgpr0, tmpVgpr0, divisor, tmpVgpr1, tmpSgpr) + # kStr += staticMultiply(vgpr(tmpVgpr0), vgpr(tmpVgpr0), kernel["MatrixInstN"], sgpr(tmpSgpr)) + # kStr += inst("v_add_u32", vgpr(tid0), vgpr(tmpVgpr0), vgpr(tid0), "coordination 1 = wave_id1 + tid1") + if writer.prefetchAcrossPersistent: wg0="PrevWorkGroup0" wg1="PrevWorkGroup1" diff --git a/Tensile/Components/LocalRead.py b/Tensile/Components/LocalRead.py index 1f8bb41fa..79925ad0b 100644 --- a/Tensile/Components/LocalRead.py +++ b/Tensile/Components/LocalRead.py @@ -38,6 +38,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): writer.localReadDoCnt += 1 tc = tP["tensorChar"] + tile01 = tP["tile01Idx"] imod = Code.Module("LocalReadDo%s_I%s"%(tc,iui)) pack = Code.Module("pack%s_I%s"%(tc,iui)) instruction = tP["localReadInstruction"] @@ -45,7 +46,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): blockWidth = instruction.blockWidth offsetMultiplier = 1 # instruction.offsetMultiplier valuIdx = 0 - numVectorsPerTile = (kernel["ThreadTile%u"%tP["tensorIdx"]]//kernel["VectorWidth"]) + numVectorsPerTile = (kernel["ThreadTile%u"%tile01]//kernel["VectorWidth"]) numReadsPerVector = (kernel["VectorWidth"] * tP["bpe"]) // (blockWidth*4) # bytes/register for vIdx in range(0, numVectorsPerTile): @@ -58,7 +59,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): paramList.append(vgpr("LocalReadAddr%s"%tc)) for oIdx in range(0, numOffsets): - paramList.append(((rIdx*blockWidth + kernel["SubGroup%u"%tP["tensorIdx"]] * (vIdx*numOffsets+oIdx)*kernel["VectorWidth"] \ + paramList.append(((rIdx*blockWidth + kernel["SubGroup%u"%tile01] * (vIdx*numOffsets+oIdx)*kernel["VectorWidth"] \ + tP["localReadOffset"]) * tP["bpe"] + tP["localReadSwapByteOffset"]) // offsetMultiplier) # print("Debug: Matrix{}, rIdx offset {}, vIdx offset {}, bpe {}, net offset {}".format( \ # tP["tensorChar"], \ @@ -68,7 +69,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): # paramList[-1])) paramTuple = tuple(paramList) comment = "L -> Reg lro=%d swapByteOffset=%u ti=%u vIdx=%u rIdx=%u oIdx=%u buffer=%u iui=%u"\ - %(tP["localReadOffset"],tP["localReadSwapByteOffset"],kernel["SubGroup%u"%tP["tensorIdx"]], vIdx, rIdx, oIdx, bufferIdx, iui) + %(tP["localReadOffset"],tP["localReadSwapByteOffset"],kernel["SubGroup%u"%tile01], vIdx, rIdx, oIdx, bufferIdx, iui) localReadCode.addCode(Code.LocalReadInst(instruction.IssueLatency,instruction.toCodeInst(paramTuple), comment)) valuIdx += blockWidth @@ -132,7 +133,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): writer.localReadDoCntA += 1 else: writer.localReadDoCntB += 1 - tIdx = tP["tensorIdx"] + tile01 = tP["tile01Idx"] instruction = tP["localReadInstruction"] numOffsets = instruction.numOffsets @@ -147,7 +148,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): tileStride = kernel["_DepthULds"] + LdsPad UnrollStride = 1 - numVectorsPerTile = kernel["MIWaveTile"][tIdx] + numVectorsPerTile = kernel["MIWaveTile"][tile01] if tc == "A": numReadsPerVector = tP["bpe"] * writer.lrvwA // int(blockWidth * 4) # bytes/register else: @@ -206,7 +207,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): paramList.append(vgpr("LocalReadAddr%s"%tc)) for oIdx in range(0, numOffsets): - offset_val = (vIdx * numOffsets+oIdx) * MIWaveGropuShape[tIdx] * tileStride + offset_val = (vIdx * numOffsets+oIdx) * MIWaveGropuShape[tile01] * tileStride offset_val = (rIdx * UnrollStride + offset_val + tP["localReadOffset"]) * tP["bpe"] if (kernel["LdsBlockSizePerPad%s"%tc] != 0) and (kernel["LdsPad%s"%tc] != 0): offset_val = offset_val + (offset_val // kernel["LdsBlockSizePerPad%s"%tc]) * kernel["LdsPad%s"%tc] * tP["bpe"] @@ -215,7 +216,7 @@ def __call__(self, writer, bufferIdx, iui, epsi, tP): paramTuple = tuple(paramList) comment = "L -> Reg lro=%d swapByteOffset=%u ti=%u vIdx=%u rIdx=%u oIdx=%u buffer=%u iui=%u" \ - % (tP["localReadOffset"], tP["localReadSwapByteOffset"], MIWaveGropuShape[tIdx], vIdx, rIdx, oIdx, bufferIdx, iui) + % (tP["localReadOffset"], tP["localReadSwapByteOffset"], MIWaveGropuShape[tile01], vIdx, rIdx, oIdx, bufferIdx, iui) highBits = highBitsForHalf or isHigh16Bits localReadCode.addCode(Code.LocalReadInst(instruction.IssueLatency,instruction.toCodeInst(paramTuple, 0, highBits), comment)) diff --git a/Tensile/Components/LraTileAssignment.py b/Tensile/Components/LraTileAssignment.py index c36ccdccd..39cb33800 100644 --- a/Tensile/Components/LraTileAssignment.py +++ b/Tensile/Components/LraTileAssignment.py @@ -20,7 +20,6 @@ ################################################################################ from ..Component import LraTileAssignment -from ..Common import globalParameters from ..AsmUtils import inst, vgpr, sgpr, vectorStaticDivideAndRemainder, vectorStaticDivide, staticMultiply, vectorStaticRemainder class LraTileAssignmentVALU(LraTileAssignment): @@ -38,7 +37,7 @@ def __call__(self, writer, kernel, tP): tmpVgpr = writer.vgprPool.checkOutAligned(2,2,"tmpVgpr") tmpSgpr = writer.getTmpSgpr(1).idx() - if tP["tensorChar"] == 'A': + if tP["tileIdx"] == 0: kStr += "%slr%s = serial %% SG%s%s%s" \ % (writer.commentPrefix, tP["tileChar"], tP["tileChar"], \ writer.commentSuffix, writer.endLine) @@ -52,15 +51,15 @@ def __call__(self, writer, kernel, tP): # release and return resource tP["gpr"]["lro"] = rReg - writer.tmplroB = qReg - elif tP["tensorChar"] == 'B': + writer.tmplro = qReg + else: kStr += "%slr%s = (serial / SG%s) %% SG%s%s%s" \ % (writer.commentPrefix, tP["tileChar"], tP["tileChar"], \ tP["tileChar"], writer.commentSuffix, writer.endLine) # constant divisor = kernel["SubGroup1"] - dividendReg = writer.tmplroB + dividendReg = writer.tmplro # generate instruction kStr += vectorStaticDivideAndRemainder(qReg, rReg, dividendReg, divisor, tmpVgpr, tmpSgpr) @@ -68,7 +67,7 @@ def __call__(self, writer, kernel, tP): # release and return resource tP["gpr"]["lro"] = rReg - writer.vgprPool.checkIn(writer.tmplroB) # old + writer.vgprPool.checkIn(writer.tmplro) # old writer.vgprPool.checkIn(qReg) writer.vgprPool.checkIn(tmpVgpr) @@ -100,21 +99,21 @@ def __call__(self, writer, kernel, tP): # get constant parameter tc = tP["tensorChar"] - tIdx = tP["tensorIdx"] - waveWidth = globalParameters["WavefrontWidth"] + tile01 = tP["tile01Idx"] + waveWidth = writer.kernel["WavefrontSize"] inputPerThread = max(writer.lrvwA,writer.lrvwB) LdsPad = kernel["LdsPad%s" % tc] if kernel["LdsBlockSizePerPad%s" % tc] == 0 else 0 # parameter for get each type index dividendForKId = kernel["MatrixInstM"] * kernel["MatrixInstB"] - num1DBlocks = kernel["MatrixInstBM"] if (tc == 'A') else kernel["MatrixInstBN"] - num1DWaves = kernel["MIWaveGroup"][0] if (tc == 'A') else kernel["MIWaveGroup"][1] - dividedForBlkId = kernel["MatrixInstM"] if (tc == 'A') else (kernel["MatrixInstM"] * kernel["MatrixInstBM"]) - dividedForWaveId = waveWidth if (tc == 'A') else (waveWidth * kernel["MIWaveGroup"][0]) + num1DBlocks = kernel["MatrixInstBM"] if (tile01 == 0) else kernel["MatrixInstBN"] + num1DWaves = kernel["MIWaveGroup"][0] if (tile01 == 0) else kernel["MIWaveGroup"][1] + dividedForBlkId = kernel["MatrixInstM"] if (tile01 == 0) else (kernel["MatrixInstM"] * kernel["MatrixInstBM"]) + dividedForWaveId = waveWidth if (tile01 == 0) else (waveWidth * kernel["MIWaveGroup"][0]) # strider for each type of index umlds = kernel["UnrollMajorLDS%s" % tP["tensorChar"]] - mt = kernel["MacroTile%u" % tIdx] + mt = kernel["MacroTile%u" % tile01] strideTile = kernel["_DepthULds"] + LdsPad if umlds else 1 strideK = inputPerThread if umlds else (mt + LdsPad) * inputPerThread strideBlock = kernel["MatrixInstM"] * strideTile @@ -135,7 +134,7 @@ def __call__(self, writer, kernel, tP): "2. block offset: bnIdx = bnIdx %% num1DBlocks(%u)" % num1DBlocks) kStr += staticMultiply(vgpr(wReg), vgpr(wReg), strideBlock, sgpr(tmpSgpr), \ "2. block offset: bnOffset = bnIdx * strideBlock(%u)" % strideBlock) - kStr += inst("v_add_u32", vgpr(tReg), vgpr(wReg), vgpr(tReg), \ + kStr += inst("_v_add_u32", vgpr(tReg), vgpr(wReg), vgpr(tReg), \ "3. add N and block offset: bnOffset = block and N offset") # unroll offset @@ -143,7 +142,7 @@ def __call__(self, writer, kernel, tP): "4. K offset: kIdx = wtid / (MIN(%u) * MIBB(%u))" % (kernel["MatrixInstN"], kernel["MatrixInstB"])) kStr += staticMultiply(vgpr(kReg), vgpr(kReg), strideK, sgpr(tmpSgpr), \ "4. K offset: lrKOffset = kIdx * mStride(%u)" % strideK) - kStr += inst("v_add_u32", vgpr(tReg), vgpr(kReg), vgpr(tReg), \ + kStr += inst("_v_add_u32", vgpr(tReg), vgpr(kReg), vgpr(tReg), \ "5. offset in wave: lrOffset = bnOffset + lrKOffset") # wave offset @@ -154,7 +153,7 @@ def __call__(self, writer, kernel, tP): "6. wave offset in M dimen: wtid0 = wtid / num1DWaves(%u)" % num1DWaves) kStr += staticMultiply(vgpr(wReg), vgpr(wReg), strideWave, sgpr(tmpSgpr), \ "6. wave offset in M dimen: wOffset = wtid0 * W0Stride(%u)" % strideWave) - kStr += inst("v_add_u32", vgpr(tReg), vgpr(wReg), vgpr(tReg), \ + kStr += inst("_v_add_u32", vgpr(tReg), vgpr(wReg), vgpr(tReg), \ "7. final local read offset: flrOffset = lrOffset + WOffset") # release register diff --git a/Tensile/Components/MAC_F16.py b/Tensile/Components/MAC_F16.py index febfe573c..53b9e23fd 100644 --- a/Tensile/Components/MAC_F16.py +++ b/Tensile/Components/MAC_F16.py @@ -22,7 +22,7 @@ from ..Component import Component, MAC from ..DataType import DataType -class MAC_Plain(MAC): +class MAC_F16_Plain(MAC): """ Plain MAC instruction implementation """ @@ -71,7 +71,7 @@ def __call__(self, writer, m, innerUnroll): return kStr -class FMA_NonPacked(MAC): +class FMA_F16_NonPacked(MAC): asmCaps = {"v_fma_f16": True, "v_pk_fma_f16": False} #archCaps = {} @@ -124,7 +124,7 @@ def __call__(self, writer, m, innerUnroll): kStr += priority(writer, 0, "Reset priority after macs") return kStr -class FMA_Packed(MAC): +class FMA_F16_Packed(MAC): asmCaps = {"v_pk_fma_f16": True} #archCaps = {} kernel = {"ProblemType": {"DataType": DataType(DataType.half), diff --git a/Tensile/Components/MAC_F16_HPA.py b/Tensile/Components/MAC_F16_HPA.py index f8ba6be21..1b34a7332 100644 --- a/Tensile/Components/MAC_F16_HPA.py +++ b/Tensile/Components/MAC_F16_HPA.py @@ -22,7 +22,8 @@ from ..Component import Component, MAC from ..DataType import DataType -class FMA_HPA_MAD_MIX_LDL(MAC): +class FMA_F16_HPA_MAD_MIX_LDL(MAC): + @staticmethod def asmCaps(caps): return (caps['v_mad_mix_f32'] or caps['v_fma_mix_f32']) \ and not caps["v_dot2c_f32_f16"] \ @@ -122,7 +123,7 @@ def __call__(self, writer, m, innerUnroll): return kStr -class FMA_HPA_MAD_MIX(MAC): +class FMA_F16_HPA_MAD_MIX(MAC): asmCaps = lambda caps: caps['v_mad_mix_f32'] or caps['v_fma_mix_f32'] #archCaps = {} kernel = {"ProblemType": {"DataType": DataType(DataType.half), @@ -153,17 +154,19 @@ def __call__(self, writer, m, innerUnroll): vars["Half_ThreadTile0"] = kernel["ThreadTile0"] // 2 vars["Half_ThreadTile1"] = kernel["ThreadTile1"] // 2 - for blockB in range(0, kernel["ThreadTile1"]//2): - for blockA in range(0, kernel["ThreadTile0"]//2): + for block1 in range(0, kernel["ThreadTile1"]//2): + for block0 in range(0, kernel["ThreadTile0"]//2): for iui in range(0, innerUnroll): - vars["blockA"] = blockA - vars["blockB"] = blockB + vars["block0"] = block0 + vars["block1"] = block1 + vars["blockA"] = block0 if writer.tPA["tileIdx"] == 0 else block1 + vars["blockB"] = block1 if writer.tPB["tileIdx"] != 0 else block0 vars["iui"] = iui vars["aBase"] = "vgprValuA_X{m}_I{iui}".format_map(vars) vars["bBase"] = "vgprValuB_X{m}_I{iui}".format_map(vars) - vars["cIdxExpr"] = "{blockA}*2 + {blockB}*{ThreadTile0}*2 + 0*2 + 0".format_map(vars) + vars["cIdxExpr"] = "{block0}*2 + {block1}*{ThreadTile0}*2 + 0*2 + 0".format_map(vars) vars["cidx"] = eval(vars["cIdxExpr"]) vars["cStr"] = "v[vgprValuC + {cIdxExpr}]".format_map(vars) # *2 b/c of fp32 @@ -173,19 +176,21 @@ def __call__(self, writer, m, innerUnroll): kStr += priority(writer, 1, "Raise priority while processing macs") - vars["cIdxExpr"] = "{blockA}*2 + {blockB}*{ThreadTile0}*2 + 0*2 + 1".format_map(vars) + vars["cIdxExpr"] = "{block0}*2 + {block1}*{ThreadTile0}*2 + 0*2 + 1".format_map(vars) vars["cidx"] = eval(vars["cIdxExpr"]) vars["cStr"] = "v[vgprValuC + {cIdxExpr}]".format_map(vars) # *2 b/c of fp32 - kStr += "{instruction} {cStr}, {aStr}, {bStr}, {cStr} op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[{cidx}]{endLine}".format_map(vars) + vars["opSel"] = "op_sel:[1,0,0]" if writer.tPA["tileIdx"] == 0 else "op_sel:[0,1,0]" + kStr += "{instruction} {cStr}, {aStr}, {bStr}, {cStr} {opSel} op_sel_hi:[1,1,0] //ValuC[{cidx}]{endLine}".format_map(vars) - vars["cIdxExpr"] = "{blockA}*2 + {blockB}*{ThreadTile0}*2 + {Half_ThreadTile0}*2 + 0".format_map(vars) + vars["cIdxExpr"] = "{block0}*2 + {block1}*{ThreadTile0}*2 + {Half_ThreadTile0}*2 + 0".format_map(vars) vars["cidx"] = eval(vars["cIdxExpr"]) vars["cStr"] = "v[vgprValuC+{cIdxExpr}]".format_map(vars) - kStr += "{instruction} {cStr}, {aStr}, {bStr}, {cStr} op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[{cidx}]{endLine}".format_map(vars) + vars["opSel"] = "op_sel:[0,1,0]" if writer.tPA["tileIdx"] == 0 else "op_sel:[1,0,0]" + kStr += "{instruction} {cStr}, {aStr}, {bStr}, {cStr} {opSel} op_sel_hi:[1,1,0] //ValuC[{cidx}]{endLine}".format_map(vars) - vars["cIdxExpr"] = "{blockA}*2+{blockB}*{ThreadTile0}*2+{Half_ThreadTile0}*2+1".format_map(vars) + vars["cIdxExpr"] = "{block0}*2+{block1}*{ThreadTile0}*2+{Half_ThreadTile0}*2+1".format_map(vars) vars["cidx"] = eval(vars["cIdxExpr"]) vars["cStr"] = "v[vgprValuC+{cIdxExpr}]".format_map(vars) @@ -195,7 +200,7 @@ def __call__(self, writer, m, innerUnroll): return kStr -class FMA_DOT2(MAC): +class FMA_F16_DOT2(MAC): asmCaps = lambda caps: caps["v_dot2c_f32_f16"] or caps["v_dot2_f32_f16"] #archCaps = {} kernel = {"ProblemType": {"DataType": DataType(DataType.half), diff --git a/Tensile/Components/MAC_F32.py b/Tensile/Components/MAC_F32.py new file mode 100644 index 000000000..00fb7feec --- /dev/null +++ b/Tensile/Components/MAC_F32.py @@ -0,0 +1,95 @@ +################################################################################ +# Copyright 2020 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +from ..Component import Component, MAC +from ..DataType import DataType + +class MAC_F32_Plain(MAC): + """ + Plain MAC instruction implementation + """ + @staticmethod + def asmCaps(caps): + return caps["v_mac_f32"] or caps["v_fma_f32"] + #archCaps = {} + kernel = {"ProblemType": {"DataType": DataType(DataType.single)}} + + def __call__(self, writer, m, innerUnroll): + kernel = writer.kernel + + instruction = "v_mac_f32" + if kernel["MACInstruction"] == "FMA": + if writer.asmCaps["v_fmac_f32"]: + instruction = "v_fmac_f32" + elif writer.asmCaps["v_fma_f32"]: + instruction = "v_fma_f32" + else: + raise RuntimeError("FMA instruction specified but not supported on {}".format(kernel["ISA"])) + + if not writer.asmCaps[instruction]: + raise RuntimeError("{} instruction specified but not supported on {}".format(instruction, kernel["ISA"])) + + kStr = self.commentHeader() + + vars = {} + + vars["m"] = m + vars["kernel"] = kernel + vars["endLine"] = writer.endLine + + vars["ThreadTile0"] = kernel["ThreadTile0"] + vars["ThreadTile1"] = kernel["ThreadTile1"] + vars["PerformanceWaitCount"] = kernel["PerformanceWaitCount"] + + vars["instruction"] = instruction + + priority = Component.Priority.find(writer) + macIdx = 0 + + for idx1 in range(0, kernel["ThreadTile1"]): + for idx0 in range(0, kernel["ThreadTile0"]): + for iui in range(0, innerUnroll): + vars["idx0"] = idx0 + vars["idx1"] = idx1 + vars["a"] = idx0 if writer.tPB["tile01Idx"] else idx1 + vars["b"] = idx1 if writer.tPB["tile01Idx"] else idx0 + vars["iui"] = iui + + vars["cStr"] = "v[vgprValuC + {idx0} + {idx1}*{ThreadTile0}]".format_map(vars) + vars["aStr"] = "v[vgprValuA_X{m}_I{iui} + {a}]".format_map(vars) + vars["bStr"] = "v[vgprValuB_X{m}_I{iui} + {b}]".format_map(vars) + + if instruction == "v_fma_f32": + kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars) + else: + kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars) + + kStr += priority(writer, 1, "Raise priority while processing macs") + + if macIdx == kernel["PerformanceWaitLocation"]: + kStr += "s_waitcnt lgkmcnt({PerformanceWaitCount}) // extra wait for performance{endLine}".format_map(vars) + if macIdx == kernel["PerformanceSyncLocation"]: + kStr += "s_barrier // extra barrier for performance{endLine}".format_map(vars) + macIdx += 1 + + kStr += priority(writer, 0, "Reset priority after macs") + + return kStr diff --git a/Tensile/Components/NotLocalFullTileElements.py b/Tensile/Components/NotLocalFullTileElements.py index f5a8b7649..665ba2532 100644 --- a/Tensile/Components/NotLocalFullTileElements.py +++ b/Tensile/Components/NotLocalFullTileElements.py @@ -20,7 +20,6 @@ ################################################################################ from ..Component import NotLocalFullTileElements -from ..Common import globalParameters class NotLocalFullTileElementsVALU(NotLocalFullTileElements): kernel = {"EnableMatrixInstruction": False} @@ -53,7 +52,8 @@ def __call__(self, writer, kernel, edge): return (vectorwidth, elements) class NotLocalFullTileElementsMFMA(NotLocalFullTileElements): - kernel = {"EnableMatrixInstruction": True} + kernel = {"EnableMatrixInstruction": True, + "SourceSwap": False} """ Partition thread-tile into writeElements for store code @@ -77,7 +77,7 @@ def __call__(self, writer, kernel, edge): totalTT0 = kernel["MIWaveTile"][0] * MFMAcontinoutsOuptut totalTT1 = kernel["MIWaveTile"][1] else: - outputsPerThread = kernel["MatrixInstM"] * kernel["MatrixInstN"] // globalParameters["WavefrontWidth"] + outputsPerThread = kernel["MatrixInstM"] * kernel["MatrixInstN"] // writer.kernel["WavefrontSize"] totalTT0 = kernel["MatrixInstBM"] * kernel["MIWaveTile"][0] * outputsPerThread totalTT1 = kernel["MatrixInstBN"] * kernel["MIWaveTile"][1] @@ -89,3 +89,43 @@ def __call__(self, writer, kernel, edge): elements.append(element) return (vectorwidth, elements) + +class NotLocalFullTileElementsMFMASwap(NotLocalFullTileElements): + kernel = {"EnableMatrixInstruction": True, + "SourceSwap": True} + + """ + Partition thread-tile into writeElements for store code + This function creates the writeElement mapping for full tiles + (ie non-edge cases) + """ + def __call__(self, writer, kernel, edge): + elements = [] + vectorwidth = 0 + + if edge: + vectorwidth = kernel["StoreVectorWidth"] if kernel["_VectorStore"] else 1 + vectorwidth = min(vectorwidth, writer.maxGwvw(kernel), kernel["AssertFree0ElementMultiple"]) + else: + vectorwidth = kernel["StoreVectorWidth"] if kernel["_VectorStore"] else 1 + vectorwidth = min(vectorwidth, writer.maxGwvw(kernel)) + + MFMAcontinoutsOuptut = kernel["MIOutputVectorWidth"] + + if kernel["MatrixInstM"] == 4: + totalTT0 = kernel["MIWaveTile"][0] * MFMAcontinoutsOuptut + totalTT1 = kernel["MIWaveTile"][1] + else: + outputsPerThread = kernel["MatrixInstM"] * kernel["MatrixInstN"] // writer.kernel["WavefrontSize"] + totalTT0 = kernel["MatrixInstBM"] * kernel["MIWaveTile"][0] * outputsPerThread + totalTT1 = kernel["MatrixInstBN"] * kernel["MIWaveTile"][1] + + for tt1 in range(0, totalTT1): + for vc1 in range(0, 1): + for vc0 in range(0, MFMAcontinoutsOuptut, vectorwidth): # note step by vectorwidth + for tt0 in range(0, totalTT0 // MFMAcontinoutsOuptut): + element = (tt1, tt0, vc1, vc0) + elements.append(element) + + + return (vectorwidth, elements) diff --git a/Tensile/Components/ShiftVectorComponents.py b/Tensile/Components/ShiftVectorComponents.py index 2300725bd..6958448de 100644 --- a/Tensile/Components/ShiftVectorComponents.py +++ b/Tensile/Components/ShiftVectorComponents.py @@ -20,7 +20,6 @@ ################################################################################ from ..Component import ShiftVectorComponents -from ..Common import globalParameters from ..AsmUtils import inst, vgpr, sgpr, accvgpr, staticMultiply, vectorStaticDivide, vectorStaticRemainder, vectorStaticDivideAndRemainder, log2 class ShiftVectorComponentsVALU(ShiftVectorComponents): @@ -50,19 +49,19 @@ def __call__(self, writer, kernel, tP): wg = tP["prevWg"] if writer.prefetchAcrossPersistent else tP["wg"] # wgMT value - tmpSgpr = writer.getTmpSgpr(2).idx() + tmpSgpr = writer.getTmpSgpr(writer.laneSGPRCount).idx() tmpVgpr = writer.vgprPool.checkOutAligned(2,2,"tmpVgpr") wgMT = writer.vgprPool.checkOut(1,"wgMT") kStr += inst("v_mov_b32", vgpr(wgMT), sgpr(wg), "") kStr += inst("v_mul_i32_i24", vgpr(wgMT), hex(-kernel[tP["mt"]]), vgpr(wgMT), \ "wg*MT") - kStr += inst("_v_add_co_u32", vgpr(wgMT), "vcc", sgpr("SizesFree+%u"%tP["idx"]), \ + kStr += inst("_v_add_co_u32", vgpr(wgMT), writer.vcc, sgpr("SizesFree+%u"%tP["idx"]), \ vgpr(wgMT), "wgMT = Size - wg*MT") kStr += inst("v_mov_b32", vgpr(tmpVgpr), hex(kernel[tP["mt"]]), "MT") - kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,2), vgpr(wgMT), \ + kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,writer.laneSGPRCount), vgpr(wgMT), \ vgpr(tmpVgpr), "wgMT < MT" ) kStr += inst("v_cndmask_b32", vgpr(wgMT), vgpr(tmpVgpr), \ - vgpr(wgMT), sgpr(tmpSgpr,2), "wgMT = (wgMT < MT) ? wgMT : MT" ) + vgpr(wgMT), sgpr(tmpSgpr,writer.laneSGPRCount), "wgMT = (wgMT < MT) ? wgMT : MT" ) dummy = writer.vgprPool.checkOut(1,"dummy") # qReg @@ -127,18 +126,13 @@ def __call__(self, writer, kernel, tP): #kStr += dump(vgpr(vReg)) if True:#tP["tensorIdx"] > kernel["VectorWidth"]: - kStr += inst("_v_add_co_u32", vgpr(vReg), "vcc", vgpr(mvReg), vgpr(vReg), "vId = 2 components") + kStr += inst("_v_add_co_u32", vgpr(vReg), writer.vcc, vgpr(mvReg), vgpr(vReg), "vId = 2 components") writer.vgprPool.checkIn(mvReg) writer.vgprPool.checkIn(vRegD) - kStr += inst("v_cmp_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \ - vgpr(eReg), "mask" ) - kStr += inst("v_mov_b32", vgpr(tmpVgpr+0), sgpr(tmpSgpr+0), "") - kStr += inst("v_mov_b32", vgpr(tmpVgpr+1), sgpr(tmpSgpr+1), "") - # for each remainder, jump for r in range(1, vw): - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(rReg), \ + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(rReg), \ hex(r), "wgMT%%VW == %u"%r ) kStr += inst("s_cbranch_vccnz label_%04u"\ % svrLabels[(r-1)%vw], \ @@ -159,7 +153,7 @@ def __call__(self, writer, kernel, tP): # for each vector index, jump for vectorIdx in range(0, numVectors): - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(vReg), \ + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(vReg), \ hex(vectorIdx), "wgMT/(SG*VW) == %u"%vectorIdx ) kStr += inst("s_cbranch_vccnz label_%04u"\ % sviLabels[(r-1)%vw][vectorIdx], \ @@ -170,7 +164,7 @@ def __call__(self, writer, kernel, tP): kStr += writer.comment("shift d%u r=%u v=%u"%(tP["idx"], r, vectorIdx)) kStr += "label_%04u:%s" % (sviLabels[r-1][vectorIdx], writer.endLine) # mask if last thread in thread#-tile column - kStr += inst("_v_cmpx_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \ + kStr += inst("_v_cmpx_eq_u32", sgpr(tmpSgpr,writer.laneSGPRCount), vgpr(thread), \ vgpr(eReg), "serial % SG == (wgMT/VECTOR_WIDTH)%SG" ) tto = kernel["ThreadTile%u"%((tP["idx"]+1)%2)] # thread tile orthogonal for tt in range(0, tto): @@ -247,17 +241,14 @@ def __call__(self, writer, kernel, tP): vgpr(writer.startVgprValuC+src*writer.bpeCinternal//writer.bpr+i), comment) # end shift reset mask and jump out - kStr += inst("s_mov_b64", sgpr(tmpSgpr,2), \ - "0xFFFFFFFFFFFFFFFF", "to restore all threads active") - kStr += inst("s_or_saveexec_b64", "vcc", sgpr(tmpSgpr,2), \ + all1mask = "0xFFFFFFFF" if (kernel["WavefrontSize"] == 32) else "0xFFFFFFFFFFFFFFFF" + kStr += inst("s_mov_b{}".format(kernel["WavefrontSize"]), sgpr(tmpSgpr,writer.laneSGPRCount), \ + all1mask, "to restore all threads active") + kStr += inst("s_or_saveexec_b{}".format(kernel["WavefrontSize"]), writer.vcc, sgpr(tmpSgpr,writer.laneSGPRCount), \ "all threads active") kStr += inst("s_branch label_%04u"%svrLabels[vw-1], \ "done shifting" ) - #kStr += inst("s_mov_b32", sgpr(sgprLoc), hex(location), "location=%u"%location) location *= 2 - #kStr += inst("v_or_b32", vgpr(vgprPath), sgpr(sgprLoc), vgpr(vgprPath), "path+=location") kStr += "label_%04u: // end shift0%s" % (svrLabels[vw-1], writer.endLine) - #kStr += inst("s_mov_b64", "exec","0xFFFFFFFFFFFFFFFF","") - #kStr += dump(vgpr(vgprPath)) # checkin scratch vgprs writer.vgprPool.checkIn(wgMT) @@ -301,7 +292,7 @@ def __call__(self, writer, kernel, tP): kStr = "" glvw = tP["glvw"] - numThreadInWave = globalParameters["WavefrontWidth"] + numThreadInWave = writer.kernel["WavefrontSize"] MIBShape0 = kernel["MatrixInstM"] * kernel["MatrixInstBM"] numContinuousOutput = kernel["MIOutputVectorWidth"] numOutputThreads1 = kernel["MatrixInstN"] @@ -340,7 +331,7 @@ def __call__(self, writer, kernel, tP): svoLabels.append(tmp2Labels) # wgMT value - tmpSgpr = writer.getTmpSgpr(2).idx() + tmpSgpr = writer.getTmpSgpr(writer.laneSGPRCount).idx() tmpVgpr = writer.vgprPool.checkOutAligned(2,2) dummy = writer.vgprPool.checkOut(1) wgMT = writer.vgprPool.checkOut(1) @@ -350,19 +341,19 @@ def __call__(self, writer, kernel, tP): mtReg = writer.vgprPool.checkOut(1) kStr += inst("v_mov_b32" , vgpr(wgMT), sgpr(wg), "") kStr += inst("v_mul_i32_i24", vgpr(wgMT), hex(-kernel[tP["mt"]]), vgpr(wgMT), "wg*MT") - kStr += inst("_v_add_co_u32", vgpr(wgMT), "vcc", sgpr("SizesFree+%u"%tP["idx"]), vgpr(wgMT), "wgMT = Size - wg*MT") + kStr += inst("_v_add_co_u32", vgpr(wgMT), writer.vcc, sgpr("SizesFree+%u"%tP["idx"]), vgpr(wgMT), "wgMT = Size - wg*MT") kStr += inst("v_mov_b32" , vgpr(mtReg), hex(kernel[tP["mt"]]), "MT") - kStr += inst("v_cmp_lt_u32" , sgpr(tmpSgpr,2), vgpr(wgMT), vgpr(mtReg), "wgMT < MT" ) - kStr += inst("v_cndmask_b32", vgpr(wgMT), vgpr(mtReg), vgpr(wgMT), sgpr(tmpSgpr,2), "wgMT = (wgMT < MT) ? wgMT : MT" ) + kStr += inst("v_cmp_lt_u32" , sgpr(tmpSgpr,writer.laneSGPRCount), vgpr(wgMT), vgpr(mtReg), "wgMT < MT" ) + kStr += inst("v_cndmask_b32", vgpr(wgMT), vgpr(mtReg), vgpr(wgMT), sgpr(tmpSgpr,writer.laneSGPRCount), "wgMT = (wgMT < MT) ? wgMT : MT" ) wReg = writer.vgprPool.checkOut(1) - kStr += vectorStaticDivide(wReg, "Serial", globalParameters["WavefrontWidth"], tmpVgpr, tmpSgpr) + kStr += vectorStaticDivide(wReg, "Serial", writer.kernel["WavefrontSize"], tmpVgpr, tmpSgpr) kStr += vectorStaticRemainder(dummy, wReg, wReg, kernel["MIWaveGroup"][0], tmpVgpr, tmpSgpr) sReg = writer.vgprPool.checkOut(1) kStr += vectorStaticDivide(sReg, wgMT, MIBShape0, tmpVgpr, tmpSgpr) kStr += vectorStaticRemainder(dummy, sReg, sReg, kernel["MIWaveGroup"][0], tmpVgpr, tmpSgpr) - kStr += inst("v_cmp_eq_u32" , sgpr(tmpSgpr,2), vgpr(sReg), vgpr(wReg), "wave_id0 == block_belong_to_wave0?" ) - kStr += inst("v_cndmask_b32", vgpr(wgMT), vgpr(mtReg), vgpr(wgMT), sgpr(tmpSgpr,2), "wgMT = (wgMT < MT) ? wgMT : MT" ) + kStr += inst("v_cmp_eq_u32" , sgpr(tmpSgpr,writer.laneSGPRCount), vgpr(sReg), vgpr(wReg), "wave_id0 == block_belong_to_wave0?" ) + kStr += inst("v_cndmask_b32", vgpr(wgMT), vgpr(mtReg), vgpr(wgMT), sgpr(tmpSgpr,writer.laneSGPRCount), "wgMT = (wgMT < MT) ? wgMT : MT" ) writer.vgprPool.checkIn(mtReg) writer.vgprPool.checkIn(sReg) @@ -371,7 +362,7 @@ def __call__(self, writer, kernel, tP): gReg = writer.vgprPool.checkOut(1) kStr += staticMultiply(vgpr(wReg), vgpr(wReg), MIBShape0 // numSubOutputPerWave0, sgpr(tmpSgpr)) kStr += vectorStaticDivide(gReg, wgMT, numSubOutputPerWave0, tmpVgpr, tmpSgpr) - kStr += inst("v_sub_u32", vgpr(gReg), vgpr(gReg), vgpr(wReg), "") + kStr += inst("_v_sub_u32", vgpr(gReg), vgpr(gReg), vgpr(wReg), "") writer.vgprPool.checkIn(wReg) # eReg : use to disguish which shift block (sub-tile) we need to deal with @@ -387,8 +378,12 @@ def __call__(self, writer, kernel, tP): kStr += writer.comment("mReg : decide which thread have to deal with this M-size") mReg = writer.vgprPool.checkOut(1) if kernel["ProblemType"]["DataType"].isDouble(): - kStr += vectorStaticRemainder(dummy, mReg, wgMT, numContinuousOutput, tmpVgpr, tmpSgpr) - kStr += vectorStaticDivide(mReg, mReg, numContinuousOutput // 2, tmpVgpr, tmpSgpr) + if kernel["SourceSwap"]: + kStr += vectorStaticRemainder(dummy, mReg, wgMT, MIBShape0, tmpVgpr, tmpSgpr) + kStr += vectorStaticDivide(mReg, mReg, glvw, tmpVgpr, tmpSgpr) + else: + kStr += vectorStaticRemainder(dummy, mReg, wgMT, numContinuousOutput, tmpVgpr, tmpSgpr) + kStr += vectorStaticDivide(mReg, mReg, numContinuousOutput // 2, tmpVgpr, tmpSgpr) else: kStr += vectorStaticDivide(mReg, wgMT, numContinuousOutput, tmpVgpr, tmpSgpr) kStr += vectorStaticRemainder(dummy, mReg, mReg, numOutputThreads0, tmpVgpr, tmpSgpr) @@ -397,8 +392,12 @@ def __call__(self, writer, kernel, tP): kStr += writer.comment("tReg : thread group id [0-31] or [32-63] for mfma 32x32x2") tReg = writer.vgprPool.checkOut(1) if kernel["ProblemType"]["DataType"].isDouble(): - kStr += vectorStaticDivide(tReg, "Serial", kernel["MatrixInstN"] * 2, tmpVgpr, tmpSgpr) - kStr += vectorStaticRemainder(dummy, tReg, tReg, numOutputThreads0 // 2, tmpVgpr, tmpSgpr) + if kernel["SourceSwap"]: + kStr += vectorStaticDivide(tReg, "Serial", glvw, tmpVgpr, tmpSgpr) + kStr += vectorStaticRemainder(dummy, tReg, tReg, numOutputThreads1 // 2, tmpVgpr, tmpSgpr) + else: + kStr += vectorStaticDivide(tReg, "Serial", kernel["MatrixInstN"] * 2, tmpVgpr, tmpSgpr) + kStr += vectorStaticRemainder(dummy, tReg, tReg, numOutputThreads0 // 2, tmpVgpr, tmpSgpr) else: kStr += vectorStaticDivide(tReg, "Serial", kernel["MatrixInstN"], tmpVgpr, tmpSgpr) kStr += vectorStaticRemainder(dummy, tReg, tReg, numOutputThreads0, tmpVgpr, tmpSgpr) @@ -409,7 +408,7 @@ def __call__(self, writer, kernel, tP): rReg = writer.vgprPool.checkOut(1) kStr += vectorStaticRemainder(dummy, rReg, wgMT, glvw, tmpVgpr, tmpSgpr) for r in range(1, glvw): - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(rReg), hex(r), "wgMT%%VW == %u"%r ) + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(rReg), hex(r), "wgMT%%VW == %u"%r ) kStr += inst("s_cbranch_vccnz label_%04u" % svrLabels[(r-1)], "branch to shift d%u r=%u"%(tP["idx"], r)) kStr += inst("s_branch label_%04u"%svrLabels[glvw-1], "no shifting" ) writer.vgprPool.checkIn(rReg) @@ -427,7 +426,7 @@ def __call__(self, writer, kernel, tP): for ot in range(0, numSubOutputGroupsPerWave0): packIdx = wt * numSubOutputGroupsPerWave0 + ot grpVal = wt * numSubOutputGroupsPerWave0 * kernel["MIWaveGroup"][0] + ot - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(gReg), hex(grpVal), "wgMT/8 == %u" % packIdx ) + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(gReg), hex(grpVal), "wgMT/8 == %u" % packIdx ) kStr += inst("s_cbranch_vccnz label_%04u" % sviLabels[(r-1)][packIdx], "branch to shift d%u, r=%u, v=%u" % (tP["idx"], r, packIdx)) for wt in range(0, kernel["MIWaveTile"][0]): @@ -437,17 +436,20 @@ def __call__(self, writer, kernel, tP): kStr += writer.comment("shift d%u r=%u v=%u" % (tP["idx"], r, packIdx)) kStr += "label_%04u:%s" % (sviLabels[r-1][packIdx], writer.endLine) - # mask if last thread in thread#-tile column - kStr += inst("v_cmpx_eq_u32", sgpr(tmpSgpr,2), vgpr(tReg), vgpr(mReg), "(serial % 64) / 32 == (wgMT/4)%2" ) - - # decide to jump to block wich handle element of shfit block (subtile) - # for vector widht 2 with continuous 4, we have 1, 3 case to handle - for outIdx in range(0, numShiftBlock): - if kernel["ProblemType"]["DataType"].isDouble(): - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(eReg), hex(outIdx), "wgMT %% 4 == %u" % (outIdx) ) - else: - kStr += inst("v_cmp_eq_u32", "vcc", vgpr(eReg), hex(outIdx*glvw+r), "wgMT %% 4 == %u" % (outIdx*2+1) ) - kStr += inst("s_cbranch_vccnz label_%04u" % svoLabels[(r-1)][packIdx][outIdx], "branch to shift d%u, r=%u, v=%u, o=%u" % (tP["idx"], r, packIdx, outIdx)) + cmt = "(serial % 64) / 32 == (wgMT/4)%2" + if kernel["SourceSwap"]: + cmt = "(serial / glvw) % wt0 == (wgMT % mib0) / glvw" + kStr += inst("v_cmpx_eq_u32", sgpr(tmpSgpr,writer.laneSGPRCount), vgpr(tReg), vgpr(mReg), cmt ) + + if not kernel["SourceSwap"]: + # decide to jump to block wich handle element of shfit block (subtile) + # for vector widht 2 with continuous 4, we have 1, 3 case to handle + for outIdx in range(0, numShiftBlock): + if kernel["ProblemType"]["DataType"].isDouble(): + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(eReg), hex(outIdx), "wgMT %% 4 == %u" % (outIdx) ) + else: + kStr += inst("v_cmp_eq_u32", writer.vcc, vgpr(eReg), hex(outIdx*glvw+r), "wgMT %% 4 == %u" % (outIdx*2+1) ) + kStr += inst("s_cbranch_vccnz label_%04u" % svoLabels[(r-1)][packIdx][outIdx], "branch to shift d%u, r=%u, v=%u, o=%u" % (tP["idx"], r, packIdx, outIdx)) # blocks to handle shfiting for outIdx in range(0, numShiftBlock): @@ -455,18 +457,22 @@ def __call__(self, writer, kernel, tP): for subTile1Idx in range(0, subTile1): for shiftIdx in range(0, r): if kernel["ProblemType"]["DataType"].isDouble(): - dstVgpr = 2 * (subTile1Idx * numOutputElements + packIdx * numContinuousOutput + outIdx + shiftIdx) tmpVgpr2 = writer.vgprPool.checkOutAligned(2,2) + dstVgpr = 2 * (subTile1Idx * numOutputElements + packIdx * numContinuousOutput + outIdx + shiftIdx) + if kernel["SourceSwap"]: + dstVgpr = 2 * (subTile1Idx * numOutputElements + outIdx * kernel["MIWaveTile"][0] + packIdx + shiftIdx) + + swapSize = 1 if kernel["SourceSwap"] else 16 kStr += inst("v_accvgpr_read_b32", vgpr(tmpVgpr), accvgpr(arch2acc[dstVgpr]), "") kStr += inst("s_nop", "1", "v_accvgpr read vgpr after write vgpr: 2 wait states") - kStr += inst("ds_swizzle_b32", vgpr(tmpVgpr2), vgpr(tmpVgpr), "offset:swizzle(SWAP, 16)", "swizzle edge values") + kStr += inst("ds_swizzle_b32", vgpr(tmpVgpr2), vgpr(tmpVgpr), "offset:swizzle(SWAP, {})".format(swapSize), "swizzle edge values") kStr += inst("s_waitcnt", "0", "wait for swizzle operation") kStr += inst("v_accvgpr_write_b32", accvgpr(arch2acc[dstVgpr]), vgpr(tmpVgpr2), "") kStr += inst("v_accvgpr_read_b32", vgpr(tmpVgpr), accvgpr(arch2acc[dstVgpr]+1), "") kStr += inst("s_nop", "1", "v_accvgpr read vgpr after write vgpr: 2 wait states") - kStr += inst("ds_swizzle_b32", vgpr(tmpVgpr2), vgpr(tmpVgpr), "offset:swizzle(SWAP, 16)", "swizzle edge values") + kStr += inst("ds_swizzle_b32", vgpr(tmpVgpr2), vgpr(tmpVgpr), "offset:swizzle(SWAP, {})".format(swapSize), "swizzle edge values") kStr += inst("s_waitcnt", "0", "wait for swizzle operation") kStr += inst("v_accvgpr_write_b32", accvgpr(arch2acc[dstVgpr]+1), vgpr(tmpVgpr2), "") @@ -487,8 +493,9 @@ def __call__(self, writer, kernel, tP): kStr += inst("v_mov_b32", vgpr(dstVgpr), vgpr(srcVgpr), "") # end shift reset mask and jump out - kStr += inst("s_mov_b64", sgpr(tmpSgpr,2), "0xFFFFFFFFFFFFFFFF", "to restore all threads active") - kStr += inst("s_or_saveexec_b64", "vcc", sgpr(tmpSgpr,2), "all threads active") + all1mask = "0xFFFFFFFF" if (kernel["WavefrontSize"] == 32) else "0xFFFFFFFFFFFFFFFF" + kStr += inst("s_mov_b{}".format(kernel["WavefrontSize"]), sgpr(tmpSgpr,writer.laneSGPRCount), all1mask, "to restore all threads active") + kStr += inst("s_or_saveexec_b{}".format(kernel["WavefrontSize"]), writer.vcc, sgpr(tmpSgpr,writer.laneSGPRCount), "all threads active") kStr += inst("s_branch label_%04u" % svrLabels[glvw-1], "done shifting" ) kStr += "label_%04u: // end shift0%s" % (svrLabels[glvw-1], writer.endLine) diff --git a/Tensile/Components/Signature.py b/Tensile/Components/Signature.py index 86fd2072b..99dae8432 100644 --- a/Tensile/Components/Signature.py +++ b/Tensile/Components/Signature.py @@ -193,7 +193,10 @@ def __call__(self, writer): kStr += " %s %u // lds bytes%s" % ( tWord, group_segment_size, writer.endLine ) if writer.archCaps["HasWave32"]: - kStr += " wavefront_size = 6 // 64-thread wavefronts%s" % writer.endLine + if kernel["WavefrontSize"] == 32: + kStr += " wavefront_size = 5 // 32-thread wavefronts%s" % writer.endLine + else: + kStr += " wavefront_size = 6 // 64-thread wavefronts%s" % writer.endLine # other kStr += " compute_pgm_rsrc2_user_sgpr = 2 // vcc%s" % writer.endLine @@ -317,7 +320,7 @@ def __call__(self, writer): kStr += " GroupSegmentFixedSize: %u%s" % ( group_segment_size, writer.endLine ) kStr += " PrivateSegmentFixedSize: %u%s" % ( 0, writer.endLine ) kStr += " KernargSegmentAlign: %u%s" % ( 8, writer.endLine ) - kStr += " WavefrontSize: %u%s" % ( 64, writer.endLine ) + kStr += " WavefrontSize: %u%s" % ( kernel["WavefrontSize"], writer.endLine ) kStr += " NumSGPRs: %u%s" % ( totalSgprs, writer.endLine ) kStr += " NumVGPRs: %u%s" % ( totalVgprs, writer.endLine ) kStr += " MaxFlatWorkGroupSize: %u%s" % ( kernel["SubGroup0"] * kernel["SubGroup1"] * kernel["LocalSplitU"], writer.endLine ) @@ -411,7 +414,10 @@ def __call__(self, writer): kStr += " %s %u // lds bytes%s" % ( tWord, group_segment_size, writer.endLine ) if writer.archCaps["HasWave32"]: - kStr += " .amdhsa_wavefront_size32 0 // 64-thread wavefronts%s" % writer.endLine + if kernel["WavefrontSize"] == 32: + kStr += " .amdhsa_wavefront_size32 1 // 32-thread wavefronts%s" % writer.endLine + else: + kStr += " .amdhsa_wavefront_size32 0 // 64-thread wavefronts%s" % writer.endLine # other kStr += " .amdhsa_private_segment_fixed_size 0%s" % writer.endLine @@ -545,7 +551,7 @@ def __call__(self, writer): kStr += " .sgpr_spill_count: %u%s" % ( 0, writer.endLine ) kStr += " .vgpr_count: %u%s" % ( totalVgprs, writer.endLine ) kStr += " .vgpr_spill_count: %u%s" % ( 0, writer.endLine ) - kStr += " .wavefront_size: %u%s" % ( 64, writer.endLine ) + kStr += " .wavefront_size: %u%s" % ( kernel["WavefrontSize"], writer.endLine ) kStr += "...\n" diff --git a/Tensile/Components/__init__.py b/Tensile/Components/__init__.py index 271200887..1f1b81635 100644 --- a/Tensile/Components/__init__.py +++ b/Tensile/Components/__init__.py @@ -25,6 +25,7 @@ def use(): pass __all__ = [ "MAC_F16", "MAC_F16_HPA", + "MAC_F32", "Priority", "Signature", "LocalRead", diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml new file mode 100644 index 000000000..2731919f7 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml @@ -0,0 +1,1095 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [6144, 6000, 1, 2560] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [512, 6000, 1, 2560] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1408, 1, 3328] + - Exact: [256, 193600, 1, 64] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [2048, 7000, 1, 2048] + - Exact: [5056, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [196, 1024, 64, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [512, 6000, 1, 2816] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1024, 12544, 1, 256] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [3072, 1500, 1, 1024] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1024, 6000, 1, 1536] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [512, 3000, 1, 1536] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [5056, 1408, 1, 3328] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1024, 24000, 1, 2560] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 3000, 1, 2816] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [1024, 24000, 1, 1536] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1500, 1, 2816] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4288, 4288, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [512, 3000, 1, 2048] + - Exact: [5056, 1024, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [8448, 6000, 1, 2816] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 5056, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [512, 6000, 1, 1536] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3072, 6000, 1, 1024] + - Exact: [2944, 1856, 1, 3328] + - Exact: [448, 5056, 1, 256] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [1024, 3000, 1, 2048] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [196, 256, 64, 1024] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [5124, 1500, 1, 2048] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [7680, 6000, 1, 2560] + - Exact: [6784, 1408, 1, 1280] + - Exact: [512, 3000, 1, 2560] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1024, 1500, 1, 1536] + - Exact: [1408, 1408, 1, 1280] + - Exact: [3072, 3000, 1, 1024] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 3328] + - Exact: [512, 3136, 1, 2048] + - Exact: [1408, 1024, 1, 256] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [4608, 1500, 1, 1536] + - Exact: [3584, 2368, 1, 1280] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5124, 9124, 1, 2048] + - Exact: [5124, 700, 1, 2048] + - Exact: [256, 12544, 1, 1024] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 6000, 1, 2560] + - Exact: [1024, 5056, 1, 256] + - Exact: [4224, 1500, 1, 176] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4288, 6784, 1, 1280] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [8448, 3000, 1, 2816] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [7680, 1500, 1, 2560] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4608, 24000, 1, 1536] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [4608, 6000, 1, 1536] + - Exact: [7680, 3000, 1, 2560] + - Exact: [5124, 700, 1, 2560] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5124, 1500, 1, 2560] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [1024, 3000, 1, 2560] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [1024, 6000, 1, 2816] + - Exact: [5888, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [1024, 1500, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [1024, 3000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [128, 50176, 1, 512] + - Exact: [1024, 48000, 1, 2048] + - Exact: [784, 512, 64, 128] + - Exact: [3136, 256, 64, 64] + - Exact: [12544, 1024, 1, 256] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 512, 1, 2048] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 2944, 1, 3328] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [512, 1500, 1, 2816] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [512, 1500, 1, 2048] + - Exact: [448, 2368, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [64, 193600, 1, 64] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 704, 1, 1280] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [448, 2368, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [512, 1500, 1, 1536] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [64, 193600, 1, 256] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [1408, 448, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [1408, 448, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [512, 1500, 1, 2560] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 256] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [704, 128, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [8448, 32, 1, 2816] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2048, 128, 1, 2048] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [128, 1500, 1, 1280] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 256] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [4608, 32, 1, 1536] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [2560, 32, 1, 2560] + - Exact: [704, 256, 1, 3328] + - Exact: [176, 1500, 1, 1408] + - Exact: [1856, 128, 1, 256] + - Exact: [4288, 64, 1, 3328] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [2560, 16, 1, 2560] + - Exact: [1760, 16, 1, 1760] + - Exact: [8448, 4, 1, 2816] + - Exact: [7680, 16, 1, 2560] + - Exact: [4608, 1, 1, 1536] + - Exact: [7680, 4, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [3072, 2, 1, 1024] + - Exact: [6144, 16, 1, 2560] + - Exact: [7680, 1, 1, 2560] + - Exact: [4608, 4, 1, 1536] + - Exact: [3072, 1, 1, 128] + - Exact: [2048, 32, 1, 2048] + - Exact: [2048, 16, 1, 2048] + - Exact: [8448, 1, 1, 2816] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [3072, 16, 1, 1024] + - Exact: [4096, 16, 1, 4096] + - Exact: [6144, 1, 1, 2560] + - Exact: [3072, 4, 1, 1024] + - Exact: [7680, 2, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [8448, 2, 1, 2816] + - Exact: [4608, 2, 1, 1536] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 2, 1, 2560] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 4, 1, 500000] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 4, 1, 512] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [128, 1, 1, 1408] + - Exact: [1024, 64, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [64, 64, 1, 3328] + - Exact: [512, 1, 1, 512] + - Exact: [512, 2, 1, 512] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [64, 1, 1, 1216] + - Exact: [256, 256, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [1024, 2, 1, 512] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 1, 1, 1024] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [1024, 4, 1, 512] + - Exact: [128, 448, 1, 3328] + - Exact: [1024, 1, 1, 512] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml new file mode 100644 index 000000000..6bdd047c1 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml @@ -0,0 +1,880 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [1024, 5056, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [6784, 2368, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [2048, 7133, 1, 2048] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [3072, 7435, 1, 1024] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [1024, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [4096, 7133, 1, 4096] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [1760, 7133, 1, 1760] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1024, 2944, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [1024, 5888, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [2944, 5888, 1, 1280] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [5056, 256, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [704, 4288, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2560, 7133, 1, 2560] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [1408, 1024, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [7680, 5481, 1, 2560] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 512, 256, 128] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 256, 1024] + - Exact: [3136, 256, 128, 64] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [128, 2368, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 64] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [704, 128, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [704, 256, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [704, 64, 1, 3328] + - Exact: [512, 16, 1, 512] + - Exact: [448, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 448, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml new file mode 100644 index 000000000..a92ef9b98 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml @@ -0,0 +1,1027 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [1760, 6400, 1, 1760] + - Exact: [5888, 704, 1, 1280] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [4288, 2944, 1, 256] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [2560, 1600, 1, 2560] + - Exact: [6144, 5984, 1, 2048] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [2048, 1600, 1, 512] + - Exact: [2048, 7000, 1, 2048] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [8448, 12000, 1, 2816] + - Exact: [4096, 800, 1, 1024] + - Exact: [8192, 3200, 1, 2048] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2560, 800, 1, 2560] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [2560, 6400, 1, 2560] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6144, 24000, 1, 2048] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4096, 400, 1, 1024] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1856, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [5056, 1408, 1, 3328] + - Exact: [8448, 5984, 1, 2816] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 24000, 1, 2560] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [4608, 5984, 1, 1536] + - Exact: [1760, 3200, 1, 1760] + - Exact: [4096, 1600, 1, 1024] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [6144, 48000, 1, 2048] + - Exact: [1024, 24000, 1, 1536] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [8192, 1600, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [2048, 3200, 1, 512] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4608, 12000, 1, 1536] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [2048, 800, 1, 512] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [6144, 12000, 1, 2048] + - Exact: [8192, 800, 1, 2048] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [2048, 1600, 1, 2048] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [4096, 3200, 1, 1024] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [2048, 6400, 1, 2048] + - Exact: [512, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [4608, 24000, 1, 1536] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [2048, 800, 1, 2048] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [16384, 400, 1, 4096] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1408, 1024, 1, 256] + - Exact: [8192, 400, 1, 2048] + - Exact: [2560, 7000, 1, 2560] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5124, 9124, 1, 2048] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [1760, 1600, 1, 1760] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [7680, 12000, 1, 2560] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [2560, 3200, 1, 2560] + - Exact: [6784, 5056, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4288, 6784, 1, 1280] + - Exact: [16384, 800, 1, 4096] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [4288, 3584, 1, 256] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [2048, 3200, 1, 2048] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1760, 800, 1, 1760] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [16384, 1600, 1, 4096] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [1024, 48000, 1, 2048] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [128, 3584, 1, 1280] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 128, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [7680, 64, 1, 2560] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [448, 1024, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [2944, 448, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 2368, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2048, 128, 1, 2048] + - Exact: [2560, 32, 1, 2560] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1024, 128, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 196] + - Exact: [256, 1024, 1, 196] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [1760, 16, 1, 1760] + - Exact: [7680, 16, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [6144, 16, 1, 2560] + - Exact: [2048, 16, 1, 2048] + - Exact: [3072, 16, 1, 1024] + - Exact: [4096, 16, 1, 4096] + - Exact: [2560, 16, 1, 2560] + - Exact: [2048, 32, 1, 2048] + - Exact: [4608, 16, 1, 1536] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [64, 448, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 256, 1, 3136] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml new file mode 100644 index 000000000..792061b10 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml @@ -0,0 +1,849 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [1024, 5056, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1024, 1408, 1, 3328] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [1408, 1408, 1, 1280] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [1024, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1024, 2944, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [1024, 5888, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [5056, 256, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [704, 4288, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [1408, 1024, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [2944, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [128, 2368, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [704, 128, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [704, 256, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [704, 64, 1, 3328] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [448, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml new file mode 100644 index 000000000..fec566b67 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml @@ -0,0 +1,1094 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [6144, 6000, 1, 2560] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [512, 6000, 1, 2560] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1408, 1, 3328] + - Exact: [256, 193600, 1, 64] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [2048, 7000, 1, 2048] + - Exact: [5056, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [196, 1024, 64, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [512, 6000, 1, 2816] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1024, 12544, 1, 256] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [3072, 1500, 1, 1024] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1024, 6000, 1, 1536] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [512, 3000, 1, 1536] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [5056, 1408, 1, 3328] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1024, 24000, 1, 2560] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 3000, 1, 2816] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [1024, 24000, 1, 1536] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1500, 1, 2816] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4288, 4288, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [512, 3000, 1, 2048] + - Exact: [5056, 1024, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [8448, 6000, 1, 2816] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 5056, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [512, 6000, 1, 1536] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3072, 6000, 1, 1024] + - Exact: [2944, 1856, 1, 3328] + - Exact: [448, 5056, 1, 256] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [1024, 3000, 1, 2048] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [196, 256, 64, 1024] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [5124, 1500, 1, 2048] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [7680, 6000, 1, 2560] + - Exact: [6784, 1408, 1, 1280] + - Exact: [512, 3000, 1, 2560] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1024, 1500, 1, 1536] + - Exact: [1408, 1408, 1, 1280] + - Exact: [3072, 3000, 1, 1024] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 3328] + - Exact: [512, 3136, 1, 2048] + - Exact: [1408, 1024, 1, 256] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [4608, 1500, 1, 1536] + - Exact: [3584, 2368, 1, 1280] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5124, 9124, 1, 2048] + - Exact: [5124, 700, 1, 2048] + - Exact: [256, 12544, 1, 1024] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 6000, 1, 2560] + - Exact: [1024, 5056, 1, 256] + - Exact: [4224, 1500, 1, 176] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4288, 6784, 1, 1280] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [8448, 3000, 1, 2816] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [7680, 1500, 1, 2560] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4608, 24000, 1, 1536] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [4608, 6000, 1, 1536] + - Exact: [7680, 3000, 1, 2560] + - Exact: [5124, 700, 1, 2560] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5124, 1500, 1, 2560] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [1024, 3000, 1, 2560] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [1024, 6000, 1, 2816] + - Exact: [5888, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [1024, 1500, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [1024, 3000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [128, 50176, 1, 512] + - Exact: [1024, 48000, 1, 2048] + - Exact: [784, 512, 64, 128] + - Exact: [3136, 256, 64, 64] + - Exact: [12544, 1024, 1, 256] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 512, 1, 2048] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 2944, 1, 3328] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [512, 1500, 1, 2816] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [512, 1500, 1, 2048] + - Exact: [448, 2368, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [64, 193600, 1, 64] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 704, 1, 1280] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [448, 2368, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [512, 1500, 1, 1536] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [64, 193600, 1, 256] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [1408, 448, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [1408, 448, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [512, 1500, 1, 2560] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 256] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [704, 128, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [8448, 32, 1, 2816] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2048, 128, 1, 2048] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [128, 1500, 1, 1280] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 256] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [4608, 32, 1, 1536] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [2560, 32, 1, 2560] + - Exact: [704, 256, 1, 3328] + - Exact: [176, 1500, 1, 1408] + - Exact: [1856, 128, 1, 256] + - Exact: [4288, 64, 1, 3328] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [2560, 16, 1, 2560] + - Exact: [1760, 16, 1, 1760] + - Exact: [8448, 4, 1, 2816] + - Exact: [7680, 16, 1, 2560] + - Exact: [4608, 1, 1, 1536] + - Exact: [7680, 4, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [3072, 2, 1, 1024] + - Exact: [6144, 16, 1, 2560] + - Exact: [7680, 1, 1, 2560] + - Exact: [4608, 4, 1, 1536] + - Exact: [3072, 1, 1, 128] + - Exact: [2048, 32, 1, 2048] + - Exact: [2048, 16, 1, 2048] + - Exact: [8448, 1, 1, 2816] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [3072, 16, 1, 1024] + - Exact: [4096, 16, 1, 4096] + - Exact: [6144, 1, 1, 2560] + - Exact: [3072, 4, 1, 1024] + - Exact: [7680, 2, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [8448, 2, 1, 2816] + - Exact: [4608, 2, 1, 1536] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 2, 1, 2560] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 4, 1, 500000] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 4, 1, 512] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [128, 1, 1, 1408] + - Exact: [1024, 64, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [64, 64, 1, 3328] + - Exact: [512, 1, 1, 512] + - Exact: [512, 2, 1, 512] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [64, 1, 1, 1216] + - Exact: [256, 256, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [1024, 2, 1, 512] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 1, 1, 1024] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [1024, 4, 1, 512] + - Exact: [128, 448, 1, 3328] + - Exact: [1024, 1, 1, 512] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml new file mode 100644 index 000000000..7953b15fb --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml @@ -0,0 +1,879 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [1024, 5056, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [6784, 2368, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [2048, 7133, 1, 2048] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [3072, 7435, 1, 1024] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [1024, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [4096, 7133, 1, 4096] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [1760, 7133, 1, 1760] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1024, 2944, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [1024, 5888, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [2944, 5888, 1, 1280] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [5056, 256, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [704, 4288, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2560, 7133, 1, 2560] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [1408, 1024, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [7680, 5481, 1, 2560] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 512, 256, 128] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 256, 1024] + - Exact: [3136, 256, 128, 64] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [128, 2368, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 64] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [704, 128, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [704, 256, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [704, 64, 1, 3328] + - Exact: [512, 16, 1, 512] + - Exact: [448, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 448, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml new file mode 100644 index 000000000..c6d5b0793 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml @@ -0,0 +1,1026 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [1760, 6400, 1, 1760] + - Exact: [5888, 704, 1, 1280] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [4288, 2944, 1, 256] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [2560, 1600, 1, 2560] + - Exact: [6144, 5984, 1, 2048] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [2048, 1600, 1, 512] + - Exact: [2048, 7000, 1, 2048] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [8448, 12000, 1, 2816] + - Exact: [4096, 800, 1, 1024] + - Exact: [8192, 3200, 1, 2048] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2560, 800, 1, 2560] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [2560, 6400, 1, 2560] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6144, 24000, 1, 2048] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4096, 400, 1, 1024] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1856, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [5056, 1408, 1, 3328] + - Exact: [8448, 5984, 1, 2816] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 24000, 1, 2560] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [4608, 5984, 1, 1536] + - Exact: [1760, 3200, 1, 1760] + - Exact: [4096, 1600, 1, 1024] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [6144, 48000, 1, 2048] + - Exact: [1024, 24000, 1, 1536] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [8192, 1600, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [2048, 3200, 1, 512] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4608, 12000, 1, 1536] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [2048, 800, 1, 512] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [6144, 12000, 1, 2048] + - Exact: [8192, 800, 1, 2048] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [2048, 1600, 1, 2048] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [4096, 3200, 1, 1024] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [2048, 6400, 1, 2048] + - Exact: [512, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [4608, 24000, 1, 1536] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [2048, 800, 1, 2048] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [16384, 400, 1, 4096] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1408, 1024, 1, 256] + - Exact: [8192, 400, 1, 2048] + - Exact: [2560, 7000, 1, 2560] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5124, 9124, 1, 2048] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [1760, 1600, 1, 1760] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [7680, 12000, 1, 2560] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [2560, 3200, 1, 2560] + - Exact: [6784, 5056, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4288, 6784, 1, 1280] + - Exact: [16384, 800, 1, 4096] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [4288, 3584, 1, 256] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [2048, 3200, 1, 2048] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1760, 800, 1, 1760] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [16384, 1600, 1, 4096] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [1024, 48000, 1, 2048] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [128, 3584, 1, 1280] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 128, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [7680, 64, 1, 2560] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [448, 1024, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [2944, 448, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 2368, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2048, 128, 1, 2048] + - Exact: [2560, 32, 1, 2560] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1024, 128, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 196] + - Exact: [256, 1024, 1, 196] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [1760, 16, 1, 1760] + - Exact: [7680, 16, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [6144, 16, 1, 2560] + - Exact: [2048, 16, 1, 2048] + - Exact: [3072, 16, 1, 1024] + - Exact: [4096, 16, 1, 4096] + - Exact: [2560, 16, 1, 2560] + - Exact: [2048, 32, 1, 2048] + - Exact: [4608, 16, 1, 1536] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [64, 448, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 256, 1, 3136] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml new file mode 100644 index 000000000..a209dba3d --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml @@ -0,0 +1,848 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [1024, 5056, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1024, 1408, 1, 3328] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1024, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [1408, 1408, 1, 1280] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [704, 6784, 1, 256] + - Exact: [1024, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [3584, 704, 1, 1280] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [448, 5888, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [3584, 1024, 1, 3328] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [1408, 1408, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [4288, 3584, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1024, 2944, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [1024, 5888, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [6784, 5056, 1, 256] + - Exact: [3584, 5056, 1, 3328] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [5056, 256, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [704, 4288, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 6784, 1, 3328] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [1408, 1024, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [2944, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [2368, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [4288, 128, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [128, 2368, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [704, 1856, 1, 1280] + - Exact: [1408, 256, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1024, 1024, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 1024, 1, 256] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1024, 1, 256] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [704, 128, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [256, 448, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [704, 256, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [704, 64, 1, 3328] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [128, 448, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [448, 128, 1, 256] + - Exact: [256, 256, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [1024, 64, 1, 1280] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml new file mode 100644 index 000000000..8fb888aae --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml @@ -0,0 +1,2271 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1408, 1, 3328] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4288, 4288, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 5056, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [448, 5056, 1, 256] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 3328] + - Exact: [1408, 1024, 1, 256] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5124, 9124, 1, 1760] + - Exact: [1024, 1500, 1, 1536] + - Exact: [512, 24000, 1, 2048] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1024, 3000, 1, 2560] + - Exact: [512, 3136, 1, 2048] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [512, 48000, 1, 2048] + - Exact: [196, 256, 64, 1024] + - Exact: [512, 48000, 1, 1536] + - Exact: [4608, 1500, 1, 1536] + - Exact: [1024, 24000, 1, 2560] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5124, 9124, 1, 2048] + - Exact: [5124, 700, 1, 2560] + - Exact: [6144, 6000, 1, 2560] + - Exact: [1024, 1500, 1, 2816] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [4224, 1500, 1, 176] + - Exact: [1024, 6000, 1, 2816] + - Exact: [1024, 48000, 1, 1536] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4608, 24000, 1, 1536] + - Exact: [7680, 48000, 1, 2560] + - Exact: [3072, 48000, 1, 1024] + - Exact: [1024, 1500, 1, 2048] + - Exact: [1024, 3000, 1, 2048] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1760, 7000, 1, 1760] + - Exact: [8448, 3000, 1, 2816] + - Exact: [4608, 48000, 1, 1536] + - Exact: [7680, 1500, 1, 2560] + - Exact: [512, 3000, 1, 1536] + - Exact: [1024, 3000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 24000, 1, 1536] + - Exact: [7680, 6000, 1, 2560] + - Exact: [512, 6000, 1, 2560] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [1024, 24000, 1, 2816] + - Exact: [2048, 7000, 1, 2048] + - Exact: [7680, 3000, 1, 2560] + - Exact: [5124, 700, 1, 2048] + - Exact: [5124, 9124, 1, 4096] + - Exact: [256, 193600, 1, 64] + - Exact: [8448, 6000, 1, 2816] + - Exact: [5124, 1500, 1, 2560] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1024, 6000, 1, 2560] + - Exact: [196, 1024, 64, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [7680, 24000, 1, 2560] + - Exact: [512, 3000, 1, 2560] + - Exact: [8448, 24000, 1, 2816] + - Exact: [512, 6000, 1, 1536] + - Exact: [3072, 6000, 1, 1024] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [1024, 3000, 1, 1536] + - Exact: [512, 6000, 1, 2816] + - Exact: [128, 50176, 1, 512] + - Exact: [256, 12544, 1, 1024] + - Exact: [1024, 12544, 1, 256] + - Exact: [512, 48000, 1, 2560] + - Exact: [512, 24000, 1, 1536] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5124, 1500, 1, 2048] + - Exact: [3072, 1500, 1, 1024] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1024, 6000, 1, 1536] + - Exact: [512, 3000, 1, 2048] + - Exact: [6144, 24000, 1, 2560] + - Exact: [4608, 6000, 1, 1536] + - Exact: [3072, 3000, 1, 1024] + - Exact: [1024, 48000, 1, 2048] + - Exact: [784, 512, 64, 128] + - Exact: [3136, 256, 64, 64] + - Exact: [12544, 1024, 1, 256] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 512, 1, 2048] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [1024, 2048, 1, 2] + - Exact: [1024, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 2048, 1, 30528] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 4096, 1, 30528] + - Exact: [4096, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [256, 8976, 1, 1536] + - Exact: [256, 8976, 1, 2048] + - Exact: [256, 8976, 1, 2304] + - Exact: [256, 8976, 1, 2560] + - Exact: [256, 8976, 1, 2816] + - Exact: [256, 8976, 1, 3072] + - Exact: [256, 8976, 1, 4352] + - Exact: [256, 8976, 1, 4864] + - Exact: [256, 8976, 1, 5376] + - Exact: [256, 8976, 1, 5632] + - Exact: [256, 8976, 1, 5888] + - Exact: [256, 8976, 1, 6144] + - Exact: [256, 8976, 1, 6656] + - Exact: [256, 8976, 1, 7168] + - Exact: [256, 8976, 1, 7424] + - Exact: [256, 8976, 1, 8192] + - Exact: [256, 8976, 1, 8448] + - Exact: [256, 8976, 1, 8960] + - Exact: [256, 8976, 1, 9472] + - Exact: [256, 8976, 1, 9728] + - Exact: [256, 8976, 1, 9984] + - Exact: [256, 8976, 1, 10240] + - Exact: [256, 8976, 1, 10496] + - Exact: [256, 8976, 1, 11008] + - Exact: [256, 8976, 1, 11520] + - Exact: [256, 8976, 1, 12288] + - Exact: [256, 8976, 1, 14336] + - Exact: [256, 8976, 1, 14848] + - Exact: [256, 8976, 1, 15104] + - Exact: [256, 8976, 1, 15872] + - Exact: [256, 8976, 1, 17152] + - Exact: [256, 8976, 1, 19712] + - Exact: [256, 8976, 1, 19968] + - Exact: [256, 8976, 1, 20480] + - Exact: [256, 8976, 1, 20992] + - Exact: [256, 8976, 1, 22016] + - Exact: [256, 8976, 1, 26112] + - Exact: [256, 8976, 1, 33536] + - Exact: [256, 8976, 1, 44505] + - Exact: [256, 32768, 1, 128] + - Exact: [480, 32768, 1, 1024] + - Exact: [512, 32768, 1, 256] + - Exact: [1024, 1600, 1, 1] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1792, 1, 256] + - Exact: [1024, 2048, 1, 256] + - Exact: [1024, 2560, 1, 256] + - Exact: [1024, 3072, 1, 256] + - Exact: [1024, 3328, 1, 256] + - Exact: [1024, 3840, 1, 256] + - Exact: [1024, 4096, 1, 256] + - Exact: [1024, 4608, 1, 256] + - Exact: [1024, 4864, 1, 256] + - Exact: [1024, 5120, 1, 256] + - Exact: [1024, 5632, 1, 256] + - Exact: [1024, 6144, 1, 256] + - Exact: [1024, 6400, 1, 256] + - Exact: [1024, 7168, 1, 256] + - Exact: [1024, 7424, 1, 256] + - Exact: [1024, 7680, 1, 256] + - Exact: [1024, 7936, 1, 256] + - Exact: [1024, 8192, 1, 256] + - Exact: [1024, 8448, 1, 256] + - Exact: [1024, 8704, 1, 256] + - Exact: [1024, 8960, 1, 256] + - Exact: [1024, 9728, 1, 256] + - Exact: [1024, 9984, 1, 256] + - Exact: [1024, 10240, 1, 256] + - Exact: [1024, 10496, 1, 256] + - Exact: [1024, 11008, 1, 256] + - Exact: [1024, 11264, 1, 256] + - Exact: [1024, 11520, 1, 256] + - Exact: [1024, 12288, 1, 256] + - Exact: [1024, 13312, 1, 256] + - Exact: [1024, 13568, 1, 256] + - Exact: [1024, 14336, 1, 256] + - Exact: [1024, 14592, 1, 256] + - Exact: [1024, 14848, 1, 256] + - Exact: [1024, 15104, 1, 256] + - Exact: [1024, 16128, 1, 256] + - Exact: [1024, 17152, 1, 256] + - Exact: [1024, 18944, 1, 256] + - Exact: [1024, 19712, 1, 256] + - Exact: [1024, 19968, 1, 256] + - Exact: [1024, 20480, 1, 256] + - Exact: [1024, 20992, 1, 256] + - Exact: [1024, 21504, 1, 256] + - Exact: [1024, 22016, 1, 256] + - Exact: [1024, 23552, 1, 256] + - Exact: [1024, 28672, 1, 256] + - Exact: [1024, 32768, 1, 512] + - Exact: [1024, 32768, 1, 1024] + - Exact: [1024, 33536, 1, 256] + - Exact: [1024, 40448, 1, 256] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1024, 1, 1] + - Exact: [2048, 1024, 1, 256] + - Exact: [3200, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 1] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 3968, 1, 42720] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 6528, 1, 42720] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7104, 1, 42720] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 7200, 1, 42720] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 9520, 1, 42720] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10080, 1, 42720] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [1024, 3240, 1, 1024] + - Exact: [1024, 3240, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3960, 1, 4096] + - Exact: [1024, 3960, 1, 42720] + - Exact: [4096, 3240, 1, 1024] + - Exact: [4096, 3960, 1, 1024] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [3136, 128, 64, 64] + - Exact: [3136, 256, 64, 128] + - Exact: [784, 512, 64, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 1024, 64, 512] + - Exact: [784, 256, 64, 512] + - Exact: [784, 512, 64, 512] + - Exact: [196, 512, 64, 1024] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 64] + - Exact: [3136, 256, 32, 128] + - Exact: [784, 512, 32, 256] + - Exact: [3136, 128, 32, 256] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 1024, 32, 512] + - Exact: [784, 256, 32, 512] + - Exact: [784, 512, 32, 512] + - Exact: [196, 512, 32, 1024] + - Exact: [196, 1024, 32, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 2] + - Exact: [4096, 512, 1, 1024] + - Exact: [1024, 1280, 1, 2] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1024, 1, 1024] + - Exact: [4096, 1280, 1, 1024] + - Exact: [1024, 4992, 1, 2] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [1024, 5120, 1, 2] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [1024, 5248, 1, 2] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [1024, 2560, 1, 2] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 2] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3072, 1, 1024] + - Exact: [1024, 1152, 1, 2] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [479, 32768, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 8192, 1, 33712] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [1024, 9600, 1, 33712] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [1024, 1024, 64, 64] + - Exact: [1024, 16384, 1, 3072] + - Exact: [1024, 2048, 1, 30592] + - Exact: [640, 2048, 1, 2560] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 4096, 1, 4608] + - Exact: [512, 512, 256, 64] + - Exact: [2048, 1024, 1, 8192] + - Exact: [4096, 16384, 1, 1024] + - Exact: [1024, 8192, 1, 50304] + - Exact: [1536, 8192, 1, 50304] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 4096, 1, 30592] + - Exact: [1536, 4096, 1, 6144] + - Exact: [1024, 16384, 1, 4096] + - Exact: [1024, 16384, 1, 50304] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1024, 2048, 1, 3072] + - Exact: [2560, 2048, 1, 7680] + - Exact: [2048, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 30592] + - Exact: [8192, 1024, 1, 2048] + - Exact: [2560, 2048, 1, 2560] + - Exact: [1536, 8192, 1, 4608] + - Exact: [1024, 2048, 1, 50304] + - Exact: [1024, 1024, 32, 64] + - Exact: [1536, 8192, 1, 6144] + - Exact: [1024, 1024, 256, 64] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 50304] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 8192, 1, 3072] + - Exact: [1024, 1024, 128, 64] + - Exact: [1024, 4096, 1, 50304] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 16384, 1, 1024] + - Exact: [2560, 2048, 1, 1920] + - Exact: [2048, 1024, 1, 6144] + - Exact: [512, 512, 128, 64] + - Exact: [1024, 8192, 1, 30592] + - Exact: [1536, 4096, 1, 1536] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 8192, 1, 30528] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 512] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 6912, 1, 1024] + - Exact: [1024, 6912, 1, 512] + - Exact: [256, 55296, 1, 128] + - Exact: [256, 6912, 1, 128] + - Exact: [480, 3456, 1, 1024] + - Exact: [480, 4096, 1, 1024] + - Exact: [480, 6912, 1, 1024] + - Exact: [512, 3456, 1, 256] + - Exact: [512, 4096, 1, 256] + - Exact: [512, 55296, 1, 256] + - Exact: [512, 6912, 1, 256] + - Exact: [1024, 1280, 1, 30528] + - Exact: [1024, 1600, 1, 30528] + - Exact: [1024, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [4096, 10240, 1, 1024] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 1640, 1, 30528] + - Exact: [1024, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 4096] + - Exact: [4096, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [1024, 6144, 1, 4096] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [512, 512, 192, 64] + - Exact: [256, 6912, 1, 1] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [1024, 10224, 1, 3072] + - Exact: [4096, 10224, 1, 1024] + - Exact: [1024, 10240, 1, 3072] + - Exact: [1024, 10192, 1, 3072] + - Exact: [4096, 10192, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [1024, 10200, 1, 3072] + - Exact: [1024, 10184, 1, 1024] + - Exact: [4096, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 3072] + - Exact: [1024, 10208, 1, 4096] + - Exact: [1024, 10224, 1, 2048] + - Exact: [1024, 10240, 1, 2048] + - Exact: [1024, 10120, 1, 1024] + - Exact: [1024, 10192, 1, 2048] + - Exact: [1024, 10152, 1, 1024] + - Exact: [1024, 10080, 1, 3072] + - Exact: [100352, 512, 1, 256] + - Exact: [12544, 2048, 1, 1024] + - Exact: [200704, 512, 1, 256] + - Exact: [25088, 1024, 1, 512] + - Exact: [50176, 1024, 1, 512] + - Exact: [6272, 2048, 1, 1024] + - Exact: [3136, 128, 128, 256] + - Exact: [3136, 128, 256, 256] + - Exact: [784, 256, 128, 512] + - Exact: [784, 256, 256, 512] + - Exact: [128, 128, 2048, 64] + - Exact: [1024, 2560, 1, 30528] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 4096] + - Exact: [1024, 12288, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [1024, 1920, 1, 30528] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 2] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [768, 2048, 1, 768] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 2] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [768, 4608, 1, 768] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 2] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [1024, 4608, 1, 1024] + - Exact: [8192, 1024, 1, 1024] + - Exact: [8192, 4096, 1, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 256, 1024] + - Exact: [196, 512, 128, 1024] + - Exact: [196, 512, 256, 1024] + - Exact: [3072, 256, 2, 1024] + - Exact: [768, 2048, 2, 512] + - Exact: [2904, 256, 2, 1024] + - Exact: [864, 2048, 2, 512] + - Exact: [2992, 256, 2, 1024] + - Exact: [3400, 256, 2, 1024] + - Exact: [4032, 256, 2, 1024] + - Exact: [15200, 128, 2, 512] + - Exact: [12288, 128, 2, 512] + - Exact: [888, 2048, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3456, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2688, 256, 2, 1024] + - Exact: [13824, 128, 2, 512] + - Exact: [3168, 256, 2, 1024] + - Exact: [3360, 256, 2, 1024] + - Exact: [3552, 256, 2, 1024] + - Exact: [11616, 128, 2, 512] + - Exact: [4200, 256, 2, 1024] + - Exact: [840, 2048, 2, 512] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [3264, 256, 2, 1024] + - Exact: [13600, 256, 2, 512] + - Exact: [12880, 256, 2, 512] + - Exact: [12288, 256, 2, 512] + - Exact: [2816, 256, 2, 1024] + - Exact: [672, 2048, 2, 512] + - Exact: [13440, 128, 2, 512] + - Exact: [13824, 256, 2, 512] + - Exact: [15200, 256, 2, 512] + - Exact: [3600, 256, 2, 1024] + - Exact: [4032, 1024, 2, 256] + - Exact: [16128, 128, 2, 512] + - Exact: [15200, 128, 1, 512] + - Exact: [13600, 128, 1, 512] + - Exact: [2904, 1024, 2, 256] + - Exact: [2992, 1024, 2, 256] + - Exact: [1536, 2048, 1, 1024] + - Exact: [24576, 128, 1, 256] + - Exact: [24576, 512, 1, 256] + - Exact: [25760, 128, 1, 256] + - Exact: [25760, 512, 1, 256] + - Exact: [6144, 256, 1, 512] + - Exact: [6440, 256, 1, 512] + - Exact: [13600, 512, 1, 128] + - Exact: [9408, 512, 2, 128] + - Exact: [56000, 256, 2, 64] + - Exact: [2816, 1024, 2, 256] + - Exact: [60800, 256, 1, 64] + - Exact: [2944, 1024, 2, 256] + - Exact: [11776, 512, 2, 128] + - Exact: [11616, 512, 2, 128] + - Exact: [4200, 1024, 2, 256] + - Exact: [54400, 256, 1, 64] + - Exact: [15200, 512, 1, 128] + - Exact: [2688, 1024, 2, 256] + - Exact: [12672, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [46464, 256, 2, 64] + - Exact: [2400, 256, 2, 1024] + - Exact: [2520, 256, 2, 1024] + - Exact: [2400, 1024, 2, 256] + - Exact: [10752, 128, 2, 512] + - Exact: [45632, 256, 2, 64] + - Exact: [2520, 1024, 2, 256] + - Exact: [53760, 256, 2, 64] + - Exact: [2352, 256, 2, 1024] + - Exact: [47872, 256, 2, 64] + - Exact: [47104, 256, 2, 64] + - Exact: [50688, 256, 2, 64] + - Exact: [45056, 256, 2, 64] + - Exact: [13440, 512, 2, 128] + - Exact: [2352, 1024, 2, 256] + - Exact: [11264, 512, 2, 128] + - Exact: [10560, 128, 2, 512] + - Exact: [16128, 512, 2, 128] + - Exact: [37632, 256, 2, 64] + - Exact: [51520, 256, 2, 64] + - Exact: [14000, 512, 2, 128] + - Exact: [10560, 512, 2, 128] + - Exact: [64512, 256, 2, 64] + - Exact: [54400, 256, 2, 64] + - Exact: [3264, 1024, 2, 256] + - Exact: [10752, 512, 2, 128] + - Exact: [3168, 1024, 2, 256] + - Exact: [55296, 256, 2, 256] + - Exact: [51520, 256, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [60800, 256, 2, 256] + - Exact: [54400, 256, 2, 256] + - Exact: [60800, 256, 2, 64] + - Exact: [3800, 1024, 1, 256] + - Exact: [3400, 1024, 1, 256] + - Exact: [3072, 1024, 2, 256] + - Exact: [3600, 1024, 2, 256] + - Exact: [12288, 512, 2, 128] + - Exact: [49152, 256, 2, 256] + - Exact: [12880, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [42240, 256, 2, 64] + - Exact: [1008, 2048, 2, 512] + - Exact: [3360, 1024, 2, 256] + - Exact: [14208, 512, 2, 128] + - Exact: [56832, 256, 2, 64] + - Exact: [43008, 256, 2, 64] + - Exact: [13600, 512, 2, 128] + - Exact: [2640, 1024, 2, 256] + - Exact: [13824, 512, 2, 128] + - Exact: [3800, 256, 2, 1024] + - Exact: [55296, 256, 2, 64] + - Exact: [2640, 256, 2, 1024] + - Exact: [15200, 512, 2, 128] + - Exact: [3552, 1024, 2, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [49152, 256, 2, 64] + - Exact: [3400, 1024, 2, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [6912, 256, 1, 512] + - Exact: [6800, 256, 1, 512] + - Exact: [27648, 128, 1, 256] + - Exact: [27200, 128, 1, 256] + - Exact: [30400, 128, 1, 256] + - Exact: [7600, 256, 1, 512] + - Exact: [6144, 1024, 1, 512] + - Exact: [6912, 1024, 1, 512] + - Exact: [6440, 1024, 1, 512] + - Exact: [27648, 512, 1, 256] + - Exact: [1728, 2048, 1, 1024] + - Exact: [27200, 512, 1, 256] + - Exact: [6800, 1024, 1, 512] + - Exact: [7600, 1024, 1, 512] + - Exact: [30400, 512, 1, 256] + - Exact: [12544, 1024, 1, 1024] + - Exact: [173280, 128, 1, 64] + - Exact: [231040, 128, 1, 64] + - Exact: [25992, 128, 1, 64] + - Exact: [2852, 256, 2, 1024] + - Exact: [3220, 256, 2, 1024] + - Exact: [850, 2048, 2, 512] + - Exact: [805, 2048, 2, 512] + - Exact: [3036, 256, 2, 1024] + - Exact: [713, 2048, 2, 512] + - Exact: [850, 2048, 1, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [3500, 256, 2, 1024] + - Exact: [3700, 256, 2, 1024] + - Exact: [748, 2048, 2, 512] + - Exact: [3036, 1024, 2, 256] + - Exact: [2852, 1024, 2, 256] + - Exact: [950, 2048, 1, 512] + - Exact: [3700, 1024, 2, 256] + - Exact: [3500, 1024, 2, 256] + - Exact: [3220, 1024, 2, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [1610, 2048, 1, 1024] + - Exact: [1700, 2048, 1, 1024] + - Exact: [1900, 2048, 1, 1024] + - Exact: [1444, 256, 120, 128] + - Exact: [1444, 256, 139, 128] + - Exact: [1444, 256, 160, 128] + - Exact: [1444, 256, 18, 128] + - Exact: [1444, 256, 19, 128] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 139, 256] + - Exact: [1444, 256, 160, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 256, 120, 512] + - Exact: [361, 256, 139, 512] + - Exact: [361, 256, 160, 512] + - Exact: [361, 256, 18, 512] + - Exact: [361, 256, 19, 512] + - Exact: [200716, 128, 1, 64] + - Exact: [27436, 128, 1, 64] + - Exact: [1024, 1024, 160, 96] + - Exact: [1920, 16384, 1, 25216] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [960, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 2880] + - Exact: [1024, 1024, 40, 96] + - Exact: [1920, 4096, 1, 25216] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [960, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 2880] + - Exact: [1024, 1024, 80, 96] + - Exact: [1920, 8192, 1, 25216] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [960, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 2880] + - Exact: [1024, 1024, 96, 96] + - Exact: [2304, 16384, 1, 12672] + - Exact: [2304, 16384, 1, 2304] + - Exact: [576, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 1728] + - Exact: [1024, 1024, 24, 96] + - Exact: [2304, 4096, 1, 12672] + - Exact: [2304, 4096, 1, 2304] + - Exact: [576, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 1728] + - Exact: [1024, 1024, 48, 96] + - Exact: [2304, 8192, 1, 12672] + - Exact: [2304, 8192, 1, 2304] + - Exact: [576, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 1728] + - Exact: [1024, 1024, 16, 96] + - Exact: [3072, 4096, 1, 6400] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [384, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1152] + - Exact: [1024, 1024, 32, 96] + - Exact: [3072, 8192, 1, 6400] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [384, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1152] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 4096, 1, 2048] + - Exact: [1024, 2283, 1, 29000] + - Exact: [1024, 2296, 1, 29000] + - Exact: [1024, 2306, 1, 29000] + - Exact: [1024, 2309, 1, 29000] + - Exact: [1024, 2318, 1, 29000] + - Exact: [1024, 2320, 1, 29000] + - Exact: [1024, 2324, 1, 29000] + - Exact: [1024, 2325, 1, 29000] + - Exact: [1024, 2329, 1, 29000] + - Exact: [1024, 2338, 1, 29000] + - Exact: [1024, 2345, 1, 29000] + - Exact: [1024, 2350, 1, 29000] + - Exact: [1024, 2362, 1, 29000] + - Exact: [1024, 2366, 1, 29000] + - Exact: [1024, 2368, 1, 29000] + - Exact: [1024, 2374, 1, 29000] + - Exact: [1024, 2390, 1, 29000] + - Exact: [512, 512, 320, 64] + - Exact: [512, 512, 80, 64] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 3072] + - Exact: [1024, 32768, 1, 4096] + - Exact: [1024, 32768, 1, 50304] + - Exact: [4096, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 780, 1, 30522] + - Exact: [1024, 308, 1, 30522] + - Exact: [1024, 800, 1, 30522] + - Exact: [1024, 820, 1, 30522] + - Exact: [1024, 385, 1, 30522] + - Exact: [1024, 462, 1, 30522] + - Exact: [1024, 640, 1, 30528] + - Exact: [2048, 199, 1, 29000] + - Exact: [2048, 221, 1, 29000] + - Exact: [2048, 224, 1, 29000] + - Exact: [2048, 229, 1, 29000] + - Exact: [2048, 234, 1, 29000] + - Exact: [2048, 242, 1, 29000] + - Exact: [2048, 246, 1, 29000] + - Exact: [2048, 247, 1, 29000] + - Exact: [2048, 256, 1, 29000] + - Exact: [2048, 262, 1, 29000] + - Exact: [2048, 264, 1, 29000] + - Exact: [2048, 265, 1, 29000] + - Exact: [2048, 274, 1, 29000] + - Exact: [2048, 277, 1, 29000] + - Exact: [2048, 279, 1, 29000] + - Exact: [2048, 288, 1, 29000] + - Exact: [2048, 296, 1, 29000] + - Exact: [2048, 315, 1, 29000] + - Exact: [2048, 335, 1, 29000] + - Exact: [1024, 561, 1, 29000] + - Exact: [1024, 574, 1, 29000] + - Exact: [1024, 600, 1, 29000] + - Exact: [1024, 608, 1, 29000] + - Exact: [1024, 615, 1, 29000] + - Exact: [1024, 622, 1, 29000] + - Exact: [1024, 625, 1, 29000] + - Exact: [1024, 626, 1, 29000] + - Exact: [1024, 628, 1, 29000] + - Exact: [1024, 636, 1, 29000] + - Exact: [1024, 651, 1, 29000] + - Exact: [1024, 658, 1, 29000] + - Exact: [1024, 669, 1, 29000] + - Exact: [1024, 670, 1, 29000] + - Exact: [1024, 672, 1, 29000] + - Exact: [1024, 684, 1, 29000] + - Exact: [1024, 716, 1, 29000] + - Exact: [1024, 730, 1, 29000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 2944, 1, 3328] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [1408, 448, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [64, 193600, 1, 64] + - Exact: [1024, 700, 1, 512] + - Exact: [2560, 128, 1, 2560] + - Exact: [64, 193600, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [512, 1500, 1, 2816] + - Exact: [3072, 128, 1, 1024] + - Exact: [7680, 64, 1, 2560] + - Exact: [7680, 128, 1, 2560] + - Exact: [512, 1500, 1, 2560] + - Exact: [1024, 1024, 1, 1024] + - Exact: [512, 1500, 1, 2048] + - Exact: [512, 1500, 1, 1536] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 256] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [512, 1600, 1, 32] + - Exact: [512, 1600, 1, 512] + - Exact: [560, 1600, 1, 1024] + - Exact: [1024, 512, 1, 1] + - Exact: [1024, 512, 1, 64] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 960, 1, 64] + - Exact: [1024, 960, 1, 1024] + - Exact: [1600, 512, 1, 1024] + - Exact: [2048, 512, 1, 1] + - Exact: [2048, 512, 1, 2048] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [1225, 64, 32, 192] + - Exact: [1225, 64, 32, 256] + - Exact: [1225, 64, 32, 288] + - Exact: [5329, 80, 32, 64] + - Exact: [289, 128, 32, 768] + - Exact: [289, 160, 32, 768] + - Exact: [289, 192, 32, 768] + - Exact: [3136, 64, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [960, 1024, 1, 1024] + - Exact: [64, 512, 16, 512] + - Exact: [64, 512, 128, 512] + - Exact: [1024, 512, 1, 2] + - Exact: [1024, 512, 1, 4096] + - Exact: [1024, 616, 1, 1024] + - Exact: [64, 128, 128, 128] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 2] + - Exact: [1024, 1024, 1, 4096] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 780, 1, 1024] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 800, 1, 1024] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 820, 1, 1024] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 385, 1, 1024] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 128, 144, 128] + - Exact: [64, 1024, 32, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 40, 512] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 512] + - Exact: [256, 3456, 1, 128] + - Exact: [256, 4096, 1, 128] + - Exact: [480, 864, 1, 1024] + - Exact: [512, 864, 1, 256] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [256, 4096, 1, 1] + - Exact: [12544, 64, 1, 147] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [3400, 256, 1, 1024] + - Exact: [3800, 256, 1, 1024] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [51520, 64, 2, 256] + - Exact: [46464, 64, 2, 256] + - Exact: [49152, 64, 2, 256] + - Exact: [1536, 512, 1, 1024] + - Exact: [1728, 512, 1, 1024] + - Exact: [1024, 1024, 1, 320] + - Exact: [51520, 64, 2, 64] + - Exact: [55296, 64, 2, 64] + - Exact: [49152, 64, 2, 64] + - Exact: [54400, 64, 2, 64] + - Exact: [42240, 64, 2, 256] + - Exact: [672, 512, 2, 2048] + - Exact: [54400, 64, 2, 256] + - Exact: [56832, 64, 2, 256] + - Exact: [55296, 64, 2, 256] + - Exact: [60800, 64, 2, 64] + - Exact: [768, 512, 2, 2048] + - Exact: [43008, 64, 2, 256] + - Exact: [864, 256, 2, 2048] + - Exact: [768, 256, 2, 2048] + - Exact: [45632, 64, 2, 256] + - Exact: [60800, 64, 2, 256] + - Exact: [1024, 1024, 1, 81] + - Exact: [950, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [950, 256, 2, 2048] + - Exact: [1900, 512, 1, 1024] + - Exact: [1700, 512, 1, 1024] + - Exact: [1610, 512, 1, 1024] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [713, 512, 2, 2048] + - Exact: [805, 256, 2, 2048] + - Exact: [850, 256, 2, 2048] + - Exact: [100, 128, 120, 512] + - Exact: [100, 128, 139, 512] + - Exact: [100, 128, 160, 512] + - Exact: [22500, 64, 1, 147] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 77, 1, 30522] + - Exact: [1024, 200, 1, 30522] + - Exact: [1024, 160, 1, 30522] + - Exact: [1024, 180, 1, 30522] + - Exact: [1024, 160, 1, 30528] + - Exact: [1024, 240, 1, 30528] + - Exact: [2560, 109, 1, 29000] + - Exact: [2560, 121, 1, 29000] + - Exact: [2560, 65, 1, 29000] + - Exact: [2560, 66, 1, 29000] + - Exact: [2560, 67, 1, 29000] + - Exact: [2560, 69, 1, 29000] + - Exact: [2560, 70, 1, 29000] + - Exact: [2560, 71, 1, 29000] + - Exact: [2560, 73, 1, 29000] + - Exact: [2560, 74, 1, 29000] + - Exact: [2560, 75, 1, 29000] + - Exact: [2560, 77, 1, 29000] + - Exact: [2560, 78, 1, 29000] + - Exact: [2560, 80, 1, 29000] + - Exact: [2560, 81, 1, 29000] + - Exact: [2560, 82, 1, 29000] + - Exact: [2560, 83, 1, 29000] + - Exact: [2560, 84, 1, 29000] + - Exact: [2560, 88, 1, 29000] + - Exact: [2560, 89, 1, 29000] + - Exact: [2560, 90, 1, 29000] + - Exact: [2560, 92, 1, 29000] + - Exact: [2560, 95, 1, 29000] + - Exact: [2560, 98, 1, 29000] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [1024, 256, 1, 3328] + - Exact: [704, 128, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 256] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [2560, 32, 1, 2560] + - Exact: [2048, 128, 1, 2048] + - Exact: [4608, 32, 1, 1536] + - Exact: [3072, 64, 1, 1024] + - Exact: [128, 1500, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [4096, 64, 1, 4096] + - Exact: [7680, 32, 1, 2560] + - Exact: [2560, 64, 1, 2560] + - Exact: [3072, 32, 1, 1024] + - Exact: [6144, 32, 1, 2560] + - Exact: [176, 1500, 1, 1408] + - Exact: [2048, 64, 1, 2048] + - Exact: [8448, 32, 1, 2816] + - Exact: [512, 512, 1, 64] + - Exact: [32, 33, 1600, 33] + - Exact: [256, 1024, 1, 1] + - Exact: [257, 1024, 1, 4096] + - Exact: [512, 200, 1, 1] + - Exact: [512, 200, 1, 32] + - Exact: [512, 215, 1, 2048] + - Exact: [512, 256, 1, 2048] + - Exact: [560, 200, 1, 1024] + - Exact: [768, 215, 1, 2048] + - Exact: [768, 256, 1, 2048] + - Exact: [1024, 200, 1, 1] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 32] + - Exact: [64, 35, 4608, 35] + - Exact: [64, 33, 1920, 27] + - Exact: [64, 33, 1920, 33] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [1225, 32, 32, 192] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [1225, 48, 32, 288] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [256, 864, 1, 128] + - Exact: [3136, 64, 1, 576] + - Exact: [784, 128, 1, 1152] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 128, 1, 2] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 96, 1, 2] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [100, 128, 18, 512] + - Exact: [100, 128, 19, 512] + - Exact: [1444, 128, 1, 576] + - Exact: [361, 512, 1, 2304] + - Exact: [2560, 35, 1, 29000] + - Exact: [2560, 36, 1, 29000] + - Exact: [2560, 39, 1, 29000] + - Exact: [2560, 40, 1, 29000] + - Exact: [2560, 42, 1, 29000] + - Exact: [2560, 43, 1, 29000] + - Exact: [2560, 44, 1, 29000] + - Exact: [2560, 46, 1, 29000] + - Exact: [2560, 48, 1, 29000] + - Exact: [2560, 49, 1, 29000] + - Exact: [2560, 50, 1, 29000] + - Exact: [2560, 51, 1, 29000] + - Exact: [2560, 53, 1, 29000] + - Exact: [2560, 54, 1, 29000] + - Exact: [2560, 55, 1, 29000] + - Exact: [2560, 56, 1, 29000] + - Exact: [2560, 57, 1, 29000] + - Exact: [2560, 58, 1, 29000] + - Exact: [2560, 59, 1, 29000] + - Exact: [2560, 61, 1, 29000] + - Exact: [2560, 63, 1, 29000] + - Exact: [1909283, 40, 1, 40] + - Exact: [3818566, 40, 1, 40] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [7680, 4, 1, 2560] + - Exact: [3072, 16, 1, 1024] + - Exact: [2048, 16, 1, 2048] + - Exact: [3072, 1, 1, 128] + - Exact: [8448, 16, 1, 2816] + - Exact: [7680, 2, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [7680, 1, 1, 2560] + - Exact: [6144, 2, 1, 2560] + - Exact: [1760, 16, 1, 1760] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 4, 1, 1024] + - Exact: [2048, 32, 1, 2048] + - Exact: [4608, 16, 1, 1536] + - Exact: [3072, 2, 1, 1024] + - Exact: [8448, 1, 1, 2816] + - Exact: [6144, 1, 1, 2560] + - Exact: [4608, 1, 1, 1536] + - Exact: [8448, 4, 1, 2816] + - Exact: [4608, 2, 1, 1536] + - Exact: [2560, 16, 1, 2560] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [7680, 16, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [8448, 2, 1, 2816] + - Exact: [4608, 4, 1, 1536] + - Exact: [2048, 2, 1, 2048] + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 4, 1, 2] + - Exact: [2560, 4, 1, 2560] + - Exact: [2048, 1, 1, 512] + - Exact: [12288, 12, 2, 256] + - Exact: [12288, 3, 2, 256] + - Exact: [51520, 12, 2, 256] + - Exact: [51520, 3, 2, 256] + - Exact: [15200, 12, 2, 256] + - Exact: [15200, 3, 2, 256] + - Exact: [3456, 3, 2, 256] + - Exact: [13600, 12, 2, 256] + - Exact: [12880, 3, 2, 256] + - Exact: [3400, 3, 2, 256] + - Exact: [12880, 12, 2, 256] + - Exact: [13824, 12, 2, 256] + - Exact: [13824, 3, 2, 256] + - Exact: [13600, 3, 2, 256] + - Exact: [3456, 12, 2, 256] + - Exact: [3800, 3, 2, 256] + - Exact: [3400, 12, 2, 256] + - Exact: [3800, 12, 2, 256] + - Exact: [55296, 3, 2, 256] + - Exact: [3072, 3, 2, 256] + - Exact: [3072, 12, 2, 256] + - Exact: [54400, 3, 2, 256] + - Exact: [60800, 12, 2, 256] + - Exact: [60800, 3, 2, 256] + - Exact: [3220, 3, 2, 256] + - Exact: [3220, 12, 2, 256] + - Exact: [2048, 8, 1, 2] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2] + - Exact: [2560, 2, 1, 2560] + - Exact: [2560, 27, 1, 29000] + - Exact: [1909283, 11, 1, 11] + - Exact: [3818566, 11, 1, 11] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [512, 4, 1, 500000] + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [49, 512, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 256, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [1, 1, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [1, 1, 1, 256] + - Exact: [128, 448, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 16, 1, 512] + - Exact: [128, 1, 1, 1408] + - Exact: [64, 1, 1, 1216] + - Exact: [1024, 2, 1, 512] + - Exact: [512, 1, 1, 512] + - Exact: [1024, 4, 1, 512] + - Exact: [512, 4, 1, 512] + - Exact: [1024, 32, 1, 512] + - Exact: [512, 2, 1, 512] + - Exact: [1024, 1, 1, 512] + - Exact: [512, 32, 1, 512] + - Exact: [128, 1, 1, 1024] + - Exact: [64, 14, 1, 14] + - Exact: [64, 14, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 21] + - Exact: [64, 21, 1, 21] + - Exact: [64, 24, 1, 24] + - Exact: [64, 24, 1, 34] + - Exact: [64, 30, 1, 30] + - Exact: [64, 31, 1, 30] + - Exact: [64, 31, 1, 31] + - Exact: [64, 32, 1, 32] + - Exact: [64, 34, 1, 34] + - Exact: [64, 35, 1, 32] + - Exact: [64, 35, 1, 35] + - Exact: [64, 512, 1, 512] + - Exact: [1024, 4, 1, 2] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 32, 1, 2] + - Exact: [1024, 32, 1, 1024] + - Exact: [32, 200, 1, 1] + - Exact: [64, 3, 512, 3] + - Exact: [64, 5, 512, 5] + - Exact: [64, 5, 960, 5] + - Exact: [64, 9, 512, 9] + - Exact: [64, 512, 1, 1] + - Exact: [67, 512, 1, 2048] + - Exact: [74, 512, 1, 2048] + - Exact: [74, 960, 1, 2048] + - Exact: [100, 512, 1, 2048] + - Exact: [128, 27, 32768, 27] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 14, 10880, 15] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 15, 7680, 17] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 17, 6144, 21] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 24, 4736, 34] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 31, 2048, 30] + - Exact: [64, 31, 2048, 31] + - Exact: [64, 27, 1920, 27] + - Exact: [1024, 8, 1, 1024] + - Exact: [1024, 77, 1, 1024] + - Exact: [1024, 10, 1, 2] + - Exact: [1024, 10, 1, 1024] + - Exact: [1024, 39, 1, 2] + - Exact: [1024, 39, 1, 1024] + - Exact: [1024, 40, 1, 2] + - Exact: [1024, 40, 1, 1024] + - Exact: [1024, 41, 1, 2] + - Exact: [1024, 41, 1, 1024] + - Exact: [1024, 5, 1, 2] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 6, 1, 2] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 8, 1, 2] + - Exact: [1024, 9, 1, 2] + - Exact: [1024, 9, 1, 1024] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [128, 128, 1, 64] + - Exact: [64, 128, 1, 128] + - Exact: [64, 5, 1, 5] + - Exact: [32, 33, 1, 33] + - Exact: [1024, 16, 1, 2] + - Exact: [1024, 16, 1, 1024] + - Exact: [1024, 1, 1, 2] + - Exact: [1024, 1, 1, 1024] + - Exact: [1024, 1, 1, 200] + - Exact: [1024, 1, 1, 1600] + - Exact: [1024, 64, 1, 2] + - Exact: [1024, 64, 1, 1024] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 80, 1, 2] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 82, 1, 2] + - Exact: [1024, 12, 1, 1024] + - Exact: [1024, 12, 1, 2] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [196, 256, 1, 2304] + - Exact: [768, 3, 2, 256] + - Exact: [768, 12, 2, 256] + - Exact: [864, 12, 2, 256] + - Exact: [864, 3, 2, 256] + - Exact: [216, 3, 2, 256] + - Exact: [176, 12, 2, 256] + - Exact: [176, 3, 2, 256] + - Exact: [192, 12, 2, 256] + - Exact: [192, 3, 2, 256] + - Exact: [216, 12, 2, 256] + - Exact: [850, 3, 2, 256] + - Exact: [850, 12, 2, 256] + - Exact: [805, 12, 2, 256] + - Exact: [805, 3, 2, 256] + - Exact: [247, 3, 2, 256] + - Exact: [950, 3, 2, 256] + - Exact: [187, 12, 2, 256] + - Exact: [247, 12, 2, 256] + - Exact: [187, 3, 2, 256] + - Exact: [228, 12, 2, 256] + - Exact: [221, 12, 2, 256] + - Exact: [950, 12, 2, 256] + - Exact: [228, 3, 2, 256] + - Exact: [221, 3, 2, 256] + - Exact: [25, 128, 120, 256] + - Exact: [25, 128, 139, 256] + - Exact: [25, 128, 160, 256] + - Exact: [25, 128, 18, 256] + - Exact: [25, 128, 19, 256] + - Exact: [9, 128, 120, 256] + - Exact: [9, 128, 139, 256] + - Exact: [9, 128, 160, 256] + - Exact: [9, 128, 18, 256] + - Exact: [9, 128, 19, 256] + - Exact: [100, 512, 1, 2304] + - Exact: [25, 256, 1, 1152] + - Exact: [9, 256, 1, 1152] + - Exact: [1024, 20, 1, 1024] + - Exact: [1024, 20, 1, 2] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml new file mode 100644 index 000000000..eba81d7be --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml @@ -0,0 +1,1633 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4096, 7133, 1, 4096] + - Exact: [2048, 7133, 1, 2048] + - Exact: [2560, 7133, 1, 2560] + - Exact: [3072, 7435, 1, 1024] + - Exact: [1760, 7133, 1, 1760] + - Exact: [7680, 5481, 1, 2560] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 512, 256, 128] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 256, 1024] + - Exact: [3136, 256, 128, 64] + - Exact: [1024, 4096, 1, 2048] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 30528, 1, 2048] + - Exact: [1024, 30528, 1, 4096] + - Exact: [4096, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 4096] + - Exact: [256, 4864, 1, 8976] + - Exact: [256, 5120, 1, 8976] + - Exact: [256, 5632, 1, 8976] + - Exact: [256, 5888, 1, 8976] + - Exact: [256, 6144, 1, 8976] + - Exact: [256, 7168, 1, 8976] + - Exact: [256, 8192, 1, 8976] + - Exact: [256, 8960, 1, 8976] + - Exact: [256, 9728, 1, 8976] + - Exact: [256, 9984, 1, 8976] + - Exact: [256, 10240, 1, 8976] + - Exact: [256, 10496, 1, 8976] + - Exact: [256, 11008, 1, 8976] + - Exact: [256, 11264, 1, 8976] + - Exact: [256, 11520, 1, 8976] + - Exact: [256, 11776, 1, 8976] + - Exact: [256, 12544, 1, 8976] + - Exact: [256, 12800, 1, 8976] + - Exact: [256, 13312, 1, 8976] + - Exact: [256, 13568, 1, 8976] + - Exact: [256, 14336, 1, 8976] + - Exact: [256, 14848, 1, 8976] + - Exact: [256, 15104, 1, 8976] + - Exact: [256, 15872, 1, 8976] + - Exact: [256, 16128, 1, 8976] + - Exact: [256, 17152, 1, 8976] + - Exact: [256, 17408, 1, 8976] + - Exact: [256, 18688, 1, 8976] + - Exact: [256, 19968, 1, 8976] + - Exact: [256, 20480, 1, 8976] + - Exact: [256, 20992, 1, 8976] + - Exact: [256, 21248, 1, 8976] + - Exact: [256, 22016, 1, 8976] + - Exact: [256, 26112, 1, 8976] + - Exact: [256, 32512, 1, 8976] + - Exact: [256, 32768, 1, 1] + - Exact: [256, 33536, 1, 8976] + - Exact: [256, 44505, 1, 8976] + - Exact: [768, 2048, 1, 256] + - Exact: [1600, 1024, 1, 512] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 960, 1, 1] + - Exact: [2048, 2048, 1, 512] + - Exact: [2048, 2048, 1, 960] + - Exact: [2048, 2048, 1, 1024] + - Exact: [3200, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 3840] + - Exact: [1024, 4096, 1, 3968] + - Exact: [1024, 4096, 1, 6528] + - Exact: [1024, 4096, 1, 7104] + - Exact: [1024, 4096, 1, 7200] + - Exact: [1024, 4096, 1, 8064] + - Exact: [1024, 4096, 1, 8160] + - Exact: [1024, 4096, 1, 9216] + - Exact: [1024, 4096, 1, 9520] + - Exact: [1024, 4096, 1, 10064] + - Exact: [1024, 4096, 1, 10080] + - Exact: [1024, 4096, 1, 10200] + - Exact: [1024, 42720, 1, 3968] + - Exact: [1024, 42720, 1, 6528] + - Exact: [1024, 42720, 1, 7104] + - Exact: [1024, 42720, 1, 7200] + - Exact: [1024, 42720, 1, 9520] + - Exact: [1024, 42720, 1, 10080] + - Exact: [4096, 1024, 1, 3840] + - Exact: [4096, 1024, 1, 3968] + - Exact: [4096, 1024, 1, 6528] + - Exact: [4096, 1024, 1, 7104] + - Exact: [4096, 1024, 1, 7200] + - Exact: [4096, 1024, 1, 8064] + - Exact: [4096, 1024, 1, 8160] + - Exact: [4096, 1024, 1, 9216] + - Exact: [4096, 1024, 1, 9520] + - Exact: [4096, 1024, 1, 10064] + - Exact: [4096, 1024, 1, 10080] + - Exact: [4096, 1024, 1, 10200] + - Exact: [1024, 4096, 1, 3240] + - Exact: [1024, 4096, 1, 3960] + - Exact: [1024, 42720, 1, 3960] + - Exact: [4096, 1024, 1, 3240] + - Exact: [4096, 1024, 1, 3960] + - Exact: [1225, 192, 64, 32] + - Exact: [1225, 192, 64, 48] + - Exact: [1225, 192, 64, 64] + - Exact: [1225, 256, 64, 48] + - Exact: [1225, 256, 64, 64] + - Exact: [1225, 288, 64, 48] + - Exact: [1225, 288, 64, 64] + - Exact: [289, 768, 64, 128] + - Exact: [289, 768, 64, 160] + - Exact: [289, 768, 64, 192] + - Exact: [1225, 192, 32, 32] + - Exact: [1225, 192, 32, 48] + - Exact: [1225, 192, 32, 64] + - Exact: [1225, 256, 32, 48] + - Exact: [1225, 256, 32, 64] + - Exact: [1225, 288, 32, 48] + - Exact: [1225, 288, 32, 64] + - Exact: [289, 768, 32, 128] + - Exact: [289, 768, 32, 160] + - Exact: [289, 768, 32, 192] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 128, 32, 512] + - Exact: [784, 512, 32, 128] + - Exact: [196, 1024, 32, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [784, 256, 64, 512] + - Exact: [3136, 256, 64, 128] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 512, 64, 1024] + - Exact: [784, 512, 64, 256] + - Exact: [784, 512, 64, 512] + - Exact: [196, 1024, 64, 512] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 256] + - Exact: [784, 256, 32, 512] + - Exact: [3136, 256, 32, 128] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 512, 32, 1024] + - Exact: [784, 512, 32, 256] + - Exact: [784, 512, 32, 512] + - Exact: [196, 1024, 32, 512] + - Exact: [196, 1024, 32, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 30522, 1, 77] + - Exact: [4096, 1024, 1, 512] + - Exact: [1024, 4096, 1, 1280] + - Exact: [1024, 30522, 1, 200] + - Exact: [4096, 1024, 1, 1280] + - Exact: [1024, 4096, 1, 4992] + - Exact: [1024, 30522, 1, 780] + - Exact: [4096, 1024, 1, 4992] + - Exact: [1024, 30522, 1, 308] + - Exact: [1024, 4096, 1, 5120] + - Exact: [1024, 30522, 1, 800] + - Exact: [4096, 1024, 1, 5120] + - Exact: [1024, 4096, 1, 5248] + - Exact: [1024, 30522, 1, 820] + - Exact: [4096, 1024, 1, 5248] + - Exact: [1024, 4096, 1, 2560] + - Exact: [1024, 30522, 1, 385] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 30522, 1, 462] + - Exact: [4096, 1024, 1, 3072] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 30522, 1, 160] + - Exact: [4096, 1024, 1, 1024] + - Exact: [1024, 4096, 1, 1152] + - Exact: [1024, 30522, 1, 180] + - Exact: [4096, 1024, 1, 1152] + - Exact: [1024, 4096, 1, 8192] + - Exact: [1024, 4096, 1, 9600] + - Exact: [1024, 33712, 1, 8192] + - Exact: [1024, 33712, 1, 9600] + - Exact: [4096, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 9600] + - Exact: [1024, 1600, 1, 1] + - Exact: [2560, 1920, 1, 2048] + - Exact: [1024, 3072, 1, 4096] + - Exact: [2560, 2560, 1, 2048] + - Exact: [2048, 2048, 1, 2] + - Exact: [1024, 30592, 1, 2048] + - Exact: [1024, 3072, 1, 16384] + - Exact: [6144, 1536, 1, 4096] + - Exact: [1536, 4608, 1, 8192] + - Exact: [640, 2560, 1, 2048] + - Exact: [1024, 4096, 1, 16384] + - Exact: [1536, 6144, 1, 4096] + - Exact: [1024, 30592, 1, 4096] + - Exact: [2560, 2560, 1, 4] + - Exact: [1536, 1536, 1, 4096] + - Exact: [2560, 7680, 1, 2048] + - Exact: [1536, 50304, 1, 4096] + - Exact: [2048, 8192, 1, 1024] + - Exact: [1024, 30592, 1, 8192] + - Exact: [4096, 1024, 1, 16384] + - Exact: [8192, 2048, 1, 1024] + - Exact: [1024, 50304, 1, 4096] + - Exact: [1536, 4608, 1, 4096] + - Exact: [6144, 1536, 1, 8192] + - Exact: [1024, 3072, 1, 8192] + - Exact: [1536, 1536, 1, 8192] + - Exact: [1536, 50304, 1, 8192] + - Exact: [2048, 6144, 1, 1024] + - Exact: [2048, 30592, 1, 1024] + - Exact: [1536, 6144, 1, 8192] + - Exact: [1024, 50304, 1, 2048] + - Exact: [1024, 50304, 1, 8192] + - Exact: [1024, 3072, 1, 2048] + - Exact: [1024, 50304, 1, 16384] + - Exact: [1024, 30528, 1, 8192] + - Exact: [256, 6912, 1, 1] + - Exact: [30528, 1024, 1, 640] + - Exact: [30528, 1024, 1, 1280] + - Exact: [4096, 1024, 1, 10240] + - Exact: [1024, 4096, 1, 10240] + - Exact: [30528, 1024, 1, 1600] + - Exact: [1024, 4096, 1, 10496] + - Exact: [30528, 1024, 1, 1640] + - Exact: [4096, 1024, 1, 10496] + - Exact: [30528, 1024, 1, 160] + - Exact: [1024, 4096, 1, 6144] + - Exact: [30528, 1024, 1, 240] + - Exact: [4096, 1024, 1, 6144] + - Exact: [1024, 4096, 1, 10224] + - Exact: [4096, 1024, 1, 10224] + - Exact: [1024, 3072, 1, 10224] + - Exact: [1024, 3072, 1, 10240] + - Exact: [4096, 1024, 1, 10192] + - Exact: [1024, 3072, 1, 10192] + - Exact: [1024, 4096, 1, 10192] + - Exact: [1024, 3072, 1, 10200] + - Exact: [4096, 1024, 1, 10208] + - Exact: [1024, 3072, 1, 10208] + - Exact: [1024, 4096, 1, 10208] + - Exact: [1024, 2048, 1, 10224] + - Exact: [1024, 2048, 1, 10240] + - Exact: [1024, 2048, 1, 10192] + - Exact: [1024, 3072, 1, 10080] + - Exact: [100352, 256, 1, 512] + - Exact: [12544, 1024, 1, 2048] + - Exact: [12544, 147, 1, 64] + - Exact: [200704, 256, 1, 512] + - Exact: [25088, 512, 1, 1024] + - Exact: [3136, 576, 1, 64] + - Exact: [50176, 512, 1, 1024] + - Exact: [6272, 1024, 1, 2048] + - Exact: [3136, 256, 128, 128] + - Exact: [3136, 256, 256, 128] + - Exact: [784, 512, 128, 256] + - Exact: [784, 512, 256, 256] + - Exact: [30528, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 12288] + - Exact: [30528, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [25600, 128, 25, 128] + - Exact: [12544, 128, 36, 128] + - Exact: [9216, 128, 49, 128] + - Exact: [6400, 128, 64, 128] + - Exact: [6400, 256, 25, 256] + - Exact: [4096, 256, 36, 256] + - Exact: [2304, 256, 49, 256] + - Exact: [2304, 256, 64, 256] + - Exact: [2304, 512, 25, 512] + - Exact: [1024, 512, 36, 512] + - Exact: [1024, 512, 49, 512] + - Exact: [1024, 512, 64, 512] + - Exact: [3072, 768, 1, 2048] + - Exact: [768, 3072, 1, 2048] + - Exact: [3072, 768, 1, 4608] + - Exact: [768, 3072, 1, 4608] + - Exact: [4096, 1024, 1, 4608] + - Exact: [1024, 4096, 1, 4608] + - Exact: [196, 1024, 128, 512] + - Exact: [196, 1024, 256, 512] + - Exact: [4880, 256, 49, 256] + - Exact: [3128, 256, 64, 256] + - Exact: [4680, 256, 49, 256] + - Exact: [5280, 256, 36, 256] + - Exact: [2640, 256, 64, 256] + - Exact: [5304, 256, 49, 256] + - Exact: [2760, 256, 64, 256] + - Exact: [6440, 256, 36, 256] + - Exact: [5704, 256, 36, 256] + - Exact: [2128, 256, 64, 256] + - Exact: [1160, 256, 49, 256] + - Exact: [4056, 256, 49, 256] + - Exact: [6144, 256, 36, 256] + - Exact: [6336, 256, 36, 256] + - Exact: [13600, 512, 2, 128] + - Exact: [15200, 512, 2, 128] + - Exact: [15200, 128, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [5632, 256, 36, 256] + - Exact: [12288, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [11408, 128, 2, 512] + - Exact: [13824, 512, 2, 128] + - Exact: [13824, 128, 2, 512] + - Exact: [10560, 128, 2, 512] + - Exact: [10752, 128, 2, 512] + - Exact: [13600, 512, 2, 256] + - Exact: [15200, 512, 2, 256] + - Exact: [768, 2048, 2, 512] + - Exact: [12880, 512, 2, 128] + - Exact: [11616, 128, 2, 512] + - Exact: [14208, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [6912, 256, 36, 256] + - Exact: [13824, 512, 2, 256] + - Exact: [11616, 512, 2, 128] + - Exact: [12288, 512, 2, 128] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [10560, 512, 2, 128] + - Exact: [672, 2048, 2, 512] + - Exact: [9408, 128, 2, 512] + - Exact: [10752, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [1240, 256, 49, 256] + - Exact: [4032, 256, 2, 1024] + - Exact: [888, 2048, 2, 512] + - Exact: [12880, 512, 2, 256] + - Exact: [12288, 512, 2, 256] + - Exact: [13440, 128, 2, 512] + - Exact: [864, 2048, 2, 256] + - Exact: [12672, 128, 2, 512] + - Exact: [11264, 128, 2, 512] + - Exact: [11776, 128, 2, 512] + - Exact: [16128, 128, 2, 512] + - Exact: [4032, 1024, 2, 256] + - Exact: [14000, 128, 2, 512] + - Exact: [13440, 512, 2, 128] + - Exact: [768, 2048, 2, 256] + - Exact: [3264, 1024, 2, 256] + - Exact: [4200, 256, 2, 1024] + - Exact: [2352, 1024, 2, 256] + - Exact: [2400, 1024, 2, 256] + - Exact: [15200, 256, 2, 12] + - Exact: [12880, 256, 2, 12] + - Exact: [2520, 1024, 2, 256] + - Exact: [13600, 256, 2, 12] + - Exact: [15200, 256, 2, 3] + - Exact: [12880, 256, 2, 3] + - Exact: [4200, 1024, 2, 256] + - Exact: [12288, 256, 2, 12] + - Exact: [13824, 256, 2, 12] + - Exact: [13600, 256, 2, 3] + - Exact: [7600, 512, 1, 256] + - Exact: [6144, 512, 1, 256] + - Exact: [12544, 1024, 1, 1024] + - Exact: [3800, 256, 2, 3] + - Exact: [13824, 256, 2, 3] + - Exact: [12288, 256, 2, 3] + - Exact: [2688, 256, 2, 1024] + - Exact: [3072, 256, 2, 12] + - Exact: [3800, 256, 2, 12] + - Exact: [3072, 256, 2, 3] + - Exact: [2520, 256, 2, 1024] + - Exact: [16128, 512, 2, 128] + - Exact: [2400, 256, 2, 1024] + - Exact: [2352, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2992, 1024, 2, 256] + - Exact: [2816, 256, 2, 1024] + - Exact: [2904, 1024, 2, 256] + - Exact: [3456, 256, 2, 3] + - Exact: [3400, 256, 2, 3] + - Exact: [2816, 1024, 2, 256] + - Exact: [3456, 256, 2, 12] + - Exact: [2944, 1024, 2, 256] + - Exact: [3168, 256, 2, 1024] + - Exact: [2992, 256, 2, 1024] + - Exact: [51520, 256, 2, 12] + - Exact: [3072, 256, 2, 1024] + - Exact: [2640, 1024, 2, 256] + - Exact: [2688, 1024, 2, 256] + - Exact: [2904, 256, 2, 1024] + - Exact: [3264, 256, 2, 1024] + - Exact: [54400, 256, 2, 12] + - Exact: [55296, 256, 2, 3] + - Exact: [60800, 256, 2, 12] + - Exact: [51520, 256, 2, 3] + - Exact: [55296, 256, 2, 12] + - Exact: [3600, 1024, 2, 256] + - Exact: [60800, 256, 2, 3] + - Exact: [952, 256, 64, 256] + - Exact: [49152, 256, 2, 12] + - Exact: [3360, 256, 2, 1024] + - Exact: [736, 256, 64, 256] + - Exact: [600, 256, 64, 256] + - Exact: [1440, 256, 49, 256] + - Exact: [3168, 1024, 2, 256] + - Exact: [1368, 256, 49, 256] + - Exact: [49152, 256, 2, 3] + - Exact: [3600, 256, 2, 1024] + - Exact: [3360, 1024, 2, 256] + - Exact: [54400, 256, 2, 3] + - Exact: [3072, 1024, 2, 256] + - Exact: [2640, 256, 2, 1024] + - Exact: [616, 256, 64, 256] + - Exact: [3008, 256, 64, 256] + - Exact: [896, 256, 64, 256] + - Exact: [768, 256, 64, 256] + - Exact: [3552, 256, 2, 1024] + - Exact: [3552, 1024, 2, 256] + - Exact: [800, 256, 64, 256] + - Exact: [1120, 256, 49, 256] + - Exact: [2408, 256, 64, 256] + - Exact: [3456, 256, 2, 1024] + - Exact: [672, 256, 64, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [1064, 256, 49, 256] + - Exact: [3400, 256, 2, 1024] + - Exact: [704, 256, 64, 256] + - Exact: [3400, 1024, 2, 256] + - Exact: [3264, 256, 64, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [3800, 256, 2, 1024] + - Exact: [6440, 512, 1, 256] + - Exact: [6912, 512, 1, 256] + - Exact: [6800, 512, 1, 256] + - Exact: [6800, 512, 1, 1024] + - Exact: [6440, 512, 1, 1024] + - Exact: [6912, 512, 1, 1024] + - Exact: [1728, 1024, 1, 512] + - Exact: [1536, 1024, 1, 512] + - Exact: [7600, 512, 1, 1024] + - Exact: [6144, 512, 1, 1024] + - Exact: [1728, 1024, 1, 2048] + - Exact: [1536, 1024, 1, 2048] + - Exact: [4524, 256, 49, 256] + - Exact: [2666, 256, 64, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [3220, 1024, 2, 256] + - Exact: [782, 128, 64, 128] + - Exact: [850, 2048, 2, 512] + - Exact: [805, 2048, 2, 512] + - Exact: [713, 2048, 2, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [805, 2048, 2, 256] + - Exact: [1251, 256, 49, 256] + - Exact: [1900, 1024, 1, 2048] + - Exact: [1610, 1024, 1, 2048] + - Exact: [1900, 1024, 1, 512] + - Exact: [3220, 256, 2, 12] + - Exact: [3220, 256, 2, 3] + - Exact: [3036, 1024, 2, 256] + - Exact: [3036, 256, 2, 1024] + - Exact: [850, 2048, 2, 256] + - Exact: [2852, 1024, 2, 256] + - Exact: [950, 2048, 2, 256] + - Exact: [3700, 1024, 2, 256] + - Exact: [2852, 256, 2, 1024] + - Exact: [3700, 256, 2, 1024] + - Exact: [1269, 256, 49, 256] + - Exact: [1467, 256, 49, 256] + - Exact: [3500, 256, 2, 1024] + - Exact: [1449, 256, 49, 256] + - Exact: [1278, 256, 49, 256] + - Exact: [1413, 256, 49, 256] + - Exact: [1341, 256, 49, 256] + - Exact: [1287, 256, 49, 256] + - Exact: [1332, 256, 49, 256] + - Exact: [1359, 256, 49, 256] + - Exact: [1395, 256, 49, 256] + - Exact: [1323, 256, 49, 256] + - Exact: [1404, 256, 49, 256] + - Exact: [1386, 256, 49, 256] + - Exact: [1350, 256, 49, 256] + - Exact: [3500, 1024, 2, 256] + - Exact: [3220, 256, 2, 1024] + - Exact: [690, 256, 64, 256] + - Exact: [660, 256, 64, 256] + - Exact: [782, 256, 64, 256] + - Exact: [884, 256, 64, 256] + - Exact: [1610, 1024, 1, 512] + - Exact: [1700, 1024, 1, 512] + - Exact: [1700, 1024, 1, 2048] + - Exact: [1444, 128, 120, 256] + - Exact: [1444, 128, 18, 256] + - Exact: [1444, 128, 19, 256] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 512, 120, 256] + - Exact: [361, 512, 18, 256] + - Exact: [361, 512, 19, 256] + - Exact: [1920, 25216, 1, 16384] + - Exact: [3840, 1920, 1, 16384] + - Exact: [1920, 3840, 1, 16384] + - Exact: [960, 1920, 1, 16384] + - Exact: [1920, 2880, 1, 16384] + - Exact: [1920, 25216, 1, 4096] + - Exact: [3840, 1920, 1, 4096] + - Exact: [1920, 3840, 1, 4096] + - Exact: [960, 1920, 1, 4096] + - Exact: [1920, 2880, 1, 4096] + - Exact: [1920, 25216, 1, 8192] + - Exact: [3840, 1920, 1, 8192] + - Exact: [1920, 3840, 1, 8192] + - Exact: [960, 1920, 1, 8192] + - Exact: [1920, 2880, 1, 8192] + - Exact: [2304, 12672, 1, 16384] + - Exact: [2304, 2304, 1, 16384] + - Exact: [576, 2304, 1, 16384] + - Exact: [2304, 1728, 1, 16384] + - Exact: [2304, 12672, 1, 4096] + - Exact: [2304, 2304, 1, 4096] + - Exact: [576, 2304, 1, 4096] + - Exact: [2304, 1728, 1, 4096] + - Exact: [2304, 12672, 1, 8192] + - Exact: [2304, 2304, 1, 8192] + - Exact: [576, 2304, 1, 8192] + - Exact: [2304, 1728, 1, 8192] + - Exact: [3072, 6400, 1, 4096] + - Exact: [1536, 3072, 1, 4096] + - Exact: [3072, 1536, 1, 4096] + - Exact: [384, 3072, 1, 4096] + - Exact: [3072, 1152, 1, 4096] + - Exact: [3072, 6400, 1, 8192] + - Exact: [1536, 3072, 1, 8192] + - Exact: [3072, 1536, 1, 8192] + - Exact: [384, 3072, 1, 8192] + - Exact: [3072, 1152, 1, 8192] + - Exact: [2048, 2048, 1, 4096] + - Exact: [2048, 2048, 1, 8] + - Exact: [2048, 29000, 1, 199] + - Exact: [2048, 29000, 1, 221] + - Exact: [2048, 29000, 1, 224] + - Exact: [2048, 29000, 1, 229] + - Exact: [2048, 29000, 1, 234] + - Exact: [2048, 29000, 1, 242] + - Exact: [2048, 29000, 1, 246] + - Exact: [2048, 29000, 1, 247] + - Exact: [2048, 29000, 1, 256] + - Exact: [2048, 29000, 1, 262] + - Exact: [2048, 29000, 1, 264] + - Exact: [2048, 29000, 1, 265] + - Exact: [2048, 29000, 1, 274] + - Exact: [2048, 29000, 1, 277] + - Exact: [2048, 29000, 1, 279] + - Exact: [2048, 29000, 1, 288] + - Exact: [2048, 29000, 1, 296] + - Exact: [2048, 29000, 1, 315] + - Exact: [2048, 29000, 1, 335] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 4096] + - Exact: [1024, 29000, 1, 2283] + - Exact: [1024, 29000, 1, 2296] + - Exact: [1024, 29000, 1, 2306] + - Exact: [1024, 29000, 1, 2309] + - Exact: [1024, 29000, 1, 2318] + - Exact: [1024, 29000, 1, 2320] + - Exact: [1024, 29000, 1, 2324] + - Exact: [1024, 29000, 1, 2325] + - Exact: [1024, 29000, 1, 2329] + - Exact: [1024, 29000, 1, 2338] + - Exact: [1024, 29000, 1, 2345] + - Exact: [1024, 29000, 1, 2350] + - Exact: [1024, 29000, 1, 2362] + - Exact: [1024, 29000, 1, 2366] + - Exact: [1024, 29000, 1, 2368] + - Exact: [1024, 29000, 1, 2374] + - Exact: [1024, 29000, 1, 2390] + - Exact: [1024, 29000, 1, 561] + - Exact: [1024, 29000, 1, 574] + - Exact: [1024, 29000, 1, 600] + - Exact: [1024, 29000, 1, 608] + - Exact: [1024, 29000, 1, 615] + - Exact: [1024, 29000, 1, 622] + - Exact: [1024, 29000, 1, 625] + - Exact: [1024, 29000, 1, 626] + - Exact: [1024, 29000, 1, 628] + - Exact: [1024, 29000, 1, 636] + - Exact: [1024, 29000, 1, 651] + - Exact: [1024, 29000, 1, 658] + - Exact: [1024, 29000, 1, 669] + - Exact: [1024, 29000, 1, 670] + - Exact: [1024, 29000, 1, 672] + - Exact: [1024, 29000, 1, 684] + - Exact: [1024, 29000, 1, 716] + - Exact: [1024, 29000, 1, 730] + - Exact: [2560, 2560, 1, 1024] + - Exact: [2560, 2560, 1, 2] + - Exact: [2560, 29000, 1, 109] + - Exact: [2560, 29000, 1, 121] + - Exact: [2560, 29000, 1, 27] + - Exact: [2560, 29000, 1, 35] + - Exact: [2560, 29000, 1, 36] + - Exact: [2560, 29000, 1, 39] + - Exact: [2560, 29000, 1, 40] + - Exact: [2560, 29000, 1, 42] + - Exact: [2560, 29000, 1, 43] + - Exact: [2560, 29000, 1, 44] + - Exact: [2560, 29000, 1, 46] + - Exact: [2560, 29000, 1, 48] + - Exact: [2560, 29000, 1, 49] + - Exact: [2560, 29000, 1, 50] + - Exact: [2560, 29000, 1, 51] + - Exact: [2560, 29000, 1, 53] + - Exact: [2560, 29000, 1, 54] + - Exact: [2560, 29000, 1, 55] + - Exact: [2560, 29000, 1, 56] + - Exact: [2560, 29000, 1, 57] + - Exact: [2560, 29000, 1, 58] + - Exact: [2560, 29000, 1, 59] + - Exact: [2560, 29000, 1, 61] + - Exact: [2560, 29000, 1, 63] + - Exact: [2560, 29000, 1, 65] + - Exact: [2560, 29000, 1, 66] + - Exact: [2560, 29000, 1, 67] + - Exact: [2560, 29000, 1, 69] + - Exact: [2560, 29000, 1, 70] + - Exact: [2560, 29000, 1, 71] + - Exact: [2560, 29000, 1, 73] + - Exact: [2560, 29000, 1, 74] + - Exact: [2560, 29000, 1, 75] + - Exact: [2560, 29000, 1, 77] + - Exact: [2560, 29000, 1, 78] + - Exact: [2560, 29000, 1, 80] + - Exact: [2560, 29000, 1, 81] + - Exact: [2560, 29000, 1, 82] + - Exact: [2560, 29000, 1, 83] + - Exact: [2560, 29000, 1, 84] + - Exact: [2560, 29000, 1, 88] + - Exact: [2560, 29000, 1, 89] + - Exact: [2560, 29000, 1, 90] + - Exact: [2560, 29000, 1, 92] + - Exact: [2560, 29000, 1, 95] + - Exact: [2560, 29000, 1, 98] + - Exact: [2560, 4096, 1, 1024] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 32768] + - Exact: [1024, 4096, 1, 32768] + - Exact: [1024, 50304, 1, 32768] + - Exact: [4096, 1024, 1, 32768] + - Exact: [1024, 128, 24, 1024] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 2560, 1, 8976] + - Exact: [256, 2816, 1, 8976] + - Exact: [256, 3328, 1, 8976] + - Exact: [256, 3584, 1, 8976] + - Exact: [256, 3840, 1, 8976] + - Exact: [256, 4096, 1, 8976] + - Exact: [256, 4352, 1, 8976] + - Exact: [480, 1024, 1, 32768] + - Exact: [1024, 256, 1, 21248] + - Exact: [1024, 256, 1, 21504] + - Exact: [1024, 256, 1, 22016] + - Exact: [1024, 256, 1, 28672] + - Exact: [1024, 256, 1, 33536] + - Exact: [1024, 512, 1, 32768] + - Exact: [1024, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 9216] + - Exact: [1024, 1024, 1, 9520] + - Exact: [1024, 1024, 1, 10064] + - Exact: [1024, 1024, 1, 10080] + - Exact: [1024, 1024, 1, 10200] + - Exact: [479, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 1024, 1, 9600] + - Exact: [1024, 1024, 1, 16384] + - Exact: [512, 256, 1, 55296] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 10496] + - Exact: [1024, 1024, 1, 10224] + - Exact: [1024, 1024, 1, 10192] + - Exact: [1024, 1024, 1, 10208] + - Exact: [1024, 1024, 1, 10184] + - Exact: [1024, 1024, 1, 10120] + - Exact: [1024, 1024, 1, 10152] + - Exact: [1024, 1024, 1, 12288] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 5888, 1, 1280] + - Exact: [64, 5056, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [5888, 64, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [64, 5056, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [64, 6784, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [64, 5888, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [6784, 64, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 64] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [128, 64, 512, 128] + - Exact: [512, 64, 64, 512] + - Exact: [1024, 1024, 1, 4] + - Exact: [1024, 1024, 1, 32] + - Exact: [1024, 1024, 1, 2048] + - Exact: [1024, 1024, 1, 4096] + - Exact: [256, 1280, 1, 8976] + - Exact: [257, 4096, 1, 1024] + - Exact: [512, 2048, 1, 256] + - Exact: [560, 1024, 1, 200] + - Exact: [560, 1024, 1, 1600] + - Exact: [1024, 1024, 1, 200] + - Exact: [1024, 1024, 1, 512] + - Exact: [1024, 1024, 1, 960] + - Exact: [1024, 1024, 1, 1600] + - Exact: [2048, 256, 1, 1024] + - Exact: [1024, 1024, 1, 3840] + - Exact: [1024, 1024, 1, 3968] + - Exact: [1024, 1024, 1, 6528] + - Exact: [1024, 1024, 1, 7104] + - Exact: [1024, 1024, 1, 7200] + - Exact: [1024, 1024, 1, 8064] + - Exact: [1024, 1024, 1, 8160] + - Exact: [1024, 1024, 1, 3240] + - Exact: [1024, 1024, 1, 3960] + - Exact: [64, 1280, 64, 192] + - Exact: [64, 1280, 64, 320] + - Exact: [64, 1280, 64, 384] + - Exact: [64, 1280, 64, 448] + - Exact: [64, 2048, 64, 192] + - Exact: [64, 2048, 64, 320] + - Exact: [64, 2048, 64, 384] + - Exact: [64, 2048, 64, 448] + - Exact: [5329, 64, 64, 80] + - Exact: [64, 1280, 32, 192] + - Exact: [64, 1280, 32, 320] + - Exact: [64, 1280, 32, 384] + - Exact: [64, 1280, 32, 448] + - Exact: [64, 2048, 32, 192] + - Exact: [64, 2048, 32, 320] + - Exact: [64, 2048, 32, 384] + - Exact: [64, 2048, 32, 448] + - Exact: [5329, 64, 32, 80] + - Exact: [3136, 64, 32, 256] + - Exact: [3136, 64, 32, 64] + - Exact: [196, 256, 32, 1024] + - Exact: [3136, 64, 64, 128] + - Exact: [3136, 64, 32, 128] + - Exact: [960, 1024, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [64, 512, 16, 512] + - Exact: [1024, 1024, 1, 1] + - Exact: [1024, 1024, 1, 77] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 10] + - Exact: [1024, 1024, 1, 1280] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 1024, 1, 39] + - Exact: [1024, 1024, 1, 780] + - Exact: [1024, 1024, 1, 4992] + - Exact: [1024, 1024, 1, 308] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 1024, 1, 40] + - Exact: [1024, 1024, 1, 800] + - Exact: [1024, 1024, 1, 5120] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 1024, 1, 41] + - Exact: [1024, 1024, 1, 820] + - Exact: [1024, 1024, 1, 5248] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 1024, 1, 5] + - Exact: [1024, 1024, 1, 385] + - Exact: [1024, 1024, 1, 2560] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 1024, 1, 6] + - Exact: [1024, 1024, 1, 462] + - Exact: [1024, 1024, 1, 3072] + - Exact: [64, 128, 128, 128] + - Exact: [1024, 1024, 1, 8] + - Exact: [1024, 1024, 1, 160] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 1024, 1, 9] + - Exact: [1024, 1024, 1, 180] + - Exact: [1024, 1024, 1, 1152] + - Exact: [2048, 512, 1, 1] + - Exact: [64, 1024, 32, 1024] + - Exact: [1024, 64, 128, 1024] + - Exact: [1024, 64, 32, 1024] + - Exact: [1024, 96, 64, 1024] + - Exact: [1024, 1024, 1, 16] + - Exact: [64, 512, 40, 512] + - Exact: [64, 1024, 256, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [512, 64, 256, 512] + - Exact: [1024, 96, 128, 1024] + - Exact: [64, 512, 128, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [512, 64, 128, 512] + - Exact: [64, 1024, 128, 1024] + - Exact: [1024, 64, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [1024, 64, 256, 1024] + - Exact: [512, 64, 40, 512] + - Exact: [1024, 1024, 1, 64] + - Exact: [64, 128, 1024, 128] + - Exact: [128, 64, 1024, 128] + - Exact: [1024, 1024, 1, 3456] + - Exact: [1024, 1024, 1, 6912] + - Exact: [1024, 1024, 1, 864] + - Exact: [1024, 512, 1, 3456] + - Exact: [1024, 512, 1, 4096] + - Exact: [1024, 512, 1, 6912] + - Exact: [1024, 512, 1, 864] + - Exact: [256, 3456, 1, 1] + - Exact: [256, 4096, 1, 1] + - Exact: [480, 1024, 1, 3456] + - Exact: [480, 1024, 1, 4096] + - Exact: [480, 1024, 1, 6912] + - Exact: [480, 1024, 1, 864] + - Exact: [1024, 1024, 1, 80] + - Exact: [64, 128, 1280, 128] + - Exact: [128, 64, 1280, 128] + - Exact: [1024, 1024, 1, 82] + - Exact: [128, 64, 1312, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [1024, 1024, 1, 12] + - Exact: [1024, 1024, 1, 6144] + - Exact: [64, 512, 192, 512] + - Exact: [512, 64, 192, 512] + - Exact: [784, 1152, 1, 128] + - Exact: [64, 128, 2048, 128] + - Exact: [128, 64, 2048, 128] + - Exact: [1024, 1024, 1, 128] + - Exact: [128, 64, 1536, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [1024, 1024, 1, 96] + - Exact: [92416, 64, 25, 64] + - Exact: [50176, 64, 36, 64] + - Exact: [36864, 64, 49, 64] + - Exact: [25600, 64, 64, 64] + - Exact: [64, 128, 192, 128] + - Exact: [128, 64, 192, 128] + - Exact: [768, 768, 1, 2048] + - Exact: [64, 384, 144, 384] + - Exact: [384, 64, 144, 384] + - Exact: [768, 768, 1, 4608] + - Exact: [64, 512, 48, 512] + - Exact: [512, 64, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [128, 64, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [384, 64, 192, 384] + - Exact: [1024, 1024, 1, 4608] + - Exact: [196, 2304, 1, 256] + - Exact: [768, 512, 2, 2048] + - Exact: [672, 512, 2, 2048] + - Exact: [1008, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [840, 512, 2, 2048] + - Exact: [768, 256, 2, 12] + - Exact: [864, 256, 2, 3] + - Exact: [864, 256, 2, 12] + - Exact: [768, 256, 2, 3] + - Exact: [1024, 320, 1, 1024] + - Exact: [173280, 64, 1, 128] + - Exact: [25992, 64, 1, 128] + - Exact: [713, 512, 2, 2048] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [748, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [850, 256, 2, 3] + - Exact: [805, 256, 2, 12] + - Exact: [805, 256, 2, 3] + - Exact: [850, 256, 2, 12] + - Exact: [950, 256, 2, 12] + - Exact: [950, 256, 2, 3] + - Exact: [100, 512, 120, 128] + - Exact: [100, 512, 18, 128] + - Exact: [100, 512, 19, 128] + - Exact: [1444, 576, 1, 128] + - Exact: [27436, 64, 1, 128] + - Exact: [361, 2304, 1, 512] + - Exact: [96, 1024, 160, 1024] + - Exact: [1024, 96, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [1024, 96, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [1024, 96, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [1024, 96, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [1024, 96, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [1024, 96, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [1024, 96, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [1024, 96, 32, 1024] + - Exact: [512, 64, 320, 512] + - Exact: [64, 512, 320, 512] + - Exact: [1024, 1024, 1, 20] + - Exact: [512, 64, 80, 512] + - Exact: [1024, 64, 512, 1024] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 256, 1, 32768] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 8448] + - Exact: [1024, 256, 1, 9728] + - Exact: [1024, 256, 1, 9984] + - Exact: [1024, 256, 1, 10496] + - Exact: [1024, 256, 1, 11520] + - Exact: [1024, 256, 1, 12032] + - Exact: [1024, 256, 1, 13568] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 14848] + - Exact: [1024, 256, 1, 15104] + - Exact: [1024, 256, 1, 15872] + - Exact: [1024, 256, 1, 16128] + - Exact: [1024, 256, 1, 17152] + - Exact: [1024, 256, 1, 17408] + - Exact: [1024, 256, 1, 18944] + - Exact: [1024, 256, 1, 19712] + - Exact: [1024, 256, 1, 19968] + - Exact: [256, 128, 1, 55296] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [2368, 64, 1, 256] + - Exact: [2368, 64, 1, 3328] + - Exact: [64, 1408, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [64, 3584, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 2368, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [64, 2368, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [64, 4288, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 3328] + - Exact: [1856, 64, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [64, 2368, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [33, 32, 200, 33] + - Exact: [33, 32, 1600, 33] + - Exact: [67, 2048, 1, 512] + - Exact: [74, 2048, 1, 512] + - Exact: [74, 2048, 1, 960] + - Exact: [100, 2048, 1, 512] + - Exact: [512, 512, 1, 200] + - Exact: [512, 512, 1, 1600] + - Exact: [1024, 256, 1, 1024] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 256, 1, 3072] + - Exact: [1024, 256, 1, 3328] + - Exact: [1024, 256, 1, 3584] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 256, 1, 4352] + - Exact: [1024, 256, 1, 4608] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 256, 1, 5376] + - Exact: [1024, 256, 1, 5632] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 256, 1, 6400] + - Exact: [1024, 256, 1, 7680] + - Exact: [1024, 256, 1, 7936] + - Exact: [32, 64, 4608, 32] + - Exact: [32, 64, 4608, 35] + - Exact: [34, 64, 4736, 24] + - Exact: [34, 64, 4736, 34] + - Exact: [35, 64, 4608, 35] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 32, 4608, 35] + - Exact: [64, 34, 4736, 24] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 35] + - Exact: [33, 64, 1920, 33] + - Exact: [64, 33, 1920, 33] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [256, 864, 1, 1] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 4096] + - Exact: [512, 256, 1, 6912] + - Exact: [512, 256, 1, 864] + - Exact: [49, 4608, 1, 512] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [56, 512, 64, 512] + - Exact: [176, 256, 2, 3] + - Exact: [176, 256, 2, 12] + - Exact: [216, 256, 2, 3] + - Exact: [192, 256, 2, 12] + - Exact: [192, 256, 2, 3] + - Exact: [216, 256, 2, 12] + - Exact: [228, 256, 2, 12] + - Exact: [228, 256, 2, 3] + - Exact: [187, 256, 2, 12] + - Exact: [247, 256, 2, 12] + - Exact: [187, 256, 2, 3] + - Exact: [221, 256, 2, 3] + - Exact: [221, 256, 2, 12] + - Exact: [247, 256, 2, 3] + - Exact: [100, 2304, 1, 512] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 128, 1, 32768] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 2, 1, 4] + - Exact: [2048, 2, 1, 8] + - Exact: [2560, 2, 1, 2] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [25, 1152, 1, 256] + - Exact: [9, 1152, 1, 256] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [13, 512, 1, 32768] + - Exact: [1024, 2, 1, 4992] + - Exact: [1024, 2, 1, 5120] + - Exact: [1024, 2, 1, 5248] + - Exact: [256, 128, 1, 6912] + - Exact: [13, 512, 1, 55296] + - Exact: [13, 512, 1, 6912] + - Exact: [768, 2, 1, 4608] + - Exact: [1024, 2, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 448, 1, 3328] + - Exact: [1, 64, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [1, 64, 1, 1280] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 64, 1, 1280] + - Exact: [1, 64, 1, 1] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [448, 64, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [64, 256, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [1, 1, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [1, 1, 1, 1280] + - Exact: [1024, 64, 1, 256] + - Exact: [256, 64, 1, 3328] + - Exact: [64, 64, 1, 3328] + - Exact: [1, 1, 1, 3328] + - Exact: [64, 704, 1, 1280] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 32, 1, 512] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 32, 1, 512] + - Exact: [14, 64, 1, 14] + - Exact: [15, 64, 1, 14] + - Exact: [15, 64, 1, 15] + - Exact: [15, 64, 1, 15] + - Exact: [17, 64, 1, 15] + - Exact: [17, 64, 1, 17] + - Exact: [17, 64, 1, 17] + - Exact: [21, 64, 1, 17] + - Exact: [21, 64, 1, 21] + - Exact: [24, 64, 1, 24] + - Exact: [30, 64, 1, 30] + - Exact: [30, 64, 1, 31] + - Exact: [31, 64, 1, 31] + - Exact: [32, 64, 1, 32] + - Exact: [32, 64, 1, 35] + - Exact: [34, 64, 1, 24] + - Exact: [34, 64, 1, 34] + - Exact: [35, 64, 1, 35] + - Exact: [64, 14, 1, 14] + - Exact: [64, 15, 1, 14] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 17, 1, 15] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 21, 1, 17] + - Exact: [64, 21, 1, 21] + - Exact: [64, 24, 1, 24] + - Exact: [64, 30, 1, 30] + - Exact: [64, 30, 1, 31] + - Exact: [64, 31, 1, 31] + - Exact: [64, 32, 1, 32] + - Exact: [64, 32, 1, 35] + - Exact: [64, 34, 1, 24] + - Exact: [64, 34, 1, 34] + - Exact: [64, 35, 1, 35] + - Exact: [64, 512, 1, 512] + - Exact: [512, 64, 1, 512] + - Exact: [1024, 2, 1, 4] + - Exact: [1024, 2, 1, 32] + - Exact: [1024, 2, 1, 2048] + - Exact: [3, 64, 512, 3] + - Exact: [5, 64, 512, 5] + - Exact: [5, 64, 960, 5] + - Exact: [9, 64, 512, 9] + - Exact: [27, 128, 32768, 27] + - Exact: [512, 32, 1, 200] + - Exact: [512, 32, 1, 1600] + - Exact: [1024, 64, 1, 512] + - Exact: [1024, 64, 1, 960] + - Exact: [14, 64, 10880, 14] + - Exact: [15, 64, 10880, 14] + - Exact: [15, 64, 7680, 15] + - Exact: [15, 64, 10880, 15] + - Exact: [17, 64, 7680, 15] + - Exact: [17, 64, 6144, 17] + - Exact: [17, 64, 7680, 17] + - Exact: [21, 64, 6144, 17] + - Exact: [21, 64, 6144, 21] + - Exact: [24, 64, 4736, 24] + - Exact: [30, 64, 2048, 30] + - Exact: [30, 64, 2048, 31] + - Exact: [31, 64, 2048, 31] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 15, 10880, 14] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 17, 7680, 15] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 21, 6144, 17] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 30, 2048, 31] + - Exact: [64, 31, 2048, 31] + - Exact: [27, 64, 1920, 27] + - Exact: [27, 64, 1920, 33] + - Exact: [64, 27, 1920, 27] + - Exact: [64, 27, 1920, 33] + - Exact: [1024, 2, 1, 1] + - Exact: [1024, 2, 1, 512] + - Exact: [1024, 2, 1, 10] + - Exact: [1024, 2, 1, 1280] + - Exact: [1024, 2, 1, 39] + - Exact: [1024, 2, 1, 40] + - Exact: [1024, 2, 1, 41] + - Exact: [1024, 2, 1, 5] + - Exact: [1024, 2, 1, 2560] + - Exact: [1024, 2, 1, 6] + - Exact: [1024, 2, 1, 3072] + - Exact: [1024, 2, 1, 8] + - Exact: [1024, 2, 1, 1024] + - Exact: [1024, 2, 1, 9] + - Exact: [1024, 2, 1, 1152] + - Exact: [4, 64, 32768, 4] + - Exact: [4, 64, 38400, 4] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [64, 128, 1, 128] + - Exact: [128, 64, 1, 128] + - Exact: [5, 64, 1, 5] + - Exact: [33, 32, 1, 33] + - Exact: [1024, 2, 1, 16] + - Exact: [1024, 2, 1, 64] + - Exact: [256, 128, 1, 3456] + - Exact: [256, 128, 1, 4096] + - Exact: [256, 128, 1, 864] + - Exact: [1024, 2, 1, 80] + - Exact: [1024, 2, 1, 82] + - Exact: [1024, 2, 1, 12] + - Exact: [13, 512, 1, 3456] + - Exact: [13, 512, 1, 4096] + - Exact: [13, 512, 1, 864] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [1024, 2, 1, 128] + - Exact: [1024, 2, 1, 96] + - Exact: [768, 2, 1, 2048] + - Exact: [1024, 81, 1, 1024] + - Exact: [25, 256, 120, 128] + - Exact: [25, 256, 18, 128] + - Exact: [25, 256, 19, 128] + - Exact: [9, 256, 120, 128] + - Exact: [9, 256, 18, 128] + - Exact: [9, 256, 19, 128] + - Exact: [1024, 2, 1, 20] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml new file mode 100644 index 000000000..64e1ffcaa --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml @@ -0,0 +1,3226 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 1024, 1, 1] + - Exact: [5056, 1408, 1, 3328] + - Exact: [5056, 1856, 1, 3328] + - Exact: [448, 3584, 1, 3328] + - Exact: [5056, 4288, 1, 32] + - Exact: [3584, 1024, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 3328] + - Exact: [448, 3584, 1, 32] + - Exact: [4288, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 32] + - Exact: [1024, 2944, 1, 1] + - Exact: [2944, 3584, 1, 3328] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 32] + - Exact: [5056, 256, 1, 256] + - Exact: [1856, 2368, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [3584, 4288, 1, 32] + - Exact: [3584, 3584, 1, 1] + - Exact: [1408, 2368, 1, 1] + - Exact: [5056, 6784, 1, 1280] + - Exact: [4288, 5056, 1, 1] + - Exact: [5056, 4288, 1, 1] + - Exact: [1408, 4288, 1, 1280] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 1] + - Exact: [704, 3584, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1408, 1024, 1, 1280] + - Exact: [5056, 5888, 1, 3328] + - Exact: [3584, 3584, 1, 1280] + - Exact: [2368, 3584, 1, 32] + - Exact: [2944, 2368, 1, 1] + - Exact: [704, 4288, 1, 1] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1024, 3584, 1, 1] + - Exact: [256, 5056, 1, 32] + - Exact: [2368, 5056, 1, 32] + - Exact: [6784, 1856, 1, 32] + - Exact: [5056, 704, 1, 1] + - Exact: [2944, 4288, 1, 256] + - Exact: [5056, 704, 1, 32] + - Exact: [1856, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 32] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 1] + - Exact: [1856, 2944, 1, 1] + - Exact: [1856, 2368, 1, 32] + - Exact: [4288, 1856, 1, 32] + - Exact: [5056, 2368, 1, 256] + - Exact: [1408, 5888, 1, 256] + - Exact: [5056, 6784, 1, 1] + - Exact: [1024, 1408, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [704, 2368, 1, 1] + - Exact: [3584, 4288, 1, 1280] + - Exact: [3584, 2368, 1, 1] + - Exact: [4288, 448, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [2368, 4288, 1, 32] + - Exact: [704, 5056, 1, 1280] + - Exact: [3584, 6784, 1, 32] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 4288, 1, 3328] + - Exact: [1408, 3584, 1, 1] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [3584, 704, 1, 1] + - Exact: [448, 5056, 1, 1] + - Exact: [5888, 5888, 1, 256] + - Exact: [3584, 704, 1, 32] + - Exact: [448, 6784, 1, 3328] + - Exact: [6784, 4288, 1, 1] + - Exact: [3584, 6784, 1, 1] + - Exact: [1408, 2368, 1, 32] + - Exact: [448, 5056, 1, 32] + - Exact: [4288, 4288, 1, 1280] + - Exact: [6784, 1408, 1, 1] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 5888, 1, 1] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [3584, 3584, 1, 32] + - Exact: [2944, 2944, 1, 1280] + - Exact: [1408, 5056, 1, 1] + - Exact: [2368, 6784, 1, 1] + - Exact: [6784, 4288, 1, 1280] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [704, 2944, 1, 3328] + - Exact: [5888, 256, 1, 1] + - Exact: [5056, 6784, 1, 32] + - Exact: [448, 5056, 1, 1280] + - Exact: [256, 5888, 1, 3328] + - Exact: [5888, 1024, 1, 1] + - Exact: [5888, 448, 1, 32] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [448, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 1] + - Exact: [1408, 1856, 1, 3328] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 5888, 1, 3328] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1408, 1024, 1, 1] + - Exact: [1408, 1024, 1, 256] + - Exact: [5056, 3584, 1, 1] + - Exact: [6784, 6784, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [5056, 3584, 1, 32] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [5888, 4288, 1, 1] + - Exact: [5056, 704, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [2944, 1856, 1, 1] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [2944, 1856, 1, 1280] + - Exact: [1024, 2368, 1, 1] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 4288, 1, 256] + - Exact: [448, 3584, 1, 1] + - Exact: [2368, 2944, 1, 32] + - Exact: [4288, 704, 1, 256] + - Exact: [1856, 1024, 1, 256] + - Exact: [704, 6784, 1, 32] + - Exact: [1024, 4288, 1, 1] + - Exact: [1408, 5888, 1, 1280] + - Exact: [5056, 1856, 1, 256] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [6784, 1856, 1, 3328] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 6784, 1, 32] + - Exact: [6784, 6784, 1, 32] + - Exact: [6784, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [6784, 448, 1, 1] + - Exact: [6784, 1856, 1, 1] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 448, 1, 32] + - Exact: [6784, 448, 1, 3328] + - Exact: [448, 3584, 1, 1280] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [5888, 704, 1, 1] + - Exact: [3584, 1856, 1, 1] + - Exact: [5056, 2944, 1, 32] + - Exact: [4288, 6784, 1, 1] + - Exact: [1024, 6784, 1, 1] + - Exact: [2368, 5888, 1, 32] + - Exact: [3584, 4288, 1, 1] + - Exact: [5888, 1024, 1, 3328] + - Exact: [6784, 5888, 1, 256] + - Exact: [5056, 1024, 1, 1] + - Exact: [4288, 2368, 1, 32] + - Exact: [704, 3584, 1, 1] + - Exact: [6784, 704, 1, 32] + - Exact: [704, 5888, 1, 256] + - Exact: [2368, 3584, 1, 1280] + - Exact: [3584, 5056, 1, 32] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 3328] + - Exact: [2368, 5056, 1, 1] + - Exact: [5888, 1408, 1, 256] + - Exact: [2368, 1024, 1, 32] + - Exact: [4288, 1024, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [5056, 5888, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5888, 5056, 1, 1280] + - Exact: [5888, 2944, 1, 1] + - Exact: [1408, 4288, 1, 3328] + - Exact: [704, 2944, 1, 32] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 448, 1, 256] + - Exact: [256, 6784, 1, 256] + - Exact: [1024, 1408, 1, 1] + - Exact: [256, 5888, 1, 1] + - Exact: [2944, 2944, 1, 1] + - Exact: [6784, 3584, 1, 256] + - Exact: [1408, 1856, 1, 256] + - Exact: [2944, 2944, 1, 32] + - Exact: [2944, 2944, 1, 3328] + - Exact: [6784, 1408, 1, 32] + - Exact: [2368, 6784, 1, 3328] + - Exact: [4288, 3584, 1, 32] + - Exact: [3584, 704, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [2944, 5888, 1, 32] + - Exact: [3584, 5056, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [4288, 4288, 1, 256] + - Exact: [448, 5056, 1, 256] + - Exact: [4288, 704, 1, 1280] + - Exact: [2368, 704, 1, 1] + - Exact: [1408, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 3328] + - Exact: [448, 4288, 1, 32] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 1024, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 32] + - Exact: [5056, 3584, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [2368, 3584, 1, 1] + - Exact: [4288, 2368, 1, 256] + - Exact: [1408, 2944, 1, 3328] + - Exact: [5888, 3584, 1, 1] + - Exact: [6784, 5056, 1, 3328] + - Exact: [6784, 5056, 1, 1] + - Exact: [5888, 3584, 1, 32] + - Exact: [5888, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 256] + - Exact: [6784, 5888, 1, 32] + - Exact: [2368, 6784, 1, 32] + - Exact: [5056, 1408, 1, 1280] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2944, 3584, 1, 1] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3584, 1024, 1, 1] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 3584, 1, 32] + - Exact: [5888, 256, 1, 32] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 1] + - Exact: [1024, 1856, 1, 32] + - Exact: [5888, 2368, 1, 1] + - Exact: [2368, 2368, 1, 1] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 32] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [1856, 1856, 1, 32] + - Exact: [4288, 2944, 1, 32] + - Exact: [256, 5056, 1, 1] + - Exact: [5056, 5056, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [4288, 704, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [448, 3584, 1, 256] + - Exact: [6784, 256, 1, 32] + - Exact: [6784, 1408, 1, 1280] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 1408, 1, 1280] + - Exact: [5888, 1856, 1, 32] + - Exact: [5888, 704, 1, 3328] + - Exact: [448, 6784, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [1856, 1408, 1, 32] + - Exact: [5888, 2944, 1, 1280] + - Exact: [448, 5888, 1, 1] + - Exact: [3584, 1408, 1, 1] + - Exact: [448, 5888, 1, 32] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 6784, 1, 1] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 32] + - Exact: [1856, 6784, 1, 1280] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 6784, 1, 32] + - Exact: [3584, 5888, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1856, 2368, 1, 1] + - Exact: [1408, 5056, 1, 3328] + - Exact: [5056, 4288, 1, 256] + - Exact: [5056, 5056, 1, 32] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [4288, 5888, 1, 1] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2368, 4288, 1, 1] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 5888, 1, 32] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1024, 5888, 1, 32] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 2368, 1, 32] + - Exact: [1408, 2368, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [704, 5056, 1, 32] + - Exact: [5056, 4288, 1, 1280] + - Exact: [4288, 448, 1, 1] + - Exact: [5888, 5888, 1, 1] + - Exact: [2944, 704, 1, 1280] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1] + - Exact: [5056, 256, 1, 32] + - Exact: [5056, 1024, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 4288, 1, 32] + - Exact: [5888, 448, 1, 1] + - Exact: [5056, 5056, 1, 1280] + - Exact: [6784, 1408, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5888, 1408, 1, 32] + - Exact: [256, 6784, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 1] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 1408, 1, 32] + - Exact: [1408, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [704, 2368, 1, 32] + - Exact: [704, 6784, 1, 1] + - Exact: [2368, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 2944, 1, 32] + - Exact: [5888, 2368, 1, 3328] + - Exact: [2368, 704, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2944, 5056, 1, 32] + - Exact: [704, 2368, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1024, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4288, 704, 1, 1] + - Exact: [1856, 1024, 1, 1] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 32] + - Exact: [1856, 1024, 1, 32] + - Exact: [2944, 6784, 1, 1] + - Exact: [6784, 2368, 1, 32] + - Exact: [5888, 5056, 1, 1] + - Exact: [704, 5888, 1, 1] + - Exact: [6784, 6784, 1, 1] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 32] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [1408, 1408, 1, 32] + - Exact: [1408, 1408, 1, 3328] + - Exact: [2944, 1856, 1, 256] + - Exact: [4288, 2944, 1, 1] + - Exact: [6784, 5056, 1, 32] + - Exact: [2944, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [2368, 5888, 1, 1] + - Exact: [1408, 1856, 1, 32] + - Exact: [1856, 6784, 1, 3328] + - Exact: [1024, 2368, 1, 32] + - Exact: [2368, 2368, 1, 3328] + - Exact: [3584, 5888, 1, 32] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [3584, 1024, 1, 32] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 5888, 1, 32] + - Exact: [1856, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 1] + - Exact: [704, 4288, 1, 3328] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 32] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [1856, 5056, 1, 1] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 448, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [1856, 5056, 1, 32] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 2944, 1, 32] + - Exact: [1408, 6784, 1, 256] + - Exact: [1024, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1] + - Exact: [2944, 5888, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 704, 1, 256] + - Exact: [6784, 4288, 1, 256] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [4288, 1024, 1, 32] + - Exact: [5888, 1856, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 704, 1, 32] + - Exact: [2944, 704, 1, 1] + - Exact: [6784, 1856, 1, 256] + - Exact: [1856, 1856, 1, 1] + - Exact: [2944, 704, 1, 3328] + - Exact: [2368, 1856, 1, 32] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 448, 1, 3328] + - Exact: [256, 6784, 1, 1] + - Exact: [1024, 3584, 1, 32] + - Exact: [256, 6784, 1, 32] + - Exact: [2944, 1408, 1, 32] + - Exact: [4288, 3584, 1, 1] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 3584, 1, 32] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [2944, 5888, 1, 1] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5888, 1024, 1, 32] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5056, 6784, 1, 256] + - Exact: [2944, 5056, 1, 1] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [2368, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 1] + - Exact: [2944, 1024, 1, 32] + - Exact: [2944, 1024, 1, 1280] + - Exact: [5056, 5056, 1, 1] + - Exact: [2368, 4288, 1, 256] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [3584, 2368, 1, 32] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 256] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5056, 2368, 1, 1280] + - Exact: [2944, 1024, 1, 1] + - Exact: [3584, 704, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [4288, 2368, 1, 1] + - Exact: [1408, 3584, 1, 32] + - Exact: [2944, 4288, 1, 32] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1] + - Exact: [1024, 1856, 1, 1] + - Exact: [704, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 1408, 1, 1] + - Exact: [4288, 5056, 1, 1280] + - Exact: [1856, 1856, 1, 3328] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [5888, 3584, 1, 256] + - Exact: [1024, 5056, 1, 32] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 1408, 1, 256] + - Exact: [3584, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 1280] + - Exact: [3584, 5888, 1, 1] + - Exact: [5888, 4288, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2944, 3584, 1, 256] + - Exact: [5888, 1856, 1, 1] + - Exact: [6784, 2368, 1, 3328] + - Exact: [1408, 4288, 1, 32] + - Exact: [1856, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 256] + - Exact: [5056, 1856, 1, 1] + - Exact: [5888, 2368, 1, 256] + - Exact: [1408, 1024, 1, 32] + - Exact: [5056, 1856, 1, 32] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [5888, 6784, 1, 1] + - Exact: [5888, 4288, 1, 32] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 256, 1, 1] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [2944, 1856, 1, 32] + - Exact: [2368, 1408, 1, 256] + - Exact: [2368, 1856, 1, 1] + - Exact: [4288, 1408, 1, 1] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1408, 2944, 1, 1] + - Exact: [4288, 1408, 1, 32] + - Exact: [5888, 2944, 1, 256] + - Exact: [1408, 2944, 1, 32] + - Exact: [5888, 6784, 1, 256] + - Exact: [6784, 5888, 1, 1] + - Exact: [6784, 5888, 1, 1280] + - Exact: [1024, 4288, 1, 32] + - Exact: [3584, 5888, 1, 256] + - Exact: [5056, 2368, 1, 1] + - Exact: [5056, 448, 1, 1] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [5056, 448, 1, 32] + - Exact: [3584, 2944, 1, 32] + - Exact: [3584, 1856, 1, 32] + - Exact: [4288, 1408, 1, 1280] + - Exact: [6784, 2368, 1, 1] + - Exact: [704, 5056, 1, 1] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5888, 1408, 1, 1] + - Exact: [1024, 4288, 1, 1280] + - Exact: [1856, 4288, 1, 1] + - Exact: [3584, 4288, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [704, 5056, 1, 256] + - Exact: [1856, 4288, 1, 32] + - Exact: [4288, 1024, 1, 1280] + - Exact: [4288, 6784, 1, 32] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [4288, 5888, 1, 256] + - Exact: [3584, 3584, 1, 3328] + - Exact: [2944, 6784, 1, 32] + - Exact: [5056, 256, 1, 1] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [1024, 1408, 1, 32] + - Exact: [6784, 704, 1, 1] + - Exact: [704, 3584, 1, 32] + - Exact: [4288, 4288, 1, 1] + - Exact: [5056, 2944, 1, 1] + - Exact: [6784, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2368, 1856, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [3584, 448, 1, 1] + - Exact: [2944, 1024, 1, 3328] + - Exact: [5888, 5056, 1, 32] + - Exact: [704, 2944, 1, 1] + - Exact: [3584, 448, 1, 32] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [1856, 2368, 1, 1280] + - Exact: [6784, 1024, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 1408, 1, 1] + - Exact: [1408, 4288, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [448, 6784, 1, 1] + - Exact: [704, 2944, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 6784, 1, 32] + - Exact: [1408, 1856, 1, 1] + - Exact: [4288, 448, 1, 32] + - Exact: [4288, 448, 1, 1280] + - Exact: [2944, 704, 1, 32] + - Exact: [448, 4288, 1, 1] + - Exact: [3584, 5056, 1, 1] + - Exact: [1408, 3584, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2368, 1, 32] + - Exact: [5888, 2944, 1, 32] + - Exact: [1856, 2944, 1, 32] + - Exact: [5056, 1408, 1, 1] + - Exact: [5888, 1408, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [6784, 1024, 1, 1] + - Exact: [6784, 1024, 1, 32] + - Exact: [6784, 3584, 1, 1] + - Exact: [2944, 2368, 1, 32] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1408, 1, 256] + - Exact: [5056, 1024, 1, 32] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 1] + - Exact: [256, 5888, 1, 32] + - Exact: [4288, 5056, 1, 32] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1856, 5888, 1, 32] + - Exact: [1856, 5888, 1, 1280] + - Exact: [704, 2368, 1, 1280] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [2944, 4288, 1, 1] + - Exact: [5056, 5888, 1, 32] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 6784, 1, 32] + - Exact: [3584, 2944, 1, 3328] + - Exact: [1408, 5888, 1, 1] + - Exact: [704, 4288, 1, 32] + - Exact: [1408, 5888, 1, 32] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 2368, 1, 256] + - Exact: [1408, 6784, 1, 1] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1024, 5888, 1, 1] + - Exact: [1856, 3584, 1, 256] + - Exact: [2368, 704, 1, 32] + - Exact: [256, 5888, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [5056, 2368, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [2560, 7000, 1, 2560] + - Exact: [7680, 12000, 1, 2560] + - Exact: [5124, 9124, 1, 1760] + - Exact: [512, 24000, 1, 1536] + - Exact: [3072, 24000, 1, 1024] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 48000, 1, 2048] + - Exact: [2048, 1600, 1, 2048] + - Exact: [512, 48000, 1, 1536] + - Exact: [8448, 5984, 1, 2816] + - Exact: [4096, 3200, 1, 1024] + - Exact: [1024, 24000, 1, 2560] + - Exact: [1760, 6400, 1, 1760] + - Exact: [5124, 9124, 1, 2048] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1024, 48000, 1, 2560] + - Exact: [8448, 48000, 1, 2816] + - Exact: [2560, 3200, 1, 2560] + - Exact: [16384, 800, 1, 4096] + - Exact: [4608, 24000, 1, 1536] + - Exact: [7680, 48000, 1, 2560] + - Exact: [3072, 48000, 1, 1024] + - Exact: [8192, 3200, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [4096, 400, 1, 1024] + - Exact: [6144, 48000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [2048, 800, 1, 512] + - Exact: [4608, 5984, 1, 1536] + - Exact: [4096, 1600, 1, 1024] + - Exact: [6144, 5984, 1, 2048] + - Exact: [7680, 24000, 1, 2560] + - Exact: [6144, 48000, 1, 2048] + - Exact: [2048, 3200, 1, 2048] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 24000, 1, 1536] + - Exact: [2560, 6400, 1, 2560] + - Exact: [512, 24000, 1, 2560] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [2048, 1600, 1, 512] + - Exact: [2048, 7000, 1, 2048] + - Exact: [1760, 800, 1, 1760] + - Exact: [2560, 1600, 1, 2560] + - Exact: [2048, 3200, 1, 512] + - Exact: [2560, 800, 1, 2560] + - Exact: [4608, 12000, 1, 1536] + - Exact: [6144, 24000, 1, 2048] + - Exact: [8192, 800, 1, 2048] + - Exact: [5124, 9124, 1, 4096] + - Exact: [8448, 24000, 1, 2816] + - Exact: [1024, 48000, 1, 1536] + - Exact: [8192, 1600, 1, 2048] + - Exact: [4096, 800, 1, 1024] + - Exact: [2048, 800, 1, 2048] + - Exact: [1760, 3200, 1, 1760] + - Exact: [512, 48000, 1, 2560] + - Exact: [512, 24000, 1, 2048] + - Exact: [16384, 1600, 1, 4096] + - Exact: [1024, 24000, 1, 2048] + - Exact: [8192, 400, 1, 2048] + - Exact: [2048, 6400, 1, 2048] + - Exact: [6144, 12000, 1, 2048] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 48000, 1, 2816] + - Exact: [4096, 7000, 1, 4096] + - Exact: [6144, 24000, 1, 2560] + - Exact: [8448, 12000, 1, 2816] + - Exact: [16384, 400, 1, 4096] + - Exact: [1760, 1600, 1, 1760] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2049] + - Exact: [8192, 8191, 1, 8192] + - Exact: [8192, 8192, 1, 8192] + - Exact: [2047, 2048, 1, 2048] + - Exact: [2048, 2049, 1, 2048] + - Exact: [8192, 8192, 1, 8191] + - Exact: [3072, 513, 1, 3072] + - Exact: [8191, 8192, 1, 8192] + - Exact: [8192, 8193, 1, 8192] + - Exact: [4096, 4097, 1, 4096] + - Exact: [8192, 8192, 1, 8193] + - Exact: [4096, 4095, 1, 4096] + - Exact: [4096, 4096, 1, 4097] + - Exact: [2048, 2048, 1, 2048] + - Exact: [4095, 4096, 1, 4096] + - Exact: [8193, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4095] + - Exact: [3072, 511, 1, 3072] + - Exact: [2049, 2048, 1, 2048] + - Exact: [2048, 2047, 1, 2048] + - Exact: [2048, 2048, 1, 2047] + - Exact: [4097, 4096, 1, 4096] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [1024, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [30528, 2048, 1, 1024] + - Exact: [30528, 4096, 1, 1024] + - Exact: [128, 32768, 1, 256] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 32768, 1, 512] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [512, 32768, 1, 13] + - Exact: [512, 32768, 1, 1024] + - Exact: [684, 8976, 1, 256] + - Exact: [1024, 1600, 1, 560] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 32768, 1, 480] + - Exact: [1024, 32768, 1, 1024] + - Exact: [1280, 8976, 1, 256] + - Exact: [1792, 8976, 1, 256] + - Exact: [2048, 684, 1, 512] + - Exact: [2048, 684, 1, 768] + - Exact: [2048, 960, 1, 74] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1536, 1, 512] + - Exact: [2048, 1536, 1, 768] + - Exact: [2048, 8976, 1, 256] + - Exact: [2304, 8976, 1, 256] + - Exact: [2560, 8976, 1, 256] + - Exact: [2816, 8976, 1, 256] + - Exact: [3072, 8976, 1, 256] + - Exact: [3328, 8976, 1, 256] + - Exact: [3840, 8976, 1, 256] + - Exact: [4096, 8976, 1, 256] + - Exact: [4352, 8976, 1, 256] + - Exact: [4608, 8976, 1, 256] + - Exact: [4864, 8976, 1, 256] + - Exact: [5120, 8976, 1, 256] + - Exact: [5376, 8976, 1, 256] + - Exact: [5632, 8976, 1, 256] + - Exact: [5888, 8976, 1, 256] + - Exact: [6144, 8976, 1, 256] + - Exact: [6400, 8976, 1, 256] + - Exact: [7168, 8976, 1, 256] + - Exact: [7936, 8976, 1, 256] + - Exact: [8192, 8976, 1, 256] + - Exact: [8448, 8976, 1, 256] + - Exact: [8960, 8976, 1, 256] + - Exact: [9472, 8976, 1, 256] + - Exact: [9728, 8976, 1, 256] + - Exact: [9984, 8976, 1, 256] + - Exact: [10240, 8976, 1, 256] + - Exact: [10496, 8976, 1, 256] + - Exact: [11264, 8976, 1, 256] + - Exact: [11776, 8976, 1, 256] + - Exact: [12544, 8976, 1, 256] + - Exact: [13312, 8976, 1, 256] + - Exact: [13568, 8976, 1, 256] + - Exact: [13824, 8976, 1, 256] + - Exact: [15104, 8976, 1, 256] + - Exact: [15360, 8976, 1, 256] + - Exact: [15872, 8976, 1, 256] + - Exact: [16128, 8976, 1, 256] + - Exact: [17152, 8976, 1, 256] + - Exact: [18176, 8976, 1, 256] + - Exact: [18688, 8976, 1, 256] + - Exact: [18944, 8976, 1, 256] + - Exact: [19712, 8976, 1, 256] + - Exact: [19968, 8976, 1, 256] + - Exact: [20480, 8976, 1, 256] + - Exact: [20992, 8976, 1, 256] + - Exact: [21248, 8976, 1, 256] + - Exact: [23552, 8976, 1, 256] + - Exact: [28672, 8976, 1, 256] + - Exact: [31488, 8976, 1, 256] + - Exact: [33536, 8976, 1, 256] + - Exact: [44505, 8976, 1, 256] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [42720, 3968, 1, 1024] + - Exact: [42720, 6528, 1, 1024] + - Exact: [42720, 7104, 1, 1024] + - Exact: [42720, 7200, 1, 1024] + - Exact: [42720, 9520, 1, 1024] + - Exact: [42720, 10080, 1, 1024] + - Exact: [1024, 3240, 1, 1024] + - Exact: [1024, 3240, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3960, 1, 4096] + - Exact: [4096, 3240, 1, 1024] + - Exact: [4096, 3960, 1, 1024] + - Exact: [42720, 3960, 1, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [512, 512, 16, 64] + - Exact: [512, 512, 128, 64] + - Exact: [4096, 512, 1, 1024] + - Exact: [30522, 616, 1, 1024] + - Exact: [128, 128, 128, 64] + - Exact: [128, 128, 160, 64] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [30522, 160, 1, 1024] + - Exact: [30522, 200, 1, 1024] + - Exact: [128, 128, 624, 64] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [30522, 780, 1, 1024] + - Exact: [30522, 308, 1, 1024] + - Exact: [128, 128, 640, 64] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [30522, 800, 1, 1024] + - Exact: [128, 128, 656, 64] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [30522, 820, 1, 1024] + - Exact: [512, 512, 80, 64] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [30522, 385, 1, 1024] + - Exact: [512, 512, 96, 64] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3072, 1, 1024] + - Exact: [30522, 462, 1, 1024] + - Exact: [4096, 1024, 1, 1024] + - Exact: [128, 128, 144, 64] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [30522, 180, 1, 1024] + - Exact: [1024, 32768, 1, 479] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [33712, 8192, 1, 1024] + - Exact: [33712, 9600, 1, 1024] + - Exact: [1024, 1024, 128, 96] + - Exact: [30592, 4096, 1, 1024] + - Exact: [1536, 8192, 1, 1536] + - Exact: [3072, 8192, 1, 1024] + - Exact: [3072, 2048, 1, 1024] + - Exact: [50304, 8192, 1, 1024] + - Exact: [2048, 1024, 1, 8192] + - Exact: [50304, 2048, 1, 1024] + - Exact: [1536, 8192, 1, 6144] + - Exact: [50304, 4096, 1, 1536] + - Exact: [8192, 1024, 1, 2048] + - Exact: [2560, 2048, 1, 640] + - Exact: [1024, 1024, 128, 64] + - Exact: [2048, 1024, 1, 2048] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 64] + - Exact: [30592, 8192, 1, 1024] + - Exact: [50304, 16384, 1, 1024] + - Exact: [4608, 4096, 1, 1536] + - Exact: [2560, 2048, 1, 2560] + - Exact: [7680, 2048, 1, 2560] + - Exact: [50304, 4096, 1, 1024] + - Exact: [1920, 2048, 1, 2560] + - Exact: [1024, 1024, 64, 96] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 6144] + - Exact: [512, 512, 256, 64] + - Exact: [50304, 8192, 1, 1536] + - Exact: [6144, 8192, 1, 1536] + - Exact: [4096, 16384, 1, 1024] + - Exact: [30592, 1024, 1, 2048] + - Exact: [1024, 16384, 1, 4096] + - Exact: [512, 512, 40, 64] + - Exact: [6144, 1024, 1, 2048] + - Exact: [4608, 8192, 1, 1536] + - Exact: [30592, 2048, 1, 1024] + - Exact: [3072, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 1024, 32, 64] + - Exact: [3072, 4096, 1, 1024] + - Exact: [30528, 8192, 1, 1024] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 480] + - Exact: [1024, 4096, 1, 480] + - Exact: [1024, 6912, 1, 1024] + - Exact: [1024, 6912, 1, 480] + - Exact: [128, 55296, 1, 256] + - Exact: [256, 55296, 1, 512] + - Exact: [256, 6912, 1, 512] + - Exact: [512, 3456, 1, 1024] + - Exact: [512, 3456, 1, 13] + - Exact: [512, 4096, 1, 1024] + - Exact: [512, 4096, 1, 13] + - Exact: [512, 55296, 1, 13] + - Exact: [512, 6912, 1, 1024] + - Exact: [512, 6912, 1, 13] + - Exact: [30528, 640, 1, 1024] + - Exact: [30528, 1280, 1, 1024] + - Exact: [30528, 1600, 1, 1024] + - Exact: [1024, 10240, 1, 1024] + - Exact: [4096, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 10496, 1, 4096] + - Exact: [30528, 1640, 1, 1024] + - Exact: [4096, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [30528, 160, 1, 1024] + - Exact: [30528, 240, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 4096] + - Exact: [512, 512, 192, 64] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [4096, 10224, 1, 1024] + - Exact: [3072, 10224, 1, 1024] + - Exact: [3072, 10240, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [4096, 10192, 1, 1024] + - Exact: [3072, 10192, 1, 1024] + - Exact: [3072, 10200, 1, 1024] + - Exact: [1024, 10184, 1, 1024] + - Exact: [3072, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 4096] + - Exact: [4096, 10208, 1, 1024] + - Exact: [2048, 10224, 1, 1024] + - Exact: [2048, 10240, 1, 1024] + - Exact: [1024, 10120, 1, 1024] + - Exact: [2048, 10192, 1, 1024] + - Exact: [1024, 10152, 1, 1024] + - Exact: [3072, 10080, 1, 1024] + - Exact: [1024, 2048, 1, 49] + - Exact: [4608, 512, 1, 49] + - Exact: [256, 256, 25, 12544] + - Exact: [256, 256, 49, 3200] + - Exact: [256, 256, 25, 6272] + - Exact: [256, 256, 49, 6400] + - Exact: [512, 512, 49, 1152] + - Exact: [512, 512, 25, 2048] + - Exact: [512, 512, 49, 2304] + - Exact: [512, 512, 25, 4096] + - Exact: [128, 128, 2048, 64] + - Exact: [30528, 2560, 1, 1024] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 1024] + - Exact: [1024, 12288, 1, 4096] + - Exact: [30528, 1920, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [128, 128, 81, 12544] + - Exact: [128, 128, 121, 9216] + - Exact: [128, 128, 169, 6400] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 49, 2304] + - Exact: [256, 256, 64, 2304] + - Exact: [256, 256, 81, 4096] + - Exact: [256, 256, 121, 2304] + - Exact: [256, 256, 169, 2304] + - Exact: [512, 512, 81, 1024] + - Exact: [512, 512, 121, 1024] + - Exact: [512, 512, 169, 1024] + - Exact: [512, 512, 36, 1024] + - Exact: [512, 512, 49, 1024] + - Exact: [512, 512, 64, 1024] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 768] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 1024] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3072, 3072, 1, 3072] + - Exact: [3072, 512, 1, 3072] + - Exact: [4096, 512, 1, 4096] + - Exact: [512, 3072, 1, 3072] + - Exact: [512, 4096, 1, 4096] + - Exact: [512, 8192, 1, 8192] + - Exact: [8192, 512, 1, 8192] + - Exact: [256, 256, 36, 432] + - Exact: [256, 256, 36, 456] + - Exact: [256, 256, 36, 504] + - Exact: [256, 256, 49, 1120] + - Exact: [256, 256, 36, 442] + - Exact: [256, 256, 49, 950] + - Exact: [256, 256, 64, 616] + - Exact: [256, 256, 64, 660] + - Exact: [256, 256, 36, 408] + - Exact: [256, 256, 49, 1008] + - Exact: [256, 256, 36, 462] + - Exact: [256, 256, 36, 468] + - Exact: [256, 256, 36, 494] + - Exact: [512, 512, 64, 48] + - Exact: [256, 256, 64, 140] + - Exact: [512, 512, 64, 56] + - Exact: [512, 512, 49, 90] + - Exact: [512, 512, 49, 60] + - Exact: [256, 256, 49, 864] + - Exact: [256, 256, 64, 224] + - Exact: [256, 256, 64, 176] + - Exact: [256, 256, 64, 154] + - Exact: [512, 512, 49, 80] + - Exact: [256, 256, 49, 1200] + - Exact: [256, 256, 64, 704] + - Exact: [256, 256, 64, 768] + - Exact: [256, 256, 49, 1160] + - Exact: [256, 256, 49, 320] + - Exact: [512, 512, 49, 70] + - Exact: [256, 256, 49, 1240] + - Exact: [256, 256, 36, 384] + - Exact: [1024, 2048, 1, 888] + - Exact: [1024, 2048, 1, 713] + - Exact: [1024, 2048, 1, 660] + - Exact: [1024, 2048, 1, 726] + - Exact: [1024, 2048, 1, 672] + - Exact: [1024, 2048, 1, 850] + - Exact: [1024, 2048, 1, 805] + - Exact: [1024, 2048, 1, 864] + - Exact: [1024, 2048, 1, 768] + - Exact: [1024, 2048, 1, 950] + - Exact: [256, 128, 49, 1152] + - Exact: [256, 128, 121, 120] + - Exact: [256, 128, 169, 120] + - Exact: [256, 128, 36, 120] + - Exact: [256, 128, 49, 120] + - Exact: [256, 128, 64, 120] + - Exact: [256, 128, 36, 12000] + - Exact: [256, 128, 49, 1216] + - Exact: [256, 128, 121, 18] + - Exact: [256, 128, 169, 18] + - Exact: [256, 128, 36, 18] + - Exact: [256, 128, 49, 18] + - Exact: [256, 128, 64, 18] + - Exact: [256, 128, 36, 1800] + - Exact: [256, 128, 121, 19] + - Exact: [256, 128, 169, 19] + - Exact: [256, 128, 36, 19] + - Exact: [256, 128, 49, 19] + - Exact: [256, 128, 64, 19] + - Exact: [256, 128, 36, 1900] + - Exact: [256, 128, 49, 480] + - Exact: [256, 128, 81, 480] + - Exact: [256, 128, 64, 5880] + - Exact: [256, 128, 49, 72] + - Exact: [256, 128, 81, 72] + - Exact: [256, 128, 49, 76] + - Exact: [256, 128, 81, 76] + - Exact: [256, 128, 49, 7680] + - Exact: [256, 128, 64, 882] + - Exact: [256, 128, 64, 931] + - Exact: [256, 256, 49, 1152] + - Exact: [256, 256, 36, 12000] + - Exact: [256, 256, 49, 1216] + - Exact: [256, 256, 36, 1800] + - Exact: [256, 256, 36, 1900] + - Exact: [256, 256, 64, 5880] + - Exact: [256, 256, 49, 7680] + - Exact: [256, 256, 64, 882] + - Exact: [256, 256, 64, 931] + - Exact: [512, 256, 81, 1080] + - Exact: [512, 256, 25, 12000] + - Exact: [512, 256, 81, 162] + - Exact: [512, 256, 81, 171] + - Exact: [512, 256, 25, 1800] + - Exact: [512, 256, 25, 1900] + - Exact: [512, 256, 121, 1920] + - Exact: [512, 256, 169, 1920] + - Exact: [512, 256, 49, 1920] + - Exact: [512, 256, 121, 288] + - Exact: [512, 256, 169, 288] + - Exact: [512, 256, 49, 288] + - Exact: [512, 256, 25, 3000] + - Exact: [512, 256, 81, 3000] + - Exact: [512, 256, 121, 304] + - Exact: [512, 256, 169, 304] + - Exact: [512, 256, 49, 304] + - Exact: [512, 256, 25, 450] + - Exact: [512, 256, 81, 450] + - Exact: [512, 256, 25, 475] + - Exact: [512, 256, 81, 475] + - Exact: [512, 256, 121, 480] + - Exact: [512, 256, 169, 480] + - Exact: [512, 256, 49, 5880] + - Exact: [512, 256, 121, 72] + - Exact: [512, 256, 169, 72] + - Exact: [512, 256, 121, 76] + - Exact: [512, 256, 169, 76] + - Exact: [512, 256, 49, 882] + - Exact: [512, 256, 49, 931] + - Exact: [2304, 512, 1, 100] + - Exact: [2304, 512, 1, 361] + - Exact: [4608, 510, 1, 100] + - Exact: [4608, 510, 1, 361] + - Exact: [340, 256, 49, 1152] + - Exact: [340, 256, 36, 120] + - Exact: [340, 256, 49, 120] + - Exact: [340, 256, 64, 120] + - Exact: [340, 256, 36, 12000] + - Exact: [340, 256, 49, 1216] + - Exact: [340, 256, 36, 18] + - Exact: [340, 256, 49, 18] + - Exact: [340, 256, 64, 18] + - Exact: [340, 256, 36, 1800] + - Exact: [340, 256, 36, 19] + - Exact: [340, 256, 49, 19] + - Exact: [340, 256, 64, 19] + - Exact: [340, 256, 36, 1900] + - Exact: [340, 256, 64, 5880] + - Exact: [340, 256, 49, 7680] + - Exact: [340, 256, 64, 882] + - Exact: [340, 256, 64, 931] + - Exact: [510, 256, 49, 120] + - Exact: [510, 256, 64, 120] + - Exact: [510, 256, 49, 18] + - Exact: [510, 256, 64, 18] + - Exact: [510, 256, 49, 19] + - Exact: [510, 256, 64, 19] + - Exact: [510, 256, 36, 480] + - Exact: [510, 256, 36, 72] + - Exact: [510, 256, 36, 76] + - Exact: [510, 512, 36, 1080] + - Exact: [510, 512, 36, 162] + - Exact: [510, 512, 36, 171] + - Exact: [510, 512, 49, 1920] + - Exact: [510, 512, 64, 1920] + - Exact: [510, 512, 49, 288] + - Exact: [510, 512, 64, 288] + - Exact: [510, 512, 36, 3000] + - Exact: [510, 512, 49, 304] + - Exact: [510, 512, 64, 304] + - Exact: [510, 512, 36, 450] + - Exact: [510, 512, 36, 475] + - Exact: [510, 512, 49, 480] + - Exact: [510, 512, 64, 480] + - Exact: [510, 512, 49, 72] + - Exact: [510, 512, 64, 72] + - Exact: [510, 512, 49, 76] + - Exact: [510, 512, 64, 76] + - Exact: [1024, 1024, 160, 96] + - Exact: [2880, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 960] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [25216, 16384, 1, 1920] + - Exact: [1024, 1024, 40, 96] + - Exact: [2880, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 960] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [25216, 4096, 1, 1920] + - Exact: [1024, 1024, 80, 96] + - Exact: [2880, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 960] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [25216, 8192, 1, 1920] + - Exact: [1024, 1024, 96, 96] + - Exact: [1728, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 576] + - Exact: [2304, 16384, 1, 2304] + - Exact: [12672, 16384, 1, 2304] + - Exact: [1024, 1024, 24, 96] + - Exact: [1728, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 576] + - Exact: [2304, 4096, 1, 2304] + - Exact: [12672, 4096, 1, 2304] + - Exact: [1024, 1024, 48, 96] + - Exact: [1728, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 576] + - Exact: [2304, 8192, 1, 2304] + - Exact: [12672, 8192, 1, 2304] + - Exact: [1024, 1024, 16, 96] + - Exact: [1152, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 384] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [6400, 4096, 1, 3072] + - Exact: [1024, 1024, 32, 96] + - Exact: [1152, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 384] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [6400, 8192, 1, 3072] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [29000, 199, 1, 2048] + - Exact: [29000, 221, 1, 2048] + - Exact: [29000, 224, 1, 2048] + - Exact: [29000, 229, 1, 2048] + - Exact: [29000, 234, 1, 2048] + - Exact: [29000, 242, 1, 2048] + - Exact: [29000, 246, 1, 2048] + - Exact: [29000, 247, 1, 2048] + - Exact: [29000, 256, 1, 2048] + - Exact: [29000, 262, 1, 2048] + - Exact: [29000, 264, 1, 2048] + - Exact: [29000, 265, 1, 2048] + - Exact: [29000, 274, 1, 2048] + - Exact: [29000, 277, 1, 2048] + - Exact: [29000, 279, 1, 2048] + - Exact: [29000, 288, 1, 2048] + - Exact: [29000, 296, 1, 2048] + - Exact: [29000, 315, 1, 2048] + - Exact: [29000, 335, 1, 2048] + - Exact: [4096, 4096, 1, 2048] + - Exact: [29000, 2283, 1, 1024] + - Exact: [29000, 2296, 1, 1024] + - Exact: [29000, 2306, 1, 1024] + - Exact: [29000, 2309, 1, 1024] + - Exact: [29000, 2318, 1, 1024] + - Exact: [29000, 2320, 1, 1024] + - Exact: [29000, 2324, 1, 1024] + - Exact: [29000, 2325, 1, 1024] + - Exact: [29000, 2329, 1, 1024] + - Exact: [29000, 2338, 1, 1024] + - Exact: [29000, 2345, 1, 1024] + - Exact: [29000, 2350, 1, 1024] + - Exact: [29000, 2362, 1, 1024] + - Exact: [29000, 2366, 1, 1024] + - Exact: [29000, 2368, 1, 1024] + - Exact: [29000, 2374, 1, 1024] + - Exact: [29000, 2390, 1, 1024] + - Exact: [512, 512, 320, 64] + - Exact: [29000, 561, 1, 1024] + - Exact: [29000, 574, 1, 1024] + - Exact: [29000, 600, 1, 1024] + - Exact: [29000, 608, 1, 1024] + - Exact: [29000, 615, 1, 1024] + - Exact: [29000, 622, 1, 1024] + - Exact: [29000, 625, 1, 1024] + - Exact: [29000, 626, 1, 1024] + - Exact: [29000, 628, 1, 1024] + - Exact: [29000, 636, 1, 1024] + - Exact: [29000, 651, 1, 1024] + - Exact: [29000, 658, 1, 1024] + - Exact: [29000, 669, 1, 1024] + - Exact: [29000, 670, 1, 1024] + - Exact: [29000, 672, 1, 1024] + - Exact: [29000, 684, 1, 1024] + - Exact: [29000, 716, 1, 1024] + - Exact: [29000, 730, 1, 1024] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 4096] + - Exact: [3072, 32768, 1, 1024] + - Exact: [4096, 32768, 1, 1024] + - Exact: [50304, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 49, 12800] + - Exact: [128, 128, 25, 25088] + - Exact: [128, 128, 49, 25600] + - Exact: [128, 128, 25, 50176] + - Exact: [128, 128, 36, 12544] + - Exact: [128, 128, 49, 9216] + - Exact: [1024, 1024, 1, 12544] + - Exact: [1024, 1000, 1, 12544] + - Exact: [128, 128, 36, 12000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [5888, 128, 1, 1] + - Exact: [1856, 256, 1, 1] + - Exact: [256, 1856, 1, 32] + - Exact: [128, 3584, 1, 1280] + - Exact: [2944, 128, 1, 32] + - Exact: [64, 6784, 1, 1] + - Exact: [64, 5056, 1, 3328] + - Exact: [704, 1024, 1, 1] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 1408, 1, 1] + - Exact: [1024, 1024, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [2944, 256, 1, 1] + - Exact: [2944, 256, 1, 32] + - Exact: [128, 4288, 1, 3328] + - Exact: [5056, 128, 1, 256] + - Exact: [1856, 704, 1, 1280] + - Exact: [2368, 256, 1, 32] + - Exact: [5056, 64, 1, 32] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1024, 1024, 1, 3328] + - Exact: [5888, 64, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [5056, 64, 1, 3328] + - Exact: [1024, 448, 1, 32] + - Exact: [128, 2944, 1, 3328] + - Exact: [256, 1856, 1, 1] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [128, 5888, 1, 32] + - Exact: [128, 5888, 1, 1280] + - Exact: [3584, 256, 1, 256] + - Exact: [1856, 256, 1, 256] + - Exact: [1024, 704, 1, 1] + - Exact: [448, 1408, 1, 3328] + - Exact: [1024, 704, 1, 32] + - Exact: [448, 2944, 1, 256] + - Exact: [5888, 128, 1, 3328] + - Exact: [2944, 448, 1, 1] + - Exact: [5056, 64, 1, 1280] + - Exact: [704, 704, 1, 32] + - Exact: [256, 4288, 1, 256] + - Exact: [5056, 128, 1, 1] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [128, 6784, 1, 32] + - Exact: [128, 6784, 1, 3328] + - Exact: [4288, 128, 1, 1] + - Exact: [256, 2368, 1, 32] + - Exact: [448, 1024, 1, 256] + - Exact: [256, 1408, 1, 32] + - Exact: [256, 3584, 1, 32] + - Exact: [128, 4288, 1, 32] + - Exact: [448, 1856, 1, 1] + - Exact: [448, 1856, 1, 32] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [704, 1856, 1, 32] + - Exact: [704, 1408, 1, 32] + - Exact: [5888, 128, 1, 32] + - Exact: [128, 4288, 1, 1280] + - Exact: [1856, 704, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [704, 1856, 1, 3328] + - Exact: [2944, 128, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [128, 2368, 1, 1] + - Exact: [128, 2944, 1, 1280] + - Exact: [1024, 448, 1, 1] + - Exact: [256, 2944, 1, 256] + - Exact: [704, 448, 1, 32] + - Exact: [704, 1024, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [256, 2368, 1, 1] + - Exact: [5888, 64, 1, 3328] + - Exact: [704, 448, 1, 3328] + - Exact: [4288, 256, 1, 1] + - Exact: [1856, 448, 1, 3328] + - Exact: [4288, 256, 1, 1280] + - Exact: [448, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1] + - Exact: [2368, 448, 1, 32] + - Exact: [1408, 704, 1, 1] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [256, 2944, 1, 1] + - Exact: [6784, 64, 1, 1] + - Exact: [6784, 64, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [64, 5056, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [5888, 64, 1, 1] + - Exact: [256, 3584, 1, 1] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [128, 5056, 1, 32] + - Exact: [128, 5056, 1, 1280] + - Exact: [5888, 64, 1, 32] + - Exact: [2368, 256, 1, 1] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1] + - Exact: [1408, 448, 1, 32] + - Exact: [5056, 128, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [448, 704, 1, 1] + - Exact: [448, 704, 1, 32] + - Exact: [448, 704, 1, 3328] + - Exact: [64, 5888, 1, 1] + - Exact: [2368, 128, 1, 32] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [4288, 256, 1, 32] + - Exact: [448, 1408, 1, 1] + - Exact: [1408, 256, 1, 256] + - Exact: [256, 4288, 1, 32] + - Exact: [1408, 256, 1, 1280] + - Exact: [448, 1408, 1, 256] + - Exact: [128, 2944, 1, 32] + - Exact: [1856, 448, 1, 1] + - Exact: [704, 704, 1, 1] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [3584, 256, 1, 3328] + - Exact: [448, 2368, 1, 1] + - Exact: [128, 6784, 1, 1] + - Exact: [256, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [704, 704, 1, 256] + - Exact: [448, 1024, 1, 32] + - Exact: [256, 2368, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [704, 1856, 1, 1] + - Exact: [704, 448, 1, 256] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 5056, 1, 1] + - Exact: [256, 2368, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 1408, 1, 1] + - Exact: [4288, 128, 1, 3328] + - Exact: [128, 6784, 1, 1280] + - Exact: [3584, 256, 1, 32] + - Exact: [1408, 256, 1, 32] + - Exact: [5888, 128, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [1024, 448, 1, 3328] + - Exact: [3584, 128, 1, 1] + - Exact: [128, 2368, 1, 256] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5888, 1, 1] + - Exact: [64, 5056, 1, 1] + - Exact: [1856, 256, 1, 32] + - Exact: [64, 5056, 1, 32] + - Exact: [1408, 704, 1, 32] + - Exact: [1408, 704, 1, 1280] + - Exact: [1024, 1024, 1, 32] + - Exact: [5056, 128, 1, 3328] + - Exact: [128, 4288, 1, 1] + - Exact: [2944, 128, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [2944, 128, 1, 256] + - Exact: [6784, 128, 1, 1] + - Exact: [1408, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 128, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [2944, 448, 1, 1280] + - Exact: [704, 448, 1, 1] + - Exact: [256, 1408, 1, 3328] + - Exact: [2944, 128, 1, 1] + - Exact: [704, 1024, 1, 32] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 1280] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 3328] + - Exact: [448, 2944, 1, 1] + - Exact: [448, 1856, 1, 1280] + - Exact: [2368, 448, 1, 1] + - Exact: [448, 2944, 1, 32] + - Exact: [448, 2944, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [448, 2944, 1, 3328] + - Exact: [2368, 128, 1, 1280] + - Exact: [128, 3584, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [1856, 704, 1, 32] + - Exact: [2944, 448, 1, 32] + - Exact: [5056, 128, 1, 32] + - Exact: [6784, 128, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1856, 704, 1, 1] + - Exact: [256, 1856, 1, 3328] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 4288, 1, 256] + - Exact: [6784, 128, 1, 32] + - Exact: [1408, 448, 1, 1] + - Exact: [64, 5056, 1, 1280] + - Exact: [448, 1408, 1, 32] + - Exact: [128, 5056, 1, 256] + - Exact: [1024, 1024, 1, 1] + - Exact: [256, 1408, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [6784, 64, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [448, 1408, 1, 1280] + - Exact: [2368, 128, 1, 1] + - Exact: [5056, 64, 1, 256] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 6784, 1, 32] + - Exact: [256, 4288, 1, 1] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 32] + - Exact: [3584, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [128, 2944, 1, 1] + - Exact: [704, 1408, 1, 1280] + - Exact: [6784, 64, 1, 32] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1408, 1, 256] + - Exact: [4288, 128, 1, 32] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 1024, 1, 256] + - Exact: [256, 2944, 1, 32] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 2368, 1, 32] + - Exact: [704, 1856, 1, 256] + - Exact: [1856, 448, 1, 32] + - Exact: [1408, 448, 1, 256] + - Exact: [448, 1024, 1, 1] + - Exact: [1024, 704, 1, 256] + - Exact: [64, 5888, 1, 32] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [1408, 256, 1, 1] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 3584, 1, 1] + - Exact: [128, 3584, 1, 32] + - Exact: [448, 2368, 1, 32] + - Exact: [448, 2368, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [2560, 128, 1, 2560] + - Exact: [1024, 700, 1, 512] + - Exact: [4096, 128, 1, 4096] + - Exact: [3072, 128, 1, 1024] + - Exact: [7680, 64, 1, 2560] + - Exact: [7680, 128, 1, 2560] + - Exact: [1024, 1024, 1, 1024] + - Exact: [2049, 512, 1, 2048] + - Exact: [1023, 512, 1, 1024] + - Exact: [1024, 512, 1, 1025] + - Exact: [1024, 1024, 1, 1023] + - Exact: [1024, 1025, 1, 1024] + - Exact: [1024, 1023, 1, 1024] + - Exact: [2048, 511, 1, 2048] + - Exact: [2047, 512, 1, 2048] + - Exact: [1025, 1024, 1, 1024] + - Exact: [1024, 1024, 1, 1025] + - Exact: [1025, 512, 1, 1024] + - Exact: [1024, 512, 1, 1023] + - Exact: [2048, 513, 1, 2048] + - Exact: [1024, 511, 1, 1024] + - Exact: [2048, 512, 1, 2047] + - Exact: [1024, 513, 1, 1024] + - Exact: [2048, 512, 1, 2049] + - Exact: [1023, 1024, 1, 1024] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3328, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [512, 1600, 1, 512] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 1600] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1600] + - Exact: [2048, 215, 1, 512] + - Exact: [2048, 215, 1, 768] + - Exact: [2048, 256, 1, 512] + - Exact: [2048, 256, 1, 768] + - Exact: [2048, 512, 1, 67] + - Exact: [2048, 512, 1, 74] + - Exact: [2048, 512, 1, 100] + - Exact: [2048, 512, 1, 2048] + - Exact: [1024, 512, 1, 4096] + - Exact: [30522, 77, 1, 1024] + - Exact: [1024, 780, 1, 1024] + - Exact: [1024, 800, 1, 1024] + - Exact: [1024, 820, 1, 1024] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [1024, 1024, 1, 4096] + - Exact: [480, 1024, 1, 1024] + - Exact: [480, 2048, 1, 2048] + - Exact: [1024, 480, 1, 1024] + - Exact: [2048, 480, 1, 2048] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 512, 128, 512] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [960, 1024, 1, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 480] + - Exact: [128, 3456, 1, 256] + - Exact: [128, 4096, 1, 256] + - Exact: [128, 6912, 1, 256] + - Exact: [256, 3456, 1, 512] + - Exact: [256, 4096, 1, 512] + - Exact: [512, 864, 1, 1024] + - Exact: [512, 864, 1, 13] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [1024, 512, 1, 196] + - Exact: [2048, 512, 1, 49] + - Exact: [2304, 256, 1, 196] + - Exact: [512, 1024, 1, 196] + - Exact: [512, 2048, 1, 49] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [128, 128, 64, 6400] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [512, 1024, 1, 1024] + - Exact: [512, 2048, 1, 2048] + - Exact: [128, 128, 49, 1120] + - Exact: [128, 128, 49, 1064] + - Exact: [128, 128, 49, 1040] + - Exact: [128, 128, 64, 600] + - Exact: [128, 128, 64, 616] + - Exact: [128, 128, 49, 950] + - Exact: [128, 128, 49, 972] + - Exact: [128, 128, 64, 560] + - Exact: [128, 128, 49, 1008] + - Exact: [128, 128, 64, 532] + - Exact: [128, 128, 49, 1080] + - Exact: [128, 128, 64, 588] + - Exact: [128, 128, 49, 1160] + - Exact: [128, 128, 49, 988] + - Exact: [128, 128, 49, 936] + - Exact: [512, 1024, 1, 3800] + - Exact: [512, 1024, 1, 3400] + - Exact: [512, 1024, 1, 3456] + - Exact: [512, 1024, 1, 3072] + - Exact: [2048, 512, 1, 950] + - Exact: [512, 1024, 1, 3552] + - Exact: [512, 1024, 1, 3220] + - Exact: [2048, 512, 1, 850] + - Exact: [512, 2048, 1, 864] + - Exact: [512, 2048, 1, 768] + - Exact: [2048, 512, 1, 805] + - Exact: [512, 1024, 1, 2852] + - Exact: [512, 2048, 1, 888] + - Exact: [2048, 512, 1, 864] + - Exact: [2048, 512, 1, 768] + - Exact: [2048, 512, 1, 888] + - Exact: [2048, 256, 1, 950] + - Exact: [2048, 512, 1, 713] + - Exact: [512, 1024, 1, 2688] + - Exact: [512, 1024, 1, 2640] + - Exact: [512, 1024, 1, 2904] + - Exact: [1024, 512, 1, 950] + - Exact: [512, 2048, 1, 672] + - Exact: [512, 2048, 1, 660] + - Exact: [512, 2048, 1, 1008] + - Exact: [2048, 256, 1, 850] + - Exact: [2048, 512, 1, 726] + - Exact: [1024, 512, 1, 850] + - Exact: [2048, 512, 1, 660] + - Exact: [2048, 512, 1, 672] + - Exact: [512, 2048, 1, 840] + - Exact: [2048, 512, 1, 1008] + - Exact: [512, 2048, 1, 792] + - Exact: [1024, 512, 1, 805] + - Exact: [512, 2048, 1, 1050] + - Exact: [2048, 512, 1, 748] + - Exact: [2048, 256, 1, 864] + - Exact: [1024, 512, 1, 768] + - Exact: [1024, 512, 1, 864] + - Exact: [2048, 512, 1, 875] + - Exact: [2048, 512, 1, 840] + - Exact: [2048, 512, 1, 792] + - Exact: [512, 2048, 1, 736] + - Exact: [2048, 256, 1, 888] + - Exact: [512, 2048, 1, 704] + - Exact: [512, 2048, 1, 588] + - Exact: [1024, 512, 1, 888] + - Exact: [512, 2048, 1, 816] + - Exact: [1024, 512, 1, 713] + - Exact: [2048, 512, 1, 736] + - Exact: [2048, 512, 1, 588] + - Exact: [2048, 512, 1, 704] + - Exact: [1024, 512, 1, 660] + - Exact: [2048, 256, 1, 660] + - Exact: [2048, 256, 1, 672] + - Exact: [1024, 512, 1, 672] + - Exact: [1024, 512, 1, 726] + - Exact: [512, 2048, 1, 630] + - Exact: [512, 2048, 1, 600] + - Exact: [2048, 256, 1, 805] + - Exact: [2048, 256, 1, 713] + - Exact: [2048, 256, 1, 726] + - Exact: [320, 1024, 1, 1024] + - Exact: [1024, 1000, 1, 1024] + - Exact: [320, 1000, 1, 1024] + - Exact: [128, 128, 49, 1280] + - Exact: [128, 128, 49, 1360] + - Exact: [128, 128, 49, 1200] + - Exact: [128, 128, 49, 1240] + - Exact: [2304, 256, 1, 704] + - Exact: [2304, 256, 1, 736] + - Exact: [2304, 256, 1, 792] + - Exact: [2304, 256, 1, 748] + - Exact: [2304, 256, 1, 726] + - Exact: [2304, 256, 1, 713] + - Exact: [2304, 256, 1, 768] + - Exact: [512, 2048, 1, 759] + - Exact: [512, 2048, 1, 925] + - Exact: [2304, 256, 1, 805] + - Exact: [512, 2048, 1, 900] + - Exact: [512, 2048, 1, 875] + - Exact: [512, 2048, 1, 748] + - Exact: [512, 2048, 1, 726] + - Exact: [512, 2048, 1, 713] + - Exact: [512, 2048, 1, 805] + - Exact: [512, 2048, 1, 850] + - Exact: [512, 2048, 1, 950] + - Exact: [128, 128, 49, 1152] + - Exact: [128, 128, 49, 1216] + - Exact: [128, 128, 36, 1800] + - Exact: [128, 128, 36, 1900] + - Exact: [128, 128, 64, 5880] + - Exact: [128, 128, 49, 7680] + - Exact: [128, 128, 64, 882] + - Exact: [128, 128, 64, 931] + - Exact: [128, 64, 121, 1152] + - Exact: [128, 64, 81, 12000] + - Exact: [128, 64, 121, 1216] + - Exact: [128, 64, 81, 1800] + - Exact: [128, 64, 81, 1900] + - Exact: [128, 64, 49, 20280] + - Exact: [128, 64, 49, 3042] + - Exact: [128, 64, 49, 3211] + - Exact: [128, 64, 169, 5880] + - Exact: [128, 64, 121, 7680] + - Exact: [128, 64, 169, 882] + - Exact: [128, 64, 169, 931] + - Exact: [256, 128, 25, 1080] + - Exact: [256, 128, 25, 162] + - Exact: [256, 128, 25, 171] + - Exact: [1152, 256, 1, 1] + - Exact: [1152, 256, 1, 1444] + - Exact: [1152, 256, 1, 25] + - Exact: [1152, 256, 1, 9] + - Exact: [2304, 256, 1, 1444] + - Exact: [2304, 340, 1, 1] + - Exact: [2304, 340, 1, 1444] + - Exact: [2304, 340, 1, 9] + - Exact: [2304, 510, 1, 25] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 512, 80, 512] + - Exact: [29000, 109, 1, 2560] + - Exact: [29000, 121, 1, 2560] + - Exact: [29000, 65, 1, 2560] + - Exact: [29000, 66, 1, 2560] + - Exact: [29000, 67, 1, 2560] + - Exact: [29000, 69, 1, 2560] + - Exact: [29000, 70, 1, 2560] + - Exact: [29000, 71, 1, 2560] + - Exact: [29000, 73, 1, 2560] + - Exact: [29000, 74, 1, 2560] + - Exact: [29000, 75, 1, 2560] + - Exact: [29000, 77, 1, 2560] + - Exact: [29000, 78, 1, 2560] + - Exact: [29000, 80, 1, 2560] + - Exact: [29000, 81, 1, 2560] + - Exact: [29000, 82, 1, 2560] + - Exact: [29000, 83, 1, 2560] + - Exact: [29000, 84, 1, 2560] + - Exact: [29000, 88, 1, 2560] + - Exact: [29000, 89, 1, 2560] + - Exact: [29000, 90, 1, 2560] + - Exact: [29000, 92, 1, 2560] + - Exact: [29000, 95, 1, 2560] + - Exact: [29000, 98, 1, 2560] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 36, 50176] + - Exact: [64, 64, 49, 36864] + - Exact: [64, 64, 64, 25600] + - Exact: [256, 256, 1, 60800] + - Exact: [256, 256, 1, 54400] + - Exact: [256, 256, 1, 51520] + - Exact: [256, 256, 1, 55296] + - Exact: [256, 256, 1, 56832] + - Exact: [256, 256, 1, 45632] + - Exact: [256, 256, 1, 49152] + - Exact: [256, 512, 1, 13600] + - Exact: [256, 256, 1, 43008] + - Exact: [256, 512, 1, 15200] + - Exact: [256, 512, 1, 12880] + - Exact: [256, 512, 1, 13824] + - Exact: [512, 256, 1, 13824] + - Exact: [256, 512, 1, 14208] + - Exact: [512, 256, 1, 14208] + - Exact: [512, 256, 1, 15200] + - Exact: [256, 512, 1, 12288] + - Exact: [512, 256, 1, 12288] + - Exact: [128, 64, 25, 43320] + - Exact: [64, 64, 64, 20280] + - Exact: [64, 64, 49, 27000] + - Exact: [64, 64, 36, 43320] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [32, 5056, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [2368, 64, 1, 1] + - Exact: [1408, 128, 1, 32] + - Exact: [32, 2944, 1, 3328] + - Exact: [2368, 32, 1, 256] + - Exact: [1024, 128, 1, 32] + - Exact: [32, 4288, 1, 1280] + - Exact: [32, 5056, 1, 32] + - Exact: [5888, 32, 1, 32] + - Exact: [64, 2368, 1, 1280] + - Exact: [128, 704, 1, 32] + - Exact: [32, 4288, 1, 3328] + - Exact: [1408, 64, 1, 1] + - Exact: [1856, 64, 1, 256] + - Exact: [1024, 256, 1, 256] + - Exact: [1856, 128, 1, 32] + - Exact: [1856, 128, 1, 1280] + - Exact: [4288, 32, 1, 3328] + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 1856, 1, 256] + - Exact: [3584, 64, 1, 32] + - Exact: [1408, 128, 1, 3328] + - Exact: [32, 6784, 1, 3328] + - Exact: [32, 3584, 1, 256] + - Exact: [704, 256, 1, 32] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [5056, 32, 1, 3328] + - Exact: [2944, 32, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 1280] + - Exact: [448, 448, 1, 256] + - Exact: [1024, 256, 1, 1] + - Exact: [1856, 64, 1, 32] + - Exact: [4288, 64, 1, 256] + - Exact: [1408, 64, 1, 256] + - Exact: [6784, 32, 1, 32] + - Exact: [448, 448, 1, 1280] + - Exact: [32, 5888, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [1856, 64, 1, 1280] + - Exact: [128, 1408, 1, 1] + - Exact: [32, 2368, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [2944, 32, 1, 32] + - Exact: [448, 448, 1, 32] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 1] + - Exact: [64, 2944, 1, 32] + - Exact: [64, 2944, 1, 1280] + - Exact: [32, 3584, 1, 1280] + - Exact: [32, 2944, 1, 32] + - Exact: [32, 6784, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [704, 128, 1, 1280] + - Exact: [32, 3584, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [64, 4288, 1, 1] + - Exact: [3584, 32, 1, 32] + - Exact: [3584, 64, 1, 1] + - Exact: [32, 4288, 1, 32] + - Exact: [64, 1408, 1, 1] + - Exact: [256, 1024, 1, 256] + - Exact: [1408, 128, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [64, 3584, 1, 1] + - Exact: [1024, 128, 1, 1280] + - Exact: [2368, 32, 1, 32] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [2368, 64, 1, 256] + - Exact: [32, 2368, 1, 3328] + - Exact: [128, 1856, 1, 1] + - Exact: [128, 1856, 1, 32] + - Exact: [3584, 32, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [32, 2944, 1, 1280] + - Exact: [4288, 32, 1, 32] + - Exact: [1856, 64, 1, 1] + - Exact: [128, 1024, 1, 3328] + - Exact: [1408, 128, 1, 1] + - Exact: [5056, 32, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [3584, 32, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [704, 256, 1, 1280] + - Exact: [1856, 128, 1, 1] + - Exact: [256, 704, 1, 1] + - Exact: [1024, 128, 1, 1] + - Exact: [1856, 128, 1, 256] + - Exact: [1024, 256, 1, 1280] + - Exact: [64, 2368, 1, 32] + - Exact: [32, 2368, 1, 256] + - Exact: [32, 6784, 1, 1280] + - Exact: [32, 6784, 1, 32] + - Exact: [64, 3584, 1, 3328] + - Exact: [32, 5888, 1, 1280] + - Exact: [448, 256, 1, 1] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [2368, 32, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1024, 1, 32] + - Exact: [32, 2368, 1, 32] + - Exact: [64, 1856, 1, 1280] + - Exact: [32, 3584, 1, 32] + - Exact: [704, 256, 1, 1] + - Exact: [1024, 256, 1, 3328] + - Exact: [128, 1856, 1, 1280] + - Exact: [448, 256, 1, 32] + - Exact: [64, 4288, 1, 32] + - Exact: [128, 704, 1, 1] + - Exact: [4288, 64, 1, 1280] + - Exact: [448, 448, 1, 1] + - Exact: [32, 5888, 1, 32] + - Exact: [1024, 128, 1, 3328] + - Exact: [4288, 64, 1, 32] + - Exact: [2368, 64, 1, 32] + - Exact: [64, 1408, 1, 32] + - Exact: [32, 2944, 1, 256] + - Exact: [2944, 64, 1, 1] + - Exact: [2944, 64, 1, 32] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1408, 64, 1, 3328] + - Exact: [6784, 32, 1, 1280] + - Exact: [2944, 64, 1, 1280] + - Exact: [2944, 32, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [5888, 32, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 32] + - Exact: [64, 1856, 1, 3328] + - Exact: [5888, 32, 1, 1280] + - Exact: [256, 704, 1, 32] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 32] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [2368, 64, 1, 3328] + - Exact: [32, 5056, 1, 3328] + - Exact: [64, 1856, 1, 1] + - Exact: [704, 128, 1, 32] + - Exact: [4288, 64, 1, 1] + - Exact: [5056, 32, 1, 1280] + - Exact: [128, 1024, 1, 1] + - Exact: [256, 1024, 1, 1] + - Exact: [1408, 64, 1, 1280] + - Exact: [1024, 256, 1, 32] + - Exact: [2368, 32, 1, 1280] + - Exact: [704, 128, 1, 1] + - Exact: [256, 448, 1, 256] + - Exact: [32, 4288, 1, 256] + - Exact: [128, 1408, 1, 32] + - Exact: [704, 128, 1, 3328] + - Exact: [64, 4288, 1, 256] + - Exact: [4288, 32, 1, 1280] + - Exact: [32, 5056, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [256, 1024, 1, 32] + - Exact: [256, 1024, 1, 1280] + - Exact: [6784, 32, 1, 256] + - Exact: [64, 2368, 1, 1] + - Exact: [1408, 128, 1, 256] + - Exact: [5888, 32, 1, 3328] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [2944, 32, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 448, 1, 1] + - Exact: [256, 448, 1, 32] + - Exact: [64, 3584, 1, 32] + - Exact: [64, 3584, 1, 1280] + - Exact: [4288, 32, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [128, 1856, 1, 256] + - Exact: [3584, 32, 1, 3328] + - Exact: [6784, 32, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [5056, 32, 1, 32] + - Exact: [32, 5888, 1, 3328] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 196] + - Exact: [256, 1024, 1, 196] + - Exact: [1760, 64, 1, 1760] + - Exact: [2560, 32, 1, 2560] + - Exact: [4608, 32, 1, 1536] + - Exact: [3072, 64, 1, 1024] + - Exact: [2048, 128, 1, 2048] + - Exact: [4096, 64, 1, 4096] + - Exact: [7680, 32, 1, 2560] + - Exact: [2560, 64, 1, 2560] + - Exact: [1760, 128, 1, 1760] + - Exact: [3072, 32, 1, 1024] + - Exact: [6144, 32, 1, 2560] + - Exact: [4096, 32, 1, 4096] + - Exact: [2048, 64, 1, 2048] + - Exact: [8448, 32, 1, 2816] + - Exact: [512, 512, 1, 512] + - Exact: [511, 512, 1, 512] + - Exact: [512, 512, 1, 511] + - Exact: [512, 513, 1, 512] + - Exact: [512, 511, 1, 512] + - Exact: [513, 512, 1, 512] + - Exact: [512, 512, 1, 513] + - Exact: [512, 512, 1, 64] + - Exact: [33, 33, 1600, 32] + - Exact: [256, 684, 1, 1024] + - Exact: [1024, 200, 1, 560] + - Exact: [2048, 114, 1, 512] + - Exact: [2048, 114, 1, 768] + - Exact: [32, 32, 4608, 64] + - Exact: [32, 35, 4608, 64] + - Exact: [34, 34, 4736, 64] + - Exact: [35, 35, 4608, 64] + - Exact: [33, 33, 1920, 64] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [128, 864, 1, 256] + - Exact: [256, 864, 1, 512] + - Exact: [1152, 128, 1, 784] + - Exact: [256, 512, 1, 784] + - Exact: [512, 256, 1, 784] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 256, 1, 3800] + - Exact: [1024, 256, 1, 3400] + - Exact: [256, 1024, 1, 3400] + - Exact: [1024, 256, 1, 3220] + - Exact: [256, 1024, 1, 3220] + - Exact: [1024, 256, 1, 3456] + - Exact: [256, 1024, 1, 3456] + - Exact: [1024, 256, 1, 3072] + - Exact: [256, 1024, 1, 3072] + - Exact: [1024, 256, 1, 3552] + - Exact: [256, 1024, 1, 3552] + - Exact: [256, 1024, 1, 2852] + - Exact: [1024, 256, 1, 2852] + - Exact: [256, 512, 1, 10752] + - Exact: [256, 1024, 1, 3800] + - Exact: [256, 512, 1, 10560] + - Exact: [256, 1024, 1, 2992] + - Exact: [256, 1024, 1, 2688] + - Exact: [1024, 256, 1, 2688] + - Exact: [256, 1024, 1, 2904] + - Exact: [1024, 256, 1, 2904] + - Exact: [256, 1024, 1, 2640] + - Exact: [1024, 256, 1, 2640] + - Exact: [1024, 256, 1, 4032] + - Exact: [1024, 256, 1, 2992] + - Exact: [256, 1024, 1, 3360] + - Exact: [1024, 256, 1, 3360] + - Exact: [1024, 256, 1, 3500] + - Exact: [256, 1024, 1, 3500] + - Exact: [1024, 256, 1, 3168] + - Exact: [256, 1024, 1, 3168] + - Exact: [256, 1024, 1, 3036] + - Exact: [1024, 256, 1, 4200] + - Exact: [1024, 256, 1, 3600] + - Exact: [256, 1024, 1, 3600] + - Exact: [256, 1024, 1, 2944] + - Exact: [1024, 256, 1, 2944] + - Exact: [1024, 256, 1, 3700] + - Exact: [256, 1024, 1, 2352] + - Exact: [1024, 256, 1, 2352] + - Exact: [1024, 256, 1, 2816] + - Exact: [256, 1024, 1, 3700] + - Exact: [256, 1024, 1, 2816] + - Exact: [256, 512, 1, 11408] + - Exact: [1024, 256, 1, 3036] + - Exact: [1024, 256, 1, 3264] + - Exact: [256, 1024, 1, 3264] + - Exact: [1024, 256, 1, 3864] + - Exact: [256, 1024, 1, 4032] + - Exact: [1024, 256, 1, 3128] + - Exact: [256, 1024, 1, 3128] + - Exact: [256, 1024, 1, 3200] + - Exact: [256, 512, 1, 11616] + - Exact: [1024, 256, 1, 3200] + - Exact: [1024, 256, 1, 4000] + - Exact: [256, 1024, 1, 2520] + - Exact: [1024, 256, 1, 2520] + - Exact: [256, 1024, 1, 2976] + - Exact: [256, 1024, 1, 2400] + - Exact: [1024, 256, 1, 2400] + - Exact: [1024, 256, 1, 3696] + - Exact: [1024, 256, 1, 3900] + - Exact: [1024, 256, 1, 3772] + - Exact: [256, 1024, 1, 3696] + - Exact: [256, 1024, 1, 2728] + - Exact: [1024, 256, 1, 2728] + - Exact: [1024, 256, 1, 2480] + - Exact: [256, 1024, 1, 2480] + - Exact: [1024, 256, 1, 2880] + - Exact: [512, 256, 1, 3220] + - Exact: [256, 1024, 1, 2880] + - Exact: [256, 1024, 1, 4200] + - Exact: [1024, 256, 1, 3648] + - Exact: [1024, 256, 1, 3312] + - Exact: [256, 1024, 1, 3648] + - Exact: [1024, 256, 1, 3300] + - Exact: [1024, 256, 1, 3528] + - Exact: [256, 1024, 1, 2604] + - Exact: [1024, 256, 1, 2604] + - Exact: [512, 256, 1, 11408] + - Exact: [256, 1024, 1, 3312] + - Exact: [256, 1024, 1, 3300] + - Exact: [512, 256, 1, 3072] + - Exact: [256, 1024, 1, 3528] + - Exact: [1024, 256, 1, 2976] + - Exact: [1024, 256, 1, 2760] + - Exact: [512, 256, 1, 3800] + - Exact: [256, 1024, 1, 2760] + - Exact: [1024, 256, 1, 2160] + - Exact: [256, 1024, 1, 2160] + - Exact: [512, 256, 1, 11616] + - Exact: [512, 256, 1, 2852] + - Exact: [256, 1024, 1, 3864] + - Exact: [512, 256, 1, 2640] + - Exact: [256, 1024, 1, 4000] + - Exact: [512, 256, 1, 2904] + - Exact: [256, 1024, 1, 3900] + - Exact: [512, 256, 1, 2688] + - Exact: [256, 1024, 1, 3772] + - Exact: [512, 256, 1, 3400] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 3552] + - Exact: [128, 64, 25, 6498] + - Exact: [128, 64, 25, 6859] + - Exact: [64, 64, 64, 3042] + - Exact: [64, 64, 64, 3211] + - Exact: [64, 64, 49, 4050] + - Exact: [64, 64, 49, 4275] + - Exact: [64, 64, 36, 6498] + - Exact: [64, 64, 36, 6859] + - Exact: [1152, 128, 1, 1444] + - Exact: [512, 256, 1, 361] + - Exact: [576, 128, 1, 1444] + - Exact: [29000, 35, 1, 2560] + - Exact: [29000, 36, 1, 2560] + - Exact: [29000, 39, 1, 2560] + - Exact: [29000, 40, 1, 2560] + - Exact: [29000, 42, 1, 2560] + - Exact: [29000, 43, 1, 2560] + - Exact: [29000, 44, 1, 2560] + - Exact: [29000, 46, 1, 2560] + - Exact: [29000, 48, 1, 2560] + - Exact: [29000, 49, 1, 2560] + - Exact: [29000, 50, 1, 2560] + - Exact: [29000, 51, 1, 2560] + - Exact: [29000, 53, 1, 2560] + - Exact: [29000, 54, 1, 2560] + - Exact: [29000, 55, 1, 2560] + - Exact: [29000, 56, 1, 2560] + - Exact: [29000, 57, 1, 2560] + - Exact: [29000, 58, 1, 2560] + - Exact: [29000, 59, 1, 2560] + - Exact: [29000, 61, 1, 2560] + - Exact: [29000, 63, 1, 2560] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 128, 1, 13600] + - Exact: [256, 128, 1, 12880] + - Exact: [128, 512, 1, 15200] + - Exact: [512, 128, 1, 15200] + - Exact: [128, 512, 1, 11408] + - Exact: [256, 128, 1, 13824] + - Exact: [128, 512, 1, 11616] + - Exact: [256, 128, 1, 14208] + - Exact: [128, 512, 1, 14208] + - Exact: [256, 128, 1, 15200] + - Exact: [512, 128, 1, 11408] + - Exact: [512, 128, 1, 16800] + - Exact: [128, 512, 1, 11264] + - Exact: [512, 128, 1, 11616] + - Exact: [512, 128, 1, 16128] + - Exact: [512, 128, 1, 11968] + - Exact: [128, 512, 1, 11968] + - Exact: [512, 128, 1, 12288] + - Exact: [128, 512, 1, 12288] + - Exact: [128, 512, 1, 12672] + - Exact: [512, 128, 1, 11776] + - Exact: [512, 128, 1, 12144] + - Exact: [512, 128, 1, 11264] + - Exact: [128, 512, 1, 12144] + - Exact: [512, 128, 1, 12672] + - Exact: [128, 512, 1, 12512] + - Exact: [128, 512, 1, 11776] + - Exact: [256, 128, 1, 12288] + - Exact: [40, 40, 1, 1909283] + - Exact: [40, 40, 1, 3818566] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [5888, 1, 1, 3328] + - Exact: [5056, 1, 1, 3328] + - Exact: [6784, 1, 1, 1280] + - Exact: [2944, 1, 1, 3328] + - Exact: [3584, 1, 1, 1280] + - Exact: [6784, 1, 1, 256] + - Exact: [4288, 1, 1, 1280] + - Exact: [5056, 1, 1, 1280] + - Exact: [3584, 1, 1, 256] + - Exact: [6784, 1, 1, 3328] + - Exact: [1408, 1, 1, 1280] + - Exact: [1408, 32, 1, 3328] + - Exact: [4288, 1, 1, 256] + - Exact: [2368, 1, 1, 256] + - Exact: [1856, 32, 1, 32] + - Exact: [5056, 1, 1, 256] + - Exact: [5056, 1, 1, 1] + - Exact: [1408, 1, 1, 256] + - Exact: [1408, 1, 1, 1] + - Exact: [4288, 1, 1, 3328] + - Exact: [2368, 1, 1, 1280] + - Exact: [1856, 1, 1, 1] + - Exact: [1856, 32, 1, 256] + - Exact: [1408, 32, 1, 32] + - Exact: [1856, 32, 1, 1280] + - Exact: [1408, 1, 1, 3328] + - Exact: [5888, 1, 1, 256] + - Exact: [5888, 1, 1, 1] + - Exact: [1856, 32, 1, 3328] + - Exact: [2368, 1, 1, 3328] + - Exact: [6784, 1, 1, 1] + - Exact: [5888, 1, 1, 1280] + - Exact: [2944, 1, 1, 256] + - Exact: [2944, 1, 1, 1] + - Exact: [1408, 32, 1, 1280] + - Exact: [1856, 1, 1, 1280] + - Exact: [3584, 1, 1, 1] + - Exact: [2944, 1, 1, 1280] + - Exact: [3584, 1, 1, 3328] + - Exact: [1856, 1, 1, 3328] + - Exact: [4288, 1, 1, 1] + - Exact: [1856, 1, 1, 256] + - Exact: [1408, 32, 1, 256] + - Exact: [2368, 1, 1, 1] + - Exact: [1760, 32, 1, 1760] + - Exact: [3072, 16, 1, 1024] + - Exact: [2560, 16, 1, 2560] + - Exact: [2048, 32, 1, 2048] + - Exact: [1760, 16, 1, 1760] + - Exact: [7680, 16, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [2048, 16, 1, 2048] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2560] + - Exact: [32768, 1, 1, 256] + - Exact: [1600, 1, 1, 1024] + - Exact: [3456, 1, 1, 256] + - Exact: [4096, 1, 1, 256] + - Exact: [6912, 1, 1, 256] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2560] + - Exact: [29000, 27, 1, 2560] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1, 4288, 1, 1280] + - Exact: [32, 1408, 1, 32] + - Exact: [1, 1408, 1, 3328] + - Exact: [1, 2368, 1, 1280] + - Exact: [1, 5888, 1, 3328] + - Exact: [1, 1856, 1, 256] + - Exact: [1, 3584, 1, 3328] + - Exact: [1, 6784, 1, 3328] + - Exact: [1, 2368, 1, 256] + - Exact: [32, 1856, 1, 3328] + - Exact: [1, 2944, 1, 1280] + - Exact: [1, 1856, 1, 3328] + - Exact: [1, 1408, 1, 1] + - Exact: [1, 6784, 1, 256] + - Exact: [1, 6784, 1, 1] + - Exact: [1, 4288, 1, 3328] + - Exact: [1, 2368, 1, 3328] + - Exact: [1, 5888, 1, 1280] + - Exact: [1, 2944, 1, 256] + - Exact: [1, 6784, 1, 1280] + - Exact: [1, 5056, 1, 1] + - Exact: [32, 1856, 1, 32] + - Exact: [32, 1408, 1, 256] + - Exact: [1, 5888, 1, 1] + - Exact: [1, 2944, 1, 3328] + - Exact: [1, 3584, 1, 1] + - Exact: [1, 1408, 1, 256] + - Exact: [1, 1856, 1, 1] + - Exact: [1, 5056, 1, 1280] + - Exact: [1, 5888, 1, 256] + - Exact: [32, 1856, 1, 1280] + - Exact: [1, 2368, 1, 1] + - Exact: [1, 1408, 1, 1280] + - Exact: [1, 5056, 1, 256] + - Exact: [1, 3584, 1, 1280] + - Exact: [1, 4288, 1, 256] + - Exact: [1, 4288, 1, 1] + - Exact: [1, 2944, 1, 1] + - Exact: [32, 1408, 1, 3328] + - Exact: [1, 5056, 1, 3328] + - Exact: [32, 1856, 1, 256] + - Exact: [1, 1856, 1, 1280] + - Exact: [1, 3584, 1, 256] + - Exact: [32, 1408, 1, 1280] + - Exact: [2, 2048, 1, 1024] + - Exact: [32, 1600, 1, 512] + - Exact: [1, 4096, 1, 256] + - Exact: [1, 6912, 1, 256] + - Exact: [2, 2048, 1, 768] + - Exact: [2, 4608, 1, 768] + - Exact: [2, 4608, 1, 1024] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [147, 64, 1, 12544] + - Exact: [256, 128, 1, 10752] + - Exact: [256, 128, 1, 10560] + - Exact: [256, 128, 1, 11408] + - Exact: [256, 12, 1, 11408] + - Exact: [256, 128, 1, 11616] + - Exact: [256, 12, 1, 11616] + - Exact: [256, 12, 1, 12288] + - Exact: [576, 64, 1, 5625] + - Exact: [147, 64, 1, 22500] + - Exact: [11, 11, 1, 1909283] + - Exact: [11, 11, 1, 3818566] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 1, 1, 256] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [448, 64, 1, 1] + - Exact: [64, 1024, 1, 1280] + - Exact: [1024, 1, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [448, 128, 1, 256] + - Exact: [1, 1024, 1, 3328] + - Exact: [704, 64, 1, 32] + - Exact: [32, 448, 1, 3328] + - Exact: [448, 1, 1, 1] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 128, 1, 1] + - Exact: [256, 128, 1, 1] + - Exact: [256, 32, 1, 3328] + - Exact: [1, 1, 1, 3328] + - Exact: [32, 448, 1, 1280] + - Exact: [32, 448, 1, 32] + - Exact: [64, 1024, 1, 32] + - Exact: [128, 1, 1, 1] + - Exact: [1024, 32, 1, 3328] + - Exact: [448, 1, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [256, 256, 1, 32] + - Exact: [1024, 1, 1, 256] + - Exact: [128, 32, 1, 32] + - Exact: [448, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [1, 64, 1, 3328] + - Exact: [64, 1024, 1, 1] + - Exact: [64, 1024, 1, 3328] + - Exact: [32, 704, 1, 3328] + - Exact: [32, 1024, 1, 3328] + - Exact: [64, 1, 1, 256] + - Exact: [1024, 64, 1, 32] + - Exact: [1024, 64, 1, 3328] + - Exact: [32, 1024, 1, 256] + - Exact: [64, 1, 1, 1] + - Exact: [256, 1, 1, 256] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 64, 1, 1] + - Exact: [32, 704, 1, 1280] + - Exact: [256, 1, 1, 1280] + - Exact: [128, 32, 1, 1280] + - Exact: [128, 256, 1, 1] + - Exact: [1, 256, 1, 256] + - Exact: [1, 256, 1, 1] + - Exact: [1024, 1, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [1024, 32, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [704, 32, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [32, 32, 1, 32] + - Exact: [1024, 32, 1, 32] + - Exact: [128, 64, 1, 32] + - Exact: [64, 1, 1, 1280] + - Exact: [448, 32, 1, 1280] + - Exact: [704, 32, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 256, 1, 1] + - Exact: [256, 256, 1, 256] + - Exact: [448, 1, 1, 3328] + - Exact: [256, 1, 1, 1] + - Exact: [32, 1024, 1, 1280] + - Exact: [1, 256, 1, 3328] + - Exact: [256, 32, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [256, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [32, 1024, 1, 32] + - Exact: [128, 256, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [704, 1, 1, 1] + - Exact: [128, 448, 1, 1280] + - Exact: [448, 32, 1, 32] + - Exact: [704, 64, 1, 1] + - Exact: [704, 32, 1, 256] + - Exact: [32, 704, 1, 32] + - Exact: [128, 64, 1, 256] + - Exact: [448, 32, 1, 3328] + - Exact: [64, 704, 1, 32] + - Exact: [64, 704, 1, 3328] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 448, 1, 32] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 1] + - Exact: [1, 1024, 1, 1] + - Exact: [256, 1, 1, 3328] + - Exact: [32, 64, 1, 32] + - Exact: [256, 256, 1, 1] + - Exact: [32, 256, 1, 32] + - Exact: [128, 1, 1, 256] + - Exact: [32, 64, 1, 3328] + - Exact: [1, 128, 1, 3328] + - Exact: [32, 256, 1, 256] + - Exact: [1, 448, 1, 1] + - Exact: [1, 704, 1, 3328] + - Exact: [64, 1, 1, 3328] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 32, 1, 1280] + - Exact: [128, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 32, 1, 32] + - Exact: [1, 448, 1, 3328] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 32, 1, 3328] + - Exact: [64, 448, 1, 1] + - Exact: [128, 128, 1, 1280] + - Exact: [64, 128, 1, 256] + - Exact: [64, 448, 1, 32] + - Exact: [128, 64, 1, 3328] + - Exact: [32, 64, 1, 1280] + - Exact: [448, 32, 1, 256] + - Exact: [1024, 32, 1, 256] + - Exact: [1, 128, 1, 256] + - Exact: [32, 256, 1, 1280] + - Exact: [32, 128, 1, 3328] + - Exact: [32, 128, 1, 32] + - Exact: [1, 128, 1, 1] + - Exact: [128, 64, 1, 1] + - Exact: [32, 448, 1, 256] + - Exact: [1, 704, 1, 256] + - Exact: [32, 256, 1, 3328] + - Exact: [256, 32, 1, 32] + - Exact: [64, 256, 1, 3328] + - Exact: [1, 704, 1, 1] + - Exact: [128, 448, 1, 1] + - Exact: [64, 128, 1, 32] + - Exact: [704, 1, 1, 1280] + - Exact: [1024, 1, 1, 1] + - Exact: [256, 128, 1, 32] + - Exact: [448, 128, 1, 1] + - Exact: [704, 32, 1, 32] + - Exact: [128, 32, 1, 256] + - Exact: [64, 32, 1, 1280] + - Exact: [448, 128, 1, 32] + - Exact: [128, 448, 1, 256] + - Exact: [32, 32, 1, 256] + - Exact: [256, 64, 1, 32] + - Exact: [1, 1024, 1, 1280] + - Exact: [32, 32, 1, 3328] + - Exact: [1, 256, 1, 1280] + - Exact: [1, 128, 1, 1280] + - Exact: [1, 64, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [32, 704, 1, 256] + - Exact: [1, 64, 1, 1] + - Exact: [704, 64, 1, 1280] + - Exact: [1, 704, 1, 1280] + - Exact: [128, 128, 1, 32] + - Exact: [1024, 64, 1, 1] + - Exact: [704, 1, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 64, 1, 32] + - Exact: [1, 1, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [1, 448, 1, 1280] + - Exact: [64, 256, 1, 32] + - Exact: [32, 128, 1, 1280] + - Exact: [128, 128, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [32, 64, 1, 256] + - Exact: [128, 256, 1, 32] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [448, 64, 1, 32] + - Exact: [64, 128, 1, 1280] + - Exact: [1, 1024, 1, 256] + - Exact: [128, 1, 1, 3328] + - Exact: [128, 128, 1, 1] + - Exact: [32, 128, 1, 256] + - Exact: [1, 64, 1, 1280] + - Exact: [448, 128, 1, 1280] + - Exact: [256, 64, 1, 1] + - Exact: [256, 256, 1, 1280] + - Exact: [704, 1, 1, 3328] + - Exact: [128, 32, 1, 3328] + - Exact: [32, 32, 1, 1280] + - Exact: [1, 1, 1, 256] + - Exact: [1, 448, 1, 256] + - Exact: [256, 64, 1, 3328] + - Exact: [64, 32, 1, 256] + - Exact: [128, 1, 1, 1280] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 256, 1, 3136] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + - Exact: [14, 14, 1, 64] + - Exact: [15, 14, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [17, 15, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [21, 17, 1, 64] + - Exact: [21, 21, 1, 64] + - Exact: [24, 24, 1, 64] + - Exact: [30, 30, 1, 64] + - Exact: [30, 31, 1, 64] + - Exact: [31, 31, 1, 64] + - Exact: [32, 32, 1, 64] + - Exact: [32, 35, 1, 64] + - Exact: [34, 24, 1, 64] + - Exact: [34, 34, 1, 64] + - Exact: [35, 35, 1, 64] + - Exact: [27, 27, 1, 64] + - Exact: [27, 33, 1, 64] + - Exact: [33, 33, 1, 64] + - Exact: [2, 4, 1, 1024] + - Exact: [2, 32, 1, 1024] + - Exact: [64, 512, 1, 512] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 32, 1, 1024] + - Exact: [3, 3, 512, 64] + - Exact: [5, 5, 512, 64] + - Exact: [5, 5, 960, 64] + - Exact: [9, 9, 512, 64] + - Exact: [27, 27, 32768, 128] + - Exact: [64, 512, 1, 1024] + - Exact: [64, 960, 1, 1024] + - Exact: [14, 14, 10880, 64] + - Exact: [15, 14, 10880, 64] + - Exact: [15, 15, 7680, 64] + - Exact: [15, 15, 10880, 64] + - Exact: [17, 15, 7680, 64] + - Exact: [17, 17, 7680, 64] + - Exact: [21, 17, 6144, 64] + - Exact: [21, 21, 6144, 64] + - Exact: [24, 24, 4736, 64] + - Exact: [30, 30, 2048, 64] + - Exact: [30, 31, 2048, 64] + - Exact: [31, 31, 2048, 64] + - Exact: [34, 24, 4736, 64] + - Exact: [27, 27, 1920, 64] + - Exact: [27, 33, 1920, 64] + - Exact: [2, 8, 1, 1024] + - Exact: [1024, 77, 1, 1024] + - Exact: [2, 10, 1, 1024] + - Exact: [1024, 10, 1, 1024] + - Exact: [2, 39, 1, 1024] + - Exact: [1024, 39, 1, 1024] + - Exact: [2, 40, 1, 1024] + - Exact: [1024, 40, 1, 1024] + - Exact: [2, 41, 1, 1024] + - Exact: [1024, 41, 1, 1024] + - Exact: [2, 5, 1, 1024] + - Exact: [1024, 5, 1, 1024] + - Exact: [2, 6, 1, 1024] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 8, 1, 1024] + - Exact: [2, 9, 1, 1024] + - Exact: [1024, 9, 1, 1024] + - Exact: [4, 4, 32768, 64] + - Exact: [4, 4, 38400, 64] + - Exact: [17, 17, 6144, 64] + - Exact: [128, 128, 1, 64] + - Exact: [64, 128, 1, 128] + - Exact: [2, 1024, 1, 1024] + - Exact: [5, 5, 1, 64] + - Exact: [33, 33, 1, 32] + - Exact: [1024, 16, 1, 1024] + - Exact: [2, 4, 1, 2560] + - Exact: [2, 16, 1, 1024] + - Exact: [2, 2, 1, 2048] + - Exact: [1024, 1, 1, 1024] + - Exact: [512, 1, 1, 2048] + - Exact: [200, 1, 1, 1024] + - Exact: [960, 1, 1, 2048] + - Exact: [1024, 64, 1, 1024] + - Exact: [864, 1, 1, 256] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 12, 1, 1024] + - Exact: [2, 64, 1, 1024] + - Exact: [2, 80, 1, 1024] + - Exact: [2, 82, 1, 1024] + - Exact: [2, 12, 1, 1024] + - Exact: [2, 1, 1, 1024] + - Exact: [24, 24, 6816, 64] + - Exact: [256, 128, 1, 3136] + - Exact: [576, 64, 1, 3136] + - Exact: [768, 16, 1, 768] + - Exact: [768, 12, 1, 768] + - Exact: [768, 4, 1, 768] + - Exact: [64, 1024, 1, 1024] + - Exact: [26, 26, 6272, 64] + - Exact: [2, 128, 1, 1024] + - Exact: [2, 96, 1, 1024] + - Exact: [256, 80, 1, 784] + - Exact: [256, 12, 1, 3800] + - Exact: [256, 3, 1, 3800] + - Exact: [256, 12, 1, 950] + - Exact: [256, 3, 1, 950] + - Exact: [256, 12, 1, 3220] + - Exact: [256, 3, 1, 3220] + - Exact: [256, 12, 1, 3072] + - Exact: [256, 3, 1, 3072] + - Exact: [256, 12, 1, 850] + - Exact: [256, 3, 1, 850] + - Exact: [256, 12, 1, 2852] + - Exact: [256, 3, 1, 2852] + - Exact: [256, 12, 1, 805] + - Exact: [256, 3, 1, 805] + - Exact: [256, 3, 1, 864] + - Exact: [256, 3, 1, 768] + - Exact: [256, 12, 1, 864] + - Exact: [256, 12, 1, 768] + - Exact: [256, 12, 1, 2904] + - Exact: [256, 3, 1, 2904] + - Exact: [256, 3, 1, 713] + - Exact: [256, 12, 1, 888] + - Exact: [256, 3, 1, 888] + - Exact: [256, 12, 1, 713] + - Exact: [256, 3, 1, 660] + - Exact: [256, 3, 1, 672] + - Exact: [256, 12, 1, 660] + - Exact: [256, 3, 1, 726] + - Exact: [256, 12, 1, 672] + - Exact: [256, 3, 1, 247] + - Exact: [256, 12, 1, 726] + - Exact: [256, 3, 1, 216] + - Exact: [256, 3, 1, 3400] + - Exact: [256, 3, 1, 221] + - Exact: [256, 12, 1, 3552] + - Exact: [256, 3, 1, 3456] + - Exact: [256, 3, 1, 204] + - Exact: [256, 12, 1, 3400] + - Exact: [256, 12, 1, 3456] + - Exact: [256, 12, 1, 221] + - Exact: [256, 3, 1, 3552] + - Exact: [256, 3, 1, 228] + - Exact: [256, 3, 1, 234] + - Exact: [256, 12, 1, 234] + - Exact: [256, 12, 1, 228] + - Exact: [256, 3, 1, 252] + - Exact: [256, 12, 1, 252] + - Exact: [256, 12, 1, 247] + - Exact: [128, 256, 1, 1444] + - Exact: [256, 128, 1, 25] + - Exact: [256, 128, 1, 9] + - Exact: [256, 256, 1, 1444] + - Exact: [512, 128, 1, 100] + - Exact: [64, 128, 1, 1444] + - Exact: [81, 1024, 1, 1024] + - Exact: [81, 1000, 1, 1024] + - Exact: [1024, 20, 1, 1024] + - Exact: [2, 8, 1, 2048] + - Exact: [2, 20, 1, 1024] + - Exact: [2, 2, 1, 2560] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml new file mode 100644 index 000000000..5b58c274b --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml @@ -0,0 +1,288 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 5056, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [64, 5056, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [64, 5888, 1, 256] + - Exact: [1024, 1024, 1, 1024] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 1408, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [64, 2368, 1, 3328] + - Exact: [64, 3584, 1, 1280] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2368, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [64, 1856, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [64, 4288, 1, 256] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 4288, 1, 3328] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1, 2944, 1, 3328] + - Exact: [1, 2368, 1, 1280] + - Exact: [1, 1408, 1, 1280] + - Exact: [1, 2368, 1, 3328] + - Exact: [1, 3584, 1, 1280] + - Exact: [1, 2944, 1, 1] + - Exact: [1, 1408, 1, 3328] + - Exact: [1, 2944, 1, 256] + - Exact: [1, 5056, 1, 256] + - Exact: [1, 1856, 1, 256] + - Exact: [1, 1856, 1, 1280] + - Exact: [1, 4288, 1, 1280] + - Exact: [1, 1408, 1, 1] + - Exact: [1, 1408, 1, 256] + - Exact: [1, 2368, 1, 256] + - Exact: [1, 4288, 1, 1] + - Exact: [1, 1856, 1, 3328] + - Exact: [1, 4288, 1, 3328] + - Exact: [1, 2368, 1, 1] + - Exact: [1, 3584, 1, 3328] + - Exact: [1, 5056, 1, 1] + - Exact: [1, 3584, 1, 256] + - Exact: [1, 5056, 1, 1280] + - Exact: [1, 3584, 1, 1] + - Exact: [1, 2944, 1, 1280] + - Exact: [1, 1856, 1, 1] + - Exact: [1, 4288, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 448, 1, 3328] + - Exact: [1, 64, 1, 1280] + - Exact: [64, 128, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [1, 128, 1, 1] + - Exact: [64, 1024, 1, 3328] + - Exact: [1, 64, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [1, 256, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [1, 1, 1, 3328] + - Exact: [1, 64, 1, 1] + - Exact: [1, 128, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [1, 1024, 1, 256] + - Exact: [1, 704, 1, 3328] + - Exact: [1, 256, 1, 1] + - Exact: [128, 64, 1, 256] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [1, 1, 1, 256] + - Exact: [1, 704, 1, 1] + - Exact: [1, 704, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [1, 448, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [1, 1, 1, 1] + - Exact: [256, 64, 1, 256] + - Exact: [1, 448, 1, 3328] + - Exact: [1, 128, 1, 256] + - Exact: [1, 1024, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [1, 1024, 1, 1280] + - Exact: [1, 1, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [1, 448, 1, 256] + - Exact: [1, 1024, 1, 1] + - Exact: [64, 704, 1, 3328] + - Exact: [1, 256, 1, 256] + - Exact: [1, 704, 1, 256] + - Exact: [1, 128, 1, 3328] + - Exact: [64, 64, 1, 3328] + - Exact: [1, 448, 1, 1] + - Exact: [1, 64, 1, 256] + - Exact: [64, 704, 1, 1280] + - Exact: [1, 256, 1, 3328] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml new file mode 100644 index 000000000..6d78bddae --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml @@ -0,0 +1,2270 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [5056, 6784, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [3584, 1024, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 2944, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1408, 1408, 1, 3328] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 704, 1, 256] + - Exact: [1408, 3584, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1280] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [3584, 5888, 1, 256] + - Exact: [2944, 3584, 1, 256] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [5056, 1408, 1, 3328] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 1856, 1, 1280] + - Exact: [3584, 448, 1, 256] + - Exact: [3584, 3584, 1, 1280] + - Exact: [256, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [6784, 4288, 1, 1280] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [5056, 1856, 1, 256] + - Exact: [704, 4288, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 4288, 1, 3328] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [1408, 2944, 1, 3328] + - Exact: [4288, 4288, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 3328] + - Exact: [448, 4288, 1, 1280] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [704, 5056, 1, 1280] + - Exact: [2944, 1856, 1, 256] + - Exact: [5888, 5056, 1, 3328] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 5056, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [448, 5056, 1, 256] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [2944, 704, 1, 3328] + - Exact: [2944, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 256] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [1856, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 3328] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [256, 5888, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [4288, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 5056, 1, 3328] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [2944, 5888, 1, 3328] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 3328] + - Exact: [1408, 1024, 1, 256] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4288, 1408, 1, 256] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [448, 5888, 1, 1280] + - Exact: [4288, 704, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [5888, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [256, 5888, 1, 1280] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5124, 9124, 1, 1760] + - Exact: [1024, 1500, 1, 1536] + - Exact: [512, 24000, 1, 2048] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1024, 3000, 1, 2560] + - Exact: [512, 3136, 1, 2048] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [512, 48000, 1, 2048] + - Exact: [196, 256, 64, 1024] + - Exact: [512, 48000, 1, 1536] + - Exact: [4608, 1500, 1, 1536] + - Exact: [1024, 24000, 1, 2560] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5124, 9124, 1, 2048] + - Exact: [5124, 700, 1, 2560] + - Exact: [6144, 6000, 1, 2560] + - Exact: [1024, 1500, 1, 2816] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [4224, 1500, 1, 176] + - Exact: [1024, 6000, 1, 2816] + - Exact: [1024, 48000, 1, 1536] + - Exact: [1024, 48000, 1, 2560] + - Exact: [4608, 24000, 1, 1536] + - Exact: [7680, 48000, 1, 2560] + - Exact: [3072, 48000, 1, 1024] + - Exact: [1024, 1500, 1, 2048] + - Exact: [1024, 3000, 1, 2048] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1760, 7000, 1, 1760] + - Exact: [8448, 3000, 1, 2816] + - Exact: [4608, 48000, 1, 1536] + - Exact: [7680, 1500, 1, 2560] + - Exact: [512, 3000, 1, 1536] + - Exact: [1024, 3000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 24000, 1, 1536] + - Exact: [7680, 6000, 1, 2560] + - Exact: [512, 6000, 1, 2560] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [1024, 24000, 1, 2816] + - Exact: [2048, 7000, 1, 2048] + - Exact: [7680, 3000, 1, 2560] + - Exact: [5124, 700, 1, 2048] + - Exact: [5124, 9124, 1, 4096] + - Exact: [256, 193600, 1, 64] + - Exact: [8448, 6000, 1, 2816] + - Exact: [5124, 1500, 1, 2560] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1024, 6000, 1, 2560] + - Exact: [196, 1024, 64, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [7680, 24000, 1, 2560] + - Exact: [512, 3000, 1, 2560] + - Exact: [8448, 24000, 1, 2816] + - Exact: [512, 6000, 1, 1536] + - Exact: [3072, 6000, 1, 1024] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [1024, 3000, 1, 1536] + - Exact: [512, 6000, 1, 2816] + - Exact: [128, 50176, 1, 512] + - Exact: [256, 12544, 1, 1024] + - Exact: [1024, 12544, 1, 256] + - Exact: [512, 48000, 1, 2560] + - Exact: [512, 24000, 1, 1536] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5124, 1500, 1, 2048] + - Exact: [3072, 1500, 1, 1024] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 48000, 1, 2816] + - Exact: [1024, 6000, 1, 1536] + - Exact: [512, 3000, 1, 2048] + - Exact: [6144, 24000, 1, 2560] + - Exact: [4608, 6000, 1, 1536] + - Exact: [3072, 3000, 1, 1024] + - Exact: [1024, 48000, 1, 2048] + - Exact: [784, 512, 64, 128] + - Exact: [3136, 256, 64, 64] + - Exact: [12544, 1024, 1, 256] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 512, 1, 2048] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [1024, 2048, 1, 2] + - Exact: [1024, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 2048, 1, 30528] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 4096, 1, 30528] + - Exact: [4096, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [256, 8976, 1, 1536] + - Exact: [256, 8976, 1, 2048] + - Exact: [256, 8976, 1, 2304] + - Exact: [256, 8976, 1, 2560] + - Exact: [256, 8976, 1, 2816] + - Exact: [256, 8976, 1, 3072] + - Exact: [256, 8976, 1, 4352] + - Exact: [256, 8976, 1, 4864] + - Exact: [256, 8976, 1, 5376] + - Exact: [256, 8976, 1, 5632] + - Exact: [256, 8976, 1, 5888] + - Exact: [256, 8976, 1, 6144] + - Exact: [256, 8976, 1, 6656] + - Exact: [256, 8976, 1, 7168] + - Exact: [256, 8976, 1, 7424] + - Exact: [256, 8976, 1, 8192] + - Exact: [256, 8976, 1, 8448] + - Exact: [256, 8976, 1, 8960] + - Exact: [256, 8976, 1, 9472] + - Exact: [256, 8976, 1, 9728] + - Exact: [256, 8976, 1, 9984] + - Exact: [256, 8976, 1, 10240] + - Exact: [256, 8976, 1, 10496] + - Exact: [256, 8976, 1, 11008] + - Exact: [256, 8976, 1, 11520] + - Exact: [256, 8976, 1, 12288] + - Exact: [256, 8976, 1, 14336] + - Exact: [256, 8976, 1, 14848] + - Exact: [256, 8976, 1, 15104] + - Exact: [256, 8976, 1, 15872] + - Exact: [256, 8976, 1, 17152] + - Exact: [256, 8976, 1, 19712] + - Exact: [256, 8976, 1, 19968] + - Exact: [256, 8976, 1, 20480] + - Exact: [256, 8976, 1, 20992] + - Exact: [256, 8976, 1, 22016] + - Exact: [256, 8976, 1, 26112] + - Exact: [256, 8976, 1, 33536] + - Exact: [256, 8976, 1, 44505] + - Exact: [256, 32768, 1, 128] + - Exact: [480, 32768, 1, 1024] + - Exact: [512, 32768, 1, 256] + - Exact: [1024, 1600, 1, 1] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1792, 1, 256] + - Exact: [1024, 2048, 1, 256] + - Exact: [1024, 2560, 1, 256] + - Exact: [1024, 3072, 1, 256] + - Exact: [1024, 3328, 1, 256] + - Exact: [1024, 3840, 1, 256] + - Exact: [1024, 4096, 1, 256] + - Exact: [1024, 4608, 1, 256] + - Exact: [1024, 4864, 1, 256] + - Exact: [1024, 5120, 1, 256] + - Exact: [1024, 5632, 1, 256] + - Exact: [1024, 6144, 1, 256] + - Exact: [1024, 6400, 1, 256] + - Exact: [1024, 7168, 1, 256] + - Exact: [1024, 7424, 1, 256] + - Exact: [1024, 7680, 1, 256] + - Exact: [1024, 7936, 1, 256] + - Exact: [1024, 8192, 1, 256] + - Exact: [1024, 8448, 1, 256] + - Exact: [1024, 8704, 1, 256] + - Exact: [1024, 8960, 1, 256] + - Exact: [1024, 9728, 1, 256] + - Exact: [1024, 9984, 1, 256] + - Exact: [1024, 10240, 1, 256] + - Exact: [1024, 10496, 1, 256] + - Exact: [1024, 11008, 1, 256] + - Exact: [1024, 11264, 1, 256] + - Exact: [1024, 11520, 1, 256] + - Exact: [1024, 12288, 1, 256] + - Exact: [1024, 13312, 1, 256] + - Exact: [1024, 13568, 1, 256] + - Exact: [1024, 14336, 1, 256] + - Exact: [1024, 14592, 1, 256] + - Exact: [1024, 14848, 1, 256] + - Exact: [1024, 15104, 1, 256] + - Exact: [1024, 16128, 1, 256] + - Exact: [1024, 17152, 1, 256] + - Exact: [1024, 18944, 1, 256] + - Exact: [1024, 19712, 1, 256] + - Exact: [1024, 19968, 1, 256] + - Exact: [1024, 20480, 1, 256] + - Exact: [1024, 20992, 1, 256] + - Exact: [1024, 21504, 1, 256] + - Exact: [1024, 22016, 1, 256] + - Exact: [1024, 23552, 1, 256] + - Exact: [1024, 28672, 1, 256] + - Exact: [1024, 32768, 1, 512] + - Exact: [1024, 32768, 1, 1024] + - Exact: [1024, 33536, 1, 256] + - Exact: [1024, 40448, 1, 256] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1024, 1, 1] + - Exact: [2048, 1024, 1, 256] + - Exact: [3200, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 1] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 3968, 1, 42720] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 6528, 1, 42720] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7104, 1, 42720] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 7200, 1, 42720] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 9520, 1, 42720] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10080, 1, 42720] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [1024, 3240, 1, 1024] + - Exact: [1024, 3240, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3960, 1, 4096] + - Exact: [1024, 3960, 1, 42720] + - Exact: [4096, 3240, 1, 1024] + - Exact: [4096, 3960, 1, 1024] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [3136, 128, 64, 64] + - Exact: [3136, 256, 64, 128] + - Exact: [784, 512, 64, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 1024, 64, 512] + - Exact: [784, 256, 64, 512] + - Exact: [784, 512, 64, 512] + - Exact: [196, 512, 64, 1024] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 64] + - Exact: [3136, 256, 32, 128] + - Exact: [784, 512, 32, 256] + - Exact: [3136, 128, 32, 256] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 1024, 32, 512] + - Exact: [784, 256, 32, 512] + - Exact: [784, 512, 32, 512] + - Exact: [196, 512, 32, 1024] + - Exact: [196, 1024, 32, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 2] + - Exact: [4096, 512, 1, 1024] + - Exact: [1024, 1280, 1, 2] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1024, 1, 1024] + - Exact: [4096, 1280, 1, 1024] + - Exact: [1024, 4992, 1, 2] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [1024, 5120, 1, 2] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [1024, 5248, 1, 2] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [1024, 2560, 1, 2] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 2] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3072, 1, 1024] + - Exact: [1024, 1152, 1, 2] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [479, 32768, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 8192, 1, 33712] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [1024, 9600, 1, 33712] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [1024, 1024, 64, 64] + - Exact: [1024, 16384, 1, 3072] + - Exact: [1024, 2048, 1, 30592] + - Exact: [640, 2048, 1, 2560] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 4096, 1, 4608] + - Exact: [512, 512, 256, 64] + - Exact: [2048, 1024, 1, 8192] + - Exact: [4096, 16384, 1, 1024] + - Exact: [1024, 8192, 1, 50304] + - Exact: [1536, 8192, 1, 50304] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 4096, 1, 30592] + - Exact: [1536, 4096, 1, 6144] + - Exact: [1024, 16384, 1, 4096] + - Exact: [1024, 16384, 1, 50304] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1024, 2048, 1, 3072] + - Exact: [2560, 2048, 1, 7680] + - Exact: [2048, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 30592] + - Exact: [8192, 1024, 1, 2048] + - Exact: [2560, 2048, 1, 2560] + - Exact: [1536, 8192, 1, 4608] + - Exact: [1024, 2048, 1, 50304] + - Exact: [1024, 1024, 32, 64] + - Exact: [1536, 8192, 1, 6144] + - Exact: [1024, 1024, 256, 64] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 50304] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 8192, 1, 3072] + - Exact: [1024, 1024, 128, 64] + - Exact: [1024, 4096, 1, 50304] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 16384, 1, 1024] + - Exact: [2560, 2048, 1, 1920] + - Exact: [2048, 1024, 1, 6144] + - Exact: [512, 512, 128, 64] + - Exact: [1024, 8192, 1, 30592] + - Exact: [1536, 4096, 1, 1536] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 8192, 1, 30528] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 512] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 6912, 1, 1024] + - Exact: [1024, 6912, 1, 512] + - Exact: [256, 55296, 1, 128] + - Exact: [256, 6912, 1, 128] + - Exact: [480, 3456, 1, 1024] + - Exact: [480, 4096, 1, 1024] + - Exact: [480, 6912, 1, 1024] + - Exact: [512, 3456, 1, 256] + - Exact: [512, 4096, 1, 256] + - Exact: [512, 55296, 1, 256] + - Exact: [512, 6912, 1, 256] + - Exact: [1024, 1280, 1, 30528] + - Exact: [1024, 1600, 1, 30528] + - Exact: [1024, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [4096, 10240, 1, 1024] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 1640, 1, 30528] + - Exact: [1024, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 4096] + - Exact: [4096, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [1024, 6144, 1, 4096] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [512, 512, 192, 64] + - Exact: [256, 6912, 1, 1] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [1024, 10224, 1, 3072] + - Exact: [4096, 10224, 1, 1024] + - Exact: [1024, 10240, 1, 3072] + - Exact: [1024, 10192, 1, 3072] + - Exact: [4096, 10192, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [1024, 10200, 1, 3072] + - Exact: [1024, 10184, 1, 1024] + - Exact: [4096, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 3072] + - Exact: [1024, 10208, 1, 4096] + - Exact: [1024, 10224, 1, 2048] + - Exact: [1024, 10240, 1, 2048] + - Exact: [1024, 10120, 1, 1024] + - Exact: [1024, 10192, 1, 2048] + - Exact: [1024, 10152, 1, 1024] + - Exact: [1024, 10080, 1, 3072] + - Exact: [100352, 512, 1, 256] + - Exact: [12544, 2048, 1, 1024] + - Exact: [200704, 512, 1, 256] + - Exact: [25088, 1024, 1, 512] + - Exact: [50176, 1024, 1, 512] + - Exact: [6272, 2048, 1, 1024] + - Exact: [3136, 128, 128, 256] + - Exact: [3136, 128, 256, 256] + - Exact: [784, 256, 128, 512] + - Exact: [784, 256, 256, 512] + - Exact: [128, 128, 2048, 64] + - Exact: [1024, 2560, 1, 30528] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 4096] + - Exact: [1024, 12288, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [1024, 1920, 1, 30528] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 2] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [768, 2048, 1, 768] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 2] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [768, 4608, 1, 768] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 2] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [1024, 4608, 1, 1024] + - Exact: [8192, 1024, 1, 1024] + - Exact: [8192, 4096, 1, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 256, 1024] + - Exact: [196, 512, 128, 1024] + - Exact: [196, 512, 256, 1024] + - Exact: [3072, 256, 2, 1024] + - Exact: [768, 2048, 2, 512] + - Exact: [2904, 256, 2, 1024] + - Exact: [864, 2048, 2, 512] + - Exact: [2992, 256, 2, 1024] + - Exact: [3400, 256, 2, 1024] + - Exact: [4032, 256, 2, 1024] + - Exact: [15200, 128, 2, 512] + - Exact: [12288, 128, 2, 512] + - Exact: [888, 2048, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3456, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2688, 256, 2, 1024] + - Exact: [13824, 128, 2, 512] + - Exact: [3168, 256, 2, 1024] + - Exact: [3360, 256, 2, 1024] + - Exact: [3552, 256, 2, 1024] + - Exact: [11616, 128, 2, 512] + - Exact: [4200, 256, 2, 1024] + - Exact: [840, 2048, 2, 512] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [3264, 256, 2, 1024] + - Exact: [13600, 256, 2, 512] + - Exact: [12880, 256, 2, 512] + - Exact: [12288, 256, 2, 512] + - Exact: [2816, 256, 2, 1024] + - Exact: [672, 2048, 2, 512] + - Exact: [13440, 128, 2, 512] + - Exact: [13824, 256, 2, 512] + - Exact: [15200, 256, 2, 512] + - Exact: [3600, 256, 2, 1024] + - Exact: [4032, 1024, 2, 256] + - Exact: [16128, 128, 2, 512] + - Exact: [15200, 128, 1, 512] + - Exact: [13600, 128, 1, 512] + - Exact: [2904, 1024, 2, 256] + - Exact: [2992, 1024, 2, 256] + - Exact: [1536, 2048, 1, 1024] + - Exact: [24576, 128, 1, 256] + - Exact: [24576, 512, 1, 256] + - Exact: [25760, 128, 1, 256] + - Exact: [25760, 512, 1, 256] + - Exact: [6144, 256, 1, 512] + - Exact: [6440, 256, 1, 512] + - Exact: [13600, 512, 1, 128] + - Exact: [9408, 512, 2, 128] + - Exact: [56000, 256, 2, 64] + - Exact: [2816, 1024, 2, 256] + - Exact: [60800, 256, 1, 64] + - Exact: [2944, 1024, 2, 256] + - Exact: [11776, 512, 2, 128] + - Exact: [11616, 512, 2, 128] + - Exact: [4200, 1024, 2, 256] + - Exact: [54400, 256, 1, 64] + - Exact: [15200, 512, 1, 128] + - Exact: [2688, 1024, 2, 256] + - Exact: [12672, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [46464, 256, 2, 64] + - Exact: [2400, 256, 2, 1024] + - Exact: [2520, 256, 2, 1024] + - Exact: [2400, 1024, 2, 256] + - Exact: [10752, 128, 2, 512] + - Exact: [45632, 256, 2, 64] + - Exact: [2520, 1024, 2, 256] + - Exact: [53760, 256, 2, 64] + - Exact: [2352, 256, 2, 1024] + - Exact: [47872, 256, 2, 64] + - Exact: [47104, 256, 2, 64] + - Exact: [50688, 256, 2, 64] + - Exact: [45056, 256, 2, 64] + - Exact: [13440, 512, 2, 128] + - Exact: [2352, 1024, 2, 256] + - Exact: [11264, 512, 2, 128] + - Exact: [10560, 128, 2, 512] + - Exact: [16128, 512, 2, 128] + - Exact: [37632, 256, 2, 64] + - Exact: [51520, 256, 2, 64] + - Exact: [14000, 512, 2, 128] + - Exact: [10560, 512, 2, 128] + - Exact: [64512, 256, 2, 64] + - Exact: [54400, 256, 2, 64] + - Exact: [3264, 1024, 2, 256] + - Exact: [10752, 512, 2, 128] + - Exact: [3168, 1024, 2, 256] + - Exact: [55296, 256, 2, 256] + - Exact: [51520, 256, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [60800, 256, 2, 256] + - Exact: [54400, 256, 2, 256] + - Exact: [60800, 256, 2, 64] + - Exact: [3800, 1024, 1, 256] + - Exact: [3400, 1024, 1, 256] + - Exact: [3072, 1024, 2, 256] + - Exact: [3600, 1024, 2, 256] + - Exact: [12288, 512, 2, 128] + - Exact: [49152, 256, 2, 256] + - Exact: [12880, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [42240, 256, 2, 64] + - Exact: [1008, 2048, 2, 512] + - Exact: [3360, 1024, 2, 256] + - Exact: [14208, 512, 2, 128] + - Exact: [56832, 256, 2, 64] + - Exact: [43008, 256, 2, 64] + - Exact: [13600, 512, 2, 128] + - Exact: [2640, 1024, 2, 256] + - Exact: [13824, 512, 2, 128] + - Exact: [3800, 256, 2, 1024] + - Exact: [55296, 256, 2, 64] + - Exact: [2640, 256, 2, 1024] + - Exact: [15200, 512, 2, 128] + - Exact: [3552, 1024, 2, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [49152, 256, 2, 64] + - Exact: [3400, 1024, 2, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [6912, 256, 1, 512] + - Exact: [6800, 256, 1, 512] + - Exact: [27648, 128, 1, 256] + - Exact: [27200, 128, 1, 256] + - Exact: [30400, 128, 1, 256] + - Exact: [7600, 256, 1, 512] + - Exact: [6144, 1024, 1, 512] + - Exact: [6912, 1024, 1, 512] + - Exact: [6440, 1024, 1, 512] + - Exact: [27648, 512, 1, 256] + - Exact: [1728, 2048, 1, 1024] + - Exact: [27200, 512, 1, 256] + - Exact: [6800, 1024, 1, 512] + - Exact: [7600, 1024, 1, 512] + - Exact: [30400, 512, 1, 256] + - Exact: [12544, 1024, 1, 1024] + - Exact: [173280, 128, 1, 64] + - Exact: [231040, 128, 1, 64] + - Exact: [25992, 128, 1, 64] + - Exact: [2852, 256, 2, 1024] + - Exact: [3220, 256, 2, 1024] + - Exact: [850, 2048, 2, 512] + - Exact: [805, 2048, 2, 512] + - Exact: [3036, 256, 2, 1024] + - Exact: [713, 2048, 2, 512] + - Exact: [850, 2048, 1, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [3500, 256, 2, 1024] + - Exact: [3700, 256, 2, 1024] + - Exact: [748, 2048, 2, 512] + - Exact: [3036, 1024, 2, 256] + - Exact: [2852, 1024, 2, 256] + - Exact: [950, 2048, 1, 512] + - Exact: [3700, 1024, 2, 256] + - Exact: [3500, 1024, 2, 256] + - Exact: [3220, 1024, 2, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [1610, 2048, 1, 1024] + - Exact: [1700, 2048, 1, 1024] + - Exact: [1900, 2048, 1, 1024] + - Exact: [1444, 256, 120, 128] + - Exact: [1444, 256, 139, 128] + - Exact: [1444, 256, 160, 128] + - Exact: [1444, 256, 18, 128] + - Exact: [1444, 256, 19, 128] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 139, 256] + - Exact: [1444, 256, 160, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 256, 120, 512] + - Exact: [361, 256, 139, 512] + - Exact: [361, 256, 160, 512] + - Exact: [361, 256, 18, 512] + - Exact: [361, 256, 19, 512] + - Exact: [200716, 128, 1, 64] + - Exact: [27436, 128, 1, 64] + - Exact: [1024, 1024, 160, 96] + - Exact: [1920, 16384, 1, 25216] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [960, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 2880] + - Exact: [1024, 1024, 40, 96] + - Exact: [1920, 4096, 1, 25216] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [960, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 2880] + - Exact: [1024, 1024, 80, 96] + - Exact: [1920, 8192, 1, 25216] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [960, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 2880] + - Exact: [1024, 1024, 96, 96] + - Exact: [2304, 16384, 1, 12672] + - Exact: [2304, 16384, 1, 2304] + - Exact: [576, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 1728] + - Exact: [1024, 1024, 24, 96] + - Exact: [2304, 4096, 1, 12672] + - Exact: [2304, 4096, 1, 2304] + - Exact: [576, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 1728] + - Exact: [1024, 1024, 48, 96] + - Exact: [2304, 8192, 1, 12672] + - Exact: [2304, 8192, 1, 2304] + - Exact: [576, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 1728] + - Exact: [1024, 1024, 16, 96] + - Exact: [3072, 4096, 1, 6400] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [384, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1152] + - Exact: [1024, 1024, 32, 96] + - Exact: [3072, 8192, 1, 6400] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [384, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1152] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 4096, 1, 2048] + - Exact: [1024, 2283, 1, 29000] + - Exact: [1024, 2296, 1, 29000] + - Exact: [1024, 2306, 1, 29000] + - Exact: [1024, 2309, 1, 29000] + - Exact: [1024, 2318, 1, 29000] + - Exact: [1024, 2320, 1, 29000] + - Exact: [1024, 2324, 1, 29000] + - Exact: [1024, 2325, 1, 29000] + - Exact: [1024, 2329, 1, 29000] + - Exact: [1024, 2338, 1, 29000] + - Exact: [1024, 2345, 1, 29000] + - Exact: [1024, 2350, 1, 29000] + - Exact: [1024, 2362, 1, 29000] + - Exact: [1024, 2366, 1, 29000] + - Exact: [1024, 2368, 1, 29000] + - Exact: [1024, 2374, 1, 29000] + - Exact: [1024, 2390, 1, 29000] + - Exact: [512, 512, 320, 64] + - Exact: [512, 512, 80, 64] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 3072] + - Exact: [1024, 32768, 1, 4096] + - Exact: [1024, 32768, 1, 50304] + - Exact: [4096, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 780, 1, 30522] + - Exact: [1024, 308, 1, 30522] + - Exact: [1024, 800, 1, 30522] + - Exact: [1024, 820, 1, 30522] + - Exact: [1024, 385, 1, 30522] + - Exact: [1024, 462, 1, 30522] + - Exact: [1024, 640, 1, 30528] + - Exact: [2048, 199, 1, 29000] + - Exact: [2048, 221, 1, 29000] + - Exact: [2048, 224, 1, 29000] + - Exact: [2048, 229, 1, 29000] + - Exact: [2048, 234, 1, 29000] + - Exact: [2048, 242, 1, 29000] + - Exact: [2048, 246, 1, 29000] + - Exact: [2048, 247, 1, 29000] + - Exact: [2048, 256, 1, 29000] + - Exact: [2048, 262, 1, 29000] + - Exact: [2048, 264, 1, 29000] + - Exact: [2048, 265, 1, 29000] + - Exact: [2048, 274, 1, 29000] + - Exact: [2048, 277, 1, 29000] + - Exact: [2048, 279, 1, 29000] + - Exact: [2048, 288, 1, 29000] + - Exact: [2048, 296, 1, 29000] + - Exact: [2048, 315, 1, 29000] + - Exact: [2048, 335, 1, 29000] + - Exact: [1024, 561, 1, 29000] + - Exact: [1024, 574, 1, 29000] + - Exact: [1024, 600, 1, 29000] + - Exact: [1024, 608, 1, 29000] + - Exact: [1024, 615, 1, 29000] + - Exact: [1024, 622, 1, 29000] + - Exact: [1024, 625, 1, 29000] + - Exact: [1024, 626, 1, 29000] + - Exact: [1024, 628, 1, 29000] + - Exact: [1024, 636, 1, 29000] + - Exact: [1024, 651, 1, 29000] + - Exact: [1024, 658, 1, 29000] + - Exact: [1024, 669, 1, 29000] + - Exact: [1024, 670, 1, 29000] + - Exact: [1024, 672, 1, 29000] + - Exact: [1024, 684, 1, 29000] + - Exact: [1024, 716, 1, 29000] + - Exact: [1024, 730, 1, 29000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 2944, 1, 3328] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 256, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [2368, 128, 1, 256] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [1024, 704, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [3584, 256, 1, 3328] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [448, 2368, 1, 1280] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 256] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [256, 2944, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [704, 448, 1, 3328] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [448, 2368, 1, 3328] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 128, 1, 256] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [704, 1408, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [2944, 128, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [704, 448, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [256, 2368, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [1408, 448, 1, 256] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 6784, 1, 256] + - Exact: [704, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [448, 1856, 1, 256] + - Exact: [1856, 704, 1, 3328] + - Exact: [64, 193600, 1, 64] + - Exact: [1024, 700, 1, 512] + - Exact: [2560, 128, 1, 2560] + - Exact: [64, 193600, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [512, 1500, 1, 2816] + - Exact: [3072, 128, 1, 1024] + - Exact: [7680, 64, 1, 2560] + - Exact: [7680, 128, 1, 2560] + - Exact: [512, 1500, 1, 2560] + - Exact: [1024, 1024, 1, 1024] + - Exact: [512, 1500, 1, 2048] + - Exact: [512, 1500, 1, 1536] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 256] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [512, 1600, 1, 32] + - Exact: [512, 1600, 1, 512] + - Exact: [560, 1600, 1, 1024] + - Exact: [1024, 512, 1, 1] + - Exact: [1024, 512, 1, 64] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 960, 1, 64] + - Exact: [1024, 960, 1, 1024] + - Exact: [1600, 512, 1, 1024] + - Exact: [2048, 512, 1, 1] + - Exact: [2048, 512, 1, 2048] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [1225, 64, 32, 192] + - Exact: [1225, 64, 32, 256] + - Exact: [1225, 64, 32, 288] + - Exact: [5329, 80, 32, 64] + - Exact: [289, 128, 32, 768] + - Exact: [289, 160, 32, 768] + - Exact: [289, 192, 32, 768] + - Exact: [3136, 64, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [960, 1024, 1, 1024] + - Exact: [64, 512, 16, 512] + - Exact: [64, 512, 128, 512] + - Exact: [1024, 512, 1, 2] + - Exact: [1024, 512, 1, 4096] + - Exact: [1024, 616, 1, 1024] + - Exact: [64, 128, 128, 128] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 2] + - Exact: [1024, 1024, 1, 4096] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 780, 1, 1024] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 800, 1, 1024] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 820, 1, 1024] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 385, 1, 1024] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 128, 144, 128] + - Exact: [64, 1024, 32, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 40, 512] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 512] + - Exact: [256, 3456, 1, 128] + - Exact: [256, 4096, 1, 128] + - Exact: [480, 864, 1, 1024] + - Exact: [512, 864, 1, 256] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [256, 4096, 1, 1] + - Exact: [12544, 64, 1, 147] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [3400, 256, 1, 1024] + - Exact: [3800, 256, 1, 1024] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [51520, 64, 2, 256] + - Exact: [46464, 64, 2, 256] + - Exact: [49152, 64, 2, 256] + - Exact: [1536, 512, 1, 1024] + - Exact: [1728, 512, 1, 1024] + - Exact: [1024, 1024, 1, 320] + - Exact: [51520, 64, 2, 64] + - Exact: [55296, 64, 2, 64] + - Exact: [49152, 64, 2, 64] + - Exact: [54400, 64, 2, 64] + - Exact: [42240, 64, 2, 256] + - Exact: [672, 512, 2, 2048] + - Exact: [54400, 64, 2, 256] + - Exact: [56832, 64, 2, 256] + - Exact: [55296, 64, 2, 256] + - Exact: [60800, 64, 2, 64] + - Exact: [768, 512, 2, 2048] + - Exact: [43008, 64, 2, 256] + - Exact: [864, 256, 2, 2048] + - Exact: [768, 256, 2, 2048] + - Exact: [45632, 64, 2, 256] + - Exact: [60800, 64, 2, 256] + - Exact: [1024, 1024, 1, 81] + - Exact: [950, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [950, 256, 2, 2048] + - Exact: [1900, 512, 1, 1024] + - Exact: [1700, 512, 1, 1024] + - Exact: [1610, 512, 1, 1024] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [713, 512, 2, 2048] + - Exact: [805, 256, 2, 2048] + - Exact: [850, 256, 2, 2048] + - Exact: [100, 128, 120, 512] + - Exact: [100, 128, 139, 512] + - Exact: [100, 128, 160, 512] + - Exact: [22500, 64, 1, 147] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 77, 1, 30522] + - Exact: [1024, 200, 1, 30522] + - Exact: [1024, 160, 1, 30522] + - Exact: [1024, 180, 1, 30522] + - Exact: [1024, 160, 1, 30528] + - Exact: [1024, 240, 1, 30528] + - Exact: [2560, 109, 1, 29000] + - Exact: [2560, 121, 1, 29000] + - Exact: [2560, 65, 1, 29000] + - Exact: [2560, 66, 1, 29000] + - Exact: [2560, 67, 1, 29000] + - Exact: [2560, 69, 1, 29000] + - Exact: [2560, 70, 1, 29000] + - Exact: [2560, 71, 1, 29000] + - Exact: [2560, 73, 1, 29000] + - Exact: [2560, 74, 1, 29000] + - Exact: [2560, 75, 1, 29000] + - Exact: [2560, 77, 1, 29000] + - Exact: [2560, 78, 1, 29000] + - Exact: [2560, 80, 1, 29000] + - Exact: [2560, 81, 1, 29000] + - Exact: [2560, 82, 1, 29000] + - Exact: [2560, 83, 1, 29000] + - Exact: [2560, 84, 1, 29000] + - Exact: [2560, 88, 1, 29000] + - Exact: [2560, 89, 1, 29000] + - Exact: [2560, 90, 1, 29000] + - Exact: [2560, 92, 1, 29000] + - Exact: [2560, 95, 1, 29000] + - Exact: [2560, 98, 1, 29000] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 64, 1, 3328] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [1024, 256, 1, 3328] + - Exact: [704, 128, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [256, 448, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [704, 256, 1, 256] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [1024, 128, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [704, 256, 1, 3328] + - Exact: [1856, 128, 1, 256] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [2560, 32, 1, 2560] + - Exact: [2048, 128, 1, 2048] + - Exact: [4608, 32, 1, 1536] + - Exact: [3072, 64, 1, 1024] + - Exact: [128, 1500, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [4096, 64, 1, 4096] + - Exact: [7680, 32, 1, 2560] + - Exact: [2560, 64, 1, 2560] + - Exact: [3072, 32, 1, 1024] + - Exact: [6144, 32, 1, 2560] + - Exact: [176, 1500, 1, 1408] + - Exact: [2048, 64, 1, 2048] + - Exact: [8448, 32, 1, 2816] + - Exact: [512, 512, 1, 64] + - Exact: [32, 33, 1600, 33] + - Exact: [256, 1024, 1, 1] + - Exact: [257, 1024, 1, 4096] + - Exact: [512, 200, 1, 1] + - Exact: [512, 200, 1, 32] + - Exact: [512, 215, 1, 2048] + - Exact: [512, 256, 1, 2048] + - Exact: [560, 200, 1, 1024] + - Exact: [768, 215, 1, 2048] + - Exact: [768, 256, 1, 2048] + - Exact: [1024, 200, 1, 1] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 32] + - Exact: [64, 35, 4608, 35] + - Exact: [64, 33, 1920, 27] + - Exact: [64, 33, 1920, 33] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [1225, 32, 32, 192] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [1225, 48, 32, 288] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [256, 864, 1, 128] + - Exact: [3136, 64, 1, 576] + - Exact: [784, 128, 1, 1152] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 128, 1, 2] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 96, 1, 2] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [100, 128, 18, 512] + - Exact: [100, 128, 19, 512] + - Exact: [1444, 128, 1, 576] + - Exact: [361, 512, 1, 2304] + - Exact: [2560, 35, 1, 29000] + - Exact: [2560, 36, 1, 29000] + - Exact: [2560, 39, 1, 29000] + - Exact: [2560, 40, 1, 29000] + - Exact: [2560, 42, 1, 29000] + - Exact: [2560, 43, 1, 29000] + - Exact: [2560, 44, 1, 29000] + - Exact: [2560, 46, 1, 29000] + - Exact: [2560, 48, 1, 29000] + - Exact: [2560, 49, 1, 29000] + - Exact: [2560, 50, 1, 29000] + - Exact: [2560, 51, 1, 29000] + - Exact: [2560, 53, 1, 29000] + - Exact: [2560, 54, 1, 29000] + - Exact: [2560, 55, 1, 29000] + - Exact: [2560, 56, 1, 29000] + - Exact: [2560, 57, 1, 29000] + - Exact: [2560, 58, 1, 29000] + - Exact: [2560, 59, 1, 29000] + - Exact: [2560, 61, 1, 29000] + - Exact: [2560, 63, 1, 29000] + - Exact: [1909283, 40, 1, 40] + - Exact: [3818566, 40, 1, 40] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [7680, 4, 1, 2560] + - Exact: [3072, 16, 1, 1024] + - Exact: [2048, 16, 1, 2048] + - Exact: [3072, 1, 1, 128] + - Exact: [8448, 16, 1, 2816] + - Exact: [7680, 2, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [7680, 1, 1, 2560] + - Exact: [6144, 2, 1, 2560] + - Exact: [1760, 16, 1, 1760] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 4, 1, 1024] + - Exact: [2048, 32, 1, 2048] + - Exact: [4608, 16, 1, 1536] + - Exact: [3072, 2, 1, 1024] + - Exact: [8448, 1, 1, 2816] + - Exact: [6144, 1, 1, 2560] + - Exact: [4608, 1, 1, 1536] + - Exact: [8448, 4, 1, 2816] + - Exact: [4608, 2, 1, 1536] + - Exact: [2560, 16, 1, 2560] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [7680, 16, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [8448, 2, 1, 2816] + - Exact: [4608, 4, 1, 1536] + - Exact: [2048, 2, 1, 2048] + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 4, 1, 2] + - Exact: [2560, 4, 1, 2560] + - Exact: [2048, 1, 1, 512] + - Exact: [12288, 12, 2, 256] + - Exact: [12288, 3, 2, 256] + - Exact: [51520, 12, 2, 256] + - Exact: [51520, 3, 2, 256] + - Exact: [15200, 12, 2, 256] + - Exact: [15200, 3, 2, 256] + - Exact: [3456, 3, 2, 256] + - Exact: [13600, 12, 2, 256] + - Exact: [12880, 3, 2, 256] + - Exact: [3400, 3, 2, 256] + - Exact: [12880, 12, 2, 256] + - Exact: [13824, 12, 2, 256] + - Exact: [13824, 3, 2, 256] + - Exact: [13600, 3, 2, 256] + - Exact: [3456, 12, 2, 256] + - Exact: [3800, 3, 2, 256] + - Exact: [3400, 12, 2, 256] + - Exact: [3800, 12, 2, 256] + - Exact: [55296, 3, 2, 256] + - Exact: [3072, 3, 2, 256] + - Exact: [3072, 12, 2, 256] + - Exact: [54400, 3, 2, 256] + - Exact: [60800, 12, 2, 256] + - Exact: [60800, 3, 2, 256] + - Exact: [3220, 3, 2, 256] + - Exact: [3220, 12, 2, 256] + - Exact: [2048, 8, 1, 2] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2] + - Exact: [2560, 2, 1, 2560] + - Exact: [2560, 27, 1, 29000] + - Exact: [1909283, 11, 1, 11] + - Exact: [3818566, 11, 1, 11] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [512, 4, 1, 500000] + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [49, 512, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [448, 64, 1, 3328] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 256, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [1, 1, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [256, 64, 1, 256] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 256, 1, 3328] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 256, 1, 256] + - Exact: [1, 1, 1, 256] + - Exact: [128, 448, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 16, 1, 512] + - Exact: [128, 1, 1, 1408] + - Exact: [64, 1, 1, 1216] + - Exact: [1024, 2, 1, 512] + - Exact: [512, 1, 1, 512] + - Exact: [1024, 4, 1, 512] + - Exact: [512, 4, 1, 512] + - Exact: [1024, 32, 1, 512] + - Exact: [512, 2, 1, 512] + - Exact: [1024, 1, 1, 512] + - Exact: [512, 32, 1, 512] + - Exact: [128, 1, 1, 1024] + - Exact: [64, 14, 1, 14] + - Exact: [64, 14, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 21] + - Exact: [64, 21, 1, 21] + - Exact: [64, 24, 1, 24] + - Exact: [64, 24, 1, 34] + - Exact: [64, 30, 1, 30] + - Exact: [64, 31, 1, 30] + - Exact: [64, 31, 1, 31] + - Exact: [64, 32, 1, 32] + - Exact: [64, 34, 1, 34] + - Exact: [64, 35, 1, 32] + - Exact: [64, 35, 1, 35] + - Exact: [64, 512, 1, 512] + - Exact: [1024, 4, 1, 2] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 32, 1, 2] + - Exact: [1024, 32, 1, 1024] + - Exact: [32, 200, 1, 1] + - Exact: [64, 3, 512, 3] + - Exact: [64, 5, 512, 5] + - Exact: [64, 5, 960, 5] + - Exact: [64, 9, 512, 9] + - Exact: [64, 512, 1, 1] + - Exact: [67, 512, 1, 2048] + - Exact: [74, 512, 1, 2048] + - Exact: [74, 960, 1, 2048] + - Exact: [100, 512, 1, 2048] + - Exact: [128, 27, 32768, 27] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 14, 10880, 15] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 15, 7680, 17] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 17, 6144, 21] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 24, 4736, 34] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 31, 2048, 30] + - Exact: [64, 31, 2048, 31] + - Exact: [64, 27, 1920, 27] + - Exact: [1024, 8, 1, 1024] + - Exact: [1024, 77, 1, 1024] + - Exact: [1024, 10, 1, 2] + - Exact: [1024, 10, 1, 1024] + - Exact: [1024, 39, 1, 2] + - Exact: [1024, 39, 1, 1024] + - Exact: [1024, 40, 1, 2] + - Exact: [1024, 40, 1, 1024] + - Exact: [1024, 41, 1, 2] + - Exact: [1024, 41, 1, 1024] + - Exact: [1024, 5, 1, 2] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 6, 1, 2] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 8, 1, 2] + - Exact: [1024, 9, 1, 2] + - Exact: [1024, 9, 1, 1024] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [128, 128, 1, 64] + - Exact: [64, 128, 1, 128] + - Exact: [64, 5, 1, 5] + - Exact: [32, 33, 1, 33] + - Exact: [1024, 16, 1, 2] + - Exact: [1024, 16, 1, 1024] + - Exact: [1024, 1, 1, 2] + - Exact: [1024, 1, 1, 1024] + - Exact: [1024, 1, 1, 200] + - Exact: [1024, 1, 1, 1600] + - Exact: [1024, 64, 1, 2] + - Exact: [1024, 64, 1, 1024] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 80, 1, 2] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 82, 1, 2] + - Exact: [1024, 12, 1, 1024] + - Exact: [1024, 12, 1, 2] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [196, 256, 1, 2304] + - Exact: [768, 3, 2, 256] + - Exact: [768, 12, 2, 256] + - Exact: [864, 12, 2, 256] + - Exact: [864, 3, 2, 256] + - Exact: [216, 3, 2, 256] + - Exact: [176, 12, 2, 256] + - Exact: [176, 3, 2, 256] + - Exact: [192, 12, 2, 256] + - Exact: [192, 3, 2, 256] + - Exact: [216, 12, 2, 256] + - Exact: [850, 3, 2, 256] + - Exact: [850, 12, 2, 256] + - Exact: [805, 12, 2, 256] + - Exact: [805, 3, 2, 256] + - Exact: [247, 3, 2, 256] + - Exact: [950, 3, 2, 256] + - Exact: [187, 12, 2, 256] + - Exact: [247, 12, 2, 256] + - Exact: [187, 3, 2, 256] + - Exact: [228, 12, 2, 256] + - Exact: [221, 12, 2, 256] + - Exact: [950, 12, 2, 256] + - Exact: [228, 3, 2, 256] + - Exact: [221, 3, 2, 256] + - Exact: [25, 128, 120, 256] + - Exact: [25, 128, 139, 256] + - Exact: [25, 128, 160, 256] + - Exact: [25, 128, 18, 256] + - Exact: [25, 128, 19, 256] + - Exact: [9, 128, 120, 256] + - Exact: [9, 128, 139, 256] + - Exact: [9, 128, 160, 256] + - Exact: [9, 128, 18, 256] + - Exact: [9, 128, 19, 256] + - Exact: [100, 512, 1, 2304] + - Exact: [25, 256, 1, 1152] + - Exact: [9, 256, 1, 1152] + - Exact: [1024, 20, 1, 1024] + - Exact: [1024, 20, 1, 2] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml new file mode 100644 index 000000000..2cede9248 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml @@ -0,0 +1,1632 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4096, 7133, 1, 4096] + - Exact: [2048, 7133, 1, 2048] + - Exact: [2560, 7133, 1, 2560] + - Exact: [3072, 7435, 1, 1024] + - Exact: [1760, 7133, 1, 1760] + - Exact: [7680, 5481, 1, 2560] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 128, 128] + - Exact: [784, 512, 256, 128] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 256, 1024] + - Exact: [3136, 256, 128, 64] + - Exact: [1024, 4096, 1, 2048] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 30528, 1, 2048] + - Exact: [1024, 30528, 1, 4096] + - Exact: [4096, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 4096] + - Exact: [256, 4864, 1, 8976] + - Exact: [256, 5120, 1, 8976] + - Exact: [256, 5632, 1, 8976] + - Exact: [256, 5888, 1, 8976] + - Exact: [256, 6144, 1, 8976] + - Exact: [256, 7168, 1, 8976] + - Exact: [256, 8192, 1, 8976] + - Exact: [256, 8960, 1, 8976] + - Exact: [256, 9728, 1, 8976] + - Exact: [256, 9984, 1, 8976] + - Exact: [256, 10240, 1, 8976] + - Exact: [256, 10496, 1, 8976] + - Exact: [256, 11008, 1, 8976] + - Exact: [256, 11264, 1, 8976] + - Exact: [256, 11520, 1, 8976] + - Exact: [256, 11776, 1, 8976] + - Exact: [256, 12544, 1, 8976] + - Exact: [256, 12800, 1, 8976] + - Exact: [256, 13312, 1, 8976] + - Exact: [256, 13568, 1, 8976] + - Exact: [256, 14336, 1, 8976] + - Exact: [256, 14848, 1, 8976] + - Exact: [256, 15104, 1, 8976] + - Exact: [256, 15872, 1, 8976] + - Exact: [256, 16128, 1, 8976] + - Exact: [256, 17152, 1, 8976] + - Exact: [256, 17408, 1, 8976] + - Exact: [256, 18688, 1, 8976] + - Exact: [256, 19968, 1, 8976] + - Exact: [256, 20480, 1, 8976] + - Exact: [256, 20992, 1, 8976] + - Exact: [256, 21248, 1, 8976] + - Exact: [256, 22016, 1, 8976] + - Exact: [256, 26112, 1, 8976] + - Exact: [256, 32512, 1, 8976] + - Exact: [256, 32768, 1, 1] + - Exact: [256, 33536, 1, 8976] + - Exact: [256, 44505, 1, 8976] + - Exact: [768, 2048, 1, 256] + - Exact: [1600, 1024, 1, 512] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 960, 1, 1] + - Exact: [2048, 2048, 1, 512] + - Exact: [2048, 2048, 1, 960] + - Exact: [2048, 2048, 1, 1024] + - Exact: [3200, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 3840] + - Exact: [1024, 4096, 1, 3968] + - Exact: [1024, 4096, 1, 6528] + - Exact: [1024, 4096, 1, 7104] + - Exact: [1024, 4096, 1, 7200] + - Exact: [1024, 4096, 1, 8064] + - Exact: [1024, 4096, 1, 8160] + - Exact: [1024, 4096, 1, 9216] + - Exact: [1024, 4096, 1, 9520] + - Exact: [1024, 4096, 1, 10064] + - Exact: [1024, 4096, 1, 10080] + - Exact: [1024, 4096, 1, 10200] + - Exact: [1024, 42720, 1, 3968] + - Exact: [1024, 42720, 1, 6528] + - Exact: [1024, 42720, 1, 7104] + - Exact: [1024, 42720, 1, 7200] + - Exact: [1024, 42720, 1, 9520] + - Exact: [1024, 42720, 1, 10080] + - Exact: [4096, 1024, 1, 3840] + - Exact: [4096, 1024, 1, 3968] + - Exact: [4096, 1024, 1, 6528] + - Exact: [4096, 1024, 1, 7104] + - Exact: [4096, 1024, 1, 7200] + - Exact: [4096, 1024, 1, 8064] + - Exact: [4096, 1024, 1, 8160] + - Exact: [4096, 1024, 1, 9216] + - Exact: [4096, 1024, 1, 9520] + - Exact: [4096, 1024, 1, 10064] + - Exact: [4096, 1024, 1, 10080] + - Exact: [4096, 1024, 1, 10200] + - Exact: [1024, 4096, 1, 3240] + - Exact: [1024, 4096, 1, 3960] + - Exact: [1024, 42720, 1, 3960] + - Exact: [4096, 1024, 1, 3240] + - Exact: [4096, 1024, 1, 3960] + - Exact: [1225, 192, 64, 32] + - Exact: [1225, 192, 64, 48] + - Exact: [1225, 192, 64, 64] + - Exact: [1225, 256, 64, 48] + - Exact: [1225, 256, 64, 64] + - Exact: [1225, 288, 64, 48] + - Exact: [1225, 288, 64, 64] + - Exact: [289, 768, 64, 128] + - Exact: [289, 768, 64, 160] + - Exact: [289, 768, 64, 192] + - Exact: [1225, 192, 32, 32] + - Exact: [1225, 192, 32, 48] + - Exact: [1225, 192, 32, 64] + - Exact: [1225, 256, 32, 48] + - Exact: [1225, 256, 32, 64] + - Exact: [1225, 288, 32, 48] + - Exact: [1225, 288, 32, 64] + - Exact: [289, 768, 32, 128] + - Exact: [289, 768, 32, 160] + - Exact: [289, 768, 32, 192] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 128, 32, 512] + - Exact: [784, 512, 32, 128] + - Exact: [196, 1024, 32, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [784, 256, 64, 512] + - Exact: [3136, 256, 64, 128] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 512, 64, 1024] + - Exact: [784, 512, 64, 256] + - Exact: [784, 512, 64, 512] + - Exact: [196, 1024, 64, 512] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 256] + - Exact: [784, 256, 32, 512] + - Exact: [3136, 256, 32, 128] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 512, 32, 1024] + - Exact: [784, 512, 32, 256] + - Exact: [784, 512, 32, 512] + - Exact: [196, 1024, 32, 512] + - Exact: [196, 1024, 32, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 30522, 1, 77] + - Exact: [4096, 1024, 1, 512] + - Exact: [1024, 4096, 1, 1280] + - Exact: [1024, 30522, 1, 200] + - Exact: [4096, 1024, 1, 1280] + - Exact: [1024, 4096, 1, 4992] + - Exact: [1024, 30522, 1, 780] + - Exact: [4096, 1024, 1, 4992] + - Exact: [1024, 30522, 1, 308] + - Exact: [1024, 4096, 1, 5120] + - Exact: [1024, 30522, 1, 800] + - Exact: [4096, 1024, 1, 5120] + - Exact: [1024, 4096, 1, 5248] + - Exact: [1024, 30522, 1, 820] + - Exact: [4096, 1024, 1, 5248] + - Exact: [1024, 4096, 1, 2560] + - Exact: [1024, 30522, 1, 385] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 30522, 1, 462] + - Exact: [4096, 1024, 1, 3072] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 30522, 1, 160] + - Exact: [4096, 1024, 1, 1024] + - Exact: [1024, 4096, 1, 1152] + - Exact: [1024, 30522, 1, 180] + - Exact: [4096, 1024, 1, 1152] + - Exact: [1024, 4096, 1, 8192] + - Exact: [1024, 4096, 1, 9600] + - Exact: [1024, 33712, 1, 8192] + - Exact: [1024, 33712, 1, 9600] + - Exact: [4096, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 9600] + - Exact: [1024, 1600, 1, 1] + - Exact: [2560, 1920, 1, 2048] + - Exact: [1024, 3072, 1, 4096] + - Exact: [2560, 2560, 1, 2048] + - Exact: [2048, 2048, 1, 2] + - Exact: [1024, 30592, 1, 2048] + - Exact: [1024, 3072, 1, 16384] + - Exact: [6144, 1536, 1, 4096] + - Exact: [1536, 4608, 1, 8192] + - Exact: [640, 2560, 1, 2048] + - Exact: [1024, 4096, 1, 16384] + - Exact: [1536, 6144, 1, 4096] + - Exact: [1024, 30592, 1, 4096] + - Exact: [2560, 2560, 1, 4] + - Exact: [1536, 1536, 1, 4096] + - Exact: [2560, 7680, 1, 2048] + - Exact: [1536, 50304, 1, 4096] + - Exact: [2048, 8192, 1, 1024] + - Exact: [1024, 30592, 1, 8192] + - Exact: [4096, 1024, 1, 16384] + - Exact: [8192, 2048, 1, 1024] + - Exact: [1024, 50304, 1, 4096] + - Exact: [1536, 4608, 1, 4096] + - Exact: [6144, 1536, 1, 8192] + - Exact: [1024, 3072, 1, 8192] + - Exact: [1536, 1536, 1, 8192] + - Exact: [1536, 50304, 1, 8192] + - Exact: [2048, 6144, 1, 1024] + - Exact: [2048, 30592, 1, 1024] + - Exact: [1536, 6144, 1, 8192] + - Exact: [1024, 50304, 1, 2048] + - Exact: [1024, 50304, 1, 8192] + - Exact: [1024, 3072, 1, 2048] + - Exact: [1024, 50304, 1, 16384] + - Exact: [1024, 30528, 1, 8192] + - Exact: [256, 6912, 1, 1] + - Exact: [30528, 1024, 1, 640] + - Exact: [30528, 1024, 1, 1280] + - Exact: [4096, 1024, 1, 10240] + - Exact: [1024, 4096, 1, 10240] + - Exact: [30528, 1024, 1, 1600] + - Exact: [1024, 4096, 1, 10496] + - Exact: [30528, 1024, 1, 1640] + - Exact: [4096, 1024, 1, 10496] + - Exact: [30528, 1024, 1, 160] + - Exact: [1024, 4096, 1, 6144] + - Exact: [30528, 1024, 1, 240] + - Exact: [4096, 1024, 1, 6144] + - Exact: [1024, 4096, 1, 10224] + - Exact: [4096, 1024, 1, 10224] + - Exact: [1024, 3072, 1, 10224] + - Exact: [1024, 3072, 1, 10240] + - Exact: [4096, 1024, 1, 10192] + - Exact: [1024, 3072, 1, 10192] + - Exact: [1024, 4096, 1, 10192] + - Exact: [1024, 3072, 1, 10200] + - Exact: [4096, 1024, 1, 10208] + - Exact: [1024, 3072, 1, 10208] + - Exact: [1024, 4096, 1, 10208] + - Exact: [1024, 2048, 1, 10224] + - Exact: [1024, 2048, 1, 10240] + - Exact: [1024, 2048, 1, 10192] + - Exact: [1024, 3072, 1, 10080] + - Exact: [100352, 256, 1, 512] + - Exact: [12544, 1024, 1, 2048] + - Exact: [12544, 147, 1, 64] + - Exact: [200704, 256, 1, 512] + - Exact: [25088, 512, 1, 1024] + - Exact: [3136, 576, 1, 64] + - Exact: [50176, 512, 1, 1024] + - Exact: [6272, 1024, 1, 2048] + - Exact: [3136, 256, 128, 128] + - Exact: [3136, 256, 256, 128] + - Exact: [784, 512, 128, 256] + - Exact: [784, 512, 256, 256] + - Exact: [30528, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 12288] + - Exact: [30528, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [25600, 128, 25, 128] + - Exact: [12544, 128, 36, 128] + - Exact: [9216, 128, 49, 128] + - Exact: [6400, 128, 64, 128] + - Exact: [6400, 256, 25, 256] + - Exact: [4096, 256, 36, 256] + - Exact: [2304, 256, 49, 256] + - Exact: [2304, 256, 64, 256] + - Exact: [2304, 512, 25, 512] + - Exact: [1024, 512, 36, 512] + - Exact: [1024, 512, 49, 512] + - Exact: [1024, 512, 64, 512] + - Exact: [3072, 768, 1, 2048] + - Exact: [768, 3072, 1, 2048] + - Exact: [3072, 768, 1, 4608] + - Exact: [768, 3072, 1, 4608] + - Exact: [4096, 1024, 1, 4608] + - Exact: [1024, 4096, 1, 4608] + - Exact: [196, 1024, 128, 512] + - Exact: [196, 1024, 256, 512] + - Exact: [4880, 256, 49, 256] + - Exact: [3128, 256, 64, 256] + - Exact: [4680, 256, 49, 256] + - Exact: [5280, 256, 36, 256] + - Exact: [2640, 256, 64, 256] + - Exact: [5304, 256, 49, 256] + - Exact: [2760, 256, 64, 256] + - Exact: [6440, 256, 36, 256] + - Exact: [5704, 256, 36, 256] + - Exact: [2128, 256, 64, 256] + - Exact: [1160, 256, 49, 256] + - Exact: [4056, 256, 49, 256] + - Exact: [6144, 256, 36, 256] + - Exact: [6336, 256, 36, 256] + - Exact: [13600, 512, 2, 128] + - Exact: [15200, 512, 2, 128] + - Exact: [15200, 128, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [5632, 256, 36, 256] + - Exact: [12288, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [11408, 128, 2, 512] + - Exact: [13824, 512, 2, 128] + - Exact: [13824, 128, 2, 512] + - Exact: [10560, 128, 2, 512] + - Exact: [10752, 128, 2, 512] + - Exact: [13600, 512, 2, 256] + - Exact: [15200, 512, 2, 256] + - Exact: [768, 2048, 2, 512] + - Exact: [12880, 512, 2, 128] + - Exact: [11616, 128, 2, 512] + - Exact: [14208, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [6912, 256, 36, 256] + - Exact: [13824, 512, 2, 256] + - Exact: [11616, 512, 2, 128] + - Exact: [12288, 512, 2, 128] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [10560, 512, 2, 128] + - Exact: [672, 2048, 2, 512] + - Exact: [9408, 128, 2, 512] + - Exact: [10752, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [1240, 256, 49, 256] + - Exact: [4032, 256, 2, 1024] + - Exact: [888, 2048, 2, 512] + - Exact: [12880, 512, 2, 256] + - Exact: [12288, 512, 2, 256] + - Exact: [13440, 128, 2, 512] + - Exact: [864, 2048, 2, 256] + - Exact: [12672, 128, 2, 512] + - Exact: [11264, 128, 2, 512] + - Exact: [11776, 128, 2, 512] + - Exact: [16128, 128, 2, 512] + - Exact: [4032, 1024, 2, 256] + - Exact: [14000, 128, 2, 512] + - Exact: [13440, 512, 2, 128] + - Exact: [768, 2048, 2, 256] + - Exact: [3264, 1024, 2, 256] + - Exact: [4200, 256, 2, 1024] + - Exact: [2352, 1024, 2, 256] + - Exact: [2400, 1024, 2, 256] + - Exact: [15200, 256, 2, 12] + - Exact: [12880, 256, 2, 12] + - Exact: [2520, 1024, 2, 256] + - Exact: [13600, 256, 2, 12] + - Exact: [15200, 256, 2, 3] + - Exact: [12880, 256, 2, 3] + - Exact: [4200, 1024, 2, 256] + - Exact: [12288, 256, 2, 12] + - Exact: [13824, 256, 2, 12] + - Exact: [13600, 256, 2, 3] + - Exact: [7600, 512, 1, 256] + - Exact: [6144, 512, 1, 256] + - Exact: [12544, 1024, 1, 1024] + - Exact: [3800, 256, 2, 3] + - Exact: [13824, 256, 2, 3] + - Exact: [12288, 256, 2, 3] + - Exact: [2688, 256, 2, 1024] + - Exact: [3072, 256, 2, 12] + - Exact: [3800, 256, 2, 12] + - Exact: [3072, 256, 2, 3] + - Exact: [2520, 256, 2, 1024] + - Exact: [16128, 512, 2, 128] + - Exact: [2400, 256, 2, 1024] + - Exact: [2352, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2992, 1024, 2, 256] + - Exact: [2816, 256, 2, 1024] + - Exact: [2904, 1024, 2, 256] + - Exact: [3456, 256, 2, 3] + - Exact: [3400, 256, 2, 3] + - Exact: [2816, 1024, 2, 256] + - Exact: [3456, 256, 2, 12] + - Exact: [2944, 1024, 2, 256] + - Exact: [3168, 256, 2, 1024] + - Exact: [2992, 256, 2, 1024] + - Exact: [51520, 256, 2, 12] + - Exact: [3072, 256, 2, 1024] + - Exact: [2640, 1024, 2, 256] + - Exact: [2688, 1024, 2, 256] + - Exact: [2904, 256, 2, 1024] + - Exact: [3264, 256, 2, 1024] + - Exact: [54400, 256, 2, 12] + - Exact: [55296, 256, 2, 3] + - Exact: [60800, 256, 2, 12] + - Exact: [51520, 256, 2, 3] + - Exact: [55296, 256, 2, 12] + - Exact: [3600, 1024, 2, 256] + - Exact: [60800, 256, 2, 3] + - Exact: [952, 256, 64, 256] + - Exact: [49152, 256, 2, 12] + - Exact: [3360, 256, 2, 1024] + - Exact: [736, 256, 64, 256] + - Exact: [600, 256, 64, 256] + - Exact: [1440, 256, 49, 256] + - Exact: [3168, 1024, 2, 256] + - Exact: [1368, 256, 49, 256] + - Exact: [49152, 256, 2, 3] + - Exact: [3600, 256, 2, 1024] + - Exact: [3360, 1024, 2, 256] + - Exact: [54400, 256, 2, 3] + - Exact: [3072, 1024, 2, 256] + - Exact: [2640, 256, 2, 1024] + - Exact: [616, 256, 64, 256] + - Exact: [3008, 256, 64, 256] + - Exact: [896, 256, 64, 256] + - Exact: [768, 256, 64, 256] + - Exact: [3552, 256, 2, 1024] + - Exact: [3552, 1024, 2, 256] + - Exact: [800, 256, 64, 256] + - Exact: [1120, 256, 49, 256] + - Exact: [2408, 256, 64, 256] + - Exact: [3456, 256, 2, 1024] + - Exact: [672, 256, 64, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [1064, 256, 49, 256] + - Exact: [3400, 256, 2, 1024] + - Exact: [704, 256, 64, 256] + - Exact: [3400, 1024, 2, 256] + - Exact: [3264, 256, 64, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [3800, 256, 2, 1024] + - Exact: [6440, 512, 1, 256] + - Exact: [6912, 512, 1, 256] + - Exact: [6800, 512, 1, 256] + - Exact: [6800, 512, 1, 1024] + - Exact: [6440, 512, 1, 1024] + - Exact: [6912, 512, 1, 1024] + - Exact: [1728, 1024, 1, 512] + - Exact: [1536, 1024, 1, 512] + - Exact: [7600, 512, 1, 1024] + - Exact: [6144, 512, 1, 1024] + - Exact: [1728, 1024, 1, 2048] + - Exact: [1536, 1024, 1, 2048] + - Exact: [4524, 256, 49, 256] + - Exact: [2666, 256, 64, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [3220, 1024, 2, 256] + - Exact: [782, 128, 64, 128] + - Exact: [850, 2048, 2, 512] + - Exact: [805, 2048, 2, 512] + - Exact: [713, 2048, 2, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [805, 2048, 2, 256] + - Exact: [1251, 256, 49, 256] + - Exact: [1900, 1024, 1, 2048] + - Exact: [1610, 1024, 1, 2048] + - Exact: [1900, 1024, 1, 512] + - Exact: [3220, 256, 2, 12] + - Exact: [3220, 256, 2, 3] + - Exact: [3036, 1024, 2, 256] + - Exact: [3036, 256, 2, 1024] + - Exact: [850, 2048, 2, 256] + - Exact: [2852, 1024, 2, 256] + - Exact: [950, 2048, 2, 256] + - Exact: [3700, 1024, 2, 256] + - Exact: [2852, 256, 2, 1024] + - Exact: [3700, 256, 2, 1024] + - Exact: [1269, 256, 49, 256] + - Exact: [1467, 256, 49, 256] + - Exact: [3500, 256, 2, 1024] + - Exact: [1449, 256, 49, 256] + - Exact: [1278, 256, 49, 256] + - Exact: [1413, 256, 49, 256] + - Exact: [1341, 256, 49, 256] + - Exact: [1287, 256, 49, 256] + - Exact: [1332, 256, 49, 256] + - Exact: [1359, 256, 49, 256] + - Exact: [1395, 256, 49, 256] + - Exact: [1323, 256, 49, 256] + - Exact: [1404, 256, 49, 256] + - Exact: [1386, 256, 49, 256] + - Exact: [1350, 256, 49, 256] + - Exact: [3500, 1024, 2, 256] + - Exact: [3220, 256, 2, 1024] + - Exact: [690, 256, 64, 256] + - Exact: [660, 256, 64, 256] + - Exact: [782, 256, 64, 256] + - Exact: [884, 256, 64, 256] + - Exact: [1610, 1024, 1, 512] + - Exact: [1700, 1024, 1, 512] + - Exact: [1700, 1024, 1, 2048] + - Exact: [1444, 128, 120, 256] + - Exact: [1444, 128, 18, 256] + - Exact: [1444, 128, 19, 256] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 512, 120, 256] + - Exact: [361, 512, 18, 256] + - Exact: [361, 512, 19, 256] + - Exact: [1920, 25216, 1, 16384] + - Exact: [3840, 1920, 1, 16384] + - Exact: [1920, 3840, 1, 16384] + - Exact: [960, 1920, 1, 16384] + - Exact: [1920, 2880, 1, 16384] + - Exact: [1920, 25216, 1, 4096] + - Exact: [3840, 1920, 1, 4096] + - Exact: [1920, 3840, 1, 4096] + - Exact: [960, 1920, 1, 4096] + - Exact: [1920, 2880, 1, 4096] + - Exact: [1920, 25216, 1, 8192] + - Exact: [3840, 1920, 1, 8192] + - Exact: [1920, 3840, 1, 8192] + - Exact: [960, 1920, 1, 8192] + - Exact: [1920, 2880, 1, 8192] + - Exact: [2304, 12672, 1, 16384] + - Exact: [2304, 2304, 1, 16384] + - Exact: [576, 2304, 1, 16384] + - Exact: [2304, 1728, 1, 16384] + - Exact: [2304, 12672, 1, 4096] + - Exact: [2304, 2304, 1, 4096] + - Exact: [576, 2304, 1, 4096] + - Exact: [2304, 1728, 1, 4096] + - Exact: [2304, 12672, 1, 8192] + - Exact: [2304, 2304, 1, 8192] + - Exact: [576, 2304, 1, 8192] + - Exact: [2304, 1728, 1, 8192] + - Exact: [3072, 6400, 1, 4096] + - Exact: [1536, 3072, 1, 4096] + - Exact: [3072, 1536, 1, 4096] + - Exact: [384, 3072, 1, 4096] + - Exact: [3072, 1152, 1, 4096] + - Exact: [3072, 6400, 1, 8192] + - Exact: [1536, 3072, 1, 8192] + - Exact: [3072, 1536, 1, 8192] + - Exact: [384, 3072, 1, 8192] + - Exact: [3072, 1152, 1, 8192] + - Exact: [2048, 2048, 1, 4096] + - Exact: [2048, 2048, 1, 8] + - Exact: [2048, 29000, 1, 199] + - Exact: [2048, 29000, 1, 221] + - Exact: [2048, 29000, 1, 224] + - Exact: [2048, 29000, 1, 229] + - Exact: [2048, 29000, 1, 234] + - Exact: [2048, 29000, 1, 242] + - Exact: [2048, 29000, 1, 246] + - Exact: [2048, 29000, 1, 247] + - Exact: [2048, 29000, 1, 256] + - Exact: [2048, 29000, 1, 262] + - Exact: [2048, 29000, 1, 264] + - Exact: [2048, 29000, 1, 265] + - Exact: [2048, 29000, 1, 274] + - Exact: [2048, 29000, 1, 277] + - Exact: [2048, 29000, 1, 279] + - Exact: [2048, 29000, 1, 288] + - Exact: [2048, 29000, 1, 296] + - Exact: [2048, 29000, 1, 315] + - Exact: [2048, 29000, 1, 335] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 4096] + - Exact: [1024, 29000, 1, 2283] + - Exact: [1024, 29000, 1, 2296] + - Exact: [1024, 29000, 1, 2306] + - Exact: [1024, 29000, 1, 2309] + - Exact: [1024, 29000, 1, 2318] + - Exact: [1024, 29000, 1, 2320] + - Exact: [1024, 29000, 1, 2324] + - Exact: [1024, 29000, 1, 2325] + - Exact: [1024, 29000, 1, 2329] + - Exact: [1024, 29000, 1, 2338] + - Exact: [1024, 29000, 1, 2345] + - Exact: [1024, 29000, 1, 2350] + - Exact: [1024, 29000, 1, 2362] + - Exact: [1024, 29000, 1, 2366] + - Exact: [1024, 29000, 1, 2368] + - Exact: [1024, 29000, 1, 2374] + - Exact: [1024, 29000, 1, 2390] + - Exact: [1024, 29000, 1, 561] + - Exact: [1024, 29000, 1, 574] + - Exact: [1024, 29000, 1, 600] + - Exact: [1024, 29000, 1, 608] + - Exact: [1024, 29000, 1, 615] + - Exact: [1024, 29000, 1, 622] + - Exact: [1024, 29000, 1, 625] + - Exact: [1024, 29000, 1, 626] + - Exact: [1024, 29000, 1, 628] + - Exact: [1024, 29000, 1, 636] + - Exact: [1024, 29000, 1, 651] + - Exact: [1024, 29000, 1, 658] + - Exact: [1024, 29000, 1, 669] + - Exact: [1024, 29000, 1, 670] + - Exact: [1024, 29000, 1, 672] + - Exact: [1024, 29000, 1, 684] + - Exact: [1024, 29000, 1, 716] + - Exact: [1024, 29000, 1, 730] + - Exact: [2560, 2560, 1, 1024] + - Exact: [2560, 2560, 1, 2] + - Exact: [2560, 29000, 1, 109] + - Exact: [2560, 29000, 1, 121] + - Exact: [2560, 29000, 1, 27] + - Exact: [2560, 29000, 1, 35] + - Exact: [2560, 29000, 1, 36] + - Exact: [2560, 29000, 1, 39] + - Exact: [2560, 29000, 1, 40] + - Exact: [2560, 29000, 1, 42] + - Exact: [2560, 29000, 1, 43] + - Exact: [2560, 29000, 1, 44] + - Exact: [2560, 29000, 1, 46] + - Exact: [2560, 29000, 1, 48] + - Exact: [2560, 29000, 1, 49] + - Exact: [2560, 29000, 1, 50] + - Exact: [2560, 29000, 1, 51] + - Exact: [2560, 29000, 1, 53] + - Exact: [2560, 29000, 1, 54] + - Exact: [2560, 29000, 1, 55] + - Exact: [2560, 29000, 1, 56] + - Exact: [2560, 29000, 1, 57] + - Exact: [2560, 29000, 1, 58] + - Exact: [2560, 29000, 1, 59] + - Exact: [2560, 29000, 1, 61] + - Exact: [2560, 29000, 1, 63] + - Exact: [2560, 29000, 1, 65] + - Exact: [2560, 29000, 1, 66] + - Exact: [2560, 29000, 1, 67] + - Exact: [2560, 29000, 1, 69] + - Exact: [2560, 29000, 1, 70] + - Exact: [2560, 29000, 1, 71] + - Exact: [2560, 29000, 1, 73] + - Exact: [2560, 29000, 1, 74] + - Exact: [2560, 29000, 1, 75] + - Exact: [2560, 29000, 1, 77] + - Exact: [2560, 29000, 1, 78] + - Exact: [2560, 29000, 1, 80] + - Exact: [2560, 29000, 1, 81] + - Exact: [2560, 29000, 1, 82] + - Exact: [2560, 29000, 1, 83] + - Exact: [2560, 29000, 1, 84] + - Exact: [2560, 29000, 1, 88] + - Exact: [2560, 29000, 1, 89] + - Exact: [2560, 29000, 1, 90] + - Exact: [2560, 29000, 1, 92] + - Exact: [2560, 29000, 1, 95] + - Exact: [2560, 29000, 1, 98] + - Exact: [2560, 4096, 1, 1024] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 32768] + - Exact: [1024, 4096, 1, 32768] + - Exact: [1024, 50304, 1, 32768] + - Exact: [4096, 1024, 1, 32768] + - Exact: [1024, 128, 24, 1024] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 2560, 1, 8976] + - Exact: [256, 2816, 1, 8976] + - Exact: [256, 3328, 1, 8976] + - Exact: [256, 3584, 1, 8976] + - Exact: [256, 3840, 1, 8976] + - Exact: [256, 4096, 1, 8976] + - Exact: [256, 4352, 1, 8976] + - Exact: [480, 1024, 1, 32768] + - Exact: [1024, 256, 1, 21248] + - Exact: [1024, 256, 1, 21504] + - Exact: [1024, 256, 1, 22016] + - Exact: [1024, 256, 1, 28672] + - Exact: [1024, 256, 1, 33536] + - Exact: [1024, 512, 1, 32768] + - Exact: [1024, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 9216] + - Exact: [1024, 1024, 1, 9520] + - Exact: [1024, 1024, 1, 10064] + - Exact: [1024, 1024, 1, 10080] + - Exact: [1024, 1024, 1, 10200] + - Exact: [479, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 1024, 1, 9600] + - Exact: [1024, 1024, 1, 16384] + - Exact: [512, 256, 1, 55296] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 10496] + - Exact: [1024, 1024, 1, 10224] + - Exact: [1024, 1024, 1, 10192] + - Exact: [1024, 1024, 1, 10208] + - Exact: [1024, 1024, 1, 10184] + - Exact: [1024, 1024, 1, 10120] + - Exact: [1024, 1024, 1, 10152] + - Exact: [1024, 1024, 1, 12288] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 5888, 1, 1280] + - Exact: [64, 5056, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [5888, 64, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [64, 5056, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [64, 6784, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [64, 5888, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [6784, 64, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 1024, 1, 1024] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 256, 64] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [128, 64, 512, 128] + - Exact: [512, 64, 64, 512] + - Exact: [1024, 1024, 1, 4] + - Exact: [1024, 1024, 1, 32] + - Exact: [1024, 1024, 1, 2048] + - Exact: [1024, 1024, 1, 4096] + - Exact: [256, 1280, 1, 8976] + - Exact: [257, 4096, 1, 1024] + - Exact: [512, 2048, 1, 256] + - Exact: [560, 1024, 1, 200] + - Exact: [560, 1024, 1, 1600] + - Exact: [1024, 1024, 1, 200] + - Exact: [1024, 1024, 1, 512] + - Exact: [1024, 1024, 1, 960] + - Exact: [1024, 1024, 1, 1600] + - Exact: [2048, 256, 1, 1024] + - Exact: [1024, 1024, 1, 3840] + - Exact: [1024, 1024, 1, 3968] + - Exact: [1024, 1024, 1, 6528] + - Exact: [1024, 1024, 1, 7104] + - Exact: [1024, 1024, 1, 7200] + - Exact: [1024, 1024, 1, 8064] + - Exact: [1024, 1024, 1, 8160] + - Exact: [1024, 1024, 1, 3240] + - Exact: [1024, 1024, 1, 3960] + - Exact: [64, 1280, 64, 192] + - Exact: [64, 1280, 64, 320] + - Exact: [64, 1280, 64, 384] + - Exact: [64, 1280, 64, 448] + - Exact: [64, 2048, 64, 192] + - Exact: [64, 2048, 64, 320] + - Exact: [64, 2048, 64, 384] + - Exact: [64, 2048, 64, 448] + - Exact: [5329, 64, 64, 80] + - Exact: [64, 1280, 32, 192] + - Exact: [64, 1280, 32, 320] + - Exact: [64, 1280, 32, 384] + - Exact: [64, 1280, 32, 448] + - Exact: [64, 2048, 32, 192] + - Exact: [64, 2048, 32, 320] + - Exact: [64, 2048, 32, 384] + - Exact: [64, 2048, 32, 448] + - Exact: [5329, 64, 32, 80] + - Exact: [3136, 64, 32, 256] + - Exact: [3136, 64, 32, 64] + - Exact: [196, 256, 32, 1024] + - Exact: [3136, 64, 64, 128] + - Exact: [3136, 64, 32, 128] + - Exact: [960, 1024, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [64, 512, 16, 512] + - Exact: [1024, 1024, 1, 1] + - Exact: [1024, 1024, 1, 77] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 10] + - Exact: [1024, 1024, 1, 1280] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 1024, 1, 39] + - Exact: [1024, 1024, 1, 780] + - Exact: [1024, 1024, 1, 4992] + - Exact: [1024, 1024, 1, 308] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 1024, 1, 40] + - Exact: [1024, 1024, 1, 800] + - Exact: [1024, 1024, 1, 5120] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 1024, 1, 41] + - Exact: [1024, 1024, 1, 820] + - Exact: [1024, 1024, 1, 5248] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 1024, 1, 5] + - Exact: [1024, 1024, 1, 385] + - Exact: [1024, 1024, 1, 2560] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 1024, 1, 6] + - Exact: [1024, 1024, 1, 462] + - Exact: [1024, 1024, 1, 3072] + - Exact: [64, 128, 128, 128] + - Exact: [1024, 1024, 1, 8] + - Exact: [1024, 1024, 1, 160] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 1024, 1, 9] + - Exact: [1024, 1024, 1, 180] + - Exact: [1024, 1024, 1, 1152] + - Exact: [2048, 512, 1, 1] + - Exact: [64, 1024, 32, 1024] + - Exact: [1024, 64, 128, 1024] + - Exact: [1024, 64, 32, 1024] + - Exact: [1024, 96, 64, 1024] + - Exact: [1024, 1024, 1, 16] + - Exact: [64, 512, 40, 512] + - Exact: [64, 1024, 256, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [512, 64, 256, 512] + - Exact: [1024, 96, 128, 1024] + - Exact: [64, 512, 128, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [512, 64, 128, 512] + - Exact: [64, 1024, 128, 1024] + - Exact: [1024, 64, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [1024, 64, 256, 1024] + - Exact: [512, 64, 40, 512] + - Exact: [1024, 1024, 1, 64] + - Exact: [64, 128, 1024, 128] + - Exact: [128, 64, 1024, 128] + - Exact: [1024, 1024, 1, 3456] + - Exact: [1024, 1024, 1, 6912] + - Exact: [1024, 1024, 1, 864] + - Exact: [1024, 512, 1, 3456] + - Exact: [1024, 512, 1, 4096] + - Exact: [1024, 512, 1, 6912] + - Exact: [1024, 512, 1, 864] + - Exact: [256, 3456, 1, 1] + - Exact: [256, 4096, 1, 1] + - Exact: [480, 1024, 1, 3456] + - Exact: [480, 1024, 1, 4096] + - Exact: [480, 1024, 1, 6912] + - Exact: [480, 1024, 1, 864] + - Exact: [1024, 1024, 1, 80] + - Exact: [64, 128, 1280, 128] + - Exact: [128, 64, 1280, 128] + - Exact: [1024, 1024, 1, 82] + - Exact: [128, 64, 1312, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [1024, 1024, 1, 12] + - Exact: [1024, 1024, 1, 6144] + - Exact: [64, 512, 192, 512] + - Exact: [512, 64, 192, 512] + - Exact: [784, 1152, 1, 128] + - Exact: [64, 128, 2048, 128] + - Exact: [128, 64, 2048, 128] + - Exact: [1024, 1024, 1, 128] + - Exact: [128, 64, 1536, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [1024, 1024, 1, 96] + - Exact: [92416, 64, 25, 64] + - Exact: [50176, 64, 36, 64] + - Exact: [36864, 64, 49, 64] + - Exact: [25600, 64, 64, 64] + - Exact: [64, 128, 192, 128] + - Exact: [128, 64, 192, 128] + - Exact: [768, 768, 1, 2048] + - Exact: [64, 384, 144, 384] + - Exact: [384, 64, 144, 384] + - Exact: [768, 768, 1, 4608] + - Exact: [64, 512, 48, 512] + - Exact: [512, 64, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [128, 64, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [384, 64, 192, 384] + - Exact: [1024, 1024, 1, 4608] + - Exact: [196, 2304, 1, 256] + - Exact: [768, 512, 2, 2048] + - Exact: [672, 512, 2, 2048] + - Exact: [1008, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [840, 512, 2, 2048] + - Exact: [768, 256, 2, 12] + - Exact: [864, 256, 2, 3] + - Exact: [864, 256, 2, 12] + - Exact: [768, 256, 2, 3] + - Exact: [1024, 320, 1, 1024] + - Exact: [173280, 64, 1, 128] + - Exact: [25992, 64, 1, 128] + - Exact: [713, 512, 2, 2048] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [748, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [850, 256, 2, 3] + - Exact: [805, 256, 2, 12] + - Exact: [805, 256, 2, 3] + - Exact: [850, 256, 2, 12] + - Exact: [950, 256, 2, 12] + - Exact: [950, 256, 2, 3] + - Exact: [100, 512, 120, 128] + - Exact: [100, 512, 18, 128] + - Exact: [100, 512, 19, 128] + - Exact: [1444, 576, 1, 128] + - Exact: [27436, 64, 1, 128] + - Exact: [361, 2304, 1, 512] + - Exact: [96, 1024, 160, 1024] + - Exact: [1024, 96, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [1024, 96, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [1024, 96, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [1024, 96, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [1024, 96, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [1024, 96, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [1024, 96, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [1024, 96, 32, 1024] + - Exact: [512, 64, 320, 512] + - Exact: [64, 512, 320, 512] + - Exact: [1024, 1024, 1, 20] + - Exact: [512, 64, 80, 512] + - Exact: [1024, 64, 512, 1024] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 256, 1, 32768] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 8448] + - Exact: [1024, 256, 1, 9728] + - Exact: [1024, 256, 1, 9984] + - Exact: [1024, 256, 1, 10496] + - Exact: [1024, 256, 1, 11520] + - Exact: [1024, 256, 1, 12032] + - Exact: [1024, 256, 1, 13568] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 14848] + - Exact: [1024, 256, 1, 15104] + - Exact: [1024, 256, 1, 15872] + - Exact: [1024, 256, 1, 16128] + - Exact: [1024, 256, 1, 17152] + - Exact: [1024, 256, 1, 17408] + - Exact: [1024, 256, 1, 18944] + - Exact: [1024, 256, 1, 19712] + - Exact: [1024, 256, 1, 19968] + - Exact: [256, 128, 1, 55296] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [3584, 64, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [64, 1856, 1, 256] + - Exact: [2368, 64, 1, 256] + - Exact: [2368, 64, 1, 3328] + - Exact: [64, 1408, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [64, 3584, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 2368, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [64, 2368, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [64, 4288, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2944, 1, 3328] + - Exact: [1856, 64, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [1408, 64, 1, 1280] + - Exact: [64, 2368, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [33, 32, 200, 33] + - Exact: [33, 32, 1600, 33] + - Exact: [67, 2048, 1, 512] + - Exact: [74, 2048, 1, 512] + - Exact: [74, 2048, 1, 960] + - Exact: [100, 2048, 1, 512] + - Exact: [512, 512, 1, 200] + - Exact: [512, 512, 1, 1600] + - Exact: [1024, 256, 1, 1024] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 256, 1, 3072] + - Exact: [1024, 256, 1, 3328] + - Exact: [1024, 256, 1, 3584] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 256, 1, 4352] + - Exact: [1024, 256, 1, 4608] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 256, 1, 5376] + - Exact: [1024, 256, 1, 5632] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 256, 1, 6400] + - Exact: [1024, 256, 1, 7680] + - Exact: [1024, 256, 1, 7936] + - Exact: [32, 64, 4608, 32] + - Exact: [32, 64, 4608, 35] + - Exact: [34, 64, 4736, 24] + - Exact: [34, 64, 4736, 34] + - Exact: [35, 64, 4608, 35] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 32, 4608, 35] + - Exact: [64, 34, 4736, 24] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 35] + - Exact: [33, 64, 1920, 33] + - Exact: [64, 33, 1920, 33] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [256, 864, 1, 1] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 4096] + - Exact: [512, 256, 1, 6912] + - Exact: [512, 256, 1, 864] + - Exact: [49, 4608, 1, 512] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [56, 512, 64, 512] + - Exact: [176, 256, 2, 3] + - Exact: [176, 256, 2, 12] + - Exact: [216, 256, 2, 3] + - Exact: [192, 256, 2, 12] + - Exact: [192, 256, 2, 3] + - Exact: [216, 256, 2, 12] + - Exact: [228, 256, 2, 12] + - Exact: [228, 256, 2, 3] + - Exact: [187, 256, 2, 12] + - Exact: [247, 256, 2, 12] + - Exact: [187, 256, 2, 3] + - Exact: [221, 256, 2, 3] + - Exact: [221, 256, 2, 12] + - Exact: [247, 256, 2, 3] + - Exact: [100, 2304, 1, 512] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 128, 1, 32768] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 2, 1, 4] + - Exact: [2048, 2, 1, 8] + - Exact: [2560, 2, 1, 2] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [25, 1152, 1, 256] + - Exact: [9, 1152, 1, 256] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [13, 512, 1, 32768] + - Exact: [1024, 2, 1, 4992] + - Exact: [1024, 2, 1, 5120] + - Exact: [1024, 2, 1, 5248] + - Exact: [256, 128, 1, 6912] + - Exact: [13, 512, 1, 55296] + - Exact: [13, 512, 1, 6912] + - Exact: [768, 2, 1, 4608] + - Exact: [1024, 2, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 448, 1, 3328] + - Exact: [1, 64, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [64, 1024, 1, 3328] + - Exact: [1, 64, 1, 1280] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 704, 1, 3328] + - Exact: [64, 64, 1, 1280] + - Exact: [1, 64, 1, 1] + - Exact: [448, 64, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [704, 64, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [128, 64, 1, 256] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [448, 64, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [1024, 64, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [64, 256, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [1, 1, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [1, 1, 1, 1280] + - Exact: [1024, 64, 1, 256] + - Exact: [256, 64, 1, 3328] + - Exact: [64, 64, 1, 3328] + - Exact: [1, 1, 1, 3328] + - Exact: [64, 704, 1, 1280] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 32, 1, 512] + - Exact: [1024, 16, 1, 512] + - Exact: [512, 32, 1, 512] + - Exact: [14, 64, 1, 14] + - Exact: [15, 64, 1, 14] + - Exact: [15, 64, 1, 15] + - Exact: [15, 64, 1, 15] + - Exact: [17, 64, 1, 15] + - Exact: [17, 64, 1, 17] + - Exact: [17, 64, 1, 17] + - Exact: [21, 64, 1, 17] + - Exact: [21, 64, 1, 21] + - Exact: [24, 64, 1, 24] + - Exact: [30, 64, 1, 30] + - Exact: [30, 64, 1, 31] + - Exact: [31, 64, 1, 31] + - Exact: [32, 64, 1, 32] + - Exact: [32, 64, 1, 35] + - Exact: [34, 64, 1, 24] + - Exact: [34, 64, 1, 34] + - Exact: [35, 64, 1, 35] + - Exact: [64, 14, 1, 14] + - Exact: [64, 15, 1, 14] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 17, 1, 15] + - Exact: [64, 17, 1, 17] + - Exact: [64, 17, 1, 17] + - Exact: [64, 21, 1, 17] + - Exact: [64, 21, 1, 21] + - Exact: [64, 24, 1, 24] + - Exact: [64, 30, 1, 30] + - Exact: [64, 30, 1, 31] + - Exact: [64, 31, 1, 31] + - Exact: [64, 32, 1, 32] + - Exact: [64, 32, 1, 35] + - Exact: [64, 34, 1, 24] + - Exact: [64, 34, 1, 34] + - Exact: [64, 35, 1, 35] + - Exact: [64, 512, 1, 512] + - Exact: [512, 64, 1, 512] + - Exact: [1024, 2, 1, 4] + - Exact: [1024, 2, 1, 32] + - Exact: [1024, 2, 1, 2048] + - Exact: [3, 64, 512, 3] + - Exact: [5, 64, 512, 5] + - Exact: [5, 64, 960, 5] + - Exact: [9, 64, 512, 9] + - Exact: [27, 128, 32768, 27] + - Exact: [512, 32, 1, 200] + - Exact: [512, 32, 1, 1600] + - Exact: [1024, 64, 1, 512] + - Exact: [1024, 64, 1, 960] + - Exact: [14, 64, 10880, 14] + - Exact: [15, 64, 10880, 14] + - Exact: [15, 64, 7680, 15] + - Exact: [15, 64, 10880, 15] + - Exact: [17, 64, 7680, 15] + - Exact: [17, 64, 6144, 17] + - Exact: [17, 64, 7680, 17] + - Exact: [21, 64, 6144, 17] + - Exact: [21, 64, 6144, 21] + - Exact: [24, 64, 4736, 24] + - Exact: [30, 64, 2048, 30] + - Exact: [30, 64, 2048, 31] + - Exact: [31, 64, 2048, 31] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 15, 10880, 14] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 17, 7680, 15] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 21, 6144, 17] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 30, 2048, 31] + - Exact: [64, 31, 2048, 31] + - Exact: [27, 64, 1920, 27] + - Exact: [27, 64, 1920, 33] + - Exact: [64, 27, 1920, 27] + - Exact: [64, 27, 1920, 33] + - Exact: [1024, 2, 1, 1] + - Exact: [1024, 2, 1, 512] + - Exact: [1024, 2, 1, 10] + - Exact: [1024, 2, 1, 1280] + - Exact: [1024, 2, 1, 39] + - Exact: [1024, 2, 1, 40] + - Exact: [1024, 2, 1, 41] + - Exact: [1024, 2, 1, 5] + - Exact: [1024, 2, 1, 2560] + - Exact: [1024, 2, 1, 6] + - Exact: [1024, 2, 1, 3072] + - Exact: [1024, 2, 1, 8] + - Exact: [1024, 2, 1, 1024] + - Exact: [1024, 2, 1, 9] + - Exact: [1024, 2, 1, 1152] + - Exact: [4, 64, 32768, 4] + - Exact: [4, 64, 38400, 4] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [64, 128, 1, 128] + - Exact: [128, 64, 1, 128] + - Exact: [5, 64, 1, 5] + - Exact: [33, 32, 1, 33] + - Exact: [1024, 2, 1, 16] + - Exact: [1024, 2, 1, 64] + - Exact: [256, 128, 1, 3456] + - Exact: [256, 128, 1, 4096] + - Exact: [256, 128, 1, 864] + - Exact: [1024, 2, 1, 80] + - Exact: [1024, 2, 1, 82] + - Exact: [1024, 2, 1, 12] + - Exact: [13, 512, 1, 3456] + - Exact: [13, 512, 1, 4096] + - Exact: [13, 512, 1, 864] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [1024, 2, 1, 128] + - Exact: [1024, 2, 1, 96] + - Exact: [768, 2, 1, 2048] + - Exact: [1024, 81, 1, 1024] + - Exact: [25, 256, 120, 128] + - Exact: [25, 256, 18, 128] + - Exact: [25, 256, 19, 128] + - Exact: [9, 256, 120, 128] + - Exact: [9, 256, 18, 128] + - Exact: [9, 256, 19, 128] + - Exact: [1024, 2, 1, 20] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml new file mode 100644 index 000000000..16fd3a0da --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml @@ -0,0 +1,3225 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [8] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2368, 1024, 1, 1] + - Exact: [5056, 1408, 1, 3328] + - Exact: [5056, 1856, 1, 3328] + - Exact: [448, 3584, 1, 3328] + - Exact: [5056, 4288, 1, 32] + - Exact: [3584, 1024, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 3328] + - Exact: [448, 3584, 1, 32] + - Exact: [4288, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 32] + - Exact: [1024, 2944, 1, 1] + - Exact: [2944, 3584, 1, 3328] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 32] + - Exact: [5056, 256, 1, 256] + - Exact: [1856, 2368, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [3584, 4288, 1, 32] + - Exact: [3584, 3584, 1, 1] + - Exact: [1408, 2368, 1, 1] + - Exact: [5056, 6784, 1, 1280] + - Exact: [4288, 5056, 1, 1] + - Exact: [5056, 4288, 1, 1] + - Exact: [1408, 4288, 1, 1280] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 1] + - Exact: [704, 3584, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1408, 1024, 1, 1280] + - Exact: [5056, 5888, 1, 3328] + - Exact: [3584, 3584, 1, 1280] + - Exact: [2368, 3584, 1, 32] + - Exact: [2944, 2368, 1, 1] + - Exact: [704, 4288, 1, 1] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1024, 3584, 1, 1] + - Exact: [256, 5056, 1, 32] + - Exact: [2368, 5056, 1, 32] + - Exact: [6784, 1856, 1, 32] + - Exact: [5056, 704, 1, 1] + - Exact: [2944, 4288, 1, 256] + - Exact: [5056, 704, 1, 32] + - Exact: [1856, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 32] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 1] + - Exact: [1856, 2944, 1, 1] + - Exact: [1856, 2368, 1, 32] + - Exact: [4288, 1856, 1, 32] + - Exact: [5056, 2368, 1, 256] + - Exact: [1408, 5888, 1, 256] + - Exact: [5056, 6784, 1, 1] + - Exact: [1024, 1408, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [704, 2368, 1, 1] + - Exact: [3584, 4288, 1, 1280] + - Exact: [3584, 2368, 1, 1] + - Exact: [4288, 448, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [2368, 4288, 1, 32] + - Exact: [704, 5056, 1, 1280] + - Exact: [3584, 6784, 1, 32] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 4288, 1, 3328] + - Exact: [1408, 3584, 1, 1] + - Exact: [4288, 1856, 1, 3328] + - Exact: [1856, 2944, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [3584, 704, 1, 1] + - Exact: [448, 5056, 1, 1] + - Exact: [5888, 5888, 1, 256] + - Exact: [3584, 704, 1, 32] + - Exact: [448, 6784, 1, 3328] + - Exact: [6784, 4288, 1, 1] + - Exact: [3584, 6784, 1, 1] + - Exact: [1408, 2368, 1, 32] + - Exact: [448, 5056, 1, 32] + - Exact: [4288, 4288, 1, 1280] + - Exact: [6784, 1408, 1, 1] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1856, 1, 3328] + - Exact: [5056, 5888, 1, 1] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [3584, 3584, 1, 32] + - Exact: [2944, 2944, 1, 1280] + - Exact: [1408, 5056, 1, 1] + - Exact: [2368, 6784, 1, 1] + - Exact: [6784, 4288, 1, 1280] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 6784, 1, 1280] + - Exact: [704, 2944, 1, 3328] + - Exact: [5888, 256, 1, 1] + - Exact: [5056, 6784, 1, 32] + - Exact: [448, 5056, 1, 1280] + - Exact: [256, 5888, 1, 3328] + - Exact: [5888, 1024, 1, 1] + - Exact: [5888, 448, 1, 32] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [448, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 1] + - Exact: [1408, 1856, 1, 3328] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 5888, 1, 3328] + - Exact: [448, 4288, 1, 3328] + - Exact: [704, 2368, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1408, 1024, 1, 1] + - Exact: [1408, 1024, 1, 256] + - Exact: [5056, 3584, 1, 1] + - Exact: [6784, 6784, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [5056, 3584, 1, 32] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 1856, 1, 256] + - Exact: [5888, 4288, 1, 1] + - Exact: [5056, 704, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [1024, 5056, 1, 256] + - Exact: [5888, 448, 1, 256] + - Exact: [6784, 5056, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [704, 6784, 1, 3328] + - Exact: [2944, 1856, 1, 1] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [1024, 2944, 1, 256] + - Exact: [2944, 1856, 1, 1280] + - Exact: [1024, 2368, 1, 1] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 4288, 1, 256] + - Exact: [448, 3584, 1, 1] + - Exact: [2368, 2944, 1, 32] + - Exact: [4288, 704, 1, 256] + - Exact: [1856, 1024, 1, 256] + - Exact: [704, 6784, 1, 32] + - Exact: [1024, 4288, 1, 1] + - Exact: [1408, 5888, 1, 1280] + - Exact: [5056, 1856, 1, 256] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 256] + - Exact: [6784, 1856, 1, 3328] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 6784, 1, 32] + - Exact: [6784, 6784, 1, 32] + - Exact: [6784, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [6784, 448, 1, 1] + - Exact: [6784, 1856, 1, 1] + - Exact: [2944, 2368, 1, 1280] + - Exact: [6784, 448, 1, 32] + - Exact: [6784, 448, 1, 3328] + - Exact: [448, 3584, 1, 1280] + - Exact: [1408, 6784, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [5888, 704, 1, 1] + - Exact: [3584, 1856, 1, 1] + - Exact: [5056, 2944, 1, 32] + - Exact: [4288, 6784, 1, 1] + - Exact: [1024, 6784, 1, 1] + - Exact: [2368, 5888, 1, 32] + - Exact: [3584, 4288, 1, 1] + - Exact: [5888, 1024, 1, 3328] + - Exact: [6784, 5888, 1, 256] + - Exact: [5056, 1024, 1, 1] + - Exact: [4288, 2368, 1, 32] + - Exact: [704, 3584, 1, 1] + - Exact: [6784, 704, 1, 32] + - Exact: [704, 5888, 1, 256] + - Exact: [2368, 3584, 1, 1280] + - Exact: [3584, 5056, 1, 32] + - Exact: [6784, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 3328] + - Exact: [2368, 5056, 1, 1] + - Exact: [5888, 1408, 1, 256] + - Exact: [2368, 1024, 1, 32] + - Exact: [4288, 1024, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [5056, 5888, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [5888, 5056, 1, 1280] + - Exact: [5888, 2944, 1, 1] + - Exact: [1408, 4288, 1, 3328] + - Exact: [704, 2944, 1, 32] + - Exact: [2944, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 256] + - Exact: [2368, 1856, 1, 256] + - Exact: [2368, 4288, 1, 1280] + - Exact: [3584, 448, 1, 256] + - Exact: [256, 6784, 1, 256] + - Exact: [1024, 1408, 1, 1] + - Exact: [256, 5888, 1, 1] + - Exact: [2944, 2944, 1, 1] + - Exact: [6784, 3584, 1, 256] + - Exact: [1408, 1856, 1, 256] + - Exact: [2944, 2944, 1, 32] + - Exact: [2944, 2944, 1, 3328] + - Exact: [6784, 1408, 1, 32] + - Exact: [2368, 6784, 1, 3328] + - Exact: [4288, 3584, 1, 32] + - Exact: [3584, 704, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [5056, 256, 1, 1280] + - Exact: [2944, 5888, 1, 32] + - Exact: [3584, 5056, 1, 256] + - Exact: [3584, 2368, 1, 256] + - Exact: [4288, 4288, 1, 256] + - Exact: [448, 5056, 1, 256] + - Exact: [4288, 704, 1, 1280] + - Exact: [2368, 704, 1, 1] + - Exact: [1408, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 3328] + - Exact: [448, 4288, 1, 32] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 1024, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 32] + - Exact: [5056, 3584, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [2368, 3584, 1, 1] + - Exact: [4288, 2368, 1, 256] + - Exact: [1408, 2944, 1, 3328] + - Exact: [5888, 3584, 1, 1] + - Exact: [6784, 5056, 1, 3328] + - Exact: [6784, 5056, 1, 1] + - Exact: [5888, 3584, 1, 32] + - Exact: [5888, 3584, 1, 3328] + - Exact: [1024, 6784, 1, 256] + - Exact: [6784, 5888, 1, 32] + - Exact: [2368, 6784, 1, 32] + - Exact: [5056, 1408, 1, 1280] + - Exact: [3584, 1408, 1, 3328] + - Exact: [2944, 3584, 1, 1] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3584, 1024, 1, 1] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 3584, 1, 32] + - Exact: [5888, 256, 1, 32] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 3584, 1, 1280] + - Exact: [256, 5888, 1, 256] + - Exact: [1024, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 1] + - Exact: [1024, 1856, 1, 32] + - Exact: [5888, 2368, 1, 1] + - Exact: [2368, 2368, 1, 1] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 32] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [1856, 1856, 1, 32] + - Exact: [4288, 2944, 1, 32] + - Exact: [256, 5056, 1, 1] + - Exact: [5056, 5056, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 6784, 1, 256] + - Exact: [3584, 704, 1, 3328] + - Exact: [4288, 704, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [448, 3584, 1, 256] + - Exact: [6784, 256, 1, 32] + - Exact: [6784, 1408, 1, 1280] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 1408, 1, 1280] + - Exact: [5888, 1856, 1, 32] + - Exact: [5888, 704, 1, 3328] + - Exact: [448, 6784, 1, 256] + - Exact: [2944, 5888, 1, 256] + - Exact: [1856, 1408, 1, 32] + - Exact: [5888, 2944, 1, 1280] + - Exact: [448, 5888, 1, 1] + - Exact: [3584, 1408, 1, 1] + - Exact: [448, 5888, 1, 32] + - Exact: [5056, 704, 1, 1280] + - Exact: [1856, 6784, 1, 1] + - Exact: [2368, 1024, 1, 256] + - Exact: [1856, 6784, 1, 32] + - Exact: [1856, 6784, 1, 1280] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 6784, 1, 32] + - Exact: [3584, 5888, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [6784, 2368, 1, 256] + - Exact: [1856, 1408, 1, 1280] + - Exact: [1856, 2368, 1, 1] + - Exact: [1408, 5056, 1, 3328] + - Exact: [5056, 4288, 1, 256] + - Exact: [5056, 5056, 1, 32] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 256] + - Exact: [4288, 5888, 1, 1] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2368, 4288, 1, 1] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 5888, 1, 32] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [2944, 2368, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1024, 5888, 1, 32] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 2368, 1, 32] + - Exact: [1408, 2368, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [704, 5056, 1, 32] + - Exact: [5056, 4288, 1, 1280] + - Exact: [4288, 448, 1, 1] + - Exact: [5888, 5888, 1, 1] + - Exact: [2944, 704, 1, 1280] + - Exact: [1024, 3584, 1, 1280] + - Exact: [2368, 2944, 1, 1] + - Exact: [5056, 256, 1, 32] + - Exact: [5056, 1024, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 4288, 1, 32] + - Exact: [5888, 448, 1, 1] + - Exact: [5056, 5056, 1, 1280] + - Exact: [6784, 1408, 1, 3328] + - Exact: [5888, 5888, 1, 3328] + - Exact: [5888, 1408, 1, 32] + - Exact: [256, 6784, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 1] + - Exact: [6784, 1024, 1, 256] + - Exact: [5056, 1408, 1, 32] + - Exact: [1408, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 3328] + - Exact: [704, 2368, 1, 32] + - Exact: [704, 6784, 1, 1] + - Exact: [2368, 6784, 1, 256] + - Exact: [1856, 3584, 1, 3328] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 2944, 1, 32] + - Exact: [5888, 2368, 1, 3328] + - Exact: [2368, 704, 1, 1280] + - Exact: [1024, 1408, 1, 1280] + - Exact: [2944, 5056, 1, 32] + - Exact: [704, 2368, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [3584, 1024, 1, 1280] + - Exact: [5056, 3584, 1, 256] + - Exact: [2368, 704, 1, 256] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4288, 704, 1, 1] + - Exact: [1856, 1024, 1, 1] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 32] + - Exact: [1856, 1024, 1, 32] + - Exact: [2944, 6784, 1, 1] + - Exact: [6784, 2368, 1, 32] + - Exact: [5888, 5056, 1, 1] + - Exact: [704, 5888, 1, 1] + - Exact: [6784, 6784, 1, 1] + - Exact: [5888, 448, 1, 3328] + - Exact: [704, 5888, 1, 32] + - Exact: [704, 5888, 1, 1280] + - Exact: [1024, 6784, 1, 3328] + - Exact: [704, 2944, 1, 1280] + - Exact: [4288, 6784, 1, 256] + - Exact: [1408, 1408, 1, 32] + - Exact: [1408, 1408, 1, 3328] + - Exact: [2944, 1856, 1, 256] + - Exact: [4288, 2944, 1, 1] + - Exact: [6784, 5056, 1, 32] + - Exact: [2944, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 256] + - Exact: [2368, 5888, 1, 1] + - Exact: [1408, 1856, 1, 32] + - Exact: [1856, 6784, 1, 3328] + - Exact: [1024, 2368, 1, 32] + - Exact: [2368, 2368, 1, 3328] + - Exact: [3584, 5888, 1, 32] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 704, 1, 256] + - Exact: [3584, 1024, 1, 32] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 5888, 1, 32] + - Exact: [1856, 1408, 1, 3328] + - Exact: [4288, 1024, 1, 1] + - Exact: [704, 4288, 1, 3328] + - Exact: [2944, 5056, 1, 1280] + - Exact: [6784, 2944, 1, 1280] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 5056, 1, 32] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [1856, 5056, 1, 1] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 448, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [1856, 5056, 1, 32] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 2944, 1, 32] + - Exact: [1408, 6784, 1, 256] + - Exact: [1024, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1] + - Exact: [2944, 5888, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [5888, 704, 1, 256] + - Exact: [6784, 4288, 1, 256] + - Exact: [1408, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 256] + - Exact: [4288, 1024, 1, 32] + - Exact: [5888, 1856, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 704, 1, 32] + - Exact: [2944, 704, 1, 1] + - Exact: [6784, 1856, 1, 256] + - Exact: [1856, 1856, 1, 1] + - Exact: [2944, 704, 1, 3328] + - Exact: [2368, 1856, 1, 32] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 448, 1, 3328] + - Exact: [256, 6784, 1, 1] + - Exact: [1024, 3584, 1, 32] + - Exact: [256, 6784, 1, 32] + - Exact: [2944, 1408, 1, 32] + - Exact: [4288, 3584, 1, 1] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 3584, 1, 32] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [2944, 5888, 1, 1] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5888, 1024, 1, 32] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5056, 6784, 1, 256] + - Exact: [2944, 5056, 1, 1] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [2368, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 1] + - Exact: [2944, 1024, 1, 32] + - Exact: [2944, 1024, 1, 1280] + - Exact: [5056, 5056, 1, 1] + - Exact: [2368, 4288, 1, 256] + - Exact: [2944, 6784, 1, 1280] + - Exact: [256, 6784, 1, 1280] + - Exact: [3584, 2368, 1, 32] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2944, 2944, 1, 256] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5056, 2368, 1, 1280] + - Exact: [2944, 1024, 1, 1] + - Exact: [3584, 704, 1, 256] + - Exact: [2368, 5888, 1, 3328] + - Exact: [4288, 2368, 1, 1] + - Exact: [1408, 3584, 1, 32] + - Exact: [2944, 4288, 1, 32] + - Exact: [5888, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [3584, 2944, 1, 1] + - Exact: [1024, 1856, 1, 1] + - Exact: [704, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 1408, 1, 1] + - Exact: [4288, 5056, 1, 1280] + - Exact: [1856, 1856, 1, 3328] + - Exact: [1024, 2368, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [5888, 3584, 1, 256] + - Exact: [1024, 5056, 1, 32] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [1024, 1408, 1, 256] + - Exact: [3584, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 1280] + - Exact: [3584, 5888, 1, 1] + - Exact: [5888, 4288, 1, 256] + - Exact: [1024, 2944, 1, 1280] + - Exact: [2944, 3584, 1, 256] + - Exact: [5888, 1856, 1, 1] + - Exact: [6784, 2368, 1, 3328] + - Exact: [1408, 4288, 1, 32] + - Exact: [1856, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 256] + - Exact: [5056, 1856, 1, 1] + - Exact: [5888, 2368, 1, 256] + - Exact: [1408, 1024, 1, 32] + - Exact: [5056, 1856, 1, 32] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 5888, 1, 3328] + - Exact: [5056, 704, 1, 3328] + - Exact: [5888, 6784, 1, 1] + - Exact: [5888, 4288, 1, 32] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 256, 1, 1] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [2944, 1856, 1, 32] + - Exact: [2368, 1408, 1, 256] + - Exact: [2368, 1856, 1, 1] + - Exact: [4288, 1408, 1, 1] + - Exact: [3584, 2368, 1, 1280] + - Exact: [1408, 2944, 1, 1] + - Exact: [4288, 1408, 1, 32] + - Exact: [5888, 2944, 1, 256] + - Exact: [1408, 2944, 1, 32] + - Exact: [5888, 6784, 1, 256] + - Exact: [6784, 5888, 1, 1] + - Exact: [6784, 5888, 1, 1280] + - Exact: [1024, 4288, 1, 32] + - Exact: [3584, 5888, 1, 256] + - Exact: [5056, 2368, 1, 1] + - Exact: [5056, 448, 1, 1] + - Exact: [2368, 1024, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [5056, 448, 1, 32] + - Exact: [3584, 2944, 1, 32] + - Exact: [3584, 1856, 1, 32] + - Exact: [4288, 1408, 1, 1280] + - Exact: [6784, 2368, 1, 1] + - Exact: [704, 5056, 1, 1] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5888, 1408, 1, 1] + - Exact: [1024, 4288, 1, 1280] + - Exact: [1856, 4288, 1, 1] + - Exact: [3584, 4288, 1, 256] + - Exact: [2368, 2944, 1, 256] + - Exact: [704, 5056, 1, 256] + - Exact: [1856, 4288, 1, 32] + - Exact: [4288, 1024, 1, 1280] + - Exact: [4288, 6784, 1, 32] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [5888, 704, 1, 1280] + - Exact: [4288, 5888, 1, 256] + - Exact: [3584, 3584, 1, 3328] + - Exact: [2944, 6784, 1, 32] + - Exact: [5056, 256, 1, 1] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [6784, 2944, 1, 3328] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [1024, 1408, 1, 32] + - Exact: [6784, 704, 1, 1] + - Exact: [704, 3584, 1, 32] + - Exact: [4288, 4288, 1, 1] + - Exact: [5056, 2944, 1, 1] + - Exact: [6784, 4288, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2368, 1856, 1, 3328] + - Exact: [1856, 4288, 1, 1280] + - Exact: [3584, 448, 1, 1] + - Exact: [2944, 1024, 1, 3328] + - Exact: [5888, 5056, 1, 32] + - Exact: [704, 2944, 1, 1] + - Exact: [3584, 448, 1, 32] + - Exact: [3584, 448, 1, 1280] + - Exact: [2944, 6784, 1, 3328] + - Exact: [1856, 2368, 1, 1280] + - Exact: [6784, 1024, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1408, 1408, 1, 1] + - Exact: [1408, 4288, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [448, 6784, 1, 1] + - Exact: [704, 2944, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [448, 6784, 1, 32] + - Exact: [1408, 1856, 1, 1] + - Exact: [4288, 448, 1, 32] + - Exact: [4288, 448, 1, 1280] + - Exact: [2944, 704, 1, 32] + - Exact: [448, 4288, 1, 1] + - Exact: [3584, 5056, 1, 1] + - Exact: [1408, 3584, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2368, 1, 32] + - Exact: [5888, 2944, 1, 32] + - Exact: [1856, 2944, 1, 32] + - Exact: [5056, 1408, 1, 1] + - Exact: [5888, 1408, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [6784, 1024, 1, 1] + - Exact: [6784, 1024, 1, 32] + - Exact: [6784, 3584, 1, 1] + - Exact: [2944, 2368, 1, 32] + - Exact: [3584, 6784, 1, 3328] + - Exact: [6784, 1408, 1, 256] + - Exact: [5056, 1024, 1, 32] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 1] + - Exact: [256, 5888, 1, 32] + - Exact: [4288, 5056, 1, 32] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1856, 5888, 1, 32] + - Exact: [1856, 5888, 1, 1280] + - Exact: [704, 2368, 1, 1280] + - Exact: [4288, 2368, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [2944, 4288, 1, 1] + - Exact: [5056, 5888, 1, 32] + - Exact: [2368, 5056, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 6784, 1, 32] + - Exact: [3584, 2944, 1, 3328] + - Exact: [1408, 5888, 1, 1] + - Exact: [704, 4288, 1, 32] + - Exact: [1408, 5888, 1, 32] + - Exact: [6784, 1024, 1, 3328] + - Exact: [5056, 1408, 1, 256] + - Exact: [2944, 1408, 1, 256] + - Exact: [2368, 2368, 1, 256] + - Exact: [1408, 6784, 1, 1] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1024, 5888, 1, 1] + - Exact: [1856, 3584, 1, 256] + - Exact: [2368, 704, 1, 32] + - Exact: [256, 5888, 1, 1280] + - Exact: [1856, 1024, 1, 3328] + - Exact: [5056, 2368, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [2560, 7000, 1, 2560] + - Exact: [7680, 12000, 1, 2560] + - Exact: [5124, 9124, 1, 1760] + - Exact: [512, 24000, 1, 1536] + - Exact: [3072, 24000, 1, 1024] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 48000, 1, 2048] + - Exact: [2048, 1600, 1, 2048] + - Exact: [512, 48000, 1, 1536] + - Exact: [8448, 5984, 1, 2816] + - Exact: [4096, 3200, 1, 1024] + - Exact: [1024, 24000, 1, 2560] + - Exact: [1760, 6400, 1, 1760] + - Exact: [5124, 9124, 1, 2048] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1024, 48000, 1, 2560] + - Exact: [8448, 48000, 1, 2816] + - Exact: [2560, 3200, 1, 2560] + - Exact: [16384, 800, 1, 4096] + - Exact: [4608, 24000, 1, 1536] + - Exact: [7680, 48000, 1, 2560] + - Exact: [3072, 48000, 1, 1024] + - Exact: [8192, 3200, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [4096, 400, 1, 1024] + - Exact: [6144, 48000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [2048, 800, 1, 512] + - Exact: [4608, 5984, 1, 1536] + - Exact: [4096, 1600, 1, 1024] + - Exact: [6144, 5984, 1, 2048] + - Exact: [7680, 24000, 1, 2560] + - Exact: [6144, 48000, 1, 2048] + - Exact: [2048, 3200, 1, 2048] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 24000, 1, 1536] + - Exact: [2560, 6400, 1, 2560] + - Exact: [512, 24000, 1, 2560] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [2048, 1600, 1, 512] + - Exact: [2048, 7000, 1, 2048] + - Exact: [1760, 800, 1, 1760] + - Exact: [2560, 1600, 1, 2560] + - Exact: [2048, 3200, 1, 512] + - Exact: [2560, 800, 1, 2560] + - Exact: [4608, 12000, 1, 1536] + - Exact: [6144, 24000, 1, 2048] + - Exact: [8192, 800, 1, 2048] + - Exact: [5124, 9124, 1, 4096] + - Exact: [8448, 24000, 1, 2816] + - Exact: [1024, 48000, 1, 1536] + - Exact: [8192, 1600, 1, 2048] + - Exact: [4096, 800, 1, 1024] + - Exact: [2048, 800, 1, 2048] + - Exact: [1760, 3200, 1, 1760] + - Exact: [512, 48000, 1, 2560] + - Exact: [512, 24000, 1, 2048] + - Exact: [16384, 1600, 1, 4096] + - Exact: [1024, 24000, 1, 2048] + - Exact: [8192, 400, 1, 2048] + - Exact: [2048, 6400, 1, 2048] + - Exact: [6144, 12000, 1, 2048] + - Exact: [1760, 7000, 1, 1760] + - Exact: [1024, 48000, 1, 2816] + - Exact: [4096, 7000, 1, 4096] + - Exact: [6144, 24000, 1, 2560] + - Exact: [8448, 12000, 1, 2816] + - Exact: [16384, 400, 1, 4096] + - Exact: [1760, 1600, 1, 1760] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2049] + - Exact: [8192, 8191, 1, 8192] + - Exact: [8192, 8192, 1, 8192] + - Exact: [2047, 2048, 1, 2048] + - Exact: [2048, 2049, 1, 2048] + - Exact: [8192, 8192, 1, 8191] + - Exact: [3072, 513, 1, 3072] + - Exact: [8191, 8192, 1, 8192] + - Exact: [8192, 8193, 1, 8192] + - Exact: [4096, 4097, 1, 4096] + - Exact: [8192, 8192, 1, 8193] + - Exact: [4096, 4095, 1, 4096] + - Exact: [4096, 4096, 1, 4097] + - Exact: [2048, 2048, 1, 2048] + - Exact: [4095, 4096, 1, 4096] + - Exact: [8193, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4095] + - Exact: [3072, 511, 1, 3072] + - Exact: [2049, 2048, 1, 2048] + - Exact: [2048, 2047, 1, 2048] + - Exact: [2048, 2048, 1, 2047] + - Exact: [4097, 4096, 1, 4096] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [1024, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [30528, 2048, 1, 1024] + - Exact: [30528, 4096, 1, 1024] + - Exact: [128, 32768, 1, 256] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 32768, 1, 512] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [512, 32768, 1, 13] + - Exact: [512, 32768, 1, 1024] + - Exact: [684, 8976, 1, 256] + - Exact: [1024, 1600, 1, 560] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 32768, 1, 480] + - Exact: [1024, 32768, 1, 1024] + - Exact: [1280, 8976, 1, 256] + - Exact: [1792, 8976, 1, 256] + - Exact: [2048, 684, 1, 512] + - Exact: [2048, 684, 1, 768] + - Exact: [2048, 960, 1, 74] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1536, 1, 512] + - Exact: [2048, 1536, 1, 768] + - Exact: [2048, 8976, 1, 256] + - Exact: [2304, 8976, 1, 256] + - Exact: [2560, 8976, 1, 256] + - Exact: [2816, 8976, 1, 256] + - Exact: [3072, 8976, 1, 256] + - Exact: [3328, 8976, 1, 256] + - Exact: [3840, 8976, 1, 256] + - Exact: [4096, 8976, 1, 256] + - Exact: [4352, 8976, 1, 256] + - Exact: [4608, 8976, 1, 256] + - Exact: [4864, 8976, 1, 256] + - Exact: [5120, 8976, 1, 256] + - Exact: [5376, 8976, 1, 256] + - Exact: [5632, 8976, 1, 256] + - Exact: [5888, 8976, 1, 256] + - Exact: [6144, 8976, 1, 256] + - Exact: [6400, 8976, 1, 256] + - Exact: [7168, 8976, 1, 256] + - Exact: [7936, 8976, 1, 256] + - Exact: [8192, 8976, 1, 256] + - Exact: [8448, 8976, 1, 256] + - Exact: [8960, 8976, 1, 256] + - Exact: [9472, 8976, 1, 256] + - Exact: [9728, 8976, 1, 256] + - Exact: [9984, 8976, 1, 256] + - Exact: [10240, 8976, 1, 256] + - Exact: [10496, 8976, 1, 256] + - Exact: [11264, 8976, 1, 256] + - Exact: [11776, 8976, 1, 256] + - Exact: [12544, 8976, 1, 256] + - Exact: [13312, 8976, 1, 256] + - Exact: [13568, 8976, 1, 256] + - Exact: [13824, 8976, 1, 256] + - Exact: [15104, 8976, 1, 256] + - Exact: [15360, 8976, 1, 256] + - Exact: [15872, 8976, 1, 256] + - Exact: [16128, 8976, 1, 256] + - Exact: [17152, 8976, 1, 256] + - Exact: [18176, 8976, 1, 256] + - Exact: [18688, 8976, 1, 256] + - Exact: [18944, 8976, 1, 256] + - Exact: [19712, 8976, 1, 256] + - Exact: [19968, 8976, 1, 256] + - Exact: [20480, 8976, 1, 256] + - Exact: [20992, 8976, 1, 256] + - Exact: [21248, 8976, 1, 256] + - Exact: [23552, 8976, 1, 256] + - Exact: [28672, 8976, 1, 256] + - Exact: [31488, 8976, 1, 256] + - Exact: [33536, 8976, 1, 256] + - Exact: [44505, 8976, 1, 256] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [42720, 3968, 1, 1024] + - Exact: [42720, 6528, 1, 1024] + - Exact: [42720, 7104, 1, 1024] + - Exact: [42720, 7200, 1, 1024] + - Exact: [42720, 9520, 1, 1024] + - Exact: [42720, 10080, 1, 1024] + - Exact: [1024, 3240, 1, 1024] + - Exact: [1024, 3240, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3960, 1, 4096] + - Exact: [4096, 3240, 1, 1024] + - Exact: [4096, 3960, 1, 1024] + - Exact: [42720, 3960, 1, 1024] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [512, 512, 16, 64] + - Exact: [512, 512, 128, 64] + - Exact: [4096, 512, 1, 1024] + - Exact: [30522, 616, 1, 1024] + - Exact: [128, 128, 128, 64] + - Exact: [128, 128, 160, 64] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [30522, 160, 1, 1024] + - Exact: [30522, 200, 1, 1024] + - Exact: [128, 128, 624, 64] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [30522, 780, 1, 1024] + - Exact: [30522, 308, 1, 1024] + - Exact: [128, 128, 640, 64] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [30522, 800, 1, 1024] + - Exact: [128, 128, 656, 64] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [30522, 820, 1, 1024] + - Exact: [512, 512, 80, 64] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [30522, 385, 1, 1024] + - Exact: [512, 512, 96, 64] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3072, 1, 1024] + - Exact: [30522, 462, 1, 1024] + - Exact: [4096, 1024, 1, 1024] + - Exact: [128, 128, 144, 64] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [30522, 180, 1, 1024] + - Exact: [1024, 32768, 1, 479] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [33712, 8192, 1, 1024] + - Exact: [33712, 9600, 1, 1024] + - Exact: [1024, 1024, 128, 96] + - Exact: [30592, 4096, 1, 1024] + - Exact: [1536, 8192, 1, 1536] + - Exact: [3072, 8192, 1, 1024] + - Exact: [3072, 2048, 1, 1024] + - Exact: [50304, 8192, 1, 1024] + - Exact: [2048, 1024, 1, 8192] + - Exact: [50304, 2048, 1, 1024] + - Exact: [1536, 8192, 1, 6144] + - Exact: [50304, 4096, 1, 1536] + - Exact: [8192, 1024, 1, 2048] + - Exact: [2560, 2048, 1, 640] + - Exact: [1024, 1024, 128, 64] + - Exact: [2048, 1024, 1, 2048] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 64] + - Exact: [30592, 8192, 1, 1024] + - Exact: [50304, 16384, 1, 1024] + - Exact: [4608, 4096, 1, 1536] + - Exact: [2560, 2048, 1, 2560] + - Exact: [7680, 2048, 1, 2560] + - Exact: [50304, 4096, 1, 1024] + - Exact: [1920, 2048, 1, 2560] + - Exact: [1024, 1024, 64, 96] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 6144] + - Exact: [512, 512, 256, 64] + - Exact: [50304, 8192, 1, 1536] + - Exact: [6144, 8192, 1, 1536] + - Exact: [4096, 16384, 1, 1024] + - Exact: [30592, 1024, 1, 2048] + - Exact: [1024, 16384, 1, 4096] + - Exact: [512, 512, 40, 64] + - Exact: [6144, 1024, 1, 2048] + - Exact: [4608, 8192, 1, 1536] + - Exact: [30592, 2048, 1, 1024] + - Exact: [3072, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 1024, 32, 64] + - Exact: [3072, 4096, 1, 1024] + - Exact: [30528, 8192, 1, 1024] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 480] + - Exact: [1024, 4096, 1, 480] + - Exact: [1024, 6912, 1, 1024] + - Exact: [1024, 6912, 1, 480] + - Exact: [128, 55296, 1, 256] + - Exact: [256, 55296, 1, 512] + - Exact: [256, 6912, 1, 512] + - Exact: [512, 3456, 1, 1024] + - Exact: [512, 3456, 1, 13] + - Exact: [512, 4096, 1, 1024] + - Exact: [512, 4096, 1, 13] + - Exact: [512, 55296, 1, 13] + - Exact: [512, 6912, 1, 1024] + - Exact: [512, 6912, 1, 13] + - Exact: [30528, 640, 1, 1024] + - Exact: [30528, 1280, 1, 1024] + - Exact: [30528, 1600, 1, 1024] + - Exact: [1024, 10240, 1, 1024] + - Exact: [4096, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 10496, 1, 4096] + - Exact: [30528, 1640, 1, 1024] + - Exact: [4096, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [30528, 160, 1, 1024] + - Exact: [30528, 240, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 4096] + - Exact: [512, 512, 192, 64] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [4096, 10224, 1, 1024] + - Exact: [3072, 10224, 1, 1024] + - Exact: [3072, 10240, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [4096, 10192, 1, 1024] + - Exact: [3072, 10192, 1, 1024] + - Exact: [3072, 10200, 1, 1024] + - Exact: [1024, 10184, 1, 1024] + - Exact: [3072, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 4096] + - Exact: [4096, 10208, 1, 1024] + - Exact: [2048, 10224, 1, 1024] + - Exact: [2048, 10240, 1, 1024] + - Exact: [1024, 10120, 1, 1024] + - Exact: [2048, 10192, 1, 1024] + - Exact: [1024, 10152, 1, 1024] + - Exact: [3072, 10080, 1, 1024] + - Exact: [1024, 2048, 1, 49] + - Exact: [4608, 512, 1, 49] + - Exact: [256, 256, 25, 12544] + - Exact: [256, 256, 49, 3200] + - Exact: [256, 256, 25, 6272] + - Exact: [256, 256, 49, 6400] + - Exact: [512, 512, 49, 1152] + - Exact: [512, 512, 25, 2048] + - Exact: [512, 512, 49, 2304] + - Exact: [512, 512, 25, 4096] + - Exact: [128, 128, 2048, 64] + - Exact: [30528, 2560, 1, 1024] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 1024] + - Exact: [1024, 12288, 1, 4096] + - Exact: [30528, 1920, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [128, 128, 81, 12544] + - Exact: [128, 128, 121, 9216] + - Exact: [128, 128, 169, 6400] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 49, 2304] + - Exact: [256, 256, 64, 2304] + - Exact: [256, 256, 81, 4096] + - Exact: [256, 256, 121, 2304] + - Exact: [256, 256, 169, 2304] + - Exact: [512, 512, 81, 1024] + - Exact: [512, 512, 121, 1024] + - Exact: [512, 512, 169, 1024] + - Exact: [512, 512, 36, 1024] + - Exact: [512, 512, 49, 1024] + - Exact: [512, 512, 64, 1024] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 768] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 1024] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3072, 3072, 1, 3072] + - Exact: [3072, 512, 1, 3072] + - Exact: [4096, 512, 1, 4096] + - Exact: [512, 3072, 1, 3072] + - Exact: [512, 4096, 1, 4096] + - Exact: [512, 8192, 1, 8192] + - Exact: [8192, 512, 1, 8192] + - Exact: [256, 256, 36, 432] + - Exact: [256, 256, 36, 456] + - Exact: [256, 256, 36, 504] + - Exact: [256, 256, 49, 1120] + - Exact: [256, 256, 36, 442] + - Exact: [256, 256, 49, 950] + - Exact: [256, 256, 64, 616] + - Exact: [256, 256, 64, 660] + - Exact: [256, 256, 36, 408] + - Exact: [256, 256, 49, 1008] + - Exact: [256, 256, 36, 462] + - Exact: [256, 256, 36, 468] + - Exact: [256, 256, 36, 494] + - Exact: [512, 512, 64, 48] + - Exact: [256, 256, 64, 140] + - Exact: [512, 512, 64, 56] + - Exact: [512, 512, 49, 90] + - Exact: [512, 512, 49, 60] + - Exact: [256, 256, 49, 864] + - Exact: [256, 256, 64, 224] + - Exact: [256, 256, 64, 176] + - Exact: [256, 256, 64, 154] + - Exact: [512, 512, 49, 80] + - Exact: [256, 256, 49, 1200] + - Exact: [256, 256, 64, 704] + - Exact: [256, 256, 64, 768] + - Exact: [256, 256, 49, 1160] + - Exact: [256, 256, 49, 320] + - Exact: [512, 512, 49, 70] + - Exact: [256, 256, 49, 1240] + - Exact: [256, 256, 36, 384] + - Exact: [1024, 2048, 1, 888] + - Exact: [1024, 2048, 1, 713] + - Exact: [1024, 2048, 1, 660] + - Exact: [1024, 2048, 1, 726] + - Exact: [1024, 2048, 1, 672] + - Exact: [1024, 2048, 1, 850] + - Exact: [1024, 2048, 1, 805] + - Exact: [1024, 2048, 1, 864] + - Exact: [1024, 2048, 1, 768] + - Exact: [1024, 2048, 1, 950] + - Exact: [256, 128, 49, 1152] + - Exact: [256, 128, 121, 120] + - Exact: [256, 128, 169, 120] + - Exact: [256, 128, 36, 120] + - Exact: [256, 128, 49, 120] + - Exact: [256, 128, 64, 120] + - Exact: [256, 128, 36, 12000] + - Exact: [256, 128, 49, 1216] + - Exact: [256, 128, 121, 18] + - Exact: [256, 128, 169, 18] + - Exact: [256, 128, 36, 18] + - Exact: [256, 128, 49, 18] + - Exact: [256, 128, 64, 18] + - Exact: [256, 128, 36, 1800] + - Exact: [256, 128, 121, 19] + - Exact: [256, 128, 169, 19] + - Exact: [256, 128, 36, 19] + - Exact: [256, 128, 49, 19] + - Exact: [256, 128, 64, 19] + - Exact: [256, 128, 36, 1900] + - Exact: [256, 128, 49, 480] + - Exact: [256, 128, 81, 480] + - Exact: [256, 128, 64, 5880] + - Exact: [256, 128, 49, 72] + - Exact: [256, 128, 81, 72] + - Exact: [256, 128, 49, 76] + - Exact: [256, 128, 81, 76] + - Exact: [256, 128, 49, 7680] + - Exact: [256, 128, 64, 882] + - Exact: [256, 128, 64, 931] + - Exact: [256, 256, 49, 1152] + - Exact: [256, 256, 36, 12000] + - Exact: [256, 256, 49, 1216] + - Exact: [256, 256, 36, 1800] + - Exact: [256, 256, 36, 1900] + - Exact: [256, 256, 64, 5880] + - Exact: [256, 256, 49, 7680] + - Exact: [256, 256, 64, 882] + - Exact: [256, 256, 64, 931] + - Exact: [512, 256, 81, 1080] + - Exact: [512, 256, 25, 12000] + - Exact: [512, 256, 81, 162] + - Exact: [512, 256, 81, 171] + - Exact: [512, 256, 25, 1800] + - Exact: [512, 256, 25, 1900] + - Exact: [512, 256, 121, 1920] + - Exact: [512, 256, 169, 1920] + - Exact: [512, 256, 49, 1920] + - Exact: [512, 256, 121, 288] + - Exact: [512, 256, 169, 288] + - Exact: [512, 256, 49, 288] + - Exact: [512, 256, 25, 3000] + - Exact: [512, 256, 81, 3000] + - Exact: [512, 256, 121, 304] + - Exact: [512, 256, 169, 304] + - Exact: [512, 256, 49, 304] + - Exact: [512, 256, 25, 450] + - Exact: [512, 256, 81, 450] + - Exact: [512, 256, 25, 475] + - Exact: [512, 256, 81, 475] + - Exact: [512, 256, 121, 480] + - Exact: [512, 256, 169, 480] + - Exact: [512, 256, 49, 5880] + - Exact: [512, 256, 121, 72] + - Exact: [512, 256, 169, 72] + - Exact: [512, 256, 121, 76] + - Exact: [512, 256, 169, 76] + - Exact: [512, 256, 49, 882] + - Exact: [512, 256, 49, 931] + - Exact: [2304, 512, 1, 100] + - Exact: [2304, 512, 1, 361] + - Exact: [4608, 510, 1, 100] + - Exact: [4608, 510, 1, 361] + - Exact: [340, 256, 49, 1152] + - Exact: [340, 256, 36, 120] + - Exact: [340, 256, 49, 120] + - Exact: [340, 256, 64, 120] + - Exact: [340, 256, 36, 12000] + - Exact: [340, 256, 49, 1216] + - Exact: [340, 256, 36, 18] + - Exact: [340, 256, 49, 18] + - Exact: [340, 256, 64, 18] + - Exact: [340, 256, 36, 1800] + - Exact: [340, 256, 36, 19] + - Exact: [340, 256, 49, 19] + - Exact: [340, 256, 64, 19] + - Exact: [340, 256, 36, 1900] + - Exact: [340, 256, 64, 5880] + - Exact: [340, 256, 49, 7680] + - Exact: [340, 256, 64, 882] + - Exact: [340, 256, 64, 931] + - Exact: [510, 256, 49, 120] + - Exact: [510, 256, 64, 120] + - Exact: [510, 256, 49, 18] + - Exact: [510, 256, 64, 18] + - Exact: [510, 256, 49, 19] + - Exact: [510, 256, 64, 19] + - Exact: [510, 256, 36, 480] + - Exact: [510, 256, 36, 72] + - Exact: [510, 256, 36, 76] + - Exact: [510, 512, 36, 1080] + - Exact: [510, 512, 36, 162] + - Exact: [510, 512, 36, 171] + - Exact: [510, 512, 49, 1920] + - Exact: [510, 512, 64, 1920] + - Exact: [510, 512, 49, 288] + - Exact: [510, 512, 64, 288] + - Exact: [510, 512, 36, 3000] + - Exact: [510, 512, 49, 304] + - Exact: [510, 512, 64, 304] + - Exact: [510, 512, 36, 450] + - Exact: [510, 512, 36, 475] + - Exact: [510, 512, 49, 480] + - Exact: [510, 512, 64, 480] + - Exact: [510, 512, 49, 72] + - Exact: [510, 512, 64, 72] + - Exact: [510, 512, 49, 76] + - Exact: [510, 512, 64, 76] + - Exact: [1024, 1024, 160, 96] + - Exact: [2880, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 960] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [25216, 16384, 1, 1920] + - Exact: [1024, 1024, 40, 96] + - Exact: [2880, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 960] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [25216, 4096, 1, 1920] + - Exact: [1024, 1024, 80, 96] + - Exact: [2880, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 960] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [25216, 8192, 1, 1920] + - Exact: [1024, 1024, 96, 96] + - Exact: [1728, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 576] + - Exact: [2304, 16384, 1, 2304] + - Exact: [12672, 16384, 1, 2304] + - Exact: [1024, 1024, 24, 96] + - Exact: [1728, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 576] + - Exact: [2304, 4096, 1, 2304] + - Exact: [12672, 4096, 1, 2304] + - Exact: [1024, 1024, 48, 96] + - Exact: [1728, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 576] + - Exact: [2304, 8192, 1, 2304] + - Exact: [12672, 8192, 1, 2304] + - Exact: [1024, 1024, 16, 96] + - Exact: [1152, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 384] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [6400, 4096, 1, 3072] + - Exact: [1024, 1024, 32, 96] + - Exact: [1152, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 384] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [6400, 8192, 1, 3072] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [29000, 199, 1, 2048] + - Exact: [29000, 221, 1, 2048] + - Exact: [29000, 224, 1, 2048] + - Exact: [29000, 229, 1, 2048] + - Exact: [29000, 234, 1, 2048] + - Exact: [29000, 242, 1, 2048] + - Exact: [29000, 246, 1, 2048] + - Exact: [29000, 247, 1, 2048] + - Exact: [29000, 256, 1, 2048] + - Exact: [29000, 262, 1, 2048] + - Exact: [29000, 264, 1, 2048] + - Exact: [29000, 265, 1, 2048] + - Exact: [29000, 274, 1, 2048] + - Exact: [29000, 277, 1, 2048] + - Exact: [29000, 279, 1, 2048] + - Exact: [29000, 288, 1, 2048] + - Exact: [29000, 296, 1, 2048] + - Exact: [29000, 315, 1, 2048] + - Exact: [29000, 335, 1, 2048] + - Exact: [4096, 4096, 1, 2048] + - Exact: [29000, 2283, 1, 1024] + - Exact: [29000, 2296, 1, 1024] + - Exact: [29000, 2306, 1, 1024] + - Exact: [29000, 2309, 1, 1024] + - Exact: [29000, 2318, 1, 1024] + - Exact: [29000, 2320, 1, 1024] + - Exact: [29000, 2324, 1, 1024] + - Exact: [29000, 2325, 1, 1024] + - Exact: [29000, 2329, 1, 1024] + - Exact: [29000, 2338, 1, 1024] + - Exact: [29000, 2345, 1, 1024] + - Exact: [29000, 2350, 1, 1024] + - Exact: [29000, 2362, 1, 1024] + - Exact: [29000, 2366, 1, 1024] + - Exact: [29000, 2368, 1, 1024] + - Exact: [29000, 2374, 1, 1024] + - Exact: [29000, 2390, 1, 1024] + - Exact: [512, 512, 320, 64] + - Exact: [29000, 561, 1, 1024] + - Exact: [29000, 574, 1, 1024] + - Exact: [29000, 600, 1, 1024] + - Exact: [29000, 608, 1, 1024] + - Exact: [29000, 615, 1, 1024] + - Exact: [29000, 622, 1, 1024] + - Exact: [29000, 625, 1, 1024] + - Exact: [29000, 626, 1, 1024] + - Exact: [29000, 628, 1, 1024] + - Exact: [29000, 636, 1, 1024] + - Exact: [29000, 651, 1, 1024] + - Exact: [29000, 658, 1, 1024] + - Exact: [29000, 669, 1, 1024] + - Exact: [29000, 670, 1, 1024] + - Exact: [29000, 672, 1, 1024] + - Exact: [29000, 684, 1, 1024] + - Exact: [29000, 716, 1, 1024] + - Exact: [29000, 730, 1, 1024] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 4096] + - Exact: [3072, 32768, 1, 1024] + - Exact: [4096, 32768, 1, 1024] + - Exact: [50304, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [128, 128, 49, 12800] + - Exact: [128, 128, 25, 25088] + - Exact: [128, 128, 49, 25600] + - Exact: [128, 128, 25, 50176] + - Exact: [128, 128, 36, 12544] + - Exact: [128, 128, 49, 9216] + - Exact: [1024, 1024, 1, 12544] + - Exact: [1024, 1000, 1, 12544] + - Exact: [128, 128, 36, 12000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [5888, 128, 1, 1] + - Exact: [1856, 256, 1, 1] + - Exact: [256, 1856, 1, 32] + - Exact: [128, 3584, 1, 1280] + - Exact: [2944, 128, 1, 32] + - Exact: [64, 6784, 1, 1] + - Exact: [64, 5056, 1, 3328] + - Exact: [704, 1024, 1, 1] + - Exact: [256, 1856, 1, 1280] + - Exact: [256, 1408, 1, 1] + - Exact: [1024, 1024, 1, 1280] + - Exact: [704, 1408, 1, 3328] + - Exact: [1408, 704, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [2944, 256, 1, 1] + - Exact: [2944, 256, 1, 32] + - Exact: [128, 4288, 1, 3328] + - Exact: [5056, 128, 1, 256] + - Exact: [1856, 704, 1, 1280] + - Exact: [2368, 256, 1, 32] + - Exact: [5056, 64, 1, 32] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [1024, 1024, 1, 3328] + - Exact: [5888, 64, 1, 256] + - Exact: [2944, 448, 1, 256] + - Exact: [5056, 64, 1, 3328] + - Exact: [1024, 448, 1, 32] + - Exact: [128, 2944, 1, 3328] + - Exact: [256, 1856, 1, 1] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 4288, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [128, 5888, 1, 32] + - Exact: [128, 5888, 1, 1280] + - Exact: [3584, 256, 1, 256] + - Exact: [1856, 256, 1, 256] + - Exact: [1024, 704, 1, 1] + - Exact: [448, 1408, 1, 3328] + - Exact: [1024, 704, 1, 32] + - Exact: [448, 2944, 1, 256] + - Exact: [5888, 128, 1, 3328] + - Exact: [2944, 448, 1, 1] + - Exact: [5056, 64, 1, 1280] + - Exact: [704, 704, 1, 32] + - Exact: [256, 4288, 1, 256] + - Exact: [5056, 128, 1, 1] + - Exact: [704, 448, 1, 1280] + - Exact: [1024, 704, 1, 1280] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [128, 6784, 1, 32] + - Exact: [128, 6784, 1, 3328] + - Exact: [4288, 128, 1, 1] + - Exact: [256, 2368, 1, 32] + - Exact: [448, 1024, 1, 256] + - Exact: [256, 1408, 1, 32] + - Exact: [256, 3584, 1, 32] + - Exact: [128, 4288, 1, 32] + - Exact: [448, 1856, 1, 1] + - Exact: [448, 1856, 1, 32] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [704, 1856, 1, 32] + - Exact: [704, 1408, 1, 32] + - Exact: [5888, 128, 1, 32] + - Exact: [128, 4288, 1, 1280] + - Exact: [1856, 704, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [704, 1856, 1, 3328] + - Exact: [2944, 128, 1, 1280] + - Exact: [1408, 448, 1, 1280] + - Exact: [128, 2368, 1, 1] + - Exact: [128, 2944, 1, 1280] + - Exact: [1024, 448, 1, 1] + - Exact: [256, 2944, 1, 256] + - Exact: [704, 448, 1, 32] + - Exact: [704, 1024, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [256, 2368, 1, 1] + - Exact: [5888, 64, 1, 3328] + - Exact: [704, 448, 1, 3328] + - Exact: [4288, 256, 1, 1] + - Exact: [1856, 448, 1, 3328] + - Exact: [4288, 256, 1, 1280] + - Exact: [448, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1] + - Exact: [2368, 448, 1, 32] + - Exact: [1408, 704, 1, 1] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [256, 2944, 1, 1] + - Exact: [6784, 64, 1, 1] + - Exact: [6784, 64, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [64, 5056, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [5888, 64, 1, 1] + - Exact: [256, 3584, 1, 1] + - Exact: [704, 1856, 1, 1280] + - Exact: [448, 1024, 1, 3328] + - Exact: [128, 5056, 1, 32] + - Exact: [128, 5056, 1, 1280] + - Exact: [5888, 64, 1, 32] + - Exact: [2368, 256, 1, 1] + - Exact: [5888, 64, 1, 1280] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1] + - Exact: [1408, 448, 1, 32] + - Exact: [5056, 128, 1, 1280] + - Exact: [1856, 704, 1, 256] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [448, 704, 1, 1] + - Exact: [448, 704, 1, 32] + - Exact: [448, 704, 1, 3328] + - Exact: [64, 5888, 1, 1] + - Exact: [2368, 128, 1, 32] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 128, 1, 3328] + - Exact: [4288, 256, 1, 32] + - Exact: [448, 1408, 1, 1] + - Exact: [1408, 256, 1, 256] + - Exact: [256, 4288, 1, 32] + - Exact: [1408, 256, 1, 1280] + - Exact: [448, 1408, 1, 256] + - Exact: [128, 2944, 1, 32] + - Exact: [1856, 448, 1, 1] + - Exact: [704, 704, 1, 1] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [3584, 256, 1, 3328] + - Exact: [448, 2368, 1, 1] + - Exact: [128, 6784, 1, 1] + - Exact: [256, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [704, 704, 1, 256] + - Exact: [448, 1024, 1, 32] + - Exact: [256, 2368, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [704, 1856, 1, 1] + - Exact: [704, 448, 1, 256] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 5056, 1, 1] + - Exact: [256, 2368, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 1408, 1, 1] + - Exact: [4288, 128, 1, 3328] + - Exact: [128, 6784, 1, 1280] + - Exact: [3584, 256, 1, 32] + - Exact: [1408, 256, 1, 32] + - Exact: [5888, 128, 1, 256] + - Exact: [128, 5056, 1, 3328] + - Exact: [1024, 448, 1, 3328] + - Exact: [3584, 128, 1, 1] + - Exact: [128, 2368, 1, 256] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5888, 1, 1] + - Exact: [64, 5056, 1, 1] + - Exact: [1856, 256, 1, 32] + - Exact: [64, 5056, 1, 32] + - Exact: [1408, 704, 1, 32] + - Exact: [1408, 704, 1, 1280] + - Exact: [1024, 1024, 1, 32] + - Exact: [5056, 128, 1, 3328] + - Exact: [128, 4288, 1, 1] + - Exact: [2944, 128, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [2944, 128, 1, 256] + - Exact: [6784, 128, 1, 1] + - Exact: [1408, 256, 1, 3328] + - Exact: [2944, 256, 1, 256] + - Exact: [6784, 128, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [2944, 448, 1, 1280] + - Exact: [704, 448, 1, 1] + - Exact: [256, 1408, 1, 3328] + - Exact: [2944, 128, 1, 1] + - Exact: [704, 1024, 1, 32] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 1280] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 3328] + - Exact: [448, 2944, 1, 1] + - Exact: [448, 1856, 1, 1280] + - Exact: [2368, 448, 1, 1] + - Exact: [448, 2944, 1, 32] + - Exact: [448, 2944, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [448, 2944, 1, 3328] + - Exact: [2368, 128, 1, 1280] + - Exact: [128, 3584, 1, 3328] + - Exact: [256, 4288, 1, 3328] + - Exact: [1856, 704, 1, 32] + - Exact: [2944, 448, 1, 32] + - Exact: [5056, 128, 1, 32] + - Exact: [6784, 128, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1856, 704, 1, 1] + - Exact: [256, 1856, 1, 3328] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 4288, 1, 256] + - Exact: [6784, 128, 1, 32] + - Exact: [1408, 448, 1, 1] + - Exact: [64, 5056, 1, 1280] + - Exact: [448, 1408, 1, 32] + - Exact: [128, 5056, 1, 256] + - Exact: [1024, 1024, 1, 1] + - Exact: [256, 1408, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [6784, 64, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [448, 1408, 1, 1280] + - Exact: [2368, 128, 1, 1] + - Exact: [5056, 64, 1, 256] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 6784, 1, 32] + - Exact: [256, 4288, 1, 1] + - Exact: [128, 2944, 1, 256] + - Exact: [3584, 128, 1, 32] + - Exact: [3584, 128, 1, 3328] + - Exact: [704, 704, 1, 3328] + - Exact: [128, 2944, 1, 1] + - Exact: [704, 1408, 1, 1280] + - Exact: [6784, 64, 1, 32] + - Exact: [64, 6784, 1, 1280] + - Exact: [704, 1408, 1, 256] + - Exact: [4288, 128, 1, 32] + - Exact: [448, 704, 1, 256] + - Exact: [1856, 256, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [1024, 1024, 1, 256] + - Exact: [256, 2944, 1, 32] + - Exact: [704, 1024, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [128, 2368, 1, 32] + - Exact: [704, 1856, 1, 256] + - Exact: [1856, 448, 1, 32] + - Exact: [1408, 448, 1, 256] + - Exact: [448, 1024, 1, 1] + - Exact: [1024, 704, 1, 256] + - Exact: [64, 5888, 1, 32] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [1408, 256, 1, 1] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 3584, 1, 1] + - Exact: [128, 3584, 1, 32] + - Exact: [448, 2368, 1, 32] + - Exact: [448, 2368, 1, 3328] + - Exact: [1024, 704, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [2560, 128, 1, 2560] + - Exact: [1024, 700, 1, 512] + - Exact: [4096, 128, 1, 4096] + - Exact: [3072, 128, 1, 1024] + - Exact: [7680, 64, 1, 2560] + - Exact: [7680, 128, 1, 2560] + - Exact: [1024, 1024, 1, 1024] + - Exact: [2049, 512, 1, 2048] + - Exact: [1023, 512, 1, 1024] + - Exact: [1024, 512, 1, 1025] + - Exact: [1024, 1024, 1, 1023] + - Exact: [1024, 1025, 1, 1024] + - Exact: [1024, 1023, 1, 1024] + - Exact: [2048, 511, 1, 2048] + - Exact: [2047, 512, 1, 2048] + - Exact: [1025, 1024, 1, 1024] + - Exact: [1024, 1024, 1, 1025] + - Exact: [1025, 512, 1, 1024] + - Exact: [1024, 512, 1, 1023] + - Exact: [2048, 513, 1, 2048] + - Exact: [1024, 511, 1, 1024] + - Exact: [2048, 512, 1, 2047] + - Exact: [1024, 513, 1, 1024] + - Exact: [2048, 512, 1, 2049] + - Exact: [1023, 1024, 1, 1024] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3328, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [512, 1600, 1, 512] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 1600] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1600] + - Exact: [2048, 215, 1, 512] + - Exact: [2048, 215, 1, 768] + - Exact: [2048, 256, 1, 512] + - Exact: [2048, 256, 1, 768] + - Exact: [2048, 512, 1, 67] + - Exact: [2048, 512, 1, 74] + - Exact: [2048, 512, 1, 100] + - Exact: [2048, 512, 1, 2048] + - Exact: [1024, 512, 1, 4096] + - Exact: [30522, 77, 1, 1024] + - Exact: [1024, 780, 1, 1024] + - Exact: [1024, 800, 1, 1024] + - Exact: [1024, 820, 1, 1024] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [1024, 1024, 1, 4096] + - Exact: [480, 1024, 1, 1024] + - Exact: [480, 2048, 1, 2048] + - Exact: [1024, 480, 1, 1024] + - Exact: [2048, 480, 1, 2048] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 512, 128, 512] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 1024, 64, 1024] + - Exact: [960, 1024, 1, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 480] + - Exact: [128, 3456, 1, 256] + - Exact: [128, 4096, 1, 256] + - Exact: [128, 6912, 1, 256] + - Exact: [256, 3456, 1, 512] + - Exact: [256, 4096, 1, 512] + - Exact: [512, 864, 1, 1024] + - Exact: [512, 864, 1, 13] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [1024, 512, 1, 196] + - Exact: [2048, 512, 1, 49] + - Exact: [2304, 256, 1, 196] + - Exact: [512, 1024, 1, 196] + - Exact: [512, 2048, 1, 49] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [128, 128, 64, 6400] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [512, 1024, 1, 1024] + - Exact: [512, 2048, 1, 2048] + - Exact: [128, 128, 49, 1120] + - Exact: [128, 128, 49, 1064] + - Exact: [128, 128, 49, 1040] + - Exact: [128, 128, 64, 600] + - Exact: [128, 128, 64, 616] + - Exact: [128, 128, 49, 950] + - Exact: [128, 128, 49, 972] + - Exact: [128, 128, 64, 560] + - Exact: [128, 128, 49, 1008] + - Exact: [128, 128, 64, 532] + - Exact: [128, 128, 49, 1080] + - Exact: [128, 128, 64, 588] + - Exact: [128, 128, 49, 1160] + - Exact: [128, 128, 49, 988] + - Exact: [128, 128, 49, 936] + - Exact: [512, 1024, 1, 3800] + - Exact: [512, 1024, 1, 3400] + - Exact: [512, 1024, 1, 3456] + - Exact: [512, 1024, 1, 3072] + - Exact: [2048, 512, 1, 950] + - Exact: [512, 1024, 1, 3552] + - Exact: [512, 1024, 1, 3220] + - Exact: [2048, 512, 1, 850] + - Exact: [512, 2048, 1, 864] + - Exact: [512, 2048, 1, 768] + - Exact: [2048, 512, 1, 805] + - Exact: [512, 1024, 1, 2852] + - Exact: [512, 2048, 1, 888] + - Exact: [2048, 512, 1, 864] + - Exact: [2048, 512, 1, 768] + - Exact: [2048, 512, 1, 888] + - Exact: [2048, 256, 1, 950] + - Exact: [2048, 512, 1, 713] + - Exact: [512, 1024, 1, 2688] + - Exact: [512, 1024, 1, 2640] + - Exact: [512, 1024, 1, 2904] + - Exact: [1024, 512, 1, 950] + - Exact: [512, 2048, 1, 672] + - Exact: [512, 2048, 1, 660] + - Exact: [512, 2048, 1, 1008] + - Exact: [2048, 256, 1, 850] + - Exact: [2048, 512, 1, 726] + - Exact: [1024, 512, 1, 850] + - Exact: [2048, 512, 1, 660] + - Exact: [2048, 512, 1, 672] + - Exact: [512, 2048, 1, 840] + - Exact: [2048, 512, 1, 1008] + - Exact: [512, 2048, 1, 792] + - Exact: [1024, 512, 1, 805] + - Exact: [512, 2048, 1, 1050] + - Exact: [2048, 512, 1, 748] + - Exact: [2048, 256, 1, 864] + - Exact: [1024, 512, 1, 768] + - Exact: [1024, 512, 1, 864] + - Exact: [2048, 512, 1, 875] + - Exact: [2048, 512, 1, 840] + - Exact: [2048, 512, 1, 792] + - Exact: [512, 2048, 1, 736] + - Exact: [2048, 256, 1, 888] + - Exact: [512, 2048, 1, 704] + - Exact: [512, 2048, 1, 588] + - Exact: [1024, 512, 1, 888] + - Exact: [512, 2048, 1, 816] + - Exact: [1024, 512, 1, 713] + - Exact: [2048, 512, 1, 736] + - Exact: [2048, 512, 1, 588] + - Exact: [2048, 512, 1, 704] + - Exact: [1024, 512, 1, 660] + - Exact: [2048, 256, 1, 660] + - Exact: [2048, 256, 1, 672] + - Exact: [1024, 512, 1, 672] + - Exact: [1024, 512, 1, 726] + - Exact: [512, 2048, 1, 630] + - Exact: [512, 2048, 1, 600] + - Exact: [2048, 256, 1, 805] + - Exact: [2048, 256, 1, 713] + - Exact: [2048, 256, 1, 726] + - Exact: [320, 1024, 1, 1024] + - Exact: [1024, 1000, 1, 1024] + - Exact: [320, 1000, 1, 1024] + - Exact: [128, 128, 49, 1280] + - Exact: [128, 128, 49, 1360] + - Exact: [128, 128, 49, 1200] + - Exact: [128, 128, 49, 1240] + - Exact: [2304, 256, 1, 704] + - Exact: [2304, 256, 1, 736] + - Exact: [2304, 256, 1, 792] + - Exact: [2304, 256, 1, 748] + - Exact: [2304, 256, 1, 726] + - Exact: [2304, 256, 1, 713] + - Exact: [2304, 256, 1, 768] + - Exact: [512, 2048, 1, 759] + - Exact: [512, 2048, 1, 925] + - Exact: [2304, 256, 1, 805] + - Exact: [512, 2048, 1, 900] + - Exact: [512, 2048, 1, 875] + - Exact: [512, 2048, 1, 748] + - Exact: [512, 2048, 1, 726] + - Exact: [512, 2048, 1, 713] + - Exact: [512, 2048, 1, 805] + - Exact: [512, 2048, 1, 850] + - Exact: [512, 2048, 1, 950] + - Exact: [128, 128, 49, 1152] + - Exact: [128, 128, 49, 1216] + - Exact: [128, 128, 36, 1800] + - Exact: [128, 128, 36, 1900] + - Exact: [128, 128, 64, 5880] + - Exact: [128, 128, 49, 7680] + - Exact: [128, 128, 64, 882] + - Exact: [128, 128, 64, 931] + - Exact: [128, 64, 121, 1152] + - Exact: [128, 64, 81, 12000] + - Exact: [128, 64, 121, 1216] + - Exact: [128, 64, 81, 1800] + - Exact: [128, 64, 81, 1900] + - Exact: [128, 64, 49, 20280] + - Exact: [128, 64, 49, 3042] + - Exact: [128, 64, 49, 3211] + - Exact: [128, 64, 169, 5880] + - Exact: [128, 64, 121, 7680] + - Exact: [128, 64, 169, 882] + - Exact: [128, 64, 169, 931] + - Exact: [256, 128, 25, 1080] + - Exact: [256, 128, 25, 162] + - Exact: [256, 128, 25, 171] + - Exact: [1152, 256, 1, 1] + - Exact: [1152, 256, 1, 1444] + - Exact: [1152, 256, 1, 25] + - Exact: [1152, 256, 1, 9] + - Exact: [2304, 256, 1, 1444] + - Exact: [2304, 340, 1, 1] + - Exact: [2304, 340, 1, 1444] + - Exact: [2304, 340, 1, 9] + - Exact: [2304, 510, 1, 25] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 512, 80, 512] + - Exact: [29000, 109, 1, 2560] + - Exact: [29000, 121, 1, 2560] + - Exact: [29000, 65, 1, 2560] + - Exact: [29000, 66, 1, 2560] + - Exact: [29000, 67, 1, 2560] + - Exact: [29000, 69, 1, 2560] + - Exact: [29000, 70, 1, 2560] + - Exact: [29000, 71, 1, 2560] + - Exact: [29000, 73, 1, 2560] + - Exact: [29000, 74, 1, 2560] + - Exact: [29000, 75, 1, 2560] + - Exact: [29000, 77, 1, 2560] + - Exact: [29000, 78, 1, 2560] + - Exact: [29000, 80, 1, 2560] + - Exact: [29000, 81, 1, 2560] + - Exact: [29000, 82, 1, 2560] + - Exact: [29000, 83, 1, 2560] + - Exact: [29000, 84, 1, 2560] + - Exact: [29000, 88, 1, 2560] + - Exact: [29000, 89, 1, 2560] + - Exact: [29000, 90, 1, 2560] + - Exact: [29000, 92, 1, 2560] + - Exact: [29000, 95, 1, 2560] + - Exact: [29000, 98, 1, 2560] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 36, 50176] + - Exact: [64, 64, 49, 36864] + - Exact: [64, 64, 64, 25600] + - Exact: [256, 256, 1, 60800] + - Exact: [256, 256, 1, 54400] + - Exact: [256, 256, 1, 51520] + - Exact: [256, 256, 1, 55296] + - Exact: [256, 256, 1, 56832] + - Exact: [256, 256, 1, 45632] + - Exact: [256, 256, 1, 49152] + - Exact: [256, 512, 1, 13600] + - Exact: [256, 256, 1, 43008] + - Exact: [256, 512, 1, 15200] + - Exact: [256, 512, 1, 12880] + - Exact: [256, 512, 1, 13824] + - Exact: [512, 256, 1, 13824] + - Exact: [256, 512, 1, 14208] + - Exact: [512, 256, 1, 14208] + - Exact: [512, 256, 1, 15200] + - Exact: [256, 512, 1, 12288] + - Exact: [512, 256, 1, 12288] + - Exact: [128, 64, 25, 43320] + - Exact: [64, 64, 64, 20280] + - Exact: [64, 64, 49, 27000] + - Exact: [64, 64, 36, 43320] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [32, 5056, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [2368, 64, 1, 1] + - Exact: [1408, 128, 1, 32] + - Exact: [32, 2944, 1, 3328] + - Exact: [2368, 32, 1, 256] + - Exact: [1024, 128, 1, 32] + - Exact: [32, 4288, 1, 1280] + - Exact: [32, 5056, 1, 32] + - Exact: [5888, 32, 1, 32] + - Exact: [64, 2368, 1, 1280] + - Exact: [128, 704, 1, 32] + - Exact: [32, 4288, 1, 3328] + - Exact: [1408, 64, 1, 1] + - Exact: [1856, 64, 1, 256] + - Exact: [1024, 256, 1, 256] + - Exact: [1856, 128, 1, 32] + - Exact: [1856, 128, 1, 1280] + - Exact: [4288, 32, 1, 3328] + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 1856, 1, 256] + - Exact: [3584, 64, 1, 32] + - Exact: [1408, 128, 1, 3328] + - Exact: [32, 6784, 1, 3328] + - Exact: [32, 3584, 1, 256] + - Exact: [704, 256, 1, 32] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [256, 704, 1, 256] + - Exact: [5056, 32, 1, 3328] + - Exact: [2944, 32, 1, 1280] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 1280] + - Exact: [448, 448, 1, 256] + - Exact: [1024, 256, 1, 1] + - Exact: [1856, 64, 1, 32] + - Exact: [4288, 64, 1, 256] + - Exact: [1408, 64, 1, 256] + - Exact: [6784, 32, 1, 32] + - Exact: [448, 448, 1, 1280] + - Exact: [32, 5888, 1, 256] + - Exact: [1024, 128, 1, 256] + - Exact: [1856, 64, 1, 1280] + - Exact: [128, 1408, 1, 1] + - Exact: [32, 2368, 1, 1280] + - Exact: [448, 256, 1, 256] + - Exact: [2944, 32, 1, 32] + - Exact: [448, 448, 1, 32] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 1] + - Exact: [64, 2944, 1, 32] + - Exact: [64, 2944, 1, 1280] + - Exact: [32, 3584, 1, 1280] + - Exact: [32, 2944, 1, 32] + - Exact: [32, 6784, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [704, 128, 1, 1280] + - Exact: [32, 3584, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [64, 4288, 1, 1] + - Exact: [3584, 32, 1, 32] + - Exact: [3584, 64, 1, 1] + - Exact: [32, 4288, 1, 32] + - Exact: [64, 1408, 1, 1] + - Exact: [256, 1024, 1, 256] + - Exact: [1408, 128, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [64, 3584, 1, 1] + - Exact: [1024, 128, 1, 1280] + - Exact: [2368, 32, 1, 32] + - Exact: [128, 1408, 1, 256] + - Exact: [256, 448, 1, 3328] + - Exact: [2368, 64, 1, 256] + - Exact: [32, 2368, 1, 3328] + - Exact: [128, 1856, 1, 1] + - Exact: [128, 1856, 1, 32] + - Exact: [3584, 32, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [32, 2944, 1, 1280] + - Exact: [4288, 32, 1, 32] + - Exact: [1856, 64, 1, 1] + - Exact: [128, 1024, 1, 3328] + - Exact: [1408, 128, 1, 1] + - Exact: [5056, 32, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [3584, 32, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [704, 256, 1, 1280] + - Exact: [1856, 128, 1, 1] + - Exact: [256, 704, 1, 1] + - Exact: [1024, 128, 1, 1] + - Exact: [1856, 128, 1, 256] + - Exact: [1024, 256, 1, 1280] + - Exact: [64, 2368, 1, 32] + - Exact: [32, 2368, 1, 256] + - Exact: [32, 6784, 1, 1280] + - Exact: [32, 6784, 1, 32] + - Exact: [64, 3584, 1, 3328] + - Exact: [32, 5888, 1, 1280] + - Exact: [448, 256, 1, 1] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [2368, 32, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1024, 1, 32] + - Exact: [32, 2368, 1, 32] + - Exact: [64, 1856, 1, 1280] + - Exact: [32, 3584, 1, 32] + - Exact: [704, 256, 1, 1] + - Exact: [1024, 256, 1, 3328] + - Exact: [128, 1856, 1, 1280] + - Exact: [448, 256, 1, 32] + - Exact: [64, 4288, 1, 32] + - Exact: [128, 704, 1, 1] + - Exact: [4288, 64, 1, 1280] + - Exact: [448, 448, 1, 1] + - Exact: [32, 5888, 1, 32] + - Exact: [1024, 128, 1, 3328] + - Exact: [4288, 64, 1, 32] + - Exact: [2368, 64, 1, 32] + - Exact: [64, 1408, 1, 32] + - Exact: [32, 2944, 1, 256] + - Exact: [2944, 64, 1, 1] + - Exact: [2944, 64, 1, 32] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [1408, 64, 1, 3328] + - Exact: [6784, 32, 1, 1280] + - Exact: [2944, 64, 1, 1280] + - Exact: [2944, 32, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [5888, 32, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 32] + - Exact: [64, 1856, 1, 3328] + - Exact: [5888, 32, 1, 1280] + - Exact: [256, 704, 1, 32] + - Exact: [256, 704, 1, 1280] + - Exact: [1408, 64, 1, 32] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [2368, 64, 1, 3328] + - Exact: [32, 5056, 1, 3328] + - Exact: [64, 1856, 1, 1] + - Exact: [704, 128, 1, 32] + - Exact: [4288, 64, 1, 1] + - Exact: [5056, 32, 1, 1280] + - Exact: [128, 1024, 1, 1] + - Exact: [256, 1024, 1, 1] + - Exact: [1408, 64, 1, 1280] + - Exact: [1024, 256, 1, 32] + - Exact: [2368, 32, 1, 1280] + - Exact: [704, 128, 1, 1] + - Exact: [256, 448, 1, 256] + - Exact: [32, 4288, 1, 256] + - Exact: [128, 1408, 1, 32] + - Exact: [704, 128, 1, 3328] + - Exact: [64, 4288, 1, 256] + - Exact: [4288, 32, 1, 1280] + - Exact: [32, 5056, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [256, 1024, 1, 32] + - Exact: [256, 1024, 1, 1280] + - Exact: [6784, 32, 1, 256] + - Exact: [64, 2368, 1, 1] + - Exact: [1408, 128, 1, 256] + - Exact: [5888, 32, 1, 3328] + - Exact: [64, 2368, 1, 3328] + - Exact: [256, 704, 1, 3328] + - Exact: [128, 1408, 1, 3328] + - Exact: [2944, 32, 1, 3328] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 448, 1, 1] + - Exact: [256, 448, 1, 32] + - Exact: [64, 3584, 1, 32] + - Exact: [64, 3584, 1, 1280] + - Exact: [4288, 32, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [128, 1856, 1, 256] + - Exact: [3584, 32, 1, 3328] + - Exact: [6784, 32, 1, 3328] + - Exact: [2944, 64, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [5056, 32, 1, 32] + - Exact: [32, 5888, 1, 3328] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 196] + - Exact: [256, 1024, 1, 196] + - Exact: [1760, 64, 1, 1760] + - Exact: [2560, 32, 1, 2560] + - Exact: [4608, 32, 1, 1536] + - Exact: [3072, 64, 1, 1024] + - Exact: [2048, 128, 1, 2048] + - Exact: [4096, 64, 1, 4096] + - Exact: [7680, 32, 1, 2560] + - Exact: [2560, 64, 1, 2560] + - Exact: [1760, 128, 1, 1760] + - Exact: [3072, 32, 1, 1024] + - Exact: [6144, 32, 1, 2560] + - Exact: [4096, 32, 1, 4096] + - Exact: [2048, 64, 1, 2048] + - Exact: [8448, 32, 1, 2816] + - Exact: [512, 512, 1, 512] + - Exact: [511, 512, 1, 512] + - Exact: [512, 512, 1, 511] + - Exact: [512, 513, 1, 512] + - Exact: [512, 511, 1, 512] + - Exact: [513, 512, 1, 512] + - Exact: [512, 512, 1, 513] + - Exact: [512, 512, 1, 64] + - Exact: [33, 33, 1600, 32] + - Exact: [256, 684, 1, 1024] + - Exact: [1024, 200, 1, 560] + - Exact: [2048, 114, 1, 512] + - Exact: [2048, 114, 1, 768] + - Exact: [32, 32, 4608, 64] + - Exact: [32, 35, 4608, 64] + - Exact: [34, 34, 4736, 64] + - Exact: [35, 35, 4608, 64] + - Exact: [33, 33, 1920, 64] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [128, 864, 1, 256] + - Exact: [256, 864, 1, 512] + - Exact: [1152, 128, 1, 784] + - Exact: [256, 512, 1, 784] + - Exact: [512, 256, 1, 784] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 256, 1, 3800] + - Exact: [1024, 256, 1, 3400] + - Exact: [256, 1024, 1, 3400] + - Exact: [1024, 256, 1, 3220] + - Exact: [256, 1024, 1, 3220] + - Exact: [1024, 256, 1, 3456] + - Exact: [256, 1024, 1, 3456] + - Exact: [1024, 256, 1, 3072] + - Exact: [256, 1024, 1, 3072] + - Exact: [1024, 256, 1, 3552] + - Exact: [256, 1024, 1, 3552] + - Exact: [256, 1024, 1, 2852] + - Exact: [1024, 256, 1, 2852] + - Exact: [256, 512, 1, 10752] + - Exact: [256, 1024, 1, 3800] + - Exact: [256, 512, 1, 10560] + - Exact: [256, 1024, 1, 2992] + - Exact: [256, 1024, 1, 2688] + - Exact: [1024, 256, 1, 2688] + - Exact: [256, 1024, 1, 2904] + - Exact: [1024, 256, 1, 2904] + - Exact: [256, 1024, 1, 2640] + - Exact: [1024, 256, 1, 2640] + - Exact: [1024, 256, 1, 4032] + - Exact: [1024, 256, 1, 2992] + - Exact: [256, 1024, 1, 3360] + - Exact: [1024, 256, 1, 3360] + - Exact: [1024, 256, 1, 3500] + - Exact: [256, 1024, 1, 3500] + - Exact: [1024, 256, 1, 3168] + - Exact: [256, 1024, 1, 3168] + - Exact: [256, 1024, 1, 3036] + - Exact: [1024, 256, 1, 4200] + - Exact: [1024, 256, 1, 3600] + - Exact: [256, 1024, 1, 3600] + - Exact: [256, 1024, 1, 2944] + - Exact: [1024, 256, 1, 2944] + - Exact: [1024, 256, 1, 3700] + - Exact: [256, 1024, 1, 2352] + - Exact: [1024, 256, 1, 2352] + - Exact: [1024, 256, 1, 2816] + - Exact: [256, 1024, 1, 3700] + - Exact: [256, 1024, 1, 2816] + - Exact: [256, 512, 1, 11408] + - Exact: [1024, 256, 1, 3036] + - Exact: [1024, 256, 1, 3264] + - Exact: [256, 1024, 1, 3264] + - Exact: [1024, 256, 1, 3864] + - Exact: [256, 1024, 1, 4032] + - Exact: [1024, 256, 1, 3128] + - Exact: [256, 1024, 1, 3128] + - Exact: [256, 1024, 1, 3200] + - Exact: [256, 512, 1, 11616] + - Exact: [1024, 256, 1, 3200] + - Exact: [1024, 256, 1, 4000] + - Exact: [256, 1024, 1, 2520] + - Exact: [1024, 256, 1, 2520] + - Exact: [256, 1024, 1, 2976] + - Exact: [256, 1024, 1, 2400] + - Exact: [1024, 256, 1, 2400] + - Exact: [1024, 256, 1, 3696] + - Exact: [1024, 256, 1, 3900] + - Exact: [1024, 256, 1, 3772] + - Exact: [256, 1024, 1, 3696] + - Exact: [256, 1024, 1, 2728] + - Exact: [1024, 256, 1, 2728] + - Exact: [1024, 256, 1, 2480] + - Exact: [256, 1024, 1, 2480] + - Exact: [1024, 256, 1, 2880] + - Exact: [512, 256, 1, 3220] + - Exact: [256, 1024, 1, 2880] + - Exact: [256, 1024, 1, 4200] + - Exact: [1024, 256, 1, 3648] + - Exact: [1024, 256, 1, 3312] + - Exact: [256, 1024, 1, 3648] + - Exact: [1024, 256, 1, 3300] + - Exact: [1024, 256, 1, 3528] + - Exact: [256, 1024, 1, 2604] + - Exact: [1024, 256, 1, 2604] + - Exact: [512, 256, 1, 11408] + - Exact: [256, 1024, 1, 3312] + - Exact: [256, 1024, 1, 3300] + - Exact: [512, 256, 1, 3072] + - Exact: [256, 1024, 1, 3528] + - Exact: [1024, 256, 1, 2976] + - Exact: [1024, 256, 1, 2760] + - Exact: [512, 256, 1, 3800] + - Exact: [256, 1024, 1, 2760] + - Exact: [1024, 256, 1, 2160] + - Exact: [256, 1024, 1, 2160] + - Exact: [512, 256, 1, 11616] + - Exact: [512, 256, 1, 2852] + - Exact: [256, 1024, 1, 3864] + - Exact: [512, 256, 1, 2640] + - Exact: [256, 1024, 1, 4000] + - Exact: [512, 256, 1, 2904] + - Exact: [256, 1024, 1, 3900] + - Exact: [512, 256, 1, 2688] + - Exact: [256, 1024, 1, 3772] + - Exact: [512, 256, 1, 3400] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 3552] + - Exact: [128, 64, 25, 6498] + - Exact: [128, 64, 25, 6859] + - Exact: [64, 64, 64, 3042] + - Exact: [64, 64, 64, 3211] + - Exact: [64, 64, 49, 4050] + - Exact: [64, 64, 49, 4275] + - Exact: [64, 64, 36, 6498] + - Exact: [64, 64, 36, 6859] + - Exact: [1152, 128, 1, 1444] + - Exact: [512, 256, 1, 361] + - Exact: [576, 128, 1, 1444] + - Exact: [29000, 35, 1, 2560] + - Exact: [29000, 36, 1, 2560] + - Exact: [29000, 39, 1, 2560] + - Exact: [29000, 40, 1, 2560] + - Exact: [29000, 42, 1, 2560] + - Exact: [29000, 43, 1, 2560] + - Exact: [29000, 44, 1, 2560] + - Exact: [29000, 46, 1, 2560] + - Exact: [29000, 48, 1, 2560] + - Exact: [29000, 49, 1, 2560] + - Exact: [29000, 50, 1, 2560] + - Exact: [29000, 51, 1, 2560] + - Exact: [29000, 53, 1, 2560] + - Exact: [29000, 54, 1, 2560] + - Exact: [29000, 55, 1, 2560] + - Exact: [29000, 56, 1, 2560] + - Exact: [29000, 57, 1, 2560] + - Exact: [29000, 58, 1, 2560] + - Exact: [29000, 59, 1, 2560] + - Exact: [29000, 61, 1, 2560] + - Exact: [29000, 63, 1, 2560] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 128, 1, 13600] + - Exact: [256, 128, 1, 12880] + - Exact: [128, 512, 1, 15200] + - Exact: [512, 128, 1, 15200] + - Exact: [128, 512, 1, 11408] + - Exact: [256, 128, 1, 13824] + - Exact: [128, 512, 1, 11616] + - Exact: [256, 128, 1, 14208] + - Exact: [128, 512, 1, 14208] + - Exact: [256, 128, 1, 15200] + - Exact: [512, 128, 1, 11408] + - Exact: [512, 128, 1, 16800] + - Exact: [128, 512, 1, 11264] + - Exact: [512, 128, 1, 11616] + - Exact: [512, 128, 1, 16128] + - Exact: [512, 128, 1, 11968] + - Exact: [128, 512, 1, 11968] + - Exact: [512, 128, 1, 12288] + - Exact: [128, 512, 1, 12288] + - Exact: [128, 512, 1, 12672] + - Exact: [512, 128, 1, 11776] + - Exact: [512, 128, 1, 12144] + - Exact: [512, 128, 1, 11264] + - Exact: [128, 512, 1, 12144] + - Exact: [512, 128, 1, 12672] + - Exact: [128, 512, 1, 12512] + - Exact: [128, 512, 1, 11776] + - Exact: [256, 128, 1, 12288] + - Exact: [40, 40, 1, 1909283] + - Exact: [40, 40, 1, 3818566] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [5888, 1, 1, 3328] + - Exact: [5056, 1, 1, 3328] + - Exact: [6784, 1, 1, 1280] + - Exact: [2944, 1, 1, 3328] + - Exact: [3584, 1, 1, 1280] + - Exact: [6784, 1, 1, 256] + - Exact: [4288, 1, 1, 1280] + - Exact: [5056, 1, 1, 1280] + - Exact: [3584, 1, 1, 256] + - Exact: [6784, 1, 1, 3328] + - Exact: [1408, 1, 1, 1280] + - Exact: [1408, 32, 1, 3328] + - Exact: [4288, 1, 1, 256] + - Exact: [2368, 1, 1, 256] + - Exact: [1856, 32, 1, 32] + - Exact: [5056, 1, 1, 256] + - Exact: [5056, 1, 1, 1] + - Exact: [1408, 1, 1, 256] + - Exact: [1408, 1, 1, 1] + - Exact: [4288, 1, 1, 3328] + - Exact: [2368, 1, 1, 1280] + - Exact: [1856, 1, 1, 1] + - Exact: [1856, 32, 1, 256] + - Exact: [1408, 32, 1, 32] + - Exact: [1856, 32, 1, 1280] + - Exact: [1408, 1, 1, 3328] + - Exact: [5888, 1, 1, 256] + - Exact: [5888, 1, 1, 1] + - Exact: [1856, 32, 1, 3328] + - Exact: [2368, 1, 1, 3328] + - Exact: [6784, 1, 1, 1] + - Exact: [5888, 1, 1, 1280] + - Exact: [2944, 1, 1, 256] + - Exact: [2944, 1, 1, 1] + - Exact: [1408, 32, 1, 1280] + - Exact: [1856, 1, 1, 1280] + - Exact: [3584, 1, 1, 1] + - Exact: [2944, 1, 1, 1280] + - Exact: [3584, 1, 1, 3328] + - Exact: [1856, 1, 1, 3328] + - Exact: [4288, 1, 1, 1] + - Exact: [1856, 1, 1, 256] + - Exact: [1408, 32, 1, 256] + - Exact: [2368, 1, 1, 1] + - Exact: [1760, 32, 1, 1760] + - Exact: [3072, 16, 1, 1024] + - Exact: [2560, 16, 1, 2560] + - Exact: [2048, 32, 1, 2048] + - Exact: [1760, 16, 1, 1760] + - Exact: [7680, 16, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [2048, 16, 1, 2048] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2560] + - Exact: [32768, 1, 1, 256] + - Exact: [1600, 1, 1, 1024] + - Exact: [3456, 1, 1, 256] + - Exact: [4096, 1, 1, 256] + - Exact: [6912, 1, 1, 256] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2560] + - Exact: [29000, 27, 1, 2560] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1, 4288, 1, 1280] + - Exact: [32, 1408, 1, 32] + - Exact: [1, 1408, 1, 3328] + - Exact: [1, 2368, 1, 1280] + - Exact: [1, 5888, 1, 3328] + - Exact: [1, 1856, 1, 256] + - Exact: [1, 3584, 1, 3328] + - Exact: [1, 6784, 1, 3328] + - Exact: [1, 2368, 1, 256] + - Exact: [32, 1856, 1, 3328] + - Exact: [1, 2944, 1, 1280] + - Exact: [1, 1856, 1, 3328] + - Exact: [1, 1408, 1, 1] + - Exact: [1, 6784, 1, 256] + - Exact: [1, 6784, 1, 1] + - Exact: [1, 4288, 1, 3328] + - Exact: [1, 2368, 1, 3328] + - Exact: [1, 5888, 1, 1280] + - Exact: [1, 2944, 1, 256] + - Exact: [1, 6784, 1, 1280] + - Exact: [1, 5056, 1, 1] + - Exact: [32, 1856, 1, 32] + - Exact: [32, 1408, 1, 256] + - Exact: [1, 5888, 1, 1] + - Exact: [1, 2944, 1, 3328] + - Exact: [1, 3584, 1, 1] + - Exact: [1, 1408, 1, 256] + - Exact: [1, 1856, 1, 1] + - Exact: [1, 5056, 1, 1280] + - Exact: [1, 5888, 1, 256] + - Exact: [32, 1856, 1, 1280] + - Exact: [1, 2368, 1, 1] + - Exact: [1, 1408, 1, 1280] + - Exact: [1, 5056, 1, 256] + - Exact: [1, 3584, 1, 1280] + - Exact: [1, 4288, 1, 256] + - Exact: [1, 4288, 1, 1] + - Exact: [1, 2944, 1, 1] + - Exact: [32, 1408, 1, 3328] + - Exact: [1, 5056, 1, 3328] + - Exact: [32, 1856, 1, 256] + - Exact: [1, 1856, 1, 1280] + - Exact: [1, 3584, 1, 256] + - Exact: [32, 1408, 1, 1280] + - Exact: [2, 2048, 1, 1024] + - Exact: [32, 1600, 1, 512] + - Exact: [1, 4096, 1, 256] + - Exact: [1, 6912, 1, 256] + - Exact: [2, 2048, 1, 768] + - Exact: [2, 4608, 1, 768] + - Exact: [2, 4608, 1, 1024] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [147, 64, 1, 12544] + - Exact: [256, 128, 1, 10752] + - Exact: [256, 128, 1, 10560] + - Exact: [256, 128, 1, 11408] + - Exact: [256, 12, 1, 11408] + - Exact: [256, 128, 1, 11616] + - Exact: [256, 12, 1, 11616] + - Exact: [256, 12, 1, 12288] + - Exact: [576, 64, 1, 5625] + - Exact: [147, 64, 1, 22500] + - Exact: [11, 11, 1, 1909283] + - Exact: [11, 11, 1, 3818566] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [448, 1, 1, 256] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [448, 64, 1, 1] + - Exact: [64, 1024, 1, 1280] + - Exact: [1024, 1, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [448, 128, 1, 256] + - Exact: [1, 1024, 1, 3328] + - Exact: [704, 64, 1, 32] + - Exact: [32, 448, 1, 3328] + - Exact: [448, 1, 1, 1] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 128, 1, 1] + - Exact: [256, 128, 1, 1] + - Exact: [256, 32, 1, 3328] + - Exact: [1, 1, 1, 3328] + - Exact: [32, 448, 1, 1280] + - Exact: [32, 448, 1, 32] + - Exact: [64, 1024, 1, 32] + - Exact: [128, 1, 1, 1] + - Exact: [1024, 32, 1, 3328] + - Exact: [448, 1, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [128, 256, 1, 1280] + - Exact: [256, 256, 1, 32] + - Exact: [1024, 1, 1, 256] + - Exact: [128, 32, 1, 32] + - Exact: [448, 64, 1, 256] + - Exact: [128, 256, 1, 3328] + - Exact: [1, 64, 1, 3328] + - Exact: [64, 1024, 1, 1] + - Exact: [64, 1024, 1, 3328] + - Exact: [32, 704, 1, 3328] + - Exact: [32, 1024, 1, 3328] + - Exact: [64, 1, 1, 256] + - Exact: [1024, 64, 1, 32] + - Exact: [1024, 64, 1, 3328] + - Exact: [32, 1024, 1, 256] + - Exact: [64, 1, 1, 1] + - Exact: [256, 1, 1, 256] + - Exact: [256, 128, 1, 3328] + - Exact: [64, 64, 1, 1] + - Exact: [32, 704, 1, 1280] + - Exact: [256, 1, 1, 1280] + - Exact: [128, 32, 1, 1280] + - Exact: [128, 256, 1, 1] + - Exact: [1, 256, 1, 256] + - Exact: [1, 256, 1, 1] + - Exact: [1024, 1, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [1024, 32, 1, 1280] + - Exact: [256, 256, 1, 3328] + - Exact: [704, 32, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [32, 32, 1, 32] + - Exact: [1024, 32, 1, 32] + - Exact: [128, 64, 1, 32] + - Exact: [64, 1, 1, 1280] + - Exact: [448, 32, 1, 1280] + - Exact: [704, 32, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [64, 448, 1, 1280] + - Exact: [64, 256, 1, 1] + - Exact: [256, 256, 1, 256] + - Exact: [448, 1, 1, 3328] + - Exact: [256, 1, 1, 1] + - Exact: [32, 1024, 1, 1280] + - Exact: [1, 256, 1, 3328] + - Exact: [256, 32, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [256, 64, 1, 256] + - Exact: [1, 1, 1, 1] + - Exact: [32, 1024, 1, 32] + - Exact: [128, 256, 1, 256] + - Exact: [704, 64, 1, 256] + - Exact: [704, 1, 1, 1] + - Exact: [128, 448, 1, 1280] + - Exact: [448, 32, 1, 32] + - Exact: [704, 64, 1, 1] + - Exact: [704, 32, 1, 256] + - Exact: [32, 704, 1, 32] + - Exact: [128, 64, 1, 256] + - Exact: [448, 32, 1, 3328] + - Exact: [64, 704, 1, 32] + - Exact: [64, 704, 1, 3328] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 448, 1, 32] + - Exact: [64, 256, 1, 256] + - Exact: [64, 704, 1, 1] + - Exact: [1, 1024, 1, 1] + - Exact: [256, 1, 1, 3328] + - Exact: [32, 64, 1, 32] + - Exact: [256, 256, 1, 1] + - Exact: [32, 256, 1, 32] + - Exact: [128, 1, 1, 256] + - Exact: [32, 64, 1, 3328] + - Exact: [1, 128, 1, 3328] + - Exact: [32, 256, 1, 256] + - Exact: [1, 448, 1, 1] + - Exact: [1, 704, 1, 3328] + - Exact: [64, 1, 1, 3328] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 32, 1, 1280] + - Exact: [128, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 32, 1, 32] + - Exact: [1, 448, 1, 3328] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 32, 1, 3328] + - Exact: [64, 448, 1, 1] + - Exact: [128, 128, 1, 1280] + - Exact: [64, 128, 1, 256] + - Exact: [64, 448, 1, 32] + - Exact: [128, 64, 1, 3328] + - Exact: [32, 64, 1, 1280] + - Exact: [448, 32, 1, 256] + - Exact: [1024, 32, 1, 256] + - Exact: [1, 128, 1, 256] + - Exact: [32, 256, 1, 1280] + - Exact: [32, 128, 1, 3328] + - Exact: [32, 128, 1, 32] + - Exact: [1, 128, 1, 1] + - Exact: [128, 64, 1, 1] + - Exact: [32, 448, 1, 256] + - Exact: [1, 704, 1, 256] + - Exact: [32, 256, 1, 3328] + - Exact: [256, 32, 1, 32] + - Exact: [64, 256, 1, 3328] + - Exact: [1, 704, 1, 1] + - Exact: [128, 448, 1, 1] + - Exact: [64, 128, 1, 32] + - Exact: [704, 1, 1, 1280] + - Exact: [1024, 1, 1, 1] + - Exact: [256, 128, 1, 32] + - Exact: [448, 128, 1, 1] + - Exact: [704, 32, 1, 32] + - Exact: [128, 32, 1, 256] + - Exact: [64, 32, 1, 1280] + - Exact: [448, 128, 1, 32] + - Exact: [128, 448, 1, 256] + - Exact: [32, 32, 1, 256] + - Exact: [256, 64, 1, 32] + - Exact: [1, 1024, 1, 1280] + - Exact: [32, 32, 1, 3328] + - Exact: [1, 256, 1, 1280] + - Exact: [1, 128, 1, 1280] + - Exact: [1, 64, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [32, 704, 1, 256] + - Exact: [1, 64, 1, 1] + - Exact: [704, 64, 1, 1280] + - Exact: [1, 704, 1, 1280] + - Exact: [128, 128, 1, 32] + - Exact: [1024, 64, 1, 1] + - Exact: [704, 1, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [64, 64, 1, 32] + - Exact: [1, 1, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [1, 448, 1, 1280] + - Exact: [64, 256, 1, 32] + - Exact: [32, 128, 1, 1280] + - Exact: [128, 128, 1, 3328] + - Exact: [64, 448, 1, 3328] + - Exact: [32, 64, 1, 256] + - Exact: [128, 256, 1, 32] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 64, 1, 256] + - Exact: [448, 64, 1, 32] + - Exact: [64, 128, 1, 1280] + - Exact: [1, 1024, 1, 256] + - Exact: [128, 1, 1, 3328] + - Exact: [128, 128, 1, 1] + - Exact: [32, 128, 1, 256] + - Exact: [1, 64, 1, 1280] + - Exact: [448, 128, 1, 1280] + - Exact: [256, 64, 1, 1] + - Exact: [256, 256, 1, 1280] + - Exact: [704, 1, 1, 3328] + - Exact: [128, 32, 1, 3328] + - Exact: [32, 32, 1, 1280] + - Exact: [1, 1, 1, 256] + - Exact: [1, 448, 1, 256] + - Exact: [256, 64, 1, 3328] + - Exact: [64, 32, 1, 256] + - Exact: [128, 1, 1, 1280] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 256, 1, 3136] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + - Exact: [14, 14, 1, 64] + - Exact: [15, 14, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [17, 15, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [21, 17, 1, 64] + - Exact: [21, 21, 1, 64] + - Exact: [24, 24, 1, 64] + - Exact: [30, 30, 1, 64] + - Exact: [30, 31, 1, 64] + - Exact: [31, 31, 1, 64] + - Exact: [32, 32, 1, 64] + - Exact: [32, 35, 1, 64] + - Exact: [34, 24, 1, 64] + - Exact: [34, 34, 1, 64] + - Exact: [35, 35, 1, 64] + - Exact: [27, 27, 1, 64] + - Exact: [27, 33, 1, 64] + - Exact: [33, 33, 1, 64] + - Exact: [2, 4, 1, 1024] + - Exact: [2, 32, 1, 1024] + - Exact: [64, 512, 1, 512] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 32, 1, 1024] + - Exact: [3, 3, 512, 64] + - Exact: [5, 5, 512, 64] + - Exact: [5, 5, 960, 64] + - Exact: [9, 9, 512, 64] + - Exact: [27, 27, 32768, 128] + - Exact: [64, 512, 1, 1024] + - Exact: [64, 960, 1, 1024] + - Exact: [14, 14, 10880, 64] + - Exact: [15, 14, 10880, 64] + - Exact: [15, 15, 7680, 64] + - Exact: [15, 15, 10880, 64] + - Exact: [17, 15, 7680, 64] + - Exact: [17, 17, 7680, 64] + - Exact: [21, 17, 6144, 64] + - Exact: [21, 21, 6144, 64] + - Exact: [24, 24, 4736, 64] + - Exact: [30, 30, 2048, 64] + - Exact: [30, 31, 2048, 64] + - Exact: [31, 31, 2048, 64] + - Exact: [34, 24, 4736, 64] + - Exact: [27, 27, 1920, 64] + - Exact: [27, 33, 1920, 64] + - Exact: [2, 8, 1, 1024] + - Exact: [1024, 77, 1, 1024] + - Exact: [2, 10, 1, 1024] + - Exact: [1024, 10, 1, 1024] + - Exact: [2, 39, 1, 1024] + - Exact: [1024, 39, 1, 1024] + - Exact: [2, 40, 1, 1024] + - Exact: [1024, 40, 1, 1024] + - Exact: [2, 41, 1, 1024] + - Exact: [1024, 41, 1, 1024] + - Exact: [2, 5, 1, 1024] + - Exact: [1024, 5, 1, 1024] + - Exact: [2, 6, 1, 1024] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 8, 1, 1024] + - Exact: [2, 9, 1, 1024] + - Exact: [1024, 9, 1, 1024] + - Exact: [4, 4, 32768, 64] + - Exact: [4, 4, 38400, 64] + - Exact: [17, 17, 6144, 64] + - Exact: [128, 128, 1, 64] + - Exact: [64, 128, 1, 128] + - Exact: [2, 1024, 1, 1024] + - Exact: [5, 5, 1, 64] + - Exact: [33, 33, 1, 32] + - Exact: [1024, 16, 1, 1024] + - Exact: [2, 4, 1, 2560] + - Exact: [2, 16, 1, 1024] + - Exact: [2, 2, 1, 2048] + - Exact: [1024, 1, 1, 1024] + - Exact: [512, 1, 1, 2048] + - Exact: [200, 1, 1, 1024] + - Exact: [960, 1, 1, 2048] + - Exact: [1024, 64, 1, 1024] + - Exact: [864, 1, 1, 256] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 12, 1, 1024] + - Exact: [2, 64, 1, 1024] + - Exact: [2, 80, 1, 1024] + - Exact: [2, 82, 1, 1024] + - Exact: [2, 12, 1, 1024] + - Exact: [2, 1, 1, 1024] + - Exact: [24, 24, 6816, 64] + - Exact: [256, 128, 1, 3136] + - Exact: [576, 64, 1, 3136] + - Exact: [768, 16, 1, 768] + - Exact: [768, 12, 1, 768] + - Exact: [768, 4, 1, 768] + - Exact: [64, 1024, 1, 1024] + - Exact: [26, 26, 6272, 64] + - Exact: [2, 128, 1, 1024] + - Exact: [2, 96, 1, 1024] + - Exact: [256, 80, 1, 784] + - Exact: [256, 12, 1, 3800] + - Exact: [256, 3, 1, 3800] + - Exact: [256, 12, 1, 950] + - Exact: [256, 3, 1, 950] + - Exact: [256, 12, 1, 3220] + - Exact: [256, 3, 1, 3220] + - Exact: [256, 12, 1, 3072] + - Exact: [256, 3, 1, 3072] + - Exact: [256, 12, 1, 850] + - Exact: [256, 3, 1, 850] + - Exact: [256, 12, 1, 2852] + - Exact: [256, 3, 1, 2852] + - Exact: [256, 12, 1, 805] + - Exact: [256, 3, 1, 805] + - Exact: [256, 3, 1, 864] + - Exact: [256, 3, 1, 768] + - Exact: [256, 12, 1, 864] + - Exact: [256, 12, 1, 768] + - Exact: [256, 12, 1, 2904] + - Exact: [256, 3, 1, 2904] + - Exact: [256, 3, 1, 713] + - Exact: [256, 12, 1, 888] + - Exact: [256, 3, 1, 888] + - Exact: [256, 12, 1, 713] + - Exact: [256, 3, 1, 660] + - Exact: [256, 3, 1, 672] + - Exact: [256, 12, 1, 660] + - Exact: [256, 3, 1, 726] + - Exact: [256, 12, 1, 672] + - Exact: [256, 3, 1, 247] + - Exact: [256, 12, 1, 726] + - Exact: [256, 3, 1, 216] + - Exact: [256, 3, 1, 3400] + - Exact: [256, 3, 1, 221] + - Exact: [256, 12, 1, 3552] + - Exact: [256, 3, 1, 3456] + - Exact: [256, 3, 1, 204] + - Exact: [256, 12, 1, 3400] + - Exact: [256, 12, 1, 3456] + - Exact: [256, 12, 1, 221] + - Exact: [256, 3, 1, 3552] + - Exact: [256, 3, 1, 228] + - Exact: [256, 3, 1, 234] + - Exact: [256, 12, 1, 234] + - Exact: [256, 12, 1, 228] + - Exact: [256, 3, 1, 252] + - Exact: [256, 12, 1, 252] + - Exact: [256, 12, 1, 247] + - Exact: [128, 256, 1, 1444] + - Exact: [256, 128, 1, 25] + - Exact: [256, 128, 1, 9] + - Exact: [256, 256, 1, 1444] + - Exact: [512, 128, 1, 100] + - Exact: [64, 128, 1, 1444] + - Exact: [81, 1024, 1, 1024] + - Exact: [81, 1000, 1, 1024] + - Exact: [1024, 20, 1, 1024] + - Exact: [2, 8, 1, 2048] + - Exact: [2, 20, 1, 1024] + - Exact: [2, 2, 1, 2560] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml new file mode 100644 index 000000000..b2ed430cb --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml @@ -0,0 +1,287 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [8] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 5056, 1, 256] + - Exact: [64, 6784, 1, 3328] + - Exact: [64, 5056, 1, 3328] + - Exact: [64, 6784, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [64, 5888, 1, 256] + - Exact: [1024, 1024, 1, 1024] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 1408, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [64, 2368, 1, 3328] + - Exact: [64, 3584, 1, 1280] + - Exact: [64, 2944, 1, 256] + - Exact: [64, 1408, 1, 1280] + - Exact: [64, 2368, 1, 1280] + - Exact: [64, 3584, 1, 3328] + - Exact: [64, 1856, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [64, 4288, 1, 256] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 1280] + - Exact: [64, 2944, 1, 1280] + - Exact: [64, 1408, 1, 256] + - Exact: [64, 2368, 1, 256] + - Exact: [64, 3584, 1, 256] + - Exact: [64, 4288, 1, 3328] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - VectorWidth: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1, 2944, 1, 3328] + - Exact: [1, 2368, 1, 1280] + - Exact: [1, 1408, 1, 1280] + - Exact: [1, 2368, 1, 3328] + - Exact: [1, 3584, 1, 1280] + - Exact: [1, 2944, 1, 1] + - Exact: [1, 1408, 1, 3328] + - Exact: [1, 2944, 1, 256] + - Exact: [1, 5056, 1, 256] + - Exact: [1, 1856, 1, 256] + - Exact: [1, 1856, 1, 1280] + - Exact: [1, 4288, 1, 1280] + - Exact: [1, 1408, 1, 1] + - Exact: [1, 1408, 1, 256] + - Exact: [1, 2368, 1, 256] + - Exact: [1, 4288, 1, 1] + - Exact: [1, 1856, 1, 3328] + - Exact: [1, 4288, 1, 3328] + - Exact: [1, 2368, 1, 1] + - Exact: [1, 3584, 1, 3328] + - Exact: [1, 5056, 1, 1] + - Exact: [1, 3584, 1, 256] + - Exact: [1, 5056, 1, 1280] + - Exact: [1, 3584, 1, 1] + - Exact: [1, 2944, 1, 1280] + - Exact: [1, 1856, 1, 1] + - Exact: [1, 4288, 1, 256] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [2] + - LocalDotLayout: [2] + - InnerUnroll: [2] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 448, 1, 3328] + - Exact: [1, 64, 1, 1280] + - Exact: [64, 128, 1, 256] + - Exact: [128, 64, 1, 1280] + - Exact: [1, 128, 1, 1] + - Exact: [64, 1024, 1, 3328] + - Exact: [1, 64, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [1, 256, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [1, 1, 1, 3328] + - Exact: [1, 64, 1, 1] + - Exact: [1, 128, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 256, 1, 1280] + - Exact: [1, 1024, 1, 256] + - Exact: [1, 704, 1, 3328] + - Exact: [1, 256, 1, 1] + - Exact: [128, 64, 1, 256] + - Exact: [64, 128, 1, 3328] + - Exact: [64, 256, 1, 3328] + - Exact: [1, 1, 1, 256] + - Exact: [1, 704, 1, 1] + - Exact: [1, 704, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [1, 448, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [1, 1, 1, 1] + - Exact: [256, 64, 1, 256] + - Exact: [1, 448, 1, 3328] + - Exact: [1, 128, 1, 256] + - Exact: [1, 1024, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [1, 1024, 1, 1280] + - Exact: [1, 1, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [64, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [1, 448, 1, 256] + - Exact: [1, 1024, 1, 1] + - Exact: [64, 704, 1, 3328] + - Exact: [1, 256, 1, 256] + - Exact: [1, 704, 1, 256] + - Exact: [1, 128, 1, 3328] + - Exact: [64, 64, 1, 3328] + - Exact: [1, 448, 1, 1] + - Exact: [1, 64, 1, 256] + - Exact: [64, 704, 1, 1280] + - Exact: [1, 256, 1, 3328] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml new file mode 100644 index 000000000..a0e2fcc2e --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml @@ -0,0 +1,5203 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 4096, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 1024] + - Exact: [4096, 2048, 1, 1024] + - Exact: [768, 4096, 1, 2] + - Exact: [768, 4096, 1, 768] + - Exact: [3072, 4096, 1, 768] + - Exact: [768, 2048, 1, 2] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [3072, 1024, 1, 768] + - Exact: [3072, 512, 1, 768] + - Exact: [1024, 3072, 1, 1024] + - Exact: [3072, 2048, 1, 1024] + - Exact: [3072, 3072, 1, 1024] + - Exact: [3072, 512, 1, 1024] + - Exact: [3072, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 2] + - Exact: [1024, 3072, 1, 2] + - Exact: [1024, 4096, 1, 2] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [6144, 6000, 1, 2560] + - Exact: [2368, 6784, 1, 128] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 3584, 1, 256] + - Exact: [4288, 448, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [512, 6000, 1, 2560] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [6784, 2944, 1, 128] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [256, 193600, 1, 64] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [2048, 7000, 1, 2048] + - Exact: [5056, 704, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 4288, 1, 1280] + - Exact: [2368, 2368, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [1856, 5056, 1, 128] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [512, 6000, 1, 2816] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1024, 12544, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 1024, 1, 128] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1024, 6000, 1, 1536] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [512, 3000, 1, 1536] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [4288, 1408, 1, 128] + - Exact: [1856, 5888, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 3000, 1, 2560] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5888, 1856, 1, 128] + - Exact: [448, 6784, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [4288, 2368, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [704, 6784, 1, 128] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [3584, 6784, 1, 128] + - Exact: [2368, 6784, 1, 1280] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 3000, 1, 2816] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1500, 1, 2816] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [4608, 6000, 1, 1536] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [4288, 4288, 1, 256] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [3025, 256, 64, 64] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5888, 3584, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 128] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [1856, 2368, 1, 128] + - Exact: [6784, 2368, 1, 3328] + - Exact: [1408, 6784, 1, 128] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [8448, 6000, 1, 2816] + - Exact: [3584, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5124, 9124, 1, 2048] + - Exact: [2944, 5056, 1, 3328] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [3072, 1500, 1, 1024] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [512, 6000, 1, 1536] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3072, 6000, 1, 1024] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [2944, 4288, 1, 128] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 1280] + - Exact: [1024, 3000, 1, 2048] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 1856, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2944, 2944, 1, 128] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 5888, 1, 256] + - Exact: [6784, 1408, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [1408, 5056, 1, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [1024, 24000, 1, 1536] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [1408, 2944, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [5124, 1500, 1, 2048] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [2368, 5056, 1, 128] + - Exact: [1408, 1856, 1, 3328] + - Exact: [5888, 5056, 1, 3328] + - Exact: [7680, 6000, 1, 2560] + - Exact: [6784, 1408, 1, 1280] + - Exact: [512, 3000, 1, 2560] + - Exact: [704, 2944, 1, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1024, 1500, 1, 1536] + - Exact: [1408, 1408, 1, 1280] + - Exact: [3072, 3000, 1, 1024] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [704, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2560] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [1408, 5888, 1, 128] + - Exact: [512, 3136, 1, 2048] + - Exact: [1408, 1024, 1, 256] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5056, 3584, 1, 1280] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4608, 1500, 1, 1536] + - Exact: [1024, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 2944, 1, 128] + - Exact: [2944, 1024, 1, 128] + - Exact: [5124, 700, 1, 2048] + - Exact: [6784, 5056, 1, 128] + - Exact: [256, 12544, 1, 1024] + - Exact: [5888, 1408, 1, 3328] + - Exact: [2368, 1856, 1, 256] + - Exact: [256, 5056, 1, 256] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 128] + - Exact: [1024, 6000, 1, 2560] + - Exact: [1024, 5056, 1, 256] + - Exact: [5056, 5888, 1, 128] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [1024, 2368, 1, 128] + - Exact: [3072, 48000, 1, 1024] + - Exact: [1024, 5888, 1, 128] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [8448, 3000, 1, 2816] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [7680, 1500, 1, 2560] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2944, 4288, 1, 3328] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4608, 24000, 1, 1536] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [7680, 3000, 1, 2560] + - Exact: [4224, 1500, 1, 176] + - Exact: [5124, 700, 1, 2560] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5124, 1500, 1, 2560] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [448, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 128] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [448, 5888, 1, 1280] + - Exact: [512, 3000, 1, 2048] + - Exact: [5056, 448, 1, 128] + - Exact: [4288, 704, 1, 1280] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [3584, 4288, 1, 128] + - Exact: [1024, 6000, 1, 2816] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [1024, 1500, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [704, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [1024, 3000, 1, 1536] + - Exact: [5056, 1024, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [128, 50176, 1, 512] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4288, 1024, 1, 128] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + - Exact: [4096, 512, 1, 1024] + - Exact: [2048, 768, 1, 512] + - Exact: [4096, 512, 1, 2048] + - Exact: [4096, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 1024] + - Exact: [2048, 1024, 1, 512] + - Exact: [4096, 1024, 1, 4096] + - Exact: [2048, 1024, 1, 1024] + - Exact: [4096, 384, 1, 2048] + - Exact: [1225, 192, 64, 384] + - Exact: [289, 128, 64, 1024] + - Exact: [4096, 384, 1, 1536] + - Exact: [289, 192, 64, 1024] + - Exact: [4096, 384, 1, 1280] + - Exact: [4096, 448, 1, 1280] + - Exact: [289, 256, 64, 1024] + - Exact: [4096, 448, 1, 2048] + - Exact: [289, 384, 64, 1024] + - Exact: [1024, 3594, 1, 4096] + - Exact: [4096, 3103, 1, 1024] + - Exact: [4096, 3136, 1, 1024] + - Exact: [1024, 3141, 1, 4096] + - Exact: [4096, 3559, 1, 1024] + - Exact: [4096, 3368, 1, 1024] + - Exact: [1024, 3335, 1, 4096] + - Exact: [1024, 3510, 1, 4096] + - Exact: [4096, 3209, 1, 1024] + - Exact: [4096, 3322, 1, 1024] + - Exact: [1024, 3400, 1, 4096] + - Exact: [1024, 3995, 1, 4096] + - Exact: [1024, 3503, 1, 4096] + - Exact: [4096, 3594, 1, 1024] + - Exact: [4096, 3473, 1, 1024] + - Exact: [4096, 3522, 1, 1024] + - Exact: [1024, 3103, 1, 4096] + - Exact: [1024, 3214, 1, 4096] + - Exact: [4096, 3449, 1, 1024] + - Exact: [1024, 3136, 1, 4096] + - Exact: [1024, 3955, 1, 33708] + - Exact: [1024, 3780, 1, 4096] + - Exact: [1024, 3906, 1, 33708] + - Exact: [1024, 3386, 1, 4096] + - Exact: [4096, 3396, 1, 1024] + - Exact: [1024, 3183, 1, 4096] + - Exact: [1024, 3098, 1, 4096] + - Exact: [1024, 3548, 1, 4096] + - Exact: [1024, 3224, 1, 4096] + - Exact: [4096, 3469, 1, 1024] + - Exact: [1024, 3582, 1, 4096] + - Exact: [1024, 2977, 1, 4096] + - Exact: [1024, 3939, 1, 1024] + - Exact: [4096, 3176, 1, 1024] + - Exact: [1024, 3559, 1, 4096] + - Exact: [1024, 3478, 1, 4096] + - Exact: [4096, 3343, 1, 1024] + - Exact: [4096, 3440, 1, 1024] + - Exact: [1024, 3996, 1, 33708] + - Exact: [1024, 4012, 1, 4096] + - Exact: [1024, 3322, 1, 4096] + - Exact: [1024, 3990, 1, 33708] + - Exact: [1024, 3314, 1, 4096] + - Exact: [4096, 3513, 1, 1024] + - Exact: [1024, 3562, 1, 4096] + - Exact: [1024, 3443, 1, 4096] + - Exact: [1024, 3554, 1, 4096] + - Exact: [1024, 3063, 1, 4096] + - Exact: [4096, 3460, 1, 1024] + - Exact: [1024, 3209, 1, 4096] + - Exact: [1024, 3147, 1, 4096] + - Exact: [4096, 3387, 1, 1024] + - Exact: [4096, 3436, 1, 1024] + - Exact: [1024, 3341, 1, 4096] + - Exact: [1024, 3516, 1, 4096] + - Exact: [4096, 3277, 1, 1024] + - Exact: [1024, 3454, 1, 4096] + - Exact: [1024, 3969, 1, 4096] + - Exact: [1024, 3999, 1, 4096] + - Exact: [1024, 4032, 1, 4096] + - Exact: [4096, 3541, 1, 1024] + - Exact: [4096, 3334, 1, 1024] + - Exact: [1024, 3365, 1, 4096] + - Exact: [1024, 3527, 1, 4096] + - Exact: [1024, 3190, 1, 4096] + - Exact: [4096, 3906, 1, 1024] + - Exact: [1024, 3593, 1, 4096] + - Exact: [1024, 3336, 1, 4096] + - Exact: [4096, 3504, 1, 1024] + - Exact: [4096, 3977, 1, 1024] + - Exact: [1024, 3906, 1, 4096] + - Exact: [4096, 3415, 1, 1024] + - Exact: [1024, 3295, 1, 4096] + - Exact: [4096, 3321, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [1024, 3408, 1, 4096] + - Exact: [1024, 3522, 1, 4096] + - Exact: [4096, 3751, 1, 1024] + - Exact: [4096, 3378, 1, 1024] + - Exact: [1024, 3925, 1, 33708] + - Exact: [1024, 3990, 1, 1024] + - Exact: [1024, 3290, 1, 4096] + - Exact: [4096, 3500, 1, 1024] + - Exact: [4096, 3565, 1, 1024] + - Exact: [1024, 3484, 1, 4096] + - Exact: [4096, 3395, 1, 1024] + - Exact: [1024, 3681, 1, 1024] + - Exact: [1024, 3584, 1, 1024] + - Exact: [4096, 3093, 1, 1024] + - Exact: [1024, 4050, 1, 1024] + - Exact: [1024, 3301, 1, 4096] + - Exact: [1024, 3581, 1, 4096] + - Exact: [4096, 3374, 1, 1024] + - Exact: [1024, 3449, 1, 4096] + - Exact: [4096, 3215, 1, 1024] + - Exact: [4096, 3312, 1, 1024] + - Exact: [4096, 3479, 1, 1024] + - Exact: [4096, 3544, 1, 1024] + - Exact: [1024, 3263, 1, 4096] + - Exact: [4096, 3455, 1, 1024] + - Exact: [1024, 3379, 1, 4096] + - Exact: [1024, 3490, 1, 4096] + - Exact: [1024, 3368, 1, 4096] + - Exact: [4096, 3186, 1, 1024] + - Exact: [1024, 3428, 1, 4096] + - Exact: [4096, 3561, 1, 1024] + - Exact: [4096, 3418, 1, 1024] + - Exact: [1024, 3064, 1, 4096] + - Exact: [4096, 3259, 1, 1024] + - Exact: [4096, 3308, 1, 1024] + - Exact: [1024, 3533, 1, 4096] + - Exact: [1024, 3344, 1, 4096] + - Exact: [1024, 4030, 1, 1024] + - Exact: [4096, 3459, 1, 1024] + - Exact: [1024, 3572, 1, 4096] + - Exact: [1024, 3925, 1, 1024] + - Exact: [4096, 3435, 1, 1024] + - Exact: [1024, 3956, 1, 4096] + - Exact: [1024, 3463, 1, 4096] + - Exact: [4096, 3182, 1, 1024] + - Exact: [4096, 3976, 1, 1024] + - Exact: [1024, 3417, 1, 4096] + - Exact: [1024, 3528, 1, 4096] + - Exact: [4096, 3446, 1, 1024] + - Exact: [1024, 3543, 1, 4096] + - Exact: [4096, 3287, 1, 1024] + - Exact: [1024, 3499, 1, 4096] + - Exact: [1024, 3231, 1, 4096] + - Exact: [4096, 3519, 1, 1024] + - Exact: [4096, 3552, 1, 1024] + - Exact: [1024, 3458, 1, 4096] + - Exact: [1024, 3374, 1, 4096] + - Exact: [1024, 3396, 1, 4096] + - Exact: [1024, 2967, 1, 4096] + - Exact: [4096, 3482, 1, 1024] + - Exact: [1024, 3226, 1, 4096] + - Exact: [4096, 3377, 1, 1024] + - Exact: [4096, 3426, 1, 1024] + - Exact: [4096, 2935, 1, 1024] + - Exact: [1024, 3439, 1, 4096] + - Exact: [4096, 3267, 1, 1024] + - Exact: [4096, 3499, 1, 1024] + - Exact: [4096, 3356, 1, 1024] + - Exact: [4096, 3939, 1, 1024] + - Exact: [1024, 3526, 1, 4096] + - Exact: [1024, 3859, 1, 33708] + - Exact: [1024, 3385, 1, 4096] + - Exact: [1024, 3496, 1, 4096] + - Exact: [4096, 3141, 1, 1024] + - Exact: [4096, 3510, 1, 1024] + - Exact: [1024, 3434, 1, 4096] + - Exact: [4096, 3969, 1, 1024] + - Exact: [1024, 3121, 1, 4096] + - Exact: [1024, 3232, 1, 4096] + - Exact: [1024, 4030, 1, 33708] + - Exact: [1024, 3780, 1, 33708] + - Exact: [1024, 3969, 1, 1024] + - Exact: [4096, 3527, 1, 1024] + - Exact: [4096, 3336, 1, 1024] + - Exact: [4096, 3290, 1, 1024] + - Exact: [1024, 3469, 1, 4096] + - Exact: [4096, 3490, 1, 1024] + - Exact: [4096, 3064, 1, 1024] + - Exact: [4096, 3582, 1, 1024] + - Exact: [1024, 3956, 1, 1024] + - Exact: [4096, 3417, 1, 1024] + - Exact: [1024, 2736, 1, 4096] + - Exact: [1024, 3205, 1, 4096] + - Exact: [1024, 3143, 1, 4096] + - Exact: [1024, 4020, 1, 4096] + - Exact: [1024, 3318, 1, 4096] + - Exact: [4096, 3364, 1, 1024] + - Exact: [1024, 3353, 1, 4096] + - Exact: [1024, 3464, 1, 4096] + - Exact: [4096, 3205, 1, 1024] + - Exact: [4096, 3318, 1, 1024] + - Exact: [1024, 3402, 1, 4096] + - Exact: [4096, 3181, 1, 1024] + - Exact: [4096, 3550, 1, 1024] + - Exact: [4096, 3445, 1, 1024] + - Exact: [1024, 3138, 1, 4096] + - Exact: [4096, 3079, 1, 1024] + - Exact: [4096, 3144, 1, 1024] + - Exact: [4096, 3860, 1, 1024] + - Exact: [1024, 3515, 1, 4096] + - Exact: [4096, 3408, 1, 1024] + - Exact: [1024, 3181, 1, 4096] + - Exact: [4096, 3298, 1, 1024] + - Exact: [4096, 3585, 1, 1024] + - Exact: [1024, 3550, 1, 4096] + - Exact: [1024, 4020, 1, 1024] + - Exact: [4096, 3481, 1, 1024] + - Exact: [4096, 3530, 1, 1024] + - Exact: [4096, 3425, 1, 1024] + - Exact: [4096, 4026, 1, 1024] + - Exact: [1024, 3860, 1, 1024] + - Exact: [4096, 3975, 1, 1024] + - Exact: [1024, 3286, 1, 4096] + - Exact: [1024, 3176, 1, 4096] + - Exact: [1024, 3894, 1, 4096] + - Exact: [4096, 3355, 1, 1024] + - Exact: [4096, 3404, 1, 1024] + - Exact: [1024, 3501, 1, 4096] + - Exact: [4096, 3245, 1, 1024] + - Exact: [1024, 3431, 1, 4096] + - Exact: [1024, 4000, 1, 1024] + - Exact: [4096, 3509, 1, 1024] + - Exact: [4096, 3558, 1, 1024] + - Exact: [1024, 3535, 1, 4096] + - Exact: [1024, 3414, 1, 4096] + - Exact: [1024, 3445, 1, 4096] + - Exact: [1024, 3436, 1, 4096] + - Exact: [4096, 3472, 1, 1024] + - Exact: [1024, 3211, 1, 4096] + - Exact: [4096, 3383, 1, 1024] + - Exact: [4096, 3448, 1, 1024] + - Exact: [1024, 3343, 1, 4096] + - Exact: [1024, 3518, 1, 4096] + - Exact: [4096, 3289, 1, 1024] + - Exact: [1024, 3440, 1, 4096] + - Exact: [1024, 4032, 1, 33708] + - Exact: [4096, 3489, 1, 1024] + - Exact: [4096, 3346, 1, 1024] + - Exact: [1024, 3534, 1, 4096] + - Exact: [1024, 3079, 1, 4096] + - Exact: [1024, 3955, 1, 4096] + - Exact: [4096, 3236, 1, 1024] + - Exact: [1024, 3545, 1, 4096] + - Exact: [1024, 3144, 1, 4096] + - Exact: [4096, 3780, 1, 1024] + - Exact: [4096, 3163, 1, 1024] + - Exact: [4096, 3468, 1, 1024] + - Exact: [1024, 3539, 1, 4096] + - Exact: [1024, 3541, 1, 4096] + - Exact: [4096, 3363, 1, 1024] + - Exact: [1024, 3475, 1, 4096] + - Exact: [4096, 3110, 1, 1024] + - Exact: [1024, 3509, 1, 4096] + - Exact: [1024, 3413, 1, 4096] + - Exact: [1024, 3975, 1, 1024] + - Exact: [4096, 3549, 1, 1024] + - Exact: [4096, 3342, 1, 1024] + - Exact: [1024, 2985, 1, 4096] + - Exact: [1024, 3876, 1, 33708] + - Exact: [4096, 3280, 1, 1024] + - Exact: [4096, 3191, 1, 1024] + - Exact: [4096, 3512, 1, 1024] + - Exact: [1024, 3560, 1, 4096] + - Exact: [4096, 2499, 1, 1024] + - Exact: [1024, 3248, 1, 4096] + - Exact: [4096, 3423, 1, 1024] + - Exact: [4096, 3297, 1, 1024] + - Exact: [4096, 3154, 1, 1024] + - Exact: [1024, 3303, 1, 4096] + - Exact: [1024, 3222, 1, 4096] + - Exact: [1024, 3978, 1, 1024] + - Exact: [4096, 3529, 1, 1024] + - Exact: [4096, 3386, 1, 1024] + - Exact: [1024, 3451, 1, 4096] + - Exact: [4096, 3562, 1, 1024] + - Exact: [4096, 3276, 1, 1024] + - Exact: [1024, 3894, 1, 33708] + - Exact: [4096, 3540, 1, 1024] + - Exact: [1024, 3416, 1, 4096] + - Exact: [1024, 4005, 1, 33708] + - Exact: [1024, 3942, 1, 4096] + - Exact: [4096, 3403, 1, 1024] + - Exact: [4096, 3381, 1, 1024] + - Exact: [1024, 3492, 1, 4096] + - Exact: [4096, 3101, 1, 1024] + - Exact: [1024, 3430, 1, 4096] + - Exact: [1024, 3977, 1, 4096] + - Exact: [1024, 3640, 1, 4096] + - Exact: [4096, 3557, 1, 1024] + - Exact: [4096, 3414, 1, 1024] + - Exact: [1024, 3391, 1, 4096] + - Exact: [1024, 3356, 1, 4096] + - Exact: [4096, 3320, 1, 1024] + - Exact: [4096, 2765, 1, 1024] + - Exact: [1024, 3411, 1, 4096] + - Exact: [1024, 3978, 1, 4096] + - Exact: [4096, 3487, 1, 1024] + - Exact: [4096, 3520, 1, 1024] + - Exact: [4096, 3942, 1, 1024] + - Exact: [4096, 3431, 1, 1024] + - Exact: [1024, 3271, 1, 4096] + - Exact: [4096, 4020, 1, 1024] + - Exact: [1024, 3481, 1, 4096] + - Exact: [1024, 3419, 1, 4096] + - Exact: [1024, 4059, 1, 4096] + - Exact: [4096, 3345, 1, 1024] + - Exact: [4096, 3394, 1, 1024] + - Exact: [1024, 3298, 1, 4096] + - Exact: [4096, 3235, 1, 1024] + - Exact: [1024, 3681, 1, 33708] + - Exact: [1024, 3362, 1, 4096] + - Exact: [4096, 3467, 1, 1024] + - Exact: [1024, 3349, 1, 4096] + - Exact: [1024, 3460, 1, 4096] + - Exact: [4096, 3214, 1, 1024] + - Exact: [1024, 3398, 1, 4096] + - Exact: [4096, 3478, 1, 1024] + - Exact: [1024, 4050, 1, 33708] + - Exact: [1024, 3244, 1, 4096] + - Exact: [4096, 3341, 1, 1024] + - Exact: [4096, 3454, 1, 1024] + - Exact: [1024, 3166, 1, 4096] + - Exact: [1024, 3425, 1, 4096] + - Exact: [4096, 3295, 1, 1024] + - Exact: [4096, 3072, 1, 1024] + - Exact: [4096, 3822, 1, 1024] + - Exact: [1024, 3681, 1, 4096] + - Exact: [1024, 4050, 1, 4096] + - Exact: [4096, 3495, 1, 1024] + - Exact: [4096, 3560, 1, 1024] + - Exact: [1024, 3524, 1, 4096] + - Exact: [1024, 3942, 1, 33708] + - Exact: [1024, 3304, 1, 4096] + - Exact: [1024, 3387, 1, 4096] + - Exact: [1024, 3498, 1, 4096] + - Exact: [4096, 3458, 1, 1024] + - Exact: [4096, 2967, 1, 1024] + - Exact: [4096, 3385, 1, 1024] + - Exact: [4096, 3434, 1, 1024] + - Exact: [1024, 3519, 1, 4096] + - Exact: [1024, 3511, 1, 4096] + - Exact: [1024, 3288, 1, 4096] + - Exact: [1024, 2918, 1, 4096] + - Exact: [4096, 3573, 1, 1024] + - Exact: [1024, 3822, 1, 33708] + - Exact: [4096, 3539, 1, 1024] + - Exact: [4096, 3332, 1, 1024] + - Exact: [4096, 3286, 1, 1024] + - Exact: [1024, 4026, 1, 4096] + - Exact: [1024, 3277, 1, 4096] + - Exact: [1024, 3471, 1, 4096] + - Exact: [4096, 3518, 1, 1024] + - Exact: [1024, 3393, 1, 4096] + - Exact: [4096, 3413, 1, 1024] + - Exact: [4096, 3303, 1, 1024] + - Exact: [1024, 3207, 1, 4096] + - Exact: [1024, 3894, 1, 1024] + - Exact: [1024, 3977, 1, 1024] + - Exact: [4096, 3535, 1, 1024] + - Exact: [4096, 3376, 1, 1024] + - Exact: [1024, 3355, 1, 4096] + - Exact: [1024, 3466, 1, 4096] + - Exact: [4096, 3266, 1, 1024] + - Exact: [1024, 3404, 1, 4096] + - Exact: [1024, 3999, 1, 1024] + - Exact: [4096, 3498, 1, 1024] + - Exact: [1024, 4032, 1, 1024] + - Exact: [1024, 3410, 1, 4096] + - Exact: [4096, 3393, 1, 1024] + - Exact: [1024, 3140, 1, 4096] + - Exact: [1024, 3910, 1, 33708] + - Exact: [1024, 3334, 1, 4096] + - Exact: [4096, 3140, 1, 1024] + - Exact: [1024, 4005, 1, 4096] + - Exact: [1024, 3579, 1, 4096] + - Exact: [4096, 3372, 1, 1024] + - Exact: [1024, 3245, 1, 4096] + - Exact: [4096, 3956, 1, 1024] + - Exact: [4096, 3213, 1, 1024] + - Exact: [1024, 3361, 1, 4096] + - Exact: [1024, 3536, 1, 4096] + - Exact: [4096, 3477, 1, 1024] + - Exact: [4096, 3526, 1, 1024] + - Exact: [1024, 4005, 1, 1024] + - Exact: [1024, 3530, 1, 4096] + - Exact: [1024, 3944, 1, 4096] + - Exact: [4096, 3453, 1, 1024] + - Exact: [4096, 3184, 1, 1024] + - Exact: [4096, 3579, 1, 1024] + - Exact: [4096, 3351, 1, 1024] + - Exact: [4096, 3416, 1, 1024] + - Exact: [1024, 3822, 1, 4096] + - Exact: [1024, 3796, 1, 4096] + - Exact: [4096, 3257, 1, 1024] + - Exact: [4096, 3306, 1, 1024] + - Exact: [1024, 3505, 1, 4096] + - Exact: [1024, 3315, 1, 4096] + - Exact: [1024, 3486, 1, 4096] + - Exact: [4096, 3457, 1, 1024] + - Exact: [4096, 3870, 1, 1024] + - Exact: [1024, 3447, 1, 4096] + - Exact: [1024, 3558, 1, 4096] + - Exact: [4096, 3433, 1, 1024] + - Exact: [4096, 3180, 1, 1024] + - Exact: [1024, 3213, 1, 4096] + - Exact: [1024, 3900, 1, 4096] + - Exact: [4096, 3444, 1, 1024] + - Exact: [1024, 3504, 1, 4096] + - Exact: [4096, 4059, 1, 1024] + - Exact: [1024, 3442, 1, 4096] + - Exact: [4096, 3517, 1, 1024] + - Exact: [1024, 3566, 1, 4096] + - Exact: [4096, 3248, 1, 1024] + - Exact: [1024, 3547, 1, 4096] + - Exact: [1024, 3340, 1, 4096] + - Exact: [4096, 3480, 1, 1024] + - Exact: [4096, 3424, 1, 1024] + - Exact: [1024, 3906, 1, 1024] + - Exact: [4096, 3265, 1, 1024] + - Exact: [1024, 3384, 1, 4096] + - Exact: [1024, 3494, 1, 4096] + - Exact: [1024, 3236, 1, 4096] + - Exact: [4096, 3497, 1, 1024] + - Exact: [4096, 3354, 1, 1024] + - Exact: [4096, 3055, 1, 1024] + - Exact: [4096, 3244, 1, 1024] + - Exact: [4096, 3139, 1, 1024] + - Exact: [4096, 3508, 1, 1024] + - Exact: [4096, 4050, 1, 1024] + - Exact: [1024, 3472, 1, 4096] + - Exact: [1024, 3861, 1, 1024] + - Exact: [1024, 3910, 1, 1024] + - Exact: [4096, 3371, 1, 1024] + - Exact: [1024, 3751, 1, 4096] + - Exact: [4096, 3325, 1, 1024] + - Exact: [1024, 3321, 1, 4096] + - Exact: [1024, 3944, 1, 1024] + - Exact: [4096, 3525, 1, 1024] + - Exact: [4096, 3382, 1, 1024] + - Exact: [1024, 3453, 1, 4096] + - Exact: [4096, 3564, 1, 1024] + - Exact: [4096, 3288, 1, 1024] + - Exact: [1024, 3925, 1, 4096] + - Exact: [1024, 3057, 1, 4096] + - Exact: [4096, 3488, 1, 1024] + - Exact: [4096, 3046, 1, 1024] + - Exact: [1024, 3189, 1, 4096] + - Exact: [4096, 3399, 1, 1024] + - Exact: [1024, 3383, 1, 4096] + - Exact: [1024, 3415, 1, 4096] + - Exact: [1024, 3388, 1, 4096] + - Exact: [1024, 3376, 1, 4096] + - Exact: [1024, 3473, 1, 4096] + - Exact: [4096, 3162, 1, 1024] + - Exact: [1024, 3448, 1, 4096] + - Exact: [4096, 3362, 1, 1024] + - Exact: [1024, 3262, 1, 4096] + - Exact: [1024, 3184, 1, 4096] + - Exact: [1024, 3378, 1, 4096] + - Exact: [4096, 3548, 1, 1024] + - Exact: [4096, 2977, 1, 1024] + - Exact: [4096, 3443, 1, 1024] + - Exact: [1024, 3289, 1, 4096] + - Exact: [1024, 3483, 1, 4096] + - Exact: [4096, 3190, 1, 1024] + - Exact: [1024, 3421, 1, 4096] + - Exact: [1024, 3514, 1, 4096] + - Exact: [1024, 3532, 1, 4096] + - Exact: [1024, 3565, 1, 4096] + - Exact: [4096, 3422, 1, 1024] + - Exact: [4096, 3263, 1, 1024] + - Exact: [4096, 3296, 1, 1024] + - Exact: [4096, 3640, 1, 1024] + - Exact: [4096, 3463, 1, 1024] + - Exact: [4096, 3528, 1, 1024] + - Exact: [1024, 3351, 1, 4096] + - Exact: [1024, 3462, 1, 4096] + - Exact: [4096, 3226, 1, 1024] + - Exact: [4096, 3439, 1, 1024] + - Exact: [4096, 3121, 1, 1024] + - Exact: [1024, 4059, 1, 33708] + - Exact: [1024, 3311, 1, 4096] + - Exact: [1024, 3230, 1, 4096] + - Exact: [4096, 3353, 1, 1024] + - Exact: [4096, 3402, 1, 1024] + - Exact: [1024, 3427, 1, 4096] + - Exact: [1024, 3346, 1, 4096] + - Exact: [1024, 3126, 1, 4096] + - Exact: [1024, 3796, 1, 1024] + - Exact: [1024, 3990, 1, 4096] + - Exact: [1024, 3257, 1, 4096] + - Exact: [4096, 3996, 1, 1024] + - Exact: [1024, 3306, 1, 4096] + - Exact: [1024, 3389, 1, 4096] + - Exact: [1024, 3500, 1, 4096] + - Exact: [1024, 3999, 1, 33708] + - Exact: [4096, 3486, 1, 1024] + - Exact: [1024, 3438, 1, 4096] + - Exact: [4096, 3616, 1, 1024] + - Exact: [1024, 3955, 1, 1024] + - Exact: [4096, 3430, 1, 1024] + - Exact: [4096, 3271, 1, 1024] + - Exact: [1024, 3364, 1, 4096] + - Exact: [1024, 3497, 1, 4096] + - Exact: [4096, 3503, 1, 1024] + - Exact: [4096, 3344, 1, 1024] + - Exact: [1024, 3457, 1, 4096] + - Exact: [4096, 3466, 1, 1024] + - Exact: [1024, 3976, 1, 33708] + - Exact: [1024, 3395, 1, 4096] + - Exact: [4096, 3361, 1, 1024] + - Exact: [1024, 3751, 1, 33708] + - Exact: [1024, 3822, 1, 1024] + - Exact: [4096, 3315, 1, 1024] + - Exact: [1024, 3163, 1, 4096] + - Exact: [4096, 3547, 1, 1024] + - Exact: [4096, 3340, 1, 1024] + - Exact: [1024, 3296, 1, 4096] + - Exact: [1024, 3468, 1, 4096] + - Exact: [4096, 3294, 1, 1024] + - Exact: [1024, 3406, 1, 4096] + - Exact: [1024, 3860, 1, 33708] + - Exact: [1024, 3584, 1, 4096] + - Exact: [4096, 3189, 1, 1024] + - Exact: [4096, 3494, 1, 1024] + - Exact: [1024, 3093, 1, 4096] + - Exact: [4096, 3421, 1, 1024] + - Exact: [1024, 3479, 1, 4096] + - Exact: [1024, 3433, 1, 4096] + - Exact: [4096, 3311, 1, 1024] + - Exact: [1024, 3381, 1, 4096] + - Exact: [1024, 3996, 1, 4096] + - Exact: [4096, 3384, 1, 1024] + - Exact: [1024, 3247, 1, 4096] + - Exact: [1024, 3169, 1, 4096] + - Exact: [1024, 3088, 1, 4096] + - Exact: [1024, 3363, 1, 4096] + - Exact: [1024, 3538, 1, 4096] + - Exact: [1024, 3996, 1, 1024] + - Exact: [4096, 3169, 1, 1024] + - Exact: [4096, 3538, 1, 1024] + - Exact: [4096, 3401, 1, 1024] + - Exact: [4096, 3581, 1, 1024] + - Exact: [1024, 3180, 1, 4096] + - Exact: [1024, 3870, 1, 1024] + - Exact: [4096, 3555, 1, 1024] + - Exact: [4096, 3412, 1, 1024] + - Exact: [4096, 3302, 1, 1024] + - Exact: [1024, 3561, 1, 4096] + - Exact: [1024, 3302, 1, 4096] + - Exact: [1024, 3976, 1, 4096] + - Exact: [4096, 3485, 1, 1024] + - Exact: [4096, 3534, 1, 1024] + - Exact: [1024, 3110, 1, 4096] + - Exact: [1024, 3401, 1, 4096] + - Exact: [4096, 3216, 1, 1024] + - Exact: [1024, 4020, 1, 33708] + - Exact: [1024, 3215, 1, 4096] + - Exact: [4096, 3566, 1, 1024] + - Exact: [1024, 3137, 1, 4096] + - Exact: [4096, 3359, 1, 1024] + - Exact: [4096, 3392, 1, 1024] + - Exact: [1024, 3506, 1, 4096] + - Exact: [4096, 3233, 1, 1024] + - Exact: [1024, 3444, 1, 4096] + - Exact: [1024, 3975, 1, 4096] + - Exact: [1024, 3870, 1, 33708] + - Exact: [4096, 3465, 1, 1024] + - Exact: [1024, 3523, 1, 4096] + - Exact: [4096, 3990, 1, 1024] + - Exact: [1024, 3549, 1, 4096] + - Exact: [1024, 3342, 1, 4096] + - Exact: [4096, 3476, 1, 1024] + - Exact: [1024, 3418, 1, 4096] + - Exact: [1024, 3859, 1, 1024] + - Exact: [4096, 3339, 1, 1024] + - Exact: [4096, 3452, 1, 1024] + - Exact: [4096, 3293, 1, 1024] + - Exact: [1024, 3369, 1, 4096] + - Exact: [1024, 3544, 1, 4096] + - Exact: [4096, 3493, 1, 1024] + - Exact: [4096, 3350, 1, 1024] + - Exact: [4096, 3256, 1, 1024] + - Exact: [1024, 3870, 1, 4096] + - Exact: [4096, 4012, 1, 1024] + - Exact: [1024, 3280, 1, 4096] + - Exact: [4096, 3456, 1, 1024] + - Exact: [1024, 3555, 1, 4096] + - Exact: [4096, 3014, 1, 1024] + - Exact: [1024, 3474, 1, 4096] + - Exact: [4096, 3367, 1, 1024] + - Exact: [4096, 3432, 1, 1024] + - Exact: [4096, 3273, 1, 1024] + - Exact: [4096, 3130, 1, 1024] + - Exact: [1024, 2984, 1, 4096] + - Exact: [1024, 3995, 1, 1024] + - Exact: [1024, 3517, 1, 4096] + - Exact: [1024, 3455, 1, 4096] + - Exact: [1024, 3939, 1, 4096] + - Exact: [4096, 3147, 1, 1024] + - Exact: [4096, 3516, 1, 1024] + - Exact: [1024, 3876, 1, 4096] + - Exact: [1024, 3191, 1, 4096] + - Exact: [4096, 3411, 1, 1024] + - Exact: [1024, 3337, 1, 4096] + - Exact: [1024, 3512, 1, 4096] + - Exact: [4096, 3301, 1, 1024] + - Exact: [1024, 3450, 1, 4096] + - Exact: [4096, 3533, 1, 1024] + - Exact: [4096, 3390, 1, 1024] + - Exact: [4096, 3231, 1, 1024] + - Exact: [1024, 2499, 1, 4096] + - Exact: [1024, 3186, 1, 4096] + - Exact: [1024, 3380, 1, 4096] + - Exact: [4096, 3496, 1, 1024] + - Exact: [1024, 3956, 1, 33708] + - Exact: [1024, 3976, 1, 1024] + - Exact: [4096, 2736, 1, 1024] + - Exact: [1024, 3291, 1, 4096] + - Exact: [1024, 3944, 1, 33708] + - Exact: [1024, 3485, 1, 4096] + - Exact: [4096, 3138, 1, 1024] + - Exact: [1024, 3423, 1, 4096] + - Exact: [1024, 3491, 1, 4096] + - Exact: [1024, 3860, 1, 4096] + - Exact: [4096, 3211, 1, 1024] + - Exact: [1024, 3221, 1, 4096] + - Exact: [1024, 2917, 1, 4096] + - Exact: [4096, 3475, 1, 1024] + - Exact: [4096, 3524, 1, 1024] + - Exact: [4096, 2985, 1, 1024] + - Exact: [1024, 3480, 1, 4096] + - Exact: [4096, 3222, 1, 1024] + - Exact: [4096, 3451, 1, 1024] + - Exact: [1024, 3969, 1, 33708] + - Exact: [1024, 3640, 1, 1024] + - Exact: [1024, 3297, 1, 4096] + - Exact: [4096, 3944, 1, 1024] + - Exact: [1024, 3216, 1, 4096] + - Exact: [4096, 3349, 1, 1024] + - Exact: [4096, 3398, 1, 1024] + - Exact: [1024, 3154, 1, 4096] + - Exact: [1024, 3978, 1, 33708] + - Exact: [1024, 3348, 1, 4096] + - Exact: [4096, 3304, 1, 1024] + - Exact: [4096, 4030, 1, 1024] + - Exact: [1024, 4026, 1, 1024] + - Exact: [4096, 3471, 1, 1024] + - Exact: [1024, 3259, 1, 4096] + - Exact: [1024, 3308, 1, 4096] + - Exact: [4096, 3391, 1, 1024] + - Exact: [1024, 3312, 1, 4096] + - Exact: [1024, 3502, 1, 4096] + - Exact: [1024, 3968, 1, 33708] + - Exact: [1024, 3424, 1, 4096] + - Exact: [4096, 4032, 1, 1024] + - Exact: [1024, 3900, 1, 1024] + - Exact: [4096, 3442, 1, 1024] + - Exact: [1024, 3366, 1, 4096] + - Exact: [4096, 3999, 1, 1024] + - Exact: [1024, 3477, 1, 4096] + - Exact: [1024, 2505, 1, 4096] + - Exact: [4096, 3515, 1, 1024] + - Exact: [1024, 3564, 1, 4096] + - Exact: [4096, 3057, 1, 1024] + - Exact: [1024, 3339, 1, 4096] + - Exact: [4096, 3262, 1, 1024] + - Exact: [1024, 4030, 1, 4096] + - Exact: [1024, 3265, 1, 4096] + - Exact: [1024, 3459, 1, 4096] + - Exact: [4096, 3462, 1, 1024] + - Exact: [1024, 3513, 1, 4096] + - Exact: [1024, 3397, 1, 4096] + - Exact: [4096, 3572, 1, 1024] + - Exact: [4096, 3389, 1, 1024] + - Exact: [4096, 3438, 1, 1024] + - Exact: [1024, 3640, 1, 33708] + - Exact: [1024, 3995, 1, 33708] + - Exact: [1024, 3165, 1, 4096] + - Exact: [4096, 3543, 1, 1024] + - Exact: [4096, 3352, 1, 1024] + - Exact: [1024, 3359, 1, 4096] + - Exact: [1024, 3470, 1, 4096] + - Exact: [1024, 3392, 1, 4096] + - Exact: [4096, 3137, 1, 1024] + - Exact: [4096, 3506, 1, 1024] + - Exact: [1024, 3095, 1, 4096] + - Exact: [1024, 3859, 1, 4096] + - Exact: [4096, 3369, 1, 1024] + - Exact: [1024, 3435, 1, 4096] + - Exact: [1024, 3354, 1, 4096] + - Exact: [1024, 3055, 1, 4096] + - Exact: [4096, 3523, 1, 1024] + - Exact: [4096, 3380, 1, 1024] + - Exact: [1024, 3233, 1, 4096] + - Exact: [4096, 3221, 1, 1024] + - Exact: [4096, 3270, 1, 1024] + - Exact: [4096, 3593, 1, 1024] + - Exact: [1024, 3358, 1, 4096] + - Exact: [1024, 3540, 1, 4096] + - Exact: [4096, 3502, 1, 1024] + - Exact: [4096, 2505, 1, 1024] + - Exact: [4096, 3397, 1, 1024] + - Exact: [1024, 3300, 1, 4096] + - Exact: [4096, 3095, 1, 1024] + - Exact: [1024, 3182, 1, 4096] + - Exact: [1024, 3299, 1, 4096] + - Exact: [1024, 3276, 1, 4096] + - Exact: [1024, 3360, 1, 4096] + - Exact: [4096, 3360, 1, 1024] + - Exact: [4096, 2918, 1, 1024] + - Exact: [1024, 3939, 1, 33708] + - Exact: [4096, 3314, 1, 1024] + - Exact: [1024, 3319, 1, 4096] + - Exact: [1024, 3942, 1, 1024] + - Exact: [1024, 3465, 1, 4096] + - Exact: [4096, 3546, 1, 1024] + - Exact: [1024, 3403, 1, 4096] + - Exact: [1024, 3948, 1, 1024] + - Exact: [4096, 3441, 1, 1024] + - Exact: [1024, 3139, 1, 4096] + - Exact: [1024, 3563, 1, 4096] + - Exact: [1024, 3508, 1, 4096] + - Exact: [1024, 3975, 1, 33708] + - Exact: [1024, 3446, 1, 4096] + - Exact: [1024, 3529, 1, 4096] + - Exact: [4096, 3461, 1, 1024] + - Exact: [1024, 3574, 1, 4096] + - Exact: [1024, 3101, 1, 4096] + - Exact: [1024, 3927, 1, 1024] + - Exact: [4096, 3224, 1, 1024] + - Exact: [4096, 3437, 1, 1024] + - Exact: [4096, 3900, 1, 1024] + - Exact: [1024, 3495, 1, 4096] + - Exact: [1024, 3977, 1, 33708] + - Exact: [1024, 3328, 1, 4096] + - Exact: [4096, 3168, 1, 1024] + - Exact: [1024, 4026, 1, 33708] + - Exact: [1024, 3292, 1, 4096] + - Exact: [1024, 3294, 1, 4096] + - Exact: [4096, 3335, 1, 1024] + - Exact: [4096, 3400, 1, 1024] + - Exact: [1024, 3287, 1, 4096] + - Exact: [1024, 3910, 1, 4096] + - Exact: [1024, 3780, 1, 1024] + - Exact: [4096, 3098, 1, 1024] + - Exact: [1024, 3584, 1, 33708] + - Exact: [1024, 3371, 1, 4096] + - Exact: [1024, 3546, 1, 4096] + - Exact: [1024, 4012, 1, 1024] + - Exact: [4096, 3505, 1, 1024] + - Exact: [4096, 3554, 1, 1024] + - Exact: [4096, 3063, 1, 1024] + - Exact: [1024, 3900, 1, 33708] + - Exact: [1024, 3345, 1, 4096] + - Exact: [1024, 3357, 1, 4096] + - Exact: [1024, 3282, 1, 4096] + - Exact: [4096, 3484, 1, 1024] + - Exact: [1024, 3557, 1, 4096] + - Exact: [1024, 3476, 1, 4096] + - Exact: [1024, 3751, 1, 1024] + - Exact: [4096, 3379, 1, 1024] + - Exact: [4096, 3428, 1, 1024] + - Exact: [4096, 3126, 1, 1024] + - Exact: [1024, 3325, 1, 4096] + - Exact: [4096, 3501, 1, 1024] + - Exact: [4096, 3358, 1, 1024] + - Exact: [1024, 3441, 1, 4096] + - Exact: [1024, 3552, 1, 4096] + - Exact: [4096, 3232, 1, 1024] + - Exact: [1024, 3412, 1, 4096] + - Exact: [1024, 3372, 1, 4096] + - Exact: [1024, 3585, 1, 4096] + - Exact: [4096, 3143, 1, 1024] + - Exact: [4096, 3464, 1, 1024] + - Exact: [1024, 3145, 1, 4096] + - Exact: [4096, 3375, 1, 1024] + - Exact: [4096, 2917, 1, 1024] + - Exact: [4096, 3978, 1, 1024] + - Exact: [1024, 2765, 1, 4096] + - Exact: [1024, 3452, 1, 4096] + - Exact: [4096, 3584, 1, 1024] + - Exact: [4096, 3545, 1, 1024] + - Exact: [1024, 3352, 1, 4096] + - Exact: [4096, 3292, 1, 1024] + - Exact: [1024, 3525, 1, 4096] + - Exact: [1024, 3266, 1, 4096] + - Exact: [1024, 3382, 1, 4096] + - Exact: [4096, 3492, 1, 1024] + - Exact: [4096, 3419, 1, 1024] + - Exact: [1024, 3796, 1, 33708] + - Exact: [1024, 3293, 1, 4096] + - Exact: [4096, 3796, 1, 1024] + - Exact: [1024, 3487, 1, 4096] + - Exact: [4096, 3166, 1, 1024] + - Exact: [1024, 3409, 1, 4096] + - Exact: [1024, 3520, 1, 4096] + - Exact: [1024, 3573, 1, 4096] + - Exact: [4096, 3366, 1, 1024] + - Exact: [4096, 3720, 1, 1024] + - Exact: [4096, 3207, 1, 1024] + - Exact: [4096, 3272, 1, 1024] + - Exact: [1024, 3390, 1, 4096] + - Exact: [4096, 3183, 1, 1024] + - Exact: [4096, 3536, 1, 1024] + - Exact: [4096, 3563, 1, 1024] + - Exact: [1024, 3482, 1, 4096] + - Exact: [4096, 3447, 1, 1024] + - Exact: [4096, 3955, 1, 1024] + - Exact: [4096, 4005, 1, 1024] + - Exact: [1024, 3493, 1, 4096] + - Exact: [4096, 3410, 1, 1024] + - Exact: [1024, 3422, 1, 4096] + - Exact: [1024, 3350, 1, 4096] + - Exact: [4096, 3300, 1, 1024] + - Exact: [4096, 3910, 1, 1024] + - Exact: [1024, 3489, 1, 4096] + - Exact: [4096, 3483, 1, 1024] + - Exact: [4096, 3532, 1, 1024] + - Exact: [4096, 3230, 1, 1024] + - Exact: [4096, 3427, 1, 1024] + - Exact: [1024, 3377, 1, 4096] + - Exact: [1024, 3488, 1, 4096] + - Exact: [1024, 3616, 1, 4096] + - Exact: [1024, 3426, 1, 4096] + - Exact: [4096, 3357, 1, 1024] + - Exact: [4096, 3406, 1, 1024] + - Exact: [1024, 3046, 1, 4096] + - Exact: [1024, 3272, 1, 4096] + - Exact: [1024, 3256, 1, 4096] + - Exact: [4096, 3247, 1, 1024] + - Exact: [4096, 3088, 1, 1024] + - Exact: [1024, 3531, 1, 4096] + - Exact: [4096, 3511, 1, 1024] + - Exact: [1024, 3720, 1, 33708] + - Exact: [1024, 3267, 1, 4096] + - Exact: [1024, 3270, 1, 4096] + - Exact: [1024, 3461, 1, 4096] + - Exact: [4096, 3474, 1, 1024] + - Exact: [4096, 2984, 1, 1024] + - Exact: [1024, 3399, 1, 4096] + - Exact: [4096, 3574, 1, 1024] + - Exact: [1024, 3876, 1, 1024] + - Exact: [4096, 3337, 1, 1024] + - Exact: [4096, 3450, 1, 1024] + - Exact: [1024, 3720, 1, 1024] + - Exact: [1024, 4059, 1, 1024] + - Exact: [4096, 3291, 1, 1024] + - Exact: [4096, 3995, 1, 1024] + - Exact: [4096, 3491, 1, 1024] + - Exact: [4096, 3348, 1, 1024] + - Exact: [4096, 3925, 1, 1024] + - Exact: [4096, 3894, 1, 1024] + - Exact: [1024, 3456, 1, 4096] + - Exact: [1024, 3394, 1, 4096] + - Exact: [4096, 3165, 1, 1024] + - Exact: [4096, 3470, 1, 1024] + - Exact: [1024, 3014, 1, 4096] + - Exact: [1024, 3375, 1, 4096] + - Exact: [4096, 3859, 1, 1024] + - Exact: [4096, 3365, 1, 1024] + - Exact: [1024, 3162, 1, 4096] + - Exact: [1024, 3840, 1, 33708] + - Exact: [1024, 3437, 1, 4096] + - Exact: [4096, 3319, 1, 1024] + - Exact: [1024, 3320, 1, 4096] + - Exact: [4096, 3328, 1, 1024] + - Exact: [1024, 3235, 1, 4096] + - Exact: [4096, 3282, 1, 1024] + - Exact: [1024, 3367, 1, 4096] + - Exact: [1024, 3542, 1, 4096] + - Exact: [4096, 3145, 1, 1024] + - Exact: [4096, 3514, 1, 1024] + - Exact: [1024, 3432, 1, 4096] + - Exact: [4096, 3409, 1, 1024] + - Exact: [1024, 4012, 1, 33708] + - Exact: [4096, 3876, 1, 1024] + - Exact: [4096, 3299, 1, 1024] + - Exact: [1024, 3168, 1, 4096] + - Exact: [4096, 3681, 1, 1024] + - Exact: [4096, 3531, 1, 1024] + - Exact: [4096, 3388, 1, 1024] + - Exact: [1024, 3720, 1, 4096] + - Exact: [1024, 3332, 1, 4096] + - Exact: [1024, 3273, 1, 4096] + - Exact: [1024, 2935, 1, 4096] + - Exact: [1024, 3467, 1, 4096] + - Exact: [4096, 3542, 1, 1024] + - Exact: [1024, 3130, 1, 4096] + - Exact: [1024, 3405, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [4096, 3405, 1, 1024] + - Exact: [1024, 10080, 1, 1024] + - Exact: [36548, 1216, 1, 1024] + - Exact: [1024, 2592, 1, 1024] + - Exact: [1024, 1568, 1, 1024] + - Exact: [1024, 4445, 1, 1024] + - Exact: [1024, 6272, 1, 1024] + - Exact: [36548, 3584, 1, 1024] + - Exact: [1024, 1827, 1, 1024] + - Exact: [1024, 3220, 1, 1024] + - Exact: [1024, 1856, 1, 1024] + - Exact: [1024, 1760, 1, 1024] + - Exact: [36548, 4235, 1, 1024] + - Exact: [1024, 1984, 1, 1024] + - Exact: [1024, 14720, 1, 1024] + - Exact: [1024, 1152, 1, 1024] + - Exact: [36548, 14976, 1, 1024] + - Exact: [36548, 1152, 1, 1024] + - Exact: [1024, 3392, 1, 1024] + - Exact: [1024, 1408, 1, 1024] + - Exact: [1024, 2080, 1, 1024] + - Exact: [1024, 1824, 1, 1024] + - Exact: [36548, 2432, 1, 1024] + - Exact: [36548, 1827, 1, 1024] + - Exact: [1024, 10176, 1, 1024] + - Exact: [1024, 1952, 1, 1024] + - Exact: [1024, 17024, 1, 1024] + - Exact: [1024, 1472, 1, 1024] + - Exact: [36548, 4459, 1, 1024] + - Exact: [1024, 3712, 1, 1024] + - Exact: [36548, 12928, 1, 1024] + - Exact: [1024, 1632, 1, 1024] + - Exact: [1024, 1696, 1, 1024] + - Exact: [36548, 1764, 1, 1024] + - Exact: [1024, 2944, 1, 1024] + - Exact: [36548, 14080, 1, 1024] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 13440, 1, 1024] + - Exact: [36548, 9120, 1, 1024] + - Exact: [1024, 3008, 1, 1024] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2208, 1, 1024] + - Exact: [1024, 1920, 1, 1024] + - Exact: [36548, 2496, 1, 1024] + - Exact: [1024, 2016, 1, 1024] + - Exact: [1024, 1184, 1, 1024] + - Exact: [1024, 1664, 1, 1024] + - Exact: [1024, 11424, 1, 1024] + - Exact: [1024, 1216, 1, 1024] + - Exact: [36548, 3185, 1, 1024] + - Exact: [36548, 9216, 1, 1024] + - Exact: [1024, 3200, 1, 1024] + - Exact: [1024, 2656, 1, 1024] + - Exact: [1024, 2368, 1, 1024] + - Exact: [1024, 4459, 1, 1024] + - Exact: [1024, 3808, 1, 1024] + - Exact: [1024, 2336, 1, 1024] + - Exact: [1024, 2304, 1, 1024] + - Exact: [1024, 1560, 1, 1024] + - Exact: [1024, 2496, 1, 1024] + - Exact: [1024, 1504, 1, 1024] + - Exact: [1024, 3232, 1, 1024] + - Exact: [36548, 1015, 1, 1024] + - Exact: [1024, 2000, 1, 1024] + - Exact: [36548, 243, 1, 1024] + - Exact: [1024, 13184, 1, 1024] + - Exact: [1024, 2688, 1, 1024] + - Exact: [36548, 950, 1, 1024] + - Exact: [1024, 1764, 1, 1024] + - Exact: [1024, 1376, 1, 1024] + - Exact: [36548, 774, 1, 1024] + - Exact: [1024, 4256, 1, 1024] + - Exact: [36548, 3712, 1, 1024] + - Exact: [1024, 3360, 1, 1024] + - Exact: [1024, 2784, 1, 1024] + - Exact: [1024, 4992, 1, 1024] + - Exact: [36548, 1102, 1, 1024] + - Exact: [1024, 1536, 1, 1024] + - Exact: [1024, 2720, 1, 1024] + - Exact: [1024, 2752, 1, 1024] + - Exact: [1024, 2816, 1, 1024] + - Exact: [1024, 2624, 1, 1024] + - Exact: [1024, 2144, 1, 1024] + - Exact: [36548, 1131, 1, 1024] + - Exact: [1024, 3296, 1, 1024] + - Exact: [36548, 4992, 1, 1024] + - Exact: [1024, 1344, 1, 1024] + - Exact: [36548, 2401, 1, 1024] + - Exact: [1024, 15744, 1, 1024] + - Exact: [1024, 15232, 1, 1024] + - Exact: [1024, 1888, 1, 1024] + - Exact: [1024, 1792, 1, 1024] + - Exact: [36548, 1073, 1, 1024] + - Exact: [36548, 15488, 1, 1024] + - Exact: [1024, 2464, 1, 1024] + - Exact: [1024, 2272, 1, 1024] + - Exact: [1024, 2432, 1, 1024] + - Exact: [1024, 3936, 1, 1024] + - Exact: [36548, 13824, 1, 1024] + - Exact: [1024, 2401, 1, 1024] + - Exact: [1024, 2176, 1, 1024] + - Exact: [1024, 2240, 1, 1024] + - Exact: [1024, 1728, 1, 1024] + - Exact: [1024, 2528, 1, 1024] + - Exact: [1024, 2400, 1, 1024] + - Exact: [1024, 1440, 1, 1024] + - Exact: [1024, 2912, 1, 1024] + - Exact: [1024, 2880, 1, 1024] + - Exact: [1024, 4064, 1, 1024] + - Exact: [1024, 4655, 1, 1024] + - Exact: [36548, 6272, 1, 1024] + - Exact: [768, 2048, 1, 3072] + - Exact: [768, 4096, 1, 3072] + - Exact: [6272, 256, 1, 528] + - Exact: [3136, 2048, 1, 1024] + - Exact: [50176, 128, 1, 256] + - Exact: [12544, 1024, 1, 256] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 512, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [289, 384, 32, 1024] + - Exact: [4096, 512, 1, 4096] + - Exact: [50176, 512, 1, 256] + - Exact: [12544, 1024, 1, 512] + - Exact: [12544, 256, 1, 512] + - Exact: [784, 128, 32, 256] + - Exact: [4096, 512, 1, 9216] + - Exact: [3136, 512, 1, 2048] + - Exact: [1225, 192, 32, 384] + - Exact: [8192, 320, 1, 1280] + - Exact: [8192, 320, 1, 2048] + - Exact: [8192, 384, 1, 1280] + - Exact: [8192, 384, 1, 2048] + - Exact: [8192, 448, 1, 2048] + - Exact: [8192, 448, 1, 1280] + - Exact: [256, 6400, 1, 4096] + - Exact: [512, 3433, 1, 2048] + - Exact: [512, 3439, 1, 2048] + - Exact: [512, 3461, 1, 2048] + - Exact: [512, 3479, 1, 2048] + - Exact: [512, 3494, 1, 2048] + - Exact: [512, 3520, 1, 2048] + - Exact: [512, 3530, 1, 2048] + - Exact: [512, 3541, 1, 2048] + - Exact: [512, 3564, 1, 2048] + - Exact: [512, 3776, 1, 2048] + - Exact: [512, 3859, 1, 512] + - Exact: [512, 3925, 1, 2048] + - Exact: [512, 3944, 1, 2048] + - Exact: [512, 3955, 1, 2048] + - Exact: [512, 3969, 1, 2048] + - Exact: [512, 3976, 1, 2048] + - Exact: [2048, 1232, 1, 512] + - Exact: [2048, 3165, 1, 512] + - Exact: [512, 2387, 1, 512] + - Exact: [512, 2418, 1, 512] + - Exact: [512, 2418, 1, 2048] + - Exact: [512, 2496, 1, 512] + - Exact: [512, 2496, 1, 2048] + - Exact: [512, 2790, 1, 2048] + - Exact: [512, 2864, 1, 2048] + - Exact: [512, 3092, 1, 2048] + - Exact: [512, 3113, 1, 2048] + - Exact: [512, 3137, 1, 2048] + - Exact: [512, 3165, 1, 2048] + - Exact: [512, 3166, 1, 2048] + - Exact: [512, 3194, 1, 2048] + - Exact: [512, 3219, 1, 2048] + - Exact: [512, 3222, 1, 2048] + - Exact: [512, 3234, 1, 2048] + - Exact: [512, 3237, 1, 2048] + - Exact: [512, 3242, 1, 2048] + - Exact: [512, 3246, 1, 2048] + - Exact: [512, 3249, 1, 2048] + - Exact: [512, 3251, 1, 2048] + - Exact: [512, 3257, 1, 2048] + - Exact: [512, 3262, 1, 2048] + - Exact: [512, 3268, 1, 2048] + - Exact: [512, 3282, 1, 2048] + - Exact: [512, 3286, 1, 2048] + - Exact: [512, 3287, 1, 2048] + - Exact: [512, 3293, 1, 2048] + - Exact: [512, 3297, 1, 2048] + - Exact: [512, 3307, 1, 2048] + - Exact: [512, 3314, 1, 2048] + - Exact: [512, 3315, 1, 2048] + - Exact: [512, 3319, 1, 2048] + - Exact: [512, 3322, 1, 2048] + - Exact: [512, 3323, 1, 2048] + - Exact: [512, 3324, 1, 2048] + - Exact: [512, 3325, 1, 2048] + - Exact: [512, 3327, 1, 2048] + - Exact: [512, 3329, 1, 2048] + - Exact: [512, 3332, 1, 2048] + - Exact: [512, 3336, 1, 2048] + - Exact: [512, 3339, 1, 2048] + - Exact: [512, 3342, 1, 2048] + - Exact: [512, 3344, 1, 2048] + - Exact: [512, 3358, 1, 2048] + - Exact: [512, 3360, 1, 2048] + - Exact: [512, 3364, 1, 2048] + - Exact: [512, 3365, 1, 2048] + - Exact: [512, 3369, 1, 2048] + - Exact: [512, 3370, 1, 2048] + - Exact: [512, 3371, 1, 2048] + - Exact: [512, 3374, 1, 2048] + - Exact: [512, 3376, 1, 2048] + - Exact: [512, 3377, 1, 2048] + - Exact: [512, 3378, 1, 2048] + - Exact: [512, 3381, 1, 2048] + - Exact: [512, 3382, 1, 2048] + - Exact: [512, 3383, 1, 2048] + - Exact: [512, 3384, 1, 2048] + - Exact: [512, 3385, 1, 2048] + - Exact: [512, 3386, 1, 2048] + - Exact: [512, 3388, 1, 2048] + - Exact: [512, 3390, 1, 2048] + - Exact: [512, 3391, 1, 2048] + - Exact: [512, 3396, 1, 2048] + - Exact: [512, 3399, 1, 2048] + - Exact: [512, 3402, 1, 2048] + - Exact: [512, 3410, 1, 2048] + - Exact: [512, 3412, 1, 2048] + - Exact: [512, 3414, 1, 2048] + - Exact: [512, 3415, 1, 2048] + - Exact: [512, 3418, 1, 2048] + - Exact: [512, 3420, 1, 2048] + - Exact: [512, 3422, 1, 2048] + - Exact: [512, 3425, 1, 2048] + - Exact: [512, 3426, 1, 2048] + - Exact: [512, 3427, 1, 2048] + - Exact: [512, 3428, 1, 2048] + - Exact: [512, 3430, 1, 2048] + - Exact: [512, 3431, 1, 2048] + - Exact: [512, 3432, 1, 2048] + - Exact: [512, 3438, 1, 2048] + - Exact: [512, 3440, 1, 2048] + - Exact: [512, 3443, 1, 2048] + - Exact: [512, 3445, 1, 2048] + - Exact: [512, 3447, 1, 2048] + - Exact: [512, 3448, 1, 2048] + - Exact: [512, 3450, 1, 2048] + - Exact: [512, 3451, 1, 2048] + - Exact: [512, 3452, 1, 2048] + - Exact: [512, 3453, 1, 2048] + - Exact: [512, 3455, 1, 2048] + - Exact: [512, 3456, 1, 2048] + - Exact: [512, 3457, 1, 2048] + - Exact: [512, 3458, 1, 2048] + - Exact: [512, 3459, 1, 2048] + - Exact: [512, 3460, 1, 2048] + - Exact: [512, 3462, 1, 2048] + - Exact: [512, 3466, 1, 2048] + - Exact: [512, 3467, 1, 2048] + - Exact: [512, 3468, 1, 2048] + - Exact: [512, 3470, 1, 2048] + - Exact: [512, 3471, 1, 2048] + - Exact: [512, 3472, 1, 2048] + - Exact: [512, 3475, 1, 2048] + - Exact: [512, 3476, 1, 2048] + - Exact: [512, 3477, 1, 2048] + - Exact: [512, 3478, 1, 2048] + - Exact: [512, 3480, 1, 2048] + - Exact: [512, 3481, 1, 2048] + - Exact: [512, 3483, 1, 2048] + - Exact: [512, 3484, 1, 2048] + - Exact: [512, 3487, 1, 2048] + - Exact: [512, 3489, 1, 2048] + - Exact: [512, 3490, 1, 2048] + - Exact: [512, 3491, 1, 2048] + - Exact: [512, 3493, 1, 2048] + - Exact: [512, 3495, 1, 2048] + - Exact: [512, 3497, 1, 2048] + - Exact: [512, 3498, 1, 2048] + - Exact: [512, 3499, 1, 2048] + - Exact: [512, 3501, 1, 2048] + - Exact: [512, 3503, 1, 2048] + - Exact: [512, 3505, 1, 2048] + - Exact: [512, 3507, 1, 2048] + - Exact: [512, 3508, 1, 2048] + - Exact: [512, 3509, 1, 2048] + - Exact: [512, 3510, 1, 2048] + - Exact: [512, 3511, 1, 2048] + - Exact: [512, 3513, 1, 2048] + - Exact: [512, 3514, 1, 2048] + - Exact: [512, 3515, 1, 2048] + - Exact: [512, 3517, 1, 2048] + - Exact: [512, 3518, 1, 2048] + - Exact: [512, 3519, 1, 2048] + - Exact: [512, 3523, 1, 2048] + - Exact: [512, 3528, 1, 2048] + - Exact: [512, 3529, 1, 2048] + - Exact: [512, 3531, 1, 2048] + - Exact: [512, 3532, 1, 2048] + - Exact: [512, 3533, 1, 2048] + - Exact: [512, 3534, 1, 2048] + - Exact: [512, 3538, 1, 2048] + - Exact: [512, 3539, 1, 2048] + - Exact: [512, 3540, 1, 2048] + - Exact: [512, 3547, 1, 2048] + - Exact: [512, 3548, 1, 2048] + - Exact: [512, 3552, 1, 2048] + - Exact: [512, 3575, 1, 2048] + - Exact: [512, 3598, 1, 2048] + - Exact: [512, 3599, 1, 2048] + - Exact: [512, 3608, 1, 2048] + - Exact: [512, 3776, 1, 512] + - Exact: [512, 3780, 1, 512] + - Exact: [512, 3780, 1, 2048] + - Exact: [512, 3780, 1, 33708] + - Exact: [512, 3796, 1, 512] + - Exact: [512, 3796, 1, 2048] + - Exact: [512, 3796, 1, 33708] + - Exact: [512, 3822, 1, 512] + - Exact: [512, 3822, 1, 2048] + - Exact: [512, 3822, 1, 33708] + - Exact: [512, 3835, 1, 512] + - Exact: [512, 3835, 1, 2048] + - Exact: [512, 3840, 1, 512] + - Exact: [512, 3840, 1, 2048] + - Exact: [512, 3840, 1, 33708] + - Exact: [512, 3859, 1, 2048] + - Exact: [512, 3859, 1, 33708] + - Exact: [512, 3864, 1, 512] + - Exact: [512, 3864, 1, 2048] + - Exact: [512, 3870, 1, 512] + - Exact: [512, 3870, 1, 2048] + - Exact: [512, 3870, 1, 33708] + - Exact: [512, 3876, 1, 512] + - Exact: [512, 3876, 1, 2048] + - Exact: [512, 3876, 1, 33708] + - Exact: [512, 3906, 1, 512] + - Exact: [512, 3906, 1, 2048] + - Exact: [512, 3906, 1, 33708] + - Exact: [512, 3910, 1, 512] + - Exact: [512, 3910, 1, 2048] + - Exact: [512, 3910, 1, 33708] + - Exact: [512, 3925, 1, 512] + - Exact: [512, 3925, 1, 33708] + - Exact: [512, 3927, 1, 512] + - Exact: [512, 3942, 1, 512] + - Exact: [512, 3942, 1, 2048] + - Exact: [512, 3942, 1, 33708] + - Exact: [512, 3944, 1, 512] + - Exact: [512, 3944, 1, 33708] + - Exact: [512, 3955, 1, 512] + - Exact: [512, 3955, 1, 33708] + - Exact: [512, 3968, 1, 512] + - Exact: [512, 3968, 1, 2048] + - Exact: [512, 3968, 1, 33708] + - Exact: [512, 3969, 1, 512] + - Exact: [512, 3969, 1, 33708] + - Exact: [512, 3976, 1, 512] + - Exact: [512, 3976, 1, 33708] + - Exact: [512, 3977, 1, 512] + - Exact: [512, 3977, 1, 2048] + - Exact: [512, 3977, 1, 33708] + - Exact: [512, 3978, 1, 512] + - Exact: [512, 3978, 1, 2048] + - Exact: [512, 3978, 1, 33708] + - Exact: [512, 3990, 1, 512] + - Exact: [512, 3990, 1, 2048] + - Exact: [512, 3990, 1, 33708] + - Exact: [512, 3995, 1, 512] + - Exact: [512, 3995, 1, 2048] + - Exact: [512, 3995, 1, 33708] + - Exact: [512, 3996, 1, 512] + - Exact: [512, 3996, 1, 2048] + - Exact: [512, 3996, 1, 33708] + - Exact: [512, 3999, 1, 512] + - Exact: [512, 3999, 1, 2048] + - Exact: [512, 3999, 1, 33708] + - Exact: [512, 4005, 1, 512] + - Exact: [512, 4005, 1, 2048] + - Exact: [512, 4005, 1, 33708] + - Exact: [512, 4012, 1, 512] + - Exact: [512, 4012, 1, 2048] + - Exact: [512, 4012, 1, 33708] + - Exact: [512, 4020, 1, 512] + - Exact: [512, 4020, 1, 2048] + - Exact: [512, 4020, 1, 33708] + - Exact: [512, 4026, 1, 512] + - Exact: [512, 4026, 1, 2048] + - Exact: [512, 4026, 1, 33708] + - Exact: [512, 4030, 1, 512] + - Exact: [512, 4030, 1, 2048] + - Exact: [512, 4030, 1, 33708] + - Exact: [512, 4032, 1, 512] + - Exact: [512, 4032, 1, 2048] + - Exact: [512, 4032, 1, 33708] + - Exact: [512, 4050, 1, 512] + - Exact: [512, 4059, 1, 512] + - Exact: [2048, 644, 1, 512] + - Exact: [2048, 668, 1, 512] + - Exact: [2048, 714, 1, 512] + - Exact: [2048, 720, 1, 512] + - Exact: [2048, 722, 1, 512] + - Exact: [2048, 781, 1, 512] + - Exact: [2048, 848, 1, 512] + - Exact: [2048, 872, 1, 512] + - Exact: [2048, 936, 1, 512] + - Exact: [2048, 980, 1, 512] + - Exact: [2048, 1139, 1, 512] + - Exact: [2048, 1184, 1, 512] + - Exact: [2048, 1186, 1, 512] + - Exact: [2048, 1279, 1, 512] + - Exact: [2048, 1290, 1, 512] + - Exact: [2048, 1327, 1, 512] + - Exact: [2048, 1331, 1, 512] + - Exact: [2048, 1341, 1, 512] + - Exact: [2048, 1350, 1, 512] + - Exact: [2048, 1359, 1, 512] + - Exact: [2048, 1391, 1, 512] + - Exact: [2048, 1424, 1, 512] + - Exact: [2048, 1458, 1, 512] + - Exact: [2048, 1462, 1, 512] + - Exact: [2048, 1467, 1, 512] + - Exact: [2048, 1472, 1, 512] + - Exact: [2048, 1520, 1, 512] + - Exact: [2048, 1596, 1, 512] + - Exact: [2048, 1599, 1, 512] + - Exact: [2048, 1615, 1, 512] + - Exact: [2048, 1680, 1, 512] + - Exact: [2048, 1709, 1, 512] + - Exact: [2048, 1902, 1, 512] + - Exact: [2048, 1917, 1, 512] + - Exact: [2048, 2076, 1, 512] + - Exact: [2048, 2195, 1, 512] + - Exact: [2048, 2205, 1, 512] + - Exact: [2048, 2418, 1, 512] + - Exact: [2048, 2496, 1, 512] + - Exact: [2048, 2790, 1, 512] + - Exact: [2048, 2864, 1, 512] + - Exact: [2048, 3092, 1, 512] + - Exact: [2048, 3113, 1, 512] + - Exact: [2048, 3137, 1, 512] + - Exact: [2048, 3166, 1, 512] + - Exact: [2048, 3194, 1, 512] + - Exact: [2048, 3219, 1, 512] + - Exact: [2048, 3222, 1, 512] + - Exact: [2048, 3234, 1, 512] + - Exact: [2048, 3237, 1, 512] + - Exact: [2048, 3242, 1, 512] + - Exact: [2048, 3246, 1, 512] + - Exact: [2048, 3249, 1, 512] + - Exact: [2048, 3251, 1, 512] + - Exact: [2048, 3257, 1, 512] + - Exact: [2048, 3262, 1, 512] + - Exact: [2048, 3268, 1, 512] + - Exact: [2048, 3282, 1, 512] + - Exact: [2048, 3286, 1, 512] + - Exact: [2048, 3287, 1, 512] + - Exact: [2048, 3293, 1, 512] + - Exact: [2048, 3297, 1, 512] + - Exact: [2048, 3307, 1, 512] + - Exact: [2048, 3314, 1, 512] + - Exact: [2048, 3315, 1, 512] + - Exact: [2048, 3319, 1, 512] + - Exact: [2048, 3322, 1, 512] + - Exact: [2048, 3323, 1, 512] + - Exact: [2048, 3324, 1, 512] + - Exact: [2048, 3325, 1, 512] + - Exact: [2048, 3327, 1, 512] + - Exact: [2048, 3329, 1, 512] + - Exact: [2048, 3332, 1, 512] + - Exact: [2048, 3336, 1, 512] + - Exact: [2048, 3339, 1, 512] + - Exact: [2048, 3342, 1, 512] + - Exact: [2048, 3344, 1, 512] + - Exact: [2048, 3358, 1, 512] + - Exact: [2048, 3360, 1, 512] + - Exact: [2048, 3364, 1, 512] + - Exact: [2048, 3365, 1, 512] + - Exact: [2048, 3369, 1, 512] + - Exact: [2048, 3370, 1, 512] + - Exact: [2048, 3371, 1, 512] + - Exact: [2048, 3374, 1, 512] + - Exact: [2048, 3376, 1, 512] + - Exact: [2048, 3377, 1, 512] + - Exact: [2048, 3378, 1, 512] + - Exact: [2048, 3381, 1, 512] + - Exact: [2048, 3382, 1, 512] + - Exact: [2048, 3383, 1, 512] + - Exact: [2048, 3384, 1, 512] + - Exact: [2048, 3385, 1, 512] + - Exact: [2048, 3386, 1, 512] + - Exact: [2048, 3388, 1, 512] + - Exact: [2048, 3390, 1, 512] + - Exact: [2048, 3391, 1, 512] + - Exact: [2048, 3396, 1, 512] + - Exact: [2048, 3399, 1, 512] + - Exact: [2048, 3402, 1, 512] + - Exact: [2048, 3410, 1, 512] + - Exact: [2048, 3412, 1, 512] + - Exact: [2048, 3414, 1, 512] + - Exact: [2048, 3415, 1, 512] + - Exact: [2048, 3418, 1, 512] + - Exact: [2048, 3420, 1, 512] + - Exact: [2048, 3422, 1, 512] + - Exact: [2048, 3425, 1, 512] + - Exact: [2048, 3426, 1, 512] + - Exact: [2048, 3427, 1, 512] + - Exact: [2048, 3428, 1, 512] + - Exact: [2048, 3430, 1, 512] + - Exact: [2048, 3431, 1, 512] + - Exact: [2048, 3432, 1, 512] + - Exact: [2048, 3433, 1, 512] + - Exact: [2048, 3438, 1, 512] + - Exact: [2048, 3439, 1, 512] + - Exact: [2048, 3440, 1, 512] + - Exact: [2048, 3443, 1, 512] + - Exact: [2048, 3445, 1, 512] + - Exact: [2048, 3447, 1, 512] + - Exact: [2048, 3448, 1, 512] + - Exact: [2048, 3450, 1, 512] + - Exact: [2048, 3451, 1, 512] + - Exact: [2048, 3452, 1, 512] + - Exact: [2048, 3453, 1, 512] + - Exact: [2048, 3455, 1, 512] + - Exact: [2048, 3456, 1, 512] + - Exact: [2048, 3457, 1, 512] + - Exact: [2048, 3458, 1, 512] + - Exact: [2048, 3459, 1, 512] + - Exact: [2048, 3460, 1, 512] + - Exact: [2048, 3461, 1, 512] + - Exact: [2048, 3462, 1, 512] + - Exact: [2048, 3466, 1, 512] + - Exact: [2048, 3467, 1, 512] + - Exact: [2048, 3468, 1, 512] + - Exact: [2048, 3470, 1, 512] + - Exact: [2048, 3471, 1, 512] + - Exact: [2048, 3472, 1, 512] + - Exact: [2048, 3475, 1, 512] + - Exact: [2048, 3476, 1, 512] + - Exact: [2048, 3477, 1, 512] + - Exact: [2048, 3478, 1, 512] + - Exact: [2048, 3479, 1, 512] + - Exact: [2048, 3480, 1, 512] + - Exact: [2048, 3481, 1, 512] + - Exact: [2048, 3483, 1, 512] + - Exact: [2048, 3484, 1, 512] + - Exact: [2048, 3487, 1, 512] + - Exact: [2048, 3489, 1, 512] + - Exact: [2048, 3490, 1, 512] + - Exact: [2048, 3491, 1, 512] + - Exact: [2048, 3493, 1, 512] + - Exact: [2048, 3494, 1, 512] + - Exact: [2048, 3495, 1, 512] + - Exact: [2048, 3497, 1, 512] + - Exact: [2048, 3498, 1, 512] + - Exact: [2048, 3501, 1, 512] + - Exact: [2048, 3503, 1, 512] + - Exact: [2048, 3505, 1, 512] + - Exact: [2048, 3507, 1, 512] + - Exact: [2048, 3508, 1, 512] + - Exact: [2048, 3509, 1, 512] + - Exact: [2048, 3510, 1, 512] + - Exact: [2048, 3511, 1, 512] + - Exact: [2048, 3513, 1, 512] + - Exact: [2048, 3514, 1, 512] + - Exact: [2048, 3515, 1, 512] + - Exact: [2048, 3517, 1, 512] + - Exact: [2048, 3518, 1, 512] + - Exact: [2048, 3519, 1, 512] + - Exact: [2048, 3520, 1, 512] + - Exact: [2048, 3523, 1, 512] + - Exact: [2048, 3528, 1, 512] + - Exact: [2048, 3529, 1, 512] + - Exact: [2048, 3530, 1, 512] + - Exact: [2048, 3531, 1, 512] + - Exact: [2048, 3532, 1, 512] + - Exact: [2048, 3533, 1, 512] + - Exact: [2048, 3534, 1, 512] + - Exact: [2048, 3538, 1, 512] + - Exact: [2048, 3539, 1, 512] + - Exact: [2048, 3540, 1, 512] + - Exact: [2048, 3541, 1, 512] + - Exact: [2048, 3547, 1, 512] + - Exact: [2048, 3548, 1, 512] + - Exact: [2048, 3552, 1, 512] + - Exact: [2048, 3564, 1, 512] + - Exact: [2048, 3575, 1, 512] + - Exact: [2048, 3598, 1, 512] + - Exact: [2048, 3599, 1, 512] + - Exact: [2048, 3608, 1, 512] + - Exact: [2048, 3776, 1, 512] + - Exact: [2048, 3780, 1, 512] + - Exact: [2048, 3796, 1, 512] + - Exact: [2048, 3822, 1, 512] + - Exact: [2048, 3835, 1, 512] + - Exact: [2048, 3840, 1, 512] + - Exact: [2048, 3859, 1, 512] + - Exact: [2048, 3864, 1, 512] + - Exact: [2048, 3870, 1, 512] + - Exact: [2048, 3876, 1, 512] + - Exact: [2048, 3906, 1, 512] + - Exact: [2048, 3910, 1, 512] + - Exact: [2048, 3925, 1, 512] + - Exact: [2048, 3942, 1, 512] + - Exact: [2048, 3944, 1, 512] + - Exact: [2048, 3955, 1, 512] + - Exact: [2048, 3968, 1, 512] + - Exact: [2048, 3969, 1, 512] + - Exact: [2048, 3976, 1, 512] + - Exact: [2048, 3977, 1, 512] + - Exact: [2048, 3978, 1, 512] + - Exact: [2048, 3990, 1, 512] + - Exact: [2048, 3995, 1, 512] + - Exact: [2048, 3996, 1, 512] + - Exact: [2048, 3999, 1, 512] + - Exact: [2048, 4005, 1, 512] + - Exact: [2048, 4012, 1, 512] + - Exact: [2048, 4020, 1, 512] + - Exact: [2048, 4026, 1, 512] + - Exact: [2048, 4030, 1, 512] + - Exact: [2048, 4032, 1, 512] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 3968, 1, 42720] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 7200, 1, 42720] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 9520, 1, 42720] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 2048, 1, 30528] + - Exact: [1024, 4096, 1, 30528] + - Exact: [1024, 10240, 1, 256] + - Exact: [1024, 10496, 1, 256] + - Exact: [1024, 11008, 1, 256] + - Exact: [1024, 11264, 1, 256] + - Exact: [1024, 11520, 1, 256] + - Exact: [1024, 12288, 1, 256] + - Exact: [1024, 13312, 1, 256] + - Exact: [1024, 13568, 1, 256] + - Exact: [1024, 14336, 1, 256] + - Exact: [1024, 14592, 1, 256] + - Exact: [1024, 14848, 1, 256] + - Exact: [1024, 15104, 1, 256] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1600, 1, 1] + - Exact: [1024, 16128, 1, 256] + - Exact: [1024, 17152, 1, 256] + - Exact: [1024, 1792, 1, 256] + - Exact: [1024, 18944, 1, 256] + - Exact: [1024, 19712, 1, 256] + - Exact: [1024, 19968, 1, 256] + - Exact: [1024, 20480, 1, 256] + - Exact: [1024, 2048, 1, 256] + - Exact: [1024, 20992, 1, 256] + - Exact: [1024, 21504, 1, 256] + - Exact: [1024, 22016, 1, 256] + - Exact: [1024, 23552, 1, 256] + - Exact: [1024, 2560, 1, 256] + - Exact: [1024, 28672, 1, 256] + - Exact: [1024, 3072, 1, 256] + - Exact: [1024, 3328, 1, 256] + - Exact: [1024, 33536, 1, 256] + - Exact: [1024, 3840, 1, 256] + - Exact: [1024, 40448, 1, 256] + - Exact: [1024, 4096, 1, 256] + - Exact: [1024, 4608, 1, 256] + - Exact: [1024, 4864, 1, 256] + - Exact: [1024, 5120, 1, 256] + - Exact: [1024, 5632, 1, 256] + - Exact: [1024, 6144, 1, 256] + - Exact: [1024, 6400, 1, 256] + - Exact: [1024, 7168, 1, 256] + - Exact: [1024, 7424, 1, 256] + - Exact: [1024, 7680, 1, 256] + - Exact: [1024, 7936, 1, 256] + - Exact: [1024, 8192, 1, 256] + - Exact: [1024, 8448, 1, 256] + - Exact: [1024, 8704, 1, 256] + - Exact: [1024, 8960, 1, 256] + - Exact: [1024, 9728, 1, 256] + - Exact: [1024, 9984, 1, 256] + - Exact: [2048, 1024, 1, 1] + - Exact: [2048, 1024, 1, 256] + - Exact: [256, 8976, 1, 10240] + - Exact: [256, 8976, 1, 10496] + - Exact: [256, 8976, 1, 11008] + - Exact: [256, 8976, 1, 11520] + - Exact: [256, 8976, 1, 12288] + - Exact: [256, 8976, 1, 14336] + - Exact: [256, 8976, 1, 14848] + - Exact: [256, 8976, 1, 15104] + - Exact: [256, 8976, 1, 1536] + - Exact: [256, 8976, 1, 15872] + - Exact: [256, 8976, 1, 17152] + - Exact: [256, 8976, 1, 19712] + - Exact: [256, 8976, 1, 19968] + - Exact: [256, 8976, 1, 20480] + - Exact: [256, 8976, 1, 2048] + - Exact: [256, 8976, 1, 20992] + - Exact: [256, 8976, 1, 22016] + - Exact: [256, 8976, 1, 2304] + - Exact: [256, 8976, 1, 2560] + - Exact: [256, 8976, 1, 26112] + - Exact: [256, 8976, 1, 2816] + - Exact: [256, 8976, 1, 3072] + - Exact: [256, 8976, 1, 33536] + - Exact: [256, 8976, 1, 4352] + - Exact: [256, 8976, 1, 44505] + - Exact: [256, 8976, 1, 4864] + - Exact: [256, 8976, 1, 5376] + - Exact: [256, 8976, 1, 5632] + - Exact: [256, 8976, 1, 5888] + - Exact: [256, 8976, 1, 6144] + - Exact: [256, 8976, 1, 6656] + - Exact: [256, 8976, 1, 7168] + - Exact: [256, 8976, 1, 7424] + - Exact: [256, 8976, 1, 8192] + - Exact: [256, 8976, 1, 8448] + - Exact: [256, 8976, 1, 8960] + - Exact: [256, 8976, 1, 9472] + - Exact: [256, 8976, 1, 9728] + - Exact: [256, 8976, 1, 9984] + - Exact: [3200, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 1] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 3072, 1, 3072] + - Exact: [1024, 2048, 1, 3072] + - Exact: [30528, 4096, 1, 1024] + - Exact: [30528, 2048, 1, 1024] + - Exact: [512, 32768, 1, 256] + - Exact: [256, 32768, 1, 128] + - Exact: [1024, 32768, 1, 512] + - Exact: [1024, 32768, 1, 1024] + - Exact: [479, 32768, 1, 1024] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 256, 64, 1024] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [256, 6912, 1, 4] + - Exact: [512, 4096, 1, 256] + - Exact: [1024, 4096, 1, 512] + - Exact: [480, 4096, 1, 1024] + - Exact: [512, 6912, 1, 256] + - Exact: [1024, 6912, 1, 512] + - Exact: [1024, 6912, 1, 1024] + - Exact: [480, 6912, 1, 1024] + - Exact: [256, 55296, 1, 128] + - Exact: [512, 55296, 1, 256] + - Exact: [1920, 2048, 1, 2048] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3840, 4096, 1, 4096] + - Exact: [7680, 8192, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 3072, 1, 3072] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8192, 1, 8192] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [1152, 1152, 1, 384] + - Exact: [1536, 1536, 1, 384] + - Exact: [1920, 1920, 1, 384] + - Exact: [2304, 2304, 1, 384] + - Exact: [2688, 2688, 1, 384] + - Exact: [3072, 3072, 1, 384] + - Exact: [3456, 3456, 1, 384] + - Exact: [3840, 3840, 1, 384] + - Exact: [4224, 4224, 1, 384] + - Exact: [4608, 4608, 1, 384] + - Exact: [4992, 4992, 1, 384] + - Exact: [5376, 5376, 1, 384] + - Exact: [5760, 5760, 1, 384] + - Exact: [6144, 6144, 1, 384] + - Exact: [6528, 6528, 1, 384] + - Exact: [6912, 6912, 1, 384] + - Exact: [7296, 7296, 1, 384] + - Exact: [7680, 7680, 1, 384] + - Exact: [8064, 8064, 1, 384] + - Exact: [8448, 8448, 1, 384] + - Exact: [8832, 8832, 1, 384] + - Exact: [9216, 9216, 1, 384] + - Exact: [9600, 9600, 1, 384] + - Exact: [9984, 9984, 1, 384] + - Exact: [10368, 10368, 1, 384] + - Exact: [10752, 10752, 1, 384] + - Exact: [11136, 11136, 1, 384] + - Exact: [11520, 11520, 1, 384] + - Exact: [11904, 11904, 1, 384] + - Exact: [12288, 12288, 1, 384] + - Exact: [12672, 12672, 1, 384] + - Exact: [13056, 13056, 1, 384] + - Exact: [13440, 13440, 1, 384] + - Exact: [13824, 13824, 1, 384] + - Exact: [14208, 14208, 1, 384] + - Exact: [14592, 14592, 1, 384] + - Exact: [14976, 14976, 1, 384] + - Exact: [15360, 15360, 1, 384] + - Exact: [15744, 15744, 1, 384] + - Exact: [16128, 16128, 1, 384] + - Exact: [16512, 16512, 1, 384] + - Exact: [16896, 16896, 1, 384] + - Exact: [17280, 17280, 1, 384] + - Exact: [17664, 17664, 1, 384] + - Exact: [18048, 18048, 1, 384] + - Exact: [18432, 18432, 1, 384] + - Exact: [18816, 18816, 1, 384] + - Exact: [19200, 19200, 1, 384] + - Exact: [19584, 19584, 1, 384] + - Exact: [19968, 19968, 1, 384] + - Exact: [20352, 20352, 1, 384] + - Exact: [20736, 20736, 1, 384] + - Exact: [21120, 21120, 1, 384] + - Exact: [21504, 21504, 1, 384] + - Exact: [21888, 21888, 1, 384] + - Exact: [22272, 22272, 1, 384] + - Exact: [22656, 22656, 1, 384] + - Exact: [23040, 23040, 1, 384] + - Exact: [8192, 1024, 1, 1024] + - Exact: [8192, 4096, 1, 1024] + - Exact: [16384, 16384, 1, 16384] + - Exact: [1444, 256, 120, 128] + - Exact: [1444, 256, 139, 128] + - Exact: [1444, 256, 160, 128] + - Exact: [1444, 256, 18, 128] + - Exact: [1444, 256, 19, 128] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 139, 256] + - Exact: [1444, 256, 160, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 256, 120, 512] + - Exact: [361, 256, 139, 512] + - Exact: [361, 256, 160, 512] + - Exact: [361, 256, 18, 512] + - Exact: [361, 256, 19, 512] + - Exact: [173280, 128, 1, 64] + - Exact: [200716, 128, 1, 64] + - Exact: [231040, 128, 1, 64] + - Exact: [25992, 128, 1, 64] + - Exact: [27436, 128, 1, 64] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [1024, 1280, 1, 2] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [1024, 4992, 1, 2] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [1024, 5120, 1, 2] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [1024, 5248, 1, 2] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [1024, 2560, 1, 2] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 1152, 1, 2] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 8192, 1, 33712] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [1024, 9600, 1, 33712] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10080, 1, 42720] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 6528, 1, 42720] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7104, 1, 42720] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [480, 32768, 1, 1024] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1024, 1, 30592] + - Exact: [2048, 1024, 1, 6144] + - Exact: [2048, 1024, 1, 8192] + - Exact: [8192, 1024, 1, 2048] + - Exact: [1024, 8192, 1, 30592] + - Exact: [1024, 8192, 1, 3072] + - Exact: [512, 512, 256, 64] + - Exact: [1024, 2048, 1, 30592] + - Exact: [1024, 4096, 1, 30592] + - Exact: [512, 512, 128, 64] + - Exact: [2560, 2048, 1, 1920] + - Exact: [2560, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 7680] + - Exact: [640, 2048, 1, 2560] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 4608] + - Exact: [1536, 4096, 1, 50304] + - Exact: [1536, 4096, 1, 6144] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1536, 8192, 1, 4608] + - Exact: [1536, 8192, 1, 50304] + - Exact: [1536, 8192, 1, 6144] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 16384, 1, 3072] + - Exact: [1024, 16384, 1, 4096] + - Exact: [1024, 16384, 1, 50304] + - Exact: [4096, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [1024, 2048, 1, 50304] + - Exact: [1024, 1024, 32, 64] + - Exact: [1024, 4096, 1, 50304] + - Exact: [1024, 1024, 64, 64] + - Exact: [1024, 8192, 1, 50304] + - Exact: [1024, 1024, 128, 64] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 8192, 1, 30528] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 512] + - Exact: [256, 6912, 1, 128] + - Exact: [480, 3456, 1, 1024] + - Exact: [512, 3456, 1, 256] + - Exact: [1024, 1280, 1, 30528] + - Exact: [1024, 1600, 1, 30528] + - Exact: [1024, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [4096, 10240, 1, 1024] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 1640, 1, 30528] + - Exact: [1024, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 4096] + - Exact: [4096, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [1024, 6144, 1, 4096] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [512, 512, 192, 64] + - Exact: [256, 6912, 1, 1] + - Exact: [3136, 128, 64, 64] + - Exact: [3136, 256, 64, 128] + - Exact: [784, 512, 64, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 1024, 64, 512] + - Exact: [784, 256, 64, 512] + - Exact: [784, 512, 64, 512] + - Exact: [196, 512, 64, 1024] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 64] + - Exact: [3136, 256, 32, 128] + - Exact: [784, 512, 32, 256] + - Exact: [3136, 128, 32, 256] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 1024, 32, 512] + - Exact: [784, 256, 32, 512] + - Exact: [784, 512, 32, 512] + - Exact: [196, 512, 32, 1024] + - Exact: [196, 1024, 32, 1024] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [1024, 10224, 1, 3072] + - Exact: [4096, 10224, 1, 1024] + - Exact: [1024, 10240, 1, 3072] + - Exact: [1024, 10192, 1, 3072] + - Exact: [4096, 10192, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [1024, 10200, 1, 3072] + - Exact: [1024, 10184, 1, 1024] + - Exact: [4096, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 3072] + - Exact: [1024, 10208, 1, 4096] + - Exact: [1024, 10224, 1, 2048] + - Exact: [1024, 10240, 1, 2048] + - Exact: [1024, 10120, 1, 1024] + - Exact: [1024, 10192, 1, 2048] + - Exact: [1024, 10152, 1, 1024] + - Exact: [1024, 10080, 1, 3072] + - Exact: [100352, 512, 1, 256] + - Exact: [12544, 2048, 1, 1024] + - Exact: [200704, 512, 1, 256] + - Exact: [25088, 1024, 1, 512] + - Exact: [50176, 1024, 1, 512] + - Exact: [6272, 2048, 1, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 256, 1024] + - Exact: [196, 512, 128, 1024] + - Exact: [196, 512, 256, 1024] + - Exact: [3136, 128, 128, 256] + - Exact: [3136, 128, 256, 256] + - Exact: [784, 256, 128, 512] + - Exact: [784, 256, 256, 512] + - Exact: [128, 128, 2048, 64] + - Exact: [1024, 2560, 1, 30528] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 4096] + - Exact: [1024, 12288, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [1024, 1920, 1, 30528] + - Exact: [128, 128, 192, 64] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 2] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [768, 4608, 1, 768] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 2] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [1024, 4608, 1, 1024] + - Exact: [3072, 256, 2, 1024] + - Exact: [2852, 256, 2, 1024] + - Exact: [3220, 256, 2, 1024] + - Exact: [850, 2048, 2, 512] + - Exact: [768, 2048, 2, 512] + - Exact: [2904, 256, 2, 1024] + - Exact: [805, 2048, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [2992, 256, 2, 1024] + - Exact: [3400, 256, 2, 1024] + - Exact: [4032, 256, 2, 1024] + - Exact: [15200, 128, 2, 512] + - Exact: [12288, 128, 2, 512] + - Exact: [888, 2048, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3456, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2688, 256, 2, 1024] + - Exact: [13824, 128, 2, 512] + - Exact: [3036, 256, 2, 1024] + - Exact: [3168, 256, 2, 1024] + - Exact: [3360, 256, 2, 1024] + - Exact: [3552, 256, 2, 1024] + - Exact: [11616, 128, 2, 512] + - Exact: [4200, 256, 2, 1024] + - Exact: [840, 2048, 2, 512] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [3264, 256, 2, 1024] + - Exact: [713, 2048, 2, 512] + - Exact: [13600, 256, 2, 512] + - Exact: [12880, 256, 2, 512] + - Exact: [12288, 256, 2, 512] + - Exact: [2816, 256, 2, 1024] + - Exact: [850, 2048, 1, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [672, 2048, 2, 512] + - Exact: [13440, 128, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [3500, 256, 2, 1024] + - Exact: [13824, 256, 2, 512] + - Exact: [15200, 256, 2, 512] + - Exact: [3700, 256, 2, 1024] + - Exact: [748, 2048, 2, 512] + - Exact: [3600, 256, 2, 1024] + - Exact: [4032, 1024, 2, 256] + - Exact: [16128, 128, 2, 512] + - Exact: [15200, 128, 1, 512] + - Exact: [13600, 128, 1, 512] + - Exact: [2904, 1024, 2, 256] + - Exact: [2992, 1024, 2, 256] + - Exact: [1536, 2048, 1, 1024] + - Exact: [24576, 128, 1, 256] + - Exact: [24576, 512, 1, 256] + - Exact: [25760, 128, 1, 256] + - Exact: [25760, 512, 1, 256] + - Exact: [6144, 256, 1, 512] + - Exact: [6440, 256, 1, 512] + - Exact: [3036, 1024, 2, 256] + - Exact: [13600, 512, 1, 128] + - Exact: [9408, 512, 2, 128] + - Exact: [56000, 256, 2, 64] + - Exact: [2852, 1024, 2, 256] + - Exact: [2816, 1024, 2, 256] + - Exact: [60800, 256, 1, 64] + - Exact: [2944, 1024, 2, 256] + - Exact: [11776, 512, 2, 128] + - Exact: [11616, 512, 2, 128] + - Exact: [4200, 1024, 2, 256] + - Exact: [54400, 256, 1, 64] + - Exact: [15200, 512, 1, 128] + - Exact: [2688, 1024, 2, 256] + - Exact: [12672, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [46464, 256, 2, 64] + - Exact: [2400, 256, 2, 1024] + - Exact: [2520, 256, 2, 1024] + - Exact: [2400, 1024, 2, 256] + - Exact: [10752, 128, 2, 512] + - Exact: [45632, 256, 2, 64] + - Exact: [2520, 1024, 2, 256] + - Exact: [53760, 256, 2, 64] + - Exact: [2352, 256, 2, 1024] + - Exact: [47872, 256, 2, 64] + - Exact: [47104, 256, 2, 64] + - Exact: [50688, 256, 2, 64] + - Exact: [45056, 256, 2, 64] + - Exact: [13440, 512, 2, 128] + - Exact: [2352, 1024, 2, 256] + - Exact: [11264, 512, 2, 128] + - Exact: [10560, 128, 2, 512] + - Exact: [16128, 512, 2, 128] + - Exact: [37632, 256, 2, 64] + - Exact: [51520, 256, 2, 64] + - Exact: [14000, 512, 2, 128] + - Exact: [10560, 512, 2, 128] + - Exact: [64512, 256, 2, 64] + - Exact: [54400, 256, 2, 64] + - Exact: [3264, 1024, 2, 256] + - Exact: [10752, 512, 2, 128] + - Exact: [3168, 1024, 2, 256] + - Exact: [950, 2048, 1, 512] + - Exact: [55296, 256, 2, 256] + - Exact: [51520, 256, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [60800, 256, 2, 256] + - Exact: [54400, 256, 2, 256] + - Exact: [3700, 1024, 2, 256] + - Exact: [60800, 256, 2, 64] + - Exact: [3800, 1024, 1, 256] + - Exact: [3400, 1024, 1, 256] + - Exact: [3072, 1024, 2, 256] + - Exact: [3600, 1024, 2, 256] + - Exact: [12288, 512, 2, 128] + - Exact: [49152, 256, 2, 256] + - Exact: [12880, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [42240, 256, 2, 64] + - Exact: [1008, 2048, 2, 512] + - Exact: [3360, 1024, 2, 256] + - Exact: [14208, 512, 2, 128] + - Exact: [56832, 256, 2, 64] + - Exact: [43008, 256, 2, 64] + - Exact: [13600, 512, 2, 128] + - Exact: [3500, 1024, 2, 256] + - Exact: [2640, 1024, 2, 256] + - Exact: [13824, 512, 2, 128] + - Exact: [3800, 256, 2, 1024] + - Exact: [55296, 256, 2, 64] + - Exact: [2640, 256, 2, 1024] + - Exact: [15200, 512, 2, 128] + - Exact: [3552, 1024, 2, 256] + - Exact: [3220, 1024, 2, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [49152, 256, 2, 64] + - Exact: [3400, 1024, 2, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [3800, 1024, 2, 256] + - Exact: [1610, 2048, 1, 1024] + - Exact: [6912, 256, 1, 512] + - Exact: [6800, 256, 1, 512] + - Exact: [27648, 128, 1, 256] + - Exact: [27200, 128, 1, 256] + - Exact: [30400, 128, 1, 256] + - Exact: [7600, 256, 1, 512] + - Exact: [6144, 1024, 1, 512] + - Exact: [6912, 1024, 1, 512] + - Exact: [6440, 1024, 1, 512] + - Exact: [27648, 512, 1, 256] + - Exact: [1728, 2048, 1, 1024] + - Exact: [27200, 512, 1, 256] + - Exact: [6800, 1024, 1, 512] + - Exact: [1700, 2048, 1, 1024] + - Exact: [7600, 1024, 1, 512] + - Exact: [30400, 512, 1, 256] + - Exact: [1900, 2048, 1, 1024] + - Exact: [12544, 1024, 1, 1024] + - Exact: [1024, 1024, 160, 96] + - Exact: [1920, 16384, 1, 25216] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [960, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 2880] + - Exact: [1024, 1024, 40, 96] + - Exact: [1920, 4096, 1, 25216] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [960, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 2880] + - Exact: [1024, 1024, 80, 96] + - Exact: [1920, 8192, 1, 25216] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [960, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 2880] + - Exact: [1024, 1024, 96, 96] + - Exact: [2304, 16384, 1, 12672] + - Exact: [2304, 16384, 1, 2304] + - Exact: [576, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 1728] + - Exact: [1024, 1024, 24, 96] + - Exact: [2304, 4096, 1, 12672] + - Exact: [2304, 4096, 1, 2304] + - Exact: [576, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 1728] + - Exact: [1024, 1024, 48, 96] + - Exact: [2304, 8192, 1, 12672] + - Exact: [2304, 8192, 1, 2304] + - Exact: [576, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 1728] + - Exact: [1024, 1024, 16, 96] + - Exact: [3072, 4096, 1, 6400] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [384, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1152] + - Exact: [1024, 1024, 32, 96] + - Exact: [3072, 8192, 1, 6400] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [384, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1152] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 4096, 1, 2048] + - Exact: [1024, 2283, 1, 29000] + - Exact: [1024, 2296, 1, 29000] + - Exact: [1024, 2306, 1, 29000] + - Exact: [1024, 2309, 1, 29000] + - Exact: [1024, 2318, 1, 29000] + - Exact: [1024, 2320, 1, 29000] + - Exact: [1024, 2324, 1, 29000] + - Exact: [1024, 2325, 1, 29000] + - Exact: [1024, 2329, 1, 29000] + - Exact: [1024, 2338, 1, 29000] + - Exact: [1024, 2345, 1, 29000] + - Exact: [1024, 2350, 1, 29000] + - Exact: [1024, 2362, 1, 29000] + - Exact: [1024, 2366, 1, 29000] + - Exact: [1024, 2368, 1, 29000] + - Exact: [1024, 2374, 1, 29000] + - Exact: [1024, 2390, 1, 29000] + - Exact: [512, 512, 320, 64] + - Exact: [512, 512, 80, 64] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 3072] + - Exact: [1024, 32768, 1, 4096] + - Exact: [1024, 32768, 1, 50304] + - Exact: [4096, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [768, 320, 1, 30522] + - Exact: [768, 640, 1, 30522] + - Exact: [768, 1280, 1, 30522] + - Exact: [1024, 780, 1, 30522] + - Exact: [1024, 308, 1, 30522] + - Exact: [1024, 800, 1, 30522] + - Exact: [1024, 820, 1, 30522] + - Exact: [1024, 385, 1, 30522] + - Exact: [1024, 462, 1, 30522] + - Exact: [1024, 640, 1, 30528] + - Exact: [2048, 199, 1, 29000] + - Exact: [2048, 221, 1, 29000] + - Exact: [2048, 224, 1, 29000] + - Exact: [2048, 229, 1, 29000] + - Exact: [2048, 234, 1, 29000] + - Exact: [2048, 242, 1, 29000] + - Exact: [2048, 246, 1, 29000] + - Exact: [2048, 247, 1, 29000] + - Exact: [2048, 256, 1, 29000] + - Exact: [2048, 262, 1, 29000] + - Exact: [2048, 264, 1, 29000] + - Exact: [2048, 265, 1, 29000] + - Exact: [2048, 274, 1, 29000] + - Exact: [2048, 277, 1, 29000] + - Exact: [2048, 279, 1, 29000] + - Exact: [2048, 288, 1, 29000] + - Exact: [2048, 296, 1, 29000] + - Exact: [2048, 315, 1, 29000] + - Exact: [2048, 335, 1, 29000] + - Exact: [1024, 561, 1, 29000] + - Exact: [1024, 574, 1, 29000] + - Exact: [1024, 600, 1, 29000] + - Exact: [1024, 608, 1, 29000] + - Exact: [1024, 615, 1, 29000] + - Exact: [1024, 622, 1, 29000] + - Exact: [1024, 625, 1, 29000] + - Exact: [1024, 626, 1, 29000] + - Exact: [1024, 628, 1, 29000] + - Exact: [1024, 636, 1, 29000] + - Exact: [1024, 651, 1, 29000] + - Exact: [1024, 658, 1, 29000] + - Exact: [1024, 669, 1, 29000] + - Exact: [1024, 670, 1, 29000] + - Exact: [1024, 672, 1, 29000] + - Exact: [1024, 684, 1, 29000] + - Exact: [1024, 716, 1, 29000] + - Exact: [1024, 730, 1, 29000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1600, 512, 1, 1024] + - Exact: [1024, 512, 1, 1] + - Exact: [1024, 512, 1, 64] + - Exact: [2048, 512, 1, 1] + - Exact: [768, 640, 1, 768] + - Exact: [768, 1024, 1, 2] + - Exact: [768, 1024, 1, 768] + - Exact: [768, 1280, 1, 768] + - Exact: [768, 512, 1, 2] + - Exact: [768, 512, 1, 768] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 2] + - Exact: [64, 64, 768, 64] + - Exact: [64, 64, 96, 64] + - Exact: [704, 1024, 1, 128] + - Exact: [1024, 1024, 1, 3328] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [64, 193600, 1, 64] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [512, 1500, 1, 2816] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [448, 1024, 1, 256] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [2944, 448, 1, 128] + - Exact: [64, 193600, 1, 256] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [512, 1500, 1, 2048] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 5888, 1, 1280] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 448, 1, 128] + - Exact: [704, 1408, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 2944, 1, 128] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [3025, 64, 64, 64] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [512, 1500, 1, 1536] + - Exact: [128, 3584, 1, 1280] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [1024, 1024, 1, 128] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [448, 1024, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [448, 1856, 1, 3328] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [128, 2368, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 2944, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 3328] + - Exact: [64, 5056, 1, 128] + - Exact: [64, 6784, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1408, 448, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [128, 3584, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [5888, 64, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [512, 1500, 1, 2560] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 256, 256] + - Exact: [1024, 512, 1, 2048] + - Exact: [4096, 256, 1, 2048] + - Exact: [2048, 256, 1, 4096] + - Exact: [512, 768, 1, 2048] + - Exact: [2048, 256, 1, 1024] + - Exact: [2048, 200, 1, 512] + - Exact: [4096, 200, 1, 1024] + - Exact: [2048, 200, 1, 4096] + - Exact: [2048, 512, 1, 1024] + - Exact: [1024, 1024, 1, 512] + - Exact: [2048, 512, 1, 4096] + - Exact: [1024, 1024, 1, 4096] + - Exact: [4096, 200, 1, 2048] + - Exact: [2048, 200, 1, 1024] + - Exact: [1024, 768, 1, 512] + - Exact: [2048, 200, 1, 2048] + - Exact: [2048, 256, 1, 2048] + - Exact: [512, 768, 1, 512] + - Exact: [4096, 256, 1, 4096] + - Exact: [1024, 512, 1, 512] + - Exact: [1024, 1024, 1, 2048] + - Exact: [4096, 256, 1, 1024] + - Exact: [512, 768, 1, 1024] + - Exact: [1024, 512, 1, 4096] + - Exact: [4096, 200, 1, 4096] + - Exact: [2048, 256, 1, 512] + - Exact: [1024, 1024, 1, 1024] + - Exact: [4096, 192, 1, 2048] + - Exact: [5329, 64, 64, 160] + - Exact: [1225, 64, 64, 384] + - Exact: [4096, 320, 1, 1280] + - Exact: [4096, 192, 1, 1280] + - Exact: [1225, 96, 64, 384] + - Exact: [4096, 320, 1, 2048] + - Exact: [4096, 256, 1, 1536] + - Exact: [64, 147, 432, 148] + - Exact: [64, 123, 528, 123] + - Exact: [64, 111, 576, 112] + - Exact: [64, 77, 816, 77] + - Exact: [64, 92, 688, 92] + - Exact: [64, 159, 400, 159] + - Exact: [64, 85, 752, 84] + - Exact: [64, 122, 528, 123] + - Exact: [64, 93, 688, 92] + - Exact: [64, 102, 624, 99] + - Exact: [64, 133, 480, 133] + - Exact: [64, 232, 272, 232] + - Exact: [64, 162, 400, 159] + - Exact: [64, 78, 816, 78] + - Exact: [64, 99, 624, 99] + - Exact: [64, 101, 624, 102] + - Exact: [64, 111, 576, 111] + - Exact: [64, 134, 480, 134] + - Exact: [64, 135, 480, 132] + - Exact: [64, 134, 480, 132] + - Exact: [64, 134, 480, 135] + - Exact: [64, 162, 400, 162] + - Exact: [64, 102, 624, 102] + - Exact: [64, 135, 480, 133] + - Exact: [64, 148, 432, 143] + - Exact: [64, 100, 624, 100] + - Exact: [64, 65, 992, 65] + - Exact: [64, 122, 528, 122] + - Exact: [64, 228, 272, 228] + - Exact: [64, 112, 576, 111] + - Exact: [64, 143, 432, 143] + - Exact: [64, 135, 480, 135] + - Exact: [64, 232, 272, 228] + - Exact: [64, 193, 320, 193] + - Exact: [64, 71, 896, 71] + - Exact: [64, 84, 752, 84] + - Exact: [64, 132, 480, 132] + - Exact: [64, 85, 752, 85] + - Exact: [64, 102, 624, 100] + - Exact: [64, 78, 816, 77] + - Exact: [64, 112, 576, 112] + - Exact: [64, 148, 432, 148] + - Exact: [64, 159, 400, 160] + - Exact: [64, 102, 624, 101] + - Exact: [64, 101, 624, 101] + - Exact: [64, 160, 400, 160] + - Exact: [64, 93, 688, 93] + - Exact: [64, 147, 432, 147] + - Exact: [64, 100, 624, 102] + - Exact: [64, 177, 352, 177] + - Exact: [500, 1024, 1, 512] + - Exact: [512, 1024, 1, 512] + - Exact: [200, 2048, 1, 512] + - Exact: [512, 2000, 1, 1024] + - Exact: [512, 2048, 1, 512] + - Exact: [200, 2000, 1, 100] + - Exact: [200, 2000, 1, 1024] + - Exact: [500, 1024, 1, 2048] + - Exact: [512, 2048, 1, 100] + - Exact: [512, 2048, 1, 2000] + - Exact: [200, 2000, 1, 10] + - Exact: [500, 2048, 1, 1024] + - Exact: [500, 2000, 1, 10] + - Exact: [500, 2048, 1, 100] + - Exact: [512, 1024, 1, 500] + - Exact: [200, 2000, 1, 2000] + - Exact: [500, 2048, 1, 2000] + - Exact: [512, 2048, 1, 1024] + - Exact: [512, 1024, 1, 100] + - Exact: [256, 2000, 1, 10] + - Exact: [512, 2000, 1, 100] + - Exact: [512, 2000, 1, 2048] + - Exact: [500, 1024, 1, 500] + - Exact: [256, 2000, 1, 100] + - Exact: [512, 1024, 1, 2048] + - Exact: [500, 2048, 1, 2048] + - Exact: [200, 2048, 1, 10] + - Exact: [500, 2000, 1, 512] + - Exact: [500, 1024, 1, 1024] + - Exact: [200, 2000, 1, 500] + - Exact: [256, 2048, 1, 100] + - Exact: [500, 2000, 1, 1024] + - Exact: [256, 2048, 1, 1024] + - Exact: [200, 2048, 1, 1024] + - Exact: [512, 2048, 1, 500] + - Exact: [512, 2000, 1, 10] + - Exact: [500, 1024, 1, 2000] + - Exact: [512, 2000, 1, 512] + - Exact: [500, 2000, 1, 2000] + - Exact: [500, 1024, 1, 10] + - Exact: [256, 2048, 1, 10] + - Exact: [256, 2048, 1, 500] + - Exact: [256, 2048, 1, 2048] + - Exact: [256, 2000, 1, 512] + - Exact: [512, 1024, 1, 2000] + - Exact: [256, 2000, 1, 2000] + - Exact: [256, 2048, 1, 2000] + - Exact: [200, 2048, 1, 100] + - Exact: [200, 2000, 1, 2048] + - Exact: [500, 2048, 1, 512] + - Exact: [500, 2000, 1, 500] + - Exact: [200, 2048, 1, 2048] + - Exact: [200, 2048, 1, 500] + - Exact: [512, 2000, 1, 500] + - Exact: [200, 2048, 1, 2000] + - Exact: [500, 1024, 1, 100] + - Exact: [512, 1024, 1, 10] + - Exact: [512, 1024, 1, 1024] + - Exact: [500, 2048, 1, 10] + - Exact: [200, 2000, 1, 512] + - Exact: [256, 2000, 1, 500] + - Exact: [256, 2048, 1, 512] + - Exact: [256, 2000, 1, 2048] + - Exact: [500, 2048, 1, 500] + - Exact: [256, 2000, 1, 1024] + - Exact: [500, 2000, 1, 2048] + - Exact: [512, 2000, 1, 2000] + - Exact: [512, 2048, 1, 2048] + - Exact: [512, 2048, 1, 10] + - Exact: [500, 2000, 1, 100] + - Exact: [1024, 1131, 1, 1024] + - Exact: [1024, 1102, 1, 1024] + - Exact: [1024, 774, 1, 1024] + - Exact: [4096, 128, 1, 2048] + - Exact: [4096, 128, 1, 3072] + - Exact: [1024, 1120, 1, 1024] + - Exact: [1024, 1015, 1, 1024] + - Exact: [1024, 992, 1, 1024] + - Exact: [1024, 950, 1, 1024] + - Exact: [1024, 1088, 1, 1024] + - Exact: [64, 128, 96, 128] + - Exact: [768, 1024, 1, 3072] + - Exact: [768, 512, 1, 3072] + - Exact: [64, 256, 192, 256] + - Exact: [64, 128, 384, 128] + - Exact: [64, 256, 96, 256] + - Exact: [6272, 112, 1, 512] + - Exact: [2048, 320, 1, 1280] + - Exact: [5329, 64, 1, 448] + - Exact: [784, 64, 32, 192] + - Exact: [6272, 64, 1, 480] + - Exact: [6272, 64, 1, 512] + - Exact: [6272, 160, 1, 528] + - Exact: [289, 160, 32, 768] + - Exact: [5329, 64, 32, 160] + - Exact: [5329, 96, 1, 576] + - Exact: [1225, 64, 32, 288] + - Exact: [289, 192, 32, 768] + - Exact: [2048, 448, 1, 1280] + - Exact: [3136, 64, 32, 64] + - Exact: [6272, 128, 1, 528] + - Exact: [6272, 96, 1, 480] + - Exact: [2048, 448, 1, 2048] + - Exact: [784, 96, 32, 192] + - Exact: [1001, 512, 1, 4096] + - Exact: [2048, 192, 1, 1280] + - Exact: [1225, 64, 32, 256] + - Exact: [2048, 256, 1, 1536] + - Exact: [6272, 128, 1, 512] + - Exact: [1568, 384, 1, 832] + - Exact: [1568, 256, 1, 832] + - Exact: [1568, 192, 1, 832] + - Exact: [289, 192, 32, 1024] + - Exact: [1225, 64, 32, 384] + - Exact: [2048, 320, 1, 2048] + - Exact: [2048, 384, 1, 1536] + - Exact: [5041, 96, 1, 576] + - Exact: [6272, 192, 1, 480] + - Exact: [5041, 192, 1, 720] + - Exact: [289, 128, 32, 768] + - Exact: [12544, 64, 1, 147] + - Exact: [6272, 160, 1, 512] + - Exact: [1225, 64, 32, 192] + - Exact: [784, 64, 32, 256] + - Exact: [6272, 144, 1, 512] + - Exact: [8192, 192, 1, 1280] + - Exact: [8192, 192, 1, 2048] + - Exact: [65, 6400, 1, 1024] + - Exact: [512, 1290, 1, 2048] + - Exact: [512, 2205, 1, 2048] + - Exact: [64, 512, 16, 512] + - Exact: [512, 600, 1, 2048] + - Exact: [512, 644, 1, 512] + - Exact: [512, 644, 1, 2048] + - Exact: [512, 668, 1, 2048] + - Exact: [512, 714, 1, 512] + - Exact: [512, 714, 1, 2048] + - Exact: [512, 720, 1, 512] + - Exact: [512, 720, 1, 2048] + - Exact: [512, 722, 1, 2048] + - Exact: [512, 781, 1, 512] + - Exact: [512, 781, 1, 2048] + - Exact: [512, 848, 1, 2048] + - Exact: [512, 872, 1, 2048] + - Exact: [512, 936, 1, 512] + - Exact: [512, 936, 1, 2048] + - Exact: [512, 980, 1, 512] + - Exact: [512, 980, 1, 2048] + - Exact: [512, 1139, 1, 2048] + - Exact: [512, 1184, 1, 2048] + - Exact: [512, 1186, 1, 2048] + - Exact: [512, 1232, 1, 512] + - Exact: [512, 1232, 1, 2048] + - Exact: [512, 1279, 1, 2048] + - Exact: [512, 1290, 1, 512] + - Exact: [512, 1327, 1, 2048] + - Exact: [512, 1331, 1, 2048] + - Exact: [512, 1341, 1, 2048] + - Exact: [512, 1350, 1, 512] + - Exact: [512, 1350, 1, 2048] + - Exact: [512, 1359, 1, 2048] + - Exact: [512, 1391, 1, 2048] + - Exact: [512, 1424, 1, 512] + - Exact: [512, 1424, 1, 2048] + - Exact: [512, 1458, 1, 512] + - Exact: [512, 1458, 1, 2048] + - Exact: [512, 1462, 1, 512] + - Exact: [512, 1462, 1, 2048] + - Exact: [512, 1467, 1, 2048] + - Exact: [512, 1472, 1, 2048] + - Exact: [512, 1520, 1, 512] + - Exact: [512, 1520, 1, 2048] + - Exact: [512, 1596, 1, 512] + - Exact: [512, 1596, 1, 2048] + - Exact: [512, 1599, 1, 512] + - Exact: [512, 1599, 1, 2048] + - Exact: [512, 1615, 1, 512] + - Exact: [512, 1615, 1, 2048] + - Exact: [512, 1680, 1, 512] + - Exact: [512, 1680, 1, 2048] + - Exact: [512, 1709, 1, 2048] + - Exact: [512, 1890, 1, 512] + - Exact: [512, 1902, 1, 2048] + - Exact: [512, 1917, 1, 512] + - Exact: [512, 1917, 1, 2048] + - Exact: [512, 2076, 1, 2048] + - Exact: [512, 2195, 1, 2048] + - Exact: [512, 2205, 1, 512] + - Exact: [2048, 198, 1, 512] + - Exact: [2048, 207, 1, 512] + - Exact: [2048, 208, 1, 512] + - Exact: [2048, 245, 1, 512] + - Exact: [2048, 246, 1, 512] + - Exact: [2048, 264, 1, 512] + - Exact: [2048, 401, 1, 512] + - Exact: [2048, 439, 1, 512] + - Exact: [2048, 443, 1, 512] + - Exact: [2048, 446, 1, 512] + - Exact: [2048, 465, 1, 512] + - Exact: [2048, 468, 1, 512] + - Exact: [2048, 493, 1, 512] + - Exact: [2048, 495, 1, 512] + - Exact: [2048, 511, 1, 512] + - Exact: [2048, 512, 1, 512] + - Exact: [2048, 540, 1, 512] + - Exact: [2048, 550, 1, 512] + - Exact: [2048, 560, 1, 512] + - Exact: [2048, 600, 1, 512] + - Exact: [64, 64, 496, 64] + - Exact: [64, 65, 496, 64] + - Exact: [64, 65, 496, 65] + - Exact: [64, 70, 216, 70] + - Exact: [64, 71, 216, 71] + - Exact: [64, 78, 248, 77] + - Exact: [64, 80, 152, 80] + - Exact: [64, 93, 344, 93] + - Exact: [64, 102, 312, 102] + - Exact: [64, 122, 264, 122] + - Exact: [64, 122, 264, 123] + - Exact: [64, 123, 264, 123] + - Exact: [64, 512, 96, 512] + - Exact: [64, 512, 128, 512] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [2048, 512, 1, 2048] + - Exact: [512, 1600, 1, 32] + - Exact: [512, 1600, 1, 512] + - Exact: [560, 1600, 1, 1024] + - Exact: [1024, 512, 1, 3072] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [5329, 80, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [256, 4096, 1, 4] + - Exact: [960, 1024, 1, 1024] + - Exact: [768, 768, 1, 768] + - Exact: [768, 768, 1, 384] + - Exact: [100, 128, 120, 512] + - Exact: [100, 128, 139, 512] + - Exact: [100, 128, 160, 512] + - Exact: [22500, 64, 1, 147] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 616, 1, 1024] + - Exact: [64, 128, 128, 128] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 2] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 780, 1, 1024] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 800, 1, 1024] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 820, 1, 1024] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 960, 1, 64] + - Exact: [64, 512, 256, 512] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 512] + - Exact: [256, 3456, 1, 128] + - Exact: [256, 4096, 1, 128] + - Exact: [480, 864, 1, 1024] + - Exact: [512, 864, 1, 256] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [256, 4096, 1, 1] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [950, 512, 2, 2048] + - Exact: [3400, 256, 1, 1024] + - Exact: [3800, 256, 1, 1024] + - Exact: [850, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [950, 256, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [51520, 64, 2, 256] + - Exact: [46464, 64, 2, 256] + - Exact: [49152, 64, 2, 256] + - Exact: [1900, 512, 1, 1024] + - Exact: [1700, 512, 1, 1024] + - Exact: [1610, 512, 1, 1024] + - Exact: [1536, 512, 1, 1024] + - Exact: [1728, 512, 1, 1024] + - Exact: [1024, 1024, 1, 320] + - Exact: [51520, 64, 2, 64] + - Exact: [55296, 64, 2, 64] + - Exact: [49152, 64, 2, 64] + - Exact: [54400, 64, 2, 64] + - Exact: [42240, 64, 2, 256] + - Exact: [672, 512, 2, 2048] + - Exact: [54400, 64, 2, 256] + - Exact: [56832, 64, 2, 256] + - Exact: [55296, 64, 2, 256] + - Exact: [60800, 64, 2, 64] + - Exact: [660, 512, 2, 2048] + - Exact: [768, 512, 2, 2048] + - Exact: [43008, 64, 2, 256] + - Exact: [864, 256, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [768, 256, 2, 2048] + - Exact: [45632, 64, 2, 256] + - Exact: [713, 512, 2, 2048] + - Exact: [805, 256, 2, 2048] + - Exact: [60800, 64, 2, 256] + - Exact: [850, 256, 2, 2048] + - Exact: [1024, 1024, 1, 81] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 80, 1, 30522] + - Exact: [1024, 120, 1, 30522] + - Exact: [1024, 77, 1, 30522] + - Exact: [1024, 200, 1, 30522] + - Exact: [1024, 160, 1, 30522] + - Exact: [1024, 180, 1, 30522] + - Exact: [1024, 160, 1, 30528] + - Exact: [1024, 240, 1, 30528] + - Exact: [2560, 109, 1, 29000] + - Exact: [2560, 121, 1, 29000] + - Exact: [2560, 65, 1, 29000] + - Exact: [2560, 66, 1, 29000] + - Exact: [2560, 67, 1, 29000] + - Exact: [2560, 69, 1, 29000] + - Exact: [2560, 70, 1, 29000] + - Exact: [2560, 71, 1, 29000] + - Exact: [2560, 73, 1, 29000] + - Exact: [2560, 74, 1, 29000] + - Exact: [2560, 75, 1, 29000] + - Exact: [2560, 77, 1, 29000] + - Exact: [2560, 78, 1, 29000] + - Exact: [2560, 80, 1, 29000] + - Exact: [2560, 81, 1, 29000] + - Exact: [2560, 82, 1, 29000] + - Exact: [2560, 83, 1, 29000] + - Exact: [2560, 84, 1, 29000] + - Exact: [2560, 88, 1, 29000] + - Exact: [2560, 89, 1, 29000] + - Exact: [2560, 90, 1, 29000] + - Exact: [2560, 92, 1, 29000] + - Exact: [2560, 95, 1, 29000] + - Exact: [2560, 98, 1, 29000] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 200, 1, 32] + - Exact: [1024, 200, 1, 1] + - Exact: [512, 200, 1, 1] + - Exact: [768, 320, 1, 768] + - Exact: [768, 160, 1, 768] + - Exact: [1024, 120, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [2368, 64, 1, 3328] + - Exact: [64, 3584, 1, 1280] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 3584, 1, 3328] + - Exact: [1760, 128, 1, 1760] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [35, 8457, 1, 1760] + - Exact: [64, 2944, 1, 128] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [2048, 128, 1, 2048] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [35, 8457, 1, 2560] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1024, 128, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [1408, 128, 1, 128] + - Exact: [128, 1024, 1, 1280] + - Exact: [2944, 64, 1, 128] + - Exact: [1024, 128, 1, 3328] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [64, 4288, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [64, 4288, 1, 128] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [128, 1500, 1, 1280] + - Exact: [35, 8457, 1, 4096] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [35, 8457, 1, 2048] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [4096, 64, 1, 4096] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [1024, 256, 1, 128] + - Exact: [256, 704, 1, 128] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [2560, 32, 1, 2560] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [176, 1500, 1, 1408] + - Exact: [1856, 128, 1, 256] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 1024] + - Exact: [512, 200, 1, 512] + - Exact: [1024, 200, 1, 1024] + - Exact: [512, 256, 1, 1024] + - Exact: [1024, 256, 1, 2048] + - Exact: [1024, 200, 1, 4096] + - Exact: [1024, 200, 1, 512] + - Exact: [512, 200, 1, 1024] + - Exact: [512, 256, 1, 512] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 200, 1, 2048] + - Exact: [1024, 256, 1, 512] + - Exact: [512, 200, 1, 2048] + - Exact: [64, 32, 1984, 32] + - Exact: [64, 38, 1680, 38] + - Exact: [64, 59, 1088, 59] + - Exact: [64, 54, 1184, 54] + - Exact: [64, 49, 1296, 49] + - Exact: [64, 45, 1424, 45] + - Exact: [64, 35, 1808, 35] + - Exact: [64, 41, 1552, 41] + - Exact: [512, 512, 1, 1024] + - Exact: [512, 512, 1, 2000] + - Exact: [100, 1024, 1, 2048] + - Exact: [100, 2000, 1, 1024] + - Exact: [128, 2000, 1, 100] + - Exact: [64, 2000, 1, 1024] + - Exact: [100, 1024, 1, 1024] + - Exact: [128, 1024, 1, 512] + - Exact: [512, 500, 1, 2000] + - Exact: [500, 512, 1, 100] + - Exact: [100, 1024, 1, 500] + - Exact: [128, 2000, 1, 512] + - Exact: [256, 1024, 1, 100] + - Exact: [200, 500, 1, 1024] + - Exact: [100, 2000, 1, 512] + - Exact: [200, 512, 1, 100] + - Exact: [64, 2048, 1, 10] + - Exact: [64, 2048, 1, 500] + - Exact: [512, 512, 1, 512] + - Exact: [500, 500, 1, 2000] + - Exact: [256, 500, 1, 10] + - Exact: [512, 500, 1, 512] + - Exact: [128, 1024, 1, 2000] + - Exact: [100, 2000, 1, 2048] + - Exact: [256, 512, 1, 10] + - Exact: [64, 2000, 1, 2048] + - Exact: [64, 2048, 1, 512] + - Exact: [64, 2000, 1, 10] + - Exact: [128, 1024, 1, 500] + - Exact: [200, 512, 1, 1024] + - Exact: [128, 2048, 1, 10] + - Exact: [64, 2048, 1, 100] + - Exact: [64, 2000, 1, 100] + - Exact: [200, 500, 1, 100] + - Exact: [500, 500, 1, 500] + - Exact: [128, 2048, 1, 512] + - Exact: [100, 2048, 1, 500] + - Exact: [500, 500, 1, 2048] + - Exact: [128, 2000, 1, 2000] + - Exact: [256, 500, 1, 1024] + - Exact: [64, 2048, 1, 2000] + - Exact: [100, 2048, 1, 1024] + - Exact: [128, 1024, 1, 100] + - Exact: [256, 1024, 1, 2048] + - Exact: [500, 512, 1, 512] + - Exact: [256, 500, 1, 2000] + - Exact: [256, 512, 1, 100] + - Exact: [128, 2000, 1, 500] + - Exact: [200, 512, 1, 2048] + - Exact: [64, 2048, 1, 2048] + - Exact: [200, 1024, 1, 2048] + - Exact: [512, 512, 1, 10] + - Exact: [512, 500, 1, 10] + - Exact: [200, 512, 1, 10] + - Exact: [500, 500, 1, 1024] + - Exact: [256, 1024, 1, 512] + - Exact: [256, 500, 1, 512] + - Exact: [200, 500, 1, 2048] + - Exact: [100, 2000, 1, 10] + - Exact: [100, 2048, 1, 2048] + - Exact: [128, 1024, 1, 2048] + - Exact: [100, 2000, 1, 500] + - Exact: [100, 2048, 1, 100] + - Exact: [100, 1024, 1, 10] + - Exact: [100, 1024, 1, 2000] + - Exact: [256, 512, 1, 500] + - Exact: [100, 2000, 1, 100] + - Exact: [128, 1024, 1, 10] + - Exact: [100, 2048, 1, 10] + - Exact: [512, 500, 1, 100] + - Exact: [128, 2000, 1, 1024] + - Exact: [200, 1024, 1, 500] + - Exact: [256, 512, 1, 2000] + - Exact: [256, 1024, 1, 2000] + - Exact: [200, 512, 1, 500] + - Exact: [64, 2000, 1, 512] + - Exact: [200, 1024, 1, 100] + - Exact: [200, 1024, 1, 1024] + - Exact: [500, 512, 1, 2000] + - Exact: [200, 500, 1, 512] + - Exact: [256, 512, 1, 512] + - Exact: [512, 512, 1, 500] + - Exact: [100, 1024, 1, 512] + - Exact: [128, 1024, 1, 1024] + - Exact: [200, 512, 1, 2000] + - Exact: [256, 1024, 1, 500] + - Exact: [200, 1024, 1, 512] + - Exact: [256, 500, 1, 500] + - Exact: [256, 500, 1, 2048] + - Exact: [512, 500, 1, 1024] + - Exact: [256, 512, 1, 1024] + - Exact: [128, 2048, 1, 1024] + - Exact: [500, 512, 1, 500] + - Exact: [200, 500, 1, 500] + - Exact: [64, 2000, 1, 2000] + - Exact: [128, 2000, 1, 2048] + - Exact: [256, 1024, 1, 10] + - Exact: [256, 1024, 1, 1024] + - Exact: [500, 500, 1, 10] + - Exact: [256, 500, 1, 100] + - Exact: [256, 512, 1, 2048] + - Exact: [200, 1024, 1, 2000] + - Exact: [100, 2048, 1, 512] + - Exact: [512, 500, 1, 2048] + - Exact: [128, 2048, 1, 2000] + - Exact: [500, 512, 1, 2048] + - Exact: [200, 500, 1, 2000] + - Exact: [500, 512, 1, 1024] + - Exact: [100, 1024, 1, 100] + - Exact: [64, 2000, 1, 500] + - Exact: [128, 2048, 1, 2048] + - Exact: [128, 2000, 1, 10] + - Exact: [500, 512, 1, 10] + - Exact: [200, 512, 1, 512] + - Exact: [512, 500, 1, 500] + - Exact: [512, 512, 1, 100] + - Exact: [500, 500, 1, 512] + - Exact: [128, 2048, 1, 500] + - Exact: [200, 500, 1, 10] + - Exact: [100, 2048, 1, 2000] + - Exact: [200, 1024, 1, 10] + - Exact: [64, 2048, 1, 1024] + - Exact: [100, 2000, 1, 2000] + - Exact: [500, 500, 1, 100] + - Exact: [128, 2048, 1, 100] + - Exact: [4096, 64, 1, 2048] + - Exact: [4096, 91, 1, 2048] + - Exact: [4096, 86, 1, 3072] + - Exact: [4096, 49, 1, 2048] + - Exact: [4096, 91, 1, 3072] + - Exact: [4096, 64, 1, 3072] + - Exact: [4096, 63, 1, 3072] + - Exact: [4096, 96, 1, 2048] + - Exact: [4096, 32, 1, 2048] + - Exact: [4096, 49, 1, 3072] + - Exact: [1024, 96, 1, 1024] + - Exact: [4096, 86, 1, 2048] + - Exact: [4096, 96, 1, 3072] + - Exact: [4096, 35, 1, 3072] + - Exact: [4096, 50, 1, 2048] + - Exact: [36548, 32, 1, 1024] + - Exact: [4096, 32, 1, 3072] + - Exact: [1024, 243, 1, 1024] + - Exact: [4096, 50, 1, 3072] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 216, 1, 1024] + - Exact: [4096, 35, 1, 2048] + - Exact: [4096, 63, 1, 2048] + - Exact: [289, 256, 1, 1568] + - Exact: [3025, 64, 1, 363] + - Exact: [784, 32, 32, 192] + - Exact: [289, 256, 1, 2016] + - Exact: [21609, 32, 1, 288] + - Exact: [1225, 192, 1, 1728] + - Exact: [784, 96, 1, 800] + - Exact: [1225, 64, 1, 1200] + - Exact: [729, 192, 1, 1600] + - Exact: [6272, 32, 1, 528] + - Exact: [1568, 160, 1, 832] + - Exact: [289, 256, 1, 1792] + - Exact: [784, 32, 32, 256] + - Exact: [6272, 32, 1, 512] + - Exact: [289, 384, 1, 3456] + - Exact: [289, 384, 1, 2592] + - Exact: [1225, 32, 32, 192] + - Exact: [1568, 128, 1, 832] + - Exact: [1225, 48, 32, 288] + - Exact: [1001, 128, 1, 2048] + - Exact: [2048, 174, 1, 512] + - Exact: [2048, 189, 1, 512] + - Exact: [64, 35, 904, 35] + - Exact: [64, 103, 16, 103] + - Exact: [64, 104, 16, 103] + - Exact: [64, 123, 16, 112] + - Exact: [64, 123, 16, 123] + - Exact: [512, 540, 1, 512] + - Exact: [512, 540, 1, 2048] + - Exact: [512, 550, 1, 512] + - Exact: [512, 550, 1, 2048] + - Exact: [512, 560, 1, 512] + - Exact: [512, 560, 1, 2048] + - Exact: [2048, 160, 1, 512] + - Exact: [2048, 184, 1, 512] + - Exact: [512, 160, 1, 2048] + - Exact: [512, 174, 1, 2048] + - Exact: [512, 182, 1, 512] + - Exact: [512, 184, 1, 512] + - Exact: [512, 184, 1, 2048] + - Exact: [512, 189, 1, 512] + - Exact: [512, 189, 1, 2048] + - Exact: [512, 198, 1, 2048] + - Exact: [512, 206, 1, 512] + - Exact: [512, 207, 1, 2048] + - Exact: [512, 208, 1, 512] + - Exact: [512, 208, 1, 2048] + - Exact: [512, 224, 1, 512] + - Exact: [512, 245, 1, 2048] + - Exact: [512, 246, 1, 512] + - Exact: [512, 246, 1, 2048] + - Exact: [512, 264, 1, 512] + - Exact: [512, 264, 1, 2048] + - Exact: [512, 401, 1, 2048] + - Exact: [512, 439, 1, 2048] + - Exact: [512, 443, 1, 2048] + - Exact: [512, 446, 1, 2048] + - Exact: [512, 455, 1, 512] + - Exact: [512, 465, 1, 512] + - Exact: [512, 465, 1, 2048] + - Exact: [512, 468, 1, 512] + - Exact: [512, 468, 1, 2048] + - Exact: [512, 476, 1, 512] + - Exact: [512, 493, 1, 512] + - Exact: [512, 493, 1, 2048] + - Exact: [512, 495, 1, 2048] + - Exact: [512, 511, 1, 2048] + - Exact: [512, 512, 1, 2048] + - Exact: [64, 59, 512, 59] + - Exact: [64, 59, 544, 59] + - Exact: [256, 1024, 1, 1] + - Exact: [257, 1024, 1, 4096] + - Exact: [512, 215, 1, 2048] + - Exact: [512, 256, 1, 2048] + - Exact: [560, 200, 1, 1024] + - Exact: [768, 215, 1, 2048] + - Exact: [768, 256, 1, 2048] + - Exact: [32, 33, 1600, 33] + - Exact: [512, 512, 1, 64] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 64, 2048] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [384, 384, 1, 384] + - Exact: [100, 128, 18, 512] + - Exact: [100, 128, 19, 512] + - Exact: [1444, 128, 1, 576] + - Exact: [361, 512, 1, 2304] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 32] + - Exact: [64, 35, 4608, 35] + - Exact: [256, 864, 1, 128] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [3136, 64, 1, 576] + - Exact: [784, 128, 1, 1152] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [1024, 128, 1, 2] + - Exact: [1024, 96, 1, 2] + - Exact: [1909283, 40, 1, 40] + - Exact: [3818566, 40, 1, 40] + - Exact: [2560, 35, 1, 29000] + - Exact: [2560, 36, 1, 29000] + - Exact: [2560, 39, 1, 29000] + - Exact: [2560, 40, 1, 29000] + - Exact: [2560, 42, 1, 29000] + - Exact: [2560, 43, 1, 29000] + - Exact: [2560, 44, 1, 29000] + - Exact: [2560, 46, 1, 29000] + - Exact: [2560, 48, 1, 29000] + - Exact: [2560, 49, 1, 29000] + - Exact: [2560, 50, 1, 29000] + - Exact: [2560, 51, 1, 29000] + - Exact: [2560, 53, 1, 29000] + - Exact: [2560, 54, 1, 29000] + - Exact: [2560, 55, 1, 29000] + - Exact: [2560, 56, 1, 29000] + - Exact: [2560, 57, 1, 29000] + - Exact: [2560, 58, 1, 29000] + - Exact: [2560, 59, 1, 29000] + - Exact: [2560, 61, 1, 29000] + - Exact: [2560, 63, 1, 29000] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [3584, 4, 1, 1280] + - Exact: [2560, 16, 1, 2560] + - Exact: [2944, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [1760, 16, 1, 1760] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [8448, 4, 1, 2816] + - Exact: [1856, 4, 1, 1280] + - Exact: [4608, 1, 1, 1536] + - Exact: [7680, 4, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [3072, 2, 1, 1024] + - Exact: [2368, 4, 1, 256] + - Exact: [7680, 1, 1, 2560] + - Exact: [4608, 2, 1, 1536] + - Exact: [4608, 4, 1, 1536] + - Exact: [3072, 1, 1, 128] + - Exact: [2048, 32, 1, 2048] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 1280] + - Exact: [2048, 16, 1, 2048] + - Exact: [5888, 4, 1, 128] + - Exact: [8448, 1, 1, 2816] + - Exact: [1408, 4, 1, 256] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [5056, 4, 1, 1280] + - Exact: [3072, 16, 1, 1024] + - Exact: [1408, 4, 1, 3328] + - Exact: [6144, 1, 1, 2560] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [1408, 4, 1, 128] + - Exact: [1856, 4, 1, 256] + - Exact: [6784, 4, 1, 128] + - Exact: [2944, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [3072, 4, 1, 1024] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [7680, 2, 1, 2560] + - Exact: [7680, 16, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [8448, 2, 1, 2816] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [3584, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 2, 1, 2560] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [4096, 29, 1, 2048] + - Exact: [4096, 25, 1, 2048] + - Exact: [4096, 29, 1, 3072] + - Exact: [4096, 24, 1, 2048] + - Exact: [36548, 1, 1, 1024] + - Exact: [4096, 27, 1, 2048] + - Exact: [4096, 1, 1, 2048] + - Exact: [4096, 24, 1, 3072] + - Exact: [4096, 27, 1, 3072] + - Exact: [36548, 25, 1, 1024] + - Exact: [4096, 1, 1, 3072] + - Exact: [4096, 25, 1, 3072] + - Exact: [36548, 24, 1, 1024] + - Exact: [6272, 16, 1, 480] + - Exact: [1568, 32, 1, 832] + - Exact: [1568, 48, 1, 832] + - Exact: [6272, 24, 1, 512] + - Exact: [2048, 1, 1, 512] + - Exact: [2048, 2, 1, 2] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2] + - Exact: [2560, 4, 1, 2560] + - Exact: [12288, 12, 2, 256] + - Exact: [12288, 3, 2, 256] + - Exact: [51520, 12, 2, 256] + - Exact: [51520, 3, 2, 256] + - Exact: [15200, 12, 2, 256] + - Exact: [15200, 3, 2, 256] + - Exact: [3456, 3, 2, 256] + - Exact: [13600, 12, 2, 256] + - Exact: [12880, 3, 2, 256] + - Exact: [3400, 3, 2, 256] + - Exact: [12880, 12, 2, 256] + - Exact: [13824, 12, 2, 256] + - Exact: [13824, 3, 2, 256] + - Exact: [13600, 3, 2, 256] + - Exact: [3456, 12, 2, 256] + - Exact: [3800, 3, 2, 256] + - Exact: [3400, 12, 2, 256] + - Exact: [3800, 12, 2, 256] + - Exact: [55296, 3, 2, 256] + - Exact: [3220, 3, 2, 256] + - Exact: [3072, 3, 2, 256] + - Exact: [3220, 12, 2, 256] + - Exact: [3072, 12, 2, 256] + - Exact: [54400, 3, 2, 256] + - Exact: [60800, 12, 2, 256] + - Exact: [60800, 3, 2, 256] + - Exact: [1909283, 11, 1, 11] + - Exact: [3818566, 11, 1, 11] + - Exact: [2048, 8, 1, 2] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2] + - Exact: [2560, 2, 1, 2560] + - Exact: [2560, 27, 1, 29000] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [35, 1500, 1, 2560] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 1280] + - Exact: [35, 1500, 1, 2048] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 2368, 1, 128] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 1408, 1, 128] + - Exact: [16, 2000, 1, 2048] + - Exact: [2, 2048, 1, 2000] + - Exact: [32, 2000, 1, 2048] + - Exact: [10, 2000, 1, 1024] + - Exact: [2, 2000, 1, 100] + - Exact: [10, 2000, 1, 512] + - Exact: [32, 2000, 1, 500] + - Exact: [32, 2000, 1, 1024] + - Exact: [4, 2048, 1, 500] + - Exact: [16, 2000, 1, 500] + - Exact: [4, 2048, 1, 100] + - Exact: [16, 2000, 1, 100] + - Exact: [4, 2000, 1, 10] + - Exact: [10, 2000, 1, 10] + - Exact: [2, 2048, 1, 512] + - Exact: [10, 2048, 1, 100] + - Exact: [8, 2048, 1, 100] + - Exact: [2, 2048, 1, 1024] + - Exact: [16, 2000, 1, 1024] + - Exact: [10, 2000, 1, 2000] + - Exact: [8, 2000, 1, 500] + - Exact: [16, 2000, 1, 2000] + - Exact: [10, 2048, 1, 2048] + - Exact: [8, 2000, 1, 512] + - Exact: [2, 2000, 1, 2048] + - Exact: [16, 2048, 1, 500] + - Exact: [8, 2048, 1, 1024] + - Exact: [2, 2000, 1, 500] + - Exact: [32, 2048, 1, 100] + - Exact: [10, 2048, 1, 500] + - Exact: [4, 2000, 1, 2048] + - Exact: [8, 2000, 1, 1024] + - Exact: [32, 2048, 1, 512] + - Exact: [32, 2048, 1, 1024] + - Exact: [32, 2048, 1, 500] + - Exact: [10, 2048, 1, 1024] + - Exact: [8, 2048, 1, 2048] + - Exact: [16, 2048, 1, 2048] + - Exact: [8, 2000, 1, 10] + - Exact: [4, 2000, 1, 2000] + - Exact: [8, 2048, 1, 512] + - Exact: [8, 2000, 1, 2048] + - Exact: [32, 2048, 1, 2000] + - Exact: [16, 2000, 1, 10] + - Exact: [8, 2048, 1, 2000] + - Exact: [4, 2048, 1, 2048] + - Exact: [10, 2048, 1, 2000] + - Exact: [8, 2000, 1, 100] + - Exact: [2, 2000, 1, 2000] + - Exact: [16, 2048, 1, 1024] + - Exact: [32, 2000, 1, 2000] + - Exact: [32, 2048, 1, 2048] + - Exact: [2, 2048, 1, 10] + - Exact: [4, 2048, 1, 512] + - Exact: [4, 2048, 1, 10] + - Exact: [16, 2048, 1, 100] + - Exact: [4, 2000, 1, 500] + - Exact: [10, 2000, 1, 500] + - Exact: [32, 2000, 1, 512] + - Exact: [2, 2000, 1, 1024] + - Exact: [2, 2000, 1, 512] + - Exact: [4, 2048, 1, 1024] + - Exact: [8, 2048, 1, 500] + - Exact: [4, 2048, 1, 2000] + - Exact: [8, 2000, 1, 2000] + - Exact: [4, 2000, 1, 1024] + - Exact: [32, 2000, 1, 100] + - Exact: [2, 2048, 1, 100] + - Exact: [8, 2048, 1, 10] + - Exact: [2, 2048, 1, 2048] + - Exact: [10, 2000, 1, 2048] + - Exact: [16, 2048, 1, 2000] + - Exact: [10, 2048, 1, 512] + - Exact: [16, 2048, 1, 512] + - Exact: [2, 2000, 1, 10] + - Exact: [4, 2000, 1, 100] + - Exact: [16, 2000, 1, 512] + - Exact: [32, 2048, 1, 10] + - Exact: [10, 2048, 1, 10] + - Exact: [4, 2000, 1, 512] + - Exact: [16, 2048, 1, 10] + - Exact: [32, 2000, 1, 10] + - Exact: [10, 2000, 1, 100] + - Exact: [2, 2048, 1, 500] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 4, 1, 500000] + - Exact: [1024, 20, 1, 30522] + - Exact: [49, 512, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 512, 1, 1] + - Exact: [1024, 32, 1, 2] + - Exact: [1024, 32, 1, 1024] + - Exact: [768, 32, 1, 768] + - Exact: [768, 32, 1, 2] + - Exact: [768, 64, 1, 768] + - Exact: [768, 64, 1, 2] + - Exact: [1024, 20, 1, 1024] + - Exact: [1024, 80, 1, 1024] + - Exact: [32, 200, 1, 1] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 4, 1, 2] + - Exact: [768, 16, 1, 768] + - Exact: [768, 16, 1, 2] + - Exact: [768, 8, 1, 768] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 6, 1, 2] + - Exact: [1024, 8, 1, 1024] + - Exact: [4, 704, 1, 1280] + - Exact: [512, 4, 1, 512] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [64, 1024, 1, 3328] + - Exact: [128, 1, 1, 1408] + - Exact: [1024, 64, 1, 1280] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 256, 1, 128] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 1024, 1, 128] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [64, 64, 1, 3328] + - Exact: [512, 1, 1, 512] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 4, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 256] + - Exact: [4, 704, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [4, 448, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1, 1, 1216] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [448, 4, 1, 1280] + - Exact: [1024, 2, 1, 512] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [256, 256, 1, 256] + - Exact: [4, 1024, 1, 3328] + - Exact: [4, 704, 1, 128] + - Exact: [64, 128, 1, 128] + - Exact: [704, 4, 1, 128] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 64, 1, 1280] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 4, 1, 128] + - Exact: [256, 64, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [512, 2, 1, 512] + - Exact: [64, 128, 1, 256] + - Exact: [1024, 4, 1, 1280] + - Exact: [35, 700, 1, 2048] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [4, 256, 1, 128] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [128, 4, 1, 128] + - Exact: [128, 1, 1, 1024] + - Exact: [4, 128, 1, 3328] + - Exact: [256, 256, 1, 128] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [256, 4, 1, 128] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [4, 448, 1, 1280] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [35, 700, 1, 2560] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [1024, 4, 1, 512] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [1024, 1, 1, 512] + - Exact: [128, 128, 1, 256] + - Exact: [64, 12, 5040, 12] + - Exact: [64, 17, 3632, 17] + - Exact: [64, 19, 3264, 19] + - Exact: [64, 9, 6544, 9] + - Exact: [64, 7, 8192, 7] + - Exact: [64, 16, 3840, 16] + - Exact: [64, 8, 7280, 8] + - Exact: [64, 27, 2336, 27] + - Exact: [64, 11, 5456, 11] + - Exact: [64, 21, 2976, 21] + - Exact: [64, 10, 5952, 10] + - Exact: [64, 14, 4368, 14] + - Exact: [64, 25, 2512, 25] + - Exact: [64, 13, 4672, 13] + - Exact: [64, 15, 4096, 15] + - Exact: [64, 29, 2176, 29] + - Exact: [64, 18, 3440, 18] + - Exact: [64, 23, 2720, 23] + - Exact: [8, 500, 1, 512] + - Exact: [32, 512, 1, 512] + - Exact: [8, 512, 1, 500] + - Exact: [8, 500, 1, 1024] + - Exact: [64, 1024, 1, 100] + - Exact: [64, 1024, 1, 500] + - Exact: [64, 1024, 1, 1024] + - Exact: [2, 500, 1, 2048] + - Exact: [16, 512, 1, 10] + - Exact: [8, 512, 1, 10] + - Exact: [16, 500, 1, 2048] + - Exact: [10, 100, 1, 500] + - Exact: [16, 100, 1, 10] + - Exact: [2, 100, 1, 2000] + - Exact: [256, 100, 1, 2048] + - Exact: [2, 512, 1, 512] + - Exact: [2, 100, 1, 10] + - Exact: [200, 100, 1, 100] + - Exact: [500, 100, 1, 100] + - Exact: [4, 100, 1, 10] + - Exact: [32, 100, 1, 512] + - Exact: [16, 1024, 1, 512] + - Exact: [4, 1024, 1, 1024] + - Exact: [4, 512, 1, 10] + - Exact: [128, 100, 1, 10] + - Exact: [4, 512, 1, 2048] + - Exact: [10, 1024, 1, 2000] + - Exact: [256, 100, 1, 100] + - Exact: [64, 1024, 1, 2048] + - Exact: [16, 1024, 1, 100] + - Exact: [32, 1024, 1, 1024] + - Exact: [8, 100, 1, 500] + - Exact: [10, 512, 1, 512] + - Exact: [8, 500, 1, 10] + - Exact: [16, 1024, 1, 10] + - Exact: [16, 512, 1, 2048] + - Exact: [128, 512, 1, 2048] + - Exact: [128, 512, 1, 100] + - Exact: [64, 500, 1, 2048] + - Exact: [500, 100, 1, 10] + - Exact: [64, 100, 1, 2048] + - Exact: [64, 100, 1, 10] + - Exact: [16, 512, 1, 500] + - Exact: [200, 100, 1, 2000] + - Exact: [2, 100, 1, 512] + - Exact: [32, 512, 1, 100] + - Exact: [16, 512, 1, 1024] + - Exact: [4, 1024, 1, 512] + - Exact: [2, 500, 1, 500] + - Exact: [32, 100, 1, 100] + - Exact: [100, 500, 1, 2000] + - Exact: [10, 512, 1, 10] + - Exact: [100, 500, 1, 2048] + - Exact: [2, 100, 1, 1024] + - Exact: [32, 512, 1, 1024] + - Exact: [256, 100, 1, 1024] + - Exact: [128, 100, 1, 100] + - Exact: [32, 512, 1, 10] + - Exact: [128, 100, 1, 1024] + - Exact: [16, 500, 1, 2000] + - Exact: [64, 500, 1, 500] + - Exact: [128, 512, 1, 1024] + - Exact: [128, 512, 1, 2000] + - Exact: [2, 512, 1, 10] + - Exact: [10, 512, 1, 500] + - Exact: [4, 1024, 1, 2000] + - Exact: [256, 100, 1, 2000] + - Exact: [100, 100, 1, 10] + - Exact: [128, 512, 1, 10] + - Exact: [256, 100, 1, 500] + - Exact: [64, 100, 1, 512] + - Exact: [64, 512, 1, 500] + - Exact: [8, 100, 1, 512] + - Exact: [32, 100, 1, 500] + - Exact: [32, 500, 1, 2048] + - Exact: [128, 500, 1, 2000] + - Exact: [8, 1024, 1, 10] + - Exact: [2, 500, 1, 100] + - Exact: [10, 500, 1, 512] + - Exact: [32, 500, 1, 500] + - Exact: [100, 500, 1, 100] + - Exact: [10, 1024, 1, 512] + - Exact: [512, 100, 1, 512] + - Exact: [4, 500, 1, 500] + - Exact: [64, 100, 1, 1024] + - Exact: [2, 500, 1, 2000] + - Exact: [32, 512, 1, 2048] + - Exact: [10, 100, 1, 2000] + - Exact: [4, 100, 1, 512] + - Exact: [2, 512, 1, 2048] + - Exact: [100, 100, 1, 2000] + - Exact: [10, 500, 1, 500] + - Exact: [2, 100, 1, 2048] + - Exact: [32, 100, 1, 2048] + - Exact: [16, 100, 1, 1024] + - Exact: [2, 500, 1, 10] + - Exact: [500, 100, 1, 2048] + - Exact: [16, 1024, 1, 2000] + - Exact: [10, 1024, 1, 1024] + - Exact: [500, 100, 1, 512] + - Exact: [32, 512, 1, 500] + - Exact: [100, 500, 1, 512] + - Exact: [8, 500, 1, 2000] + - Exact: [4, 100, 1, 1024] + - Exact: [2, 500, 1, 1024] + - Exact: [100, 500, 1, 1024] + - Exact: [32, 100, 1, 1024] + - Exact: [64, 100, 1, 2000] + - Exact: [64, 500, 1, 10] + - Exact: [64, 500, 1, 512] + - Exact: [10, 100, 1, 1024] + - Exact: [16, 512, 1, 100] + - Exact: [4, 100, 1, 2000] + - Exact: [2, 512, 1, 1024] + - Exact: [64, 512, 1, 1024] + - Exact: [512, 100, 1, 2048] + - Exact: [32, 100, 1, 2000] + - Exact: [4, 512, 1, 500] + - Exact: [4, 500, 1, 1024] + - Exact: [32, 100, 1, 10] + - Exact: [10, 1024, 1, 2048] + - Exact: [8, 500, 1, 100] + - Exact: [200, 100, 1, 1024] + - Exact: [16, 100, 1, 100] + - Exact: [8, 1024, 1, 2000] + - Exact: [4, 512, 1, 100] + - Exact: [16, 500, 1, 100] + - Exact: [8, 1024, 1, 2048] + - Exact: [16, 1024, 1, 2048] + - Exact: [64, 512, 1, 100] + - Exact: [2, 100, 1, 500] + - Exact: [2, 500, 1, 512] + - Exact: [128, 500, 1, 1024] + - Exact: [10, 100, 1, 10] + - Exact: [64, 1024, 1, 10] + - Exact: [500, 100, 1, 500] + - Exact: [2, 512, 1, 100] + - Exact: [16, 100, 1, 500] + - Exact: [128, 100, 1, 500] + - Exact: [512, 100, 1, 1024] + - Exact: [16, 100, 1, 2000] + - Exact: [10, 512, 1, 100] + - Exact: [8, 512, 1, 100] + - Exact: [128, 100, 1, 2000] + - Exact: [2, 1024, 1, 2000] + - Exact: [100, 512, 1, 512] + - Exact: [32, 1024, 1, 2000] + - Exact: [128, 500, 1, 100] + - Exact: [100, 100, 1, 100] + - Exact: [8, 512, 1, 1024] + - Exact: [200, 100, 1, 500] + - Exact: [2, 1024, 1, 2048] + - Exact: [512, 100, 1, 2000] + - Exact: [16, 512, 1, 2000] + - Exact: [64, 500, 1, 1024] + - Exact: [10, 512, 1, 1024] + - Exact: [512, 100, 1, 100] + - Exact: [8, 100, 1, 1024] + - Exact: [10, 100, 1, 100] + - Exact: [10, 500, 1, 2000] + - Exact: [500, 100, 1, 2000] + - Exact: [100, 512, 1, 2000] + - Exact: [64, 1024, 1, 512] + - Exact: [32, 500, 1, 100] + - Exact: [10, 100, 1, 2048] + - Exact: [64, 100, 1, 100] + - Exact: [2, 1024, 1, 100] + - Exact: [64, 500, 1, 2000] + - Exact: [8, 512, 1, 512] + - Exact: [8, 512, 1, 2048] + - Exact: [100, 100, 1, 1024] + - Exact: [8, 100, 1, 2000] + - Exact: [2, 1024, 1, 1024] + - Exact: [16, 512, 1, 512] + - Exact: [32, 500, 1, 512] + - Exact: [32, 500, 1, 1024] + - Exact: [32, 500, 1, 10] + - Exact: [4, 1024, 1, 500] + - Exact: [256, 100, 1, 512] + - Exact: [8, 1024, 1, 500] + - Exact: [4, 1024, 1, 100] + - Exact: [100, 500, 1, 500] + - Exact: [2, 1024, 1, 500] + - Exact: [64, 100, 1, 500] + - Exact: [2, 512, 1, 500] + - Exact: [10, 1024, 1, 500] + - Exact: [128, 500, 1, 512] + - Exact: [10, 500, 1, 2048] + - Exact: [128, 512, 1, 512] + - Exact: [64, 512, 1, 10] + - Exact: [32, 500, 1, 2000] + - Exact: [100, 100, 1, 2048] + - Exact: [200, 100, 1, 512] + - Exact: [200, 100, 1, 2048] + - Exact: [8, 100, 1, 10] + - Exact: [100, 100, 1, 500] + - Exact: [100, 500, 1, 10] + - Exact: [10, 500, 1, 1024] + - Exact: [256, 100, 1, 10] + - Exact: [10, 512, 1, 2048] + - Exact: [2, 1024, 1, 512] + - Exact: [4, 500, 1, 2048] + - Exact: [100, 512, 1, 100] + - Exact: [16, 500, 1, 512] + - Exact: [10, 1024, 1, 100] + - Exact: [8, 1024, 1, 100] + - Exact: [64, 1024, 1, 2000] + - Exact: [10, 100, 1, 512] + - Exact: [4, 500, 1, 2000] + - Exact: [4, 100, 1, 100] + - Exact: [32, 1024, 1, 512] + - Exact: [8, 512, 1, 2000] + - Exact: [100, 100, 1, 512] + - Exact: [2, 512, 1, 2000] + - Exact: [16, 500, 1, 10] + - Exact: [10, 500, 1, 100] + - Exact: [4, 100, 1, 500] + - Exact: [64, 500, 1, 100] + - Exact: [2, 100, 1, 100] + - Exact: [10, 512, 1, 2000] + - Exact: [8, 500, 1, 500] + - Exact: [4, 500, 1, 512] + - Exact: [10, 500, 1, 10] + - Exact: [64, 512, 1, 2000] + - Exact: [32, 512, 1, 2000] + - Exact: [128, 500, 1, 2048] + - Exact: [4, 512, 1, 512] + - Exact: [16, 500, 1, 1024] + - Exact: [10, 1024, 1, 10] + - Exact: [16, 500, 1, 500] + - Exact: [500, 100, 1, 1024] + - Exact: [16, 100, 1, 512] + - Exact: [64, 512, 1, 2048] + - Exact: [32, 1024, 1, 10] + - Exact: [8, 1024, 1, 512] + - Exact: [4, 1024, 1, 2048] + - Exact: [128, 500, 1, 500] + - Exact: [100, 512, 1, 1024] + - Exact: [16, 1024, 1, 500] + - Exact: [128, 100, 1, 2048] + - Exact: [100, 512, 1, 500] + - Exact: [8, 1024, 1, 1024] + - Exact: [4, 500, 1, 10] + - Exact: [128, 500, 1, 10] + - Exact: [32, 1024, 1, 100] + - Exact: [8, 500, 1, 2048] + - Exact: [16, 1024, 1, 1024] + - Exact: [200, 100, 1, 10] + - Exact: [512, 100, 1, 500] + - Exact: [4, 500, 1, 100] + - Exact: [8, 100, 1, 2048] + - Exact: [512, 100, 1, 10] + - Exact: [4, 512, 1, 1024] + - Exact: [32, 1024, 1, 2048] + - Exact: [128, 100, 1, 512] + - Exact: [32, 1024, 1, 500] + - Exact: [4, 1024, 1, 10] + - Exact: [100, 512, 1, 10] + - Exact: [8, 100, 1, 100] + - Exact: [128, 512, 1, 500] + - Exact: [16, 100, 1, 2048] + - Exact: [2, 1024, 1, 10] + - Exact: [4, 100, 1, 2048] + - Exact: [4, 512, 1, 2000] + - Exact: [1024, 29, 1, 1024] + - Exact: [1024, 1, 1, 21] + - Exact: [1024, 49, 1, 1024] + - Exact: [1024, 35, 1, 1024] + - Exact: [1024, 24, 1, 1024] + - Exact: [1024, 21, 1, 1024] + - Exact: [1024, 1, 1, 14] + - Exact: [1024, 91, 1, 1024] + - Exact: [1024, 14, 1, 1024] + - Exact: [1024, 25, 1, 1024] + - Exact: [1024, 27, 1, 1024] + - Exact: [1024, 50, 1, 1024] + - Exact: [1024, 64, 1, 1024] + - Exact: [1024, 13, 1, 1024] + - Exact: [1024, 63, 1, 1024] + - Exact: [1024, 86, 1, 1024] + - Exact: [1024, 1, 1, 13] + - Exact: [289, 192, 1, 1344] + - Exact: [196, 128, 1, 800] + - Exact: [64, 512, 1, 1344] + - Exact: [289, 224, 1, 1568] + - Exact: [64, 256, 1, 1536] + - Exact: [289, 160, 1, 1120] + - Exact: [64, 256, 1, 1152] + - Exact: [289, 224, 1, 1344] + - Exact: [289, 192, 1, 896] + - Exact: [784, 16, 32, 192] + - Exact: [49, 128, 1, 1200] + - Exact: [289, 128, 1, 896] + - Exact: [1001, 32, 1, 1024] + - Exact: [64, 448, 1, 1152] + - Exact: [1001, 32, 1, 2048] + - Exact: [289, 192, 1, 1120] + - Exact: [64, 320, 1, 1728] + - Exact: [289, 96, 1, 864] + - Exact: [196, 64, 1, 800] + - Exact: [784, 32, 1, 400] + - Exact: [64, 320, 1, 2880] + - Exact: [1001, 32, 1, 1536] + - Exact: [64, 384, 1, 1152] + - Exact: [64, 192, 1, 1728] + - Exact: [1001, 64, 1, 1536] + - Exact: [1001, 64, 1, 2048] + - Exact: [1024, 64, 1, 4096] + - Exact: [64, 10, 448, 10] + - Exact: [64, 18, 648, 18] + - Exact: [64, 18, 1720, 18] + - Exact: [64, 19, 1632, 19] + - Exact: [64, 21, 1472, 21] + - Exact: [64, 23, 64, 23] + - Exact: [64, 26, 56, 26] + - Exact: [1024, 1, 1, 2] + - Exact: [1024, 1, 1, 1024] + - Exact: [64, 27, 56, 26] + - Exact: [64, 17, 1, 17] + - Exact: [64, 30, 1, 30] + - Exact: [64, 31, 1, 30] + - Exact: [64, 31, 1, 31] + - Exact: [64, 14, 1, 14] + - Exact: [64, 14, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 17] + - Exact: [100, 512, 1, 2048] + - Exact: [1024, 1, 1, 1600] + - Exact: [1024, 1, 1, 200] + - Exact: [1, 200, 1, 1] + - Exact: [1, 512, 1, 1] + - Exact: [67, 512, 1, 2048] + - Exact: [74, 512, 1, 2048] + - Exact: [64, 3, 512, 3] + - Exact: [64, 5, 512, 5] + - Exact: [64, 9, 512, 9] + - Exact: [64, 512, 1, 512] + - Exact: [25, 128, 120, 256] + - Exact: [25, 128, 139, 256] + - Exact: [25, 128, 160, 256] + - Exact: [25, 128, 18, 256] + - Exact: [25, 128, 19, 256] + - Exact: [9, 128, 120, 256] + - Exact: [9, 128, 139, 256] + - Exact: [9, 128, 160, 256] + - Exact: [9, 128, 18, 256] + - Exact: [9, 128, 19, 256] + - Exact: [1, 256, 1, 1152] + - Exact: [100, 512, 1, 2304] + - Exact: [25, 256, 1, 1152] + - Exact: [9, 256, 1, 1152] + - Exact: [1024, 77, 1, 1024] + - Exact: [1024, 10, 1, 2] + - Exact: [1024, 10, 1, 1024] + - Exact: [1024, 39, 1, 2] + - Exact: [1024, 39, 1, 1024] + - Exact: [1024, 40, 1, 2] + - Exact: [1024, 40, 1, 1024] + - Exact: [1024, 41, 1, 2] + - Exact: [1024, 41, 1, 1024] + - Exact: [1024, 5, 1, 2] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 8, 1, 2] + - Exact: [1024, 9, 1, 2] + - Exact: [1024, 9, 1, 1024] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 14, 10880, 15] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 15, 7680, 17] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 17, 6144, 21] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 24, 4736, 34] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 31, 2048, 30] + - Exact: [64, 31, 2048, 31] + - Exact: [128, 128, 1, 64] + - Exact: [64, 5, 1, 5] + - Exact: [32, 33, 1, 33] + - Exact: [64, 5, 960, 5] + - Exact: [74, 960, 1, 2048] + - Exact: [128, 27, 32768, 27] + - Exact: [1024, 16, 1, 1024] + - Exact: [1024, 16, 1, 2] + - Exact: [1024, 64, 1, 2] + - Exact: [1024, 80, 1, 2] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 82, 1, 2] + - Exact: [1024, 12, 1, 1024] + - Exact: [1024, 12, 1, 2] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [196, 256, 1, 2304] + - Exact: [850, 3, 2, 256] + - Exact: [850, 12, 2, 256] + - Exact: [805, 12, 2, 256] + - Exact: [805, 3, 2, 256] + - Exact: [768, 3, 2, 256] + - Exact: [768, 12, 2, 256] + - Exact: [864, 12, 2, 256] + - Exact: [864, 3, 2, 256] + - Exact: [247, 3, 2, 256] + - Exact: [216, 3, 2, 256] + - Exact: [950, 3, 2, 256] + - Exact: [187, 12, 2, 256] + - Exact: [176, 12, 2, 256] + - Exact: [247, 12, 2, 256] + - Exact: [187, 3, 2, 256] + - Exact: [228, 12, 2, 256] + - Exact: [221, 12, 2, 256] + - Exact: [176, 3, 2, 256] + - Exact: [950, 12, 2, 256] + - Exact: [192, 12, 2, 256] + - Exact: [228, 3, 2, 256] + - Exact: [221, 3, 2, 256] + - Exact: [192, 3, 2, 256] + - Exact: [216, 12, 2, 256] + - Exact: [2, 6, 1, 1024] + - Exact: [1024, 20, 1, 2] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml new file mode 100644 index 000000000..906d9e851 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml @@ -0,0 +1,4924 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2048, 2048, 1, 512] + - Exact: [1600, 1024, 1, 512] + - Exact: [4096, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2048] + - Exact: [3072, 768, 1, 4096] + - Exact: [3072, 1024, 1, 2048] + - Exact: [3072, 1024, 1, 3072] + - Exact: [3072, 1024, 1, 512] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [448, 5056, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [5888, 2944, 1, 128] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [1024, 4288, 1, 128] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [6784, 5888, 1, 3328] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [3584, 2944, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [2048, 7133, 1, 2048] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 1408, 1, 128] + - Exact: [4288, 5056, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 128] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5888, 1856, 1, 128] + - Exact: [2368, 1024, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [1856, 4288, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [2944, 4288, 1, 3328] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [1856, 2368, 1, 128] + - Exact: [4096, 7133, 1, 4096] + - Exact: [3584, 704, 1, 1280] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [2368, 5056, 1, 128] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 128] + - Exact: [1856, 1856, 1, 128] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [3072, 7435, 1, 1024] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 2944, 1, 128] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [1024, 2368, 1, 128] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 4288, 1, 3328] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2560, 7133, 1, 2560] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [704, 4288, 1, 128] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [1408, 1024, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [1760, 7133, 1, 1760] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 6784, 1, 128] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [2368, 6784, 1, 128] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [1856, 5056, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [7680, 5481, 1, 2560] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5888, 1408, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [4288, 1024, 1, 128] + - Exact: [196, 256, 256, 1024] + - Exact: [784, 512, 256, 128] + - Exact: [784, 128, 128, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 256, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [3136, 256, 128, 64] + - Exact: [784, 512, 128, 128] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [5329, 160, 64, 64] + - Exact: [1225, 384, 64, 192] + - Exact: [289, 1024, 64, 256] + - Exact: [1225, 384, 64, 64] + - Exact: [1225, 384, 64, 96] + - Exact: [289, 1024, 64, 384] + - Exact: [289, 1024, 64, 192] + - Exact: [289, 1024, 64, 128] + - Exact: [4096, 1024, 1, 2984] + - Exact: [1024, 4096, 1, 3437] + - Exact: [1024, 4096, 1, 3235] + - Exact: [4096, 1024, 1, 4032] + - Exact: [1024, 4096, 1, 3334] + - Exact: [4096, 1024, 1, 3288] + - Exact: [1024, 4096, 1, 3515] + - Exact: [4096, 1024, 1, 3437] + - Exact: [1024, 4096, 1, 3259] + - Exact: [1024, 4096, 1, 3384] + - Exact: [4096, 1024, 1, 3458] + - Exact: [1024, 4096, 1, 3412] + - Exact: [1024, 4096, 1, 3529] + - Exact: [1024, 4096, 1, 4032] + - Exact: [4096, 1024, 1, 3999] + - Exact: [1024, 4096, 1, 3079] + - Exact: [1024, 4096, 1, 3876] + - Exact: [1024, 4096, 1, 3450] + - Exact: [1024, 4096, 1, 3256] + - Exact: [4096, 1024, 1, 3403] + - Exact: [1024, 4096, 1, 3359] + - Exact: [4096, 1024, 1, 3549] + - Exact: [4096, 1024, 1, 3176] + - Exact: [1024, 4096, 1, 3504] + - Exact: [4096, 1024, 1, 3314] + - Exact: [4096, 1024, 1, 3183] + - Exact: [1024, 4096, 1, 3209] + - Exact: [1024, 4096, 1, 3720] + - Exact: [1024, 4096, 1, 3859] + - Exact: [1024, 33708, 1, 4059] + - Exact: [4096, 1024, 1, 3477] + - Exact: [4096, 1024, 1, 3233] + - Exact: [4096, 1024, 1, 3409] + - Exact: [4096, 1024, 1, 3564] + - Exact: [4096, 1024, 1, 3190] + - Exact: [1024, 4096, 1, 3288] + - Exact: [4096, 1024, 1, 3451] + - Exact: [1024, 4096, 1, 3348] + - Exact: [1024, 4096, 1, 3465] + - Exact: [1024, 33708, 1, 4032] + - Exact: [1024, 33708, 1, 3840] + - Exact: [4096, 1024, 1, 3391] + - Exact: [1024, 4096, 1, 3530] + - Exact: [4096, 1024, 1, 3209] + - Exact: [1024, 4096, 1, 3457] + - Exact: [1024, 4096, 1, 3386] + - Exact: [4096, 1024, 1, 3350] + - Exact: [1024, 4096, 1, 3184] + - Exact: [1024, 4096, 1, 3093] + - Exact: [1024, 4096, 1, 3400] + - Exact: [1024, 4096, 1, 3214] + - Exact: [4096, 1024, 1, 3406] + - Exact: [1024, 4096, 1, 3565] + - Exact: [4096, 1024, 1, 3536] + - Exact: [1024, 4096, 1, 3183] + - Exact: [1024, 4096, 1, 3462] + - Exact: [4096, 1024, 1, 3130] + - Exact: [4096, 1024, 1, 3381] + - Exact: [4096, 1024, 1, 3298] + - Exact: [1024, 4096, 1, 3292] + - Exact: [4096, 1024, 1, 3289] + - Exact: [1024, 4096, 1, 3379] + - Exact: [1024, 4096, 1, 3990] + - Exact: [1024, 4096, 1, 3540] + - Exact: [4096, 1024, 1, 3412] + - Exact: [1024, 4096, 1, 3555] + - Exact: [1024, 4096, 1, 3518] + - Exact: [4096, 1024, 1, 3189] + - Exact: [1024, 4096, 1, 3298] + - Exact: [4096, 1024, 1, 3072] + - Exact: [1024, 4096, 1, 3393] + - Exact: [1024, 4096, 1, 3207] + - Exact: [4096, 1024, 1, 3487] + - Exact: [4096, 1024, 1, 3431] + - Exact: [4096, 1024, 1, 3378] + - Exact: [4096, 1024, 1, 3529] + - Exact: [4096, 1024, 1, 3460] + - Exact: [1024, 4096, 1, 3336] + - Exact: [1024, 4096, 1, 3501] + - Exact: [1024, 4096, 1, 3584] + - Exact: [4096, 1024, 1, 2499] + - Exact: [4096, 1024, 1, 3352] + - Exact: [1024, 4096, 1, 3543] + - Exact: [1024, 4096, 1, 3476] + - Exact: [1024, 33708, 1, 3822] + - Exact: [1024, 4096, 1, 3436] + - Exact: [1024, 4096, 1, 3594] + - Exact: [4096, 1024, 1, 3514] + - Exact: [1024, 4096, 1, 3064] + - Exact: [4096, 1024, 1, 3371] + - Exact: [4096, 1024, 1, 3558] + - Exact: [4096, 1024, 1, 3517] + - Exact: [4096, 1024, 1, 3144] + - Exact: [1024, 4096, 1, 3312] + - Exact: [4096, 1024, 1, 3079] + - Exact: [1024, 4096, 1, 3415] + - Exact: [1024, 4096, 1, 3221] + - Exact: [1024, 4096, 1, 3978] + - Exact: [4096, 1024, 1, 3876] + - Exact: [1024, 4096, 1, 3528] + - Exact: [1024, 4096, 1, 3181] + - Exact: [4096, 1024, 1, 3445] + - Exact: [4096, 1024, 1, 3450] + - Exact: [4096, 1024, 1, 3377] + - Exact: [1024, 4096, 1, 3532] + - Exact: [1024, 33708, 1, 3944] + - Exact: [4096, 1024, 1, 3483] + - Exact: [1024, 4096, 1, 3358] + - Exact: [4096, 1024, 1, 3464] + - Exact: [4096, 1024, 1, 3282] + - Exact: [4096, 1024, 1, 3256] + - Exact: [1024, 4096, 1, 3057] + - Exact: [4096, 1024, 1, 3481] + - Exact: [4096, 1024, 1, 3340] + - Exact: [1024, 4096, 1, 3273] + - Exact: [4096, 1024, 1, 3392] + - Exact: [4096, 1024, 1, 3337] + - Exact: [4096, 1024, 1, 3359] + - Exact: [4096, 1024, 1, 3498] + - Exact: [4096, 1024, 1, 3169] + - Exact: [1024, 33708, 1, 3859] + - Exact: [1024, 4096, 1, 3103] + - Exact: [4096, 1024, 1, 3900] + - Exact: [1024, 4096, 1, 3442] + - Exact: [1024, 4096, 1, 3248] + - Exact: [1024, 4096, 1, 3351] + - Exact: [4096, 1024, 1, 3593] + - Exact: [1024, 4096, 1, 3780] + - Exact: [1024, 33708, 1, 3681] + - Exact: [4096, 1024, 1, 3374] + - Exact: [1024, 4096, 1, 3557] + - Exact: [4096, 1024, 1, 3906] + - Exact: [4096, 1024, 1, 3504] + - Exact: [1024, 4096, 1, 3270] + - Exact: [4096, 1024, 1, 3098] + - Exact: [4096, 1024, 1, 3216] + - Exact: [1024, 4096, 1, 3550] + - Exact: [4096, 1024, 1, 3449] + - Exact: [1024, 4096, 1, 3403] + - Exact: [1024, 4096, 1, 3523] + - Exact: [1024, 4096, 1, 3486] + - Exact: [1024, 4096, 1, 3564] + - Exact: [1024, 33708, 1, 4005] + - Exact: [4096, 1024, 1, 3296] + - Exact: [1024, 4096, 1, 3263] + - Exact: [1024, 4096, 1, 3130] + - Exact: [1024, 4096, 1, 3295] + - Exact: [1024, 33708, 1, 3925] + - Exact: [1024, 4096, 1, 3378] + - Exact: [4096, 1024, 1, 3720] + - Exact: [4096, 1024, 1, 3399] + - Exact: [4096, 1024, 1, 3543] + - Exact: [4096, 1024, 1, 3497] + - Exact: [4096, 1024, 1, 3594] + - Exact: [1024, 4096, 1, 3144] + - Exact: [1024, 4096, 1, 3975] + - Exact: [4096, 1024, 1, 3205] + - Exact: [1024, 33708, 1, 3995] + - Exact: [1024, 4096, 1, 3392] + - Exact: [1024, 4096, 1, 3055] + - Exact: [1024, 4096, 1, 4026] + - Exact: [4096, 1024, 1, 3557] + - Exact: [4096, 1024, 1, 3515] + - Exact: [4096, 1024, 1, 3486] + - Exact: [4096, 1024, 1, 3457] + - Exact: [1024, 4096, 1, 3511] + - Exact: [4096, 1024, 1, 3138] + - Exact: [1024, 4096, 1, 3339] + - Exact: [1024, 4096, 1, 3939] + - Exact: [4096, 1024, 1, 3500] + - Exact: [4096, 1024, 1, 3395] + - Exact: [4096, 1024, 1, 4020] + - Exact: [4096, 1024, 1, 3942] + - Exact: [4096, 1024, 1, 3349] + - Exact: [1024, 4096, 1, 3322] + - Exact: [4096, 1024, 1, 3452] + - Exact: [1024, 4096, 1, 3417] + - Exact: [1024, 4096, 1, 3526] + - Exact: [4096, 1024, 1, 3485] + - Exact: [4096, 1024, 1, 3303] + - Exact: [4096, 1024, 1, 3344] + - Exact: [1024, 4096, 1, 3479] + - Exact: [4096, 1024, 1, 3300] + - Exact: [1024, 4096, 1, 3439] + - Exact: [4096, 1024, 1, 3280] + - Exact: [1024, 4096, 1, 3245] + - Exact: [1024, 4096, 1, 3328] + - Exact: [4096, 1024, 1, 3418] + - Exact: [1024, 4096, 1, 3493] + - Exact: [1024, 4096, 1, 3500] + - Exact: [1024, 4096, 1, 3166] + - Exact: [4096, 1024, 1, 3126] + - Exact: [1024, 4096, 1, 3277] + - Exact: [1024, 4096, 1, 3315] + - Exact: [1024, 4096, 1, 3414] + - Exact: [4096, 1024, 1, 3531] + - Exact: [4096, 1024, 1, 3484] + - Exact: [1024, 4096, 1, 3180] + - Exact: [4096, 1024, 1, 3360] + - Exact: [1024, 33708, 1, 3990] + - Exact: [4096, 1024, 1, 3466] + - Exact: [1024, 4096, 1, 3428] + - Exact: [1024, 4096, 1, 3137] + - Exact: [4096, 1024, 1, 4059] + - Exact: [1024, 4096, 1, 3353] + - Exact: [1024, 4096, 1, 3942] + - Exact: [4096, 1024, 1, 3506] + - Exact: [4096, 1024, 1, 3508] + - Exact: [4096, 1024, 1, 3956] + - Exact: [1024, 4096, 1, 3272] + - Exact: [1024, 4096, 1, 3443] + - Exact: [1024, 4096, 1, 3375] + - Exact: [1024, 4096, 1, 3525] + - Exact: [4096, 1024, 1, 3472] + - Exact: [1024, 4096, 1, 3520] + - Exact: [4096, 1024, 1, 3322] + - Exact: [4096, 1024, 1, 3387] + - Exact: [1024, 33708, 1, 3939] + - Exact: [4096, 1024, 1, 3345] + - Exact: [4096, 1024, 1, 2967] + - Exact: [1024, 4096, 1, 3453] + - Exact: [1024, 4096, 1, 3640] + - Exact: [4096, 1024, 1, 3291] + - Exact: [1024, 4096, 1, 3350] + - Exact: [4096, 1024, 1, 3417] + - Exact: [1024, 4096, 1, 3467] + - Exact: [1024, 4096, 1, 3491] + - Exact: [1024, 4096, 1, 3822] + - Exact: [4096, 1024, 1, 3292] + - Exact: [1024, 4096, 1, 3231] + - Exact: [1024, 4096, 1, 3364] + - Exact: [1024, 4096, 1, 3995] + - Exact: [1024, 4096, 1, 3545] + - Exact: [1024, 4096, 1, 3186] + - Exact: [4096, 1024, 1, 3432] + - Exact: [4096, 1024, 1, 3367] + - Exact: [4096, 1024, 1, 3503] + - Exact: [1024, 4096, 1, 3095] + - Exact: [4096, 1024, 1, 3465] + - Exact: [1024, 4096, 1, 3402] + - Exact: [4096, 1024, 1, 3140] + - Exact: [4096, 1024, 1, 3424] + - Exact: [4096, 1024, 1, 3257] + - Exact: [4096, 1024, 1, 2917] + - Exact: [1024, 33708, 1, 3640] + - Exact: [1024, 4096, 1, 3456] + - Exact: [1024, 4096, 1, 3014] + - Exact: [4096, 1024, 1, 3372] + - Exact: [1024, 4096, 1, 3294] + - Exact: [4096, 1024, 1, 3446] + - Exact: [1024, 4096, 1, 3389] + - Exact: [4096, 1024, 1, 3259] + - Exact: [4096, 1024, 1, 3544] + - Exact: [4096, 1024, 1, 3479] + - Exact: [4096, 1024, 1, 3542] + - Exact: [4096, 1024, 1, 3321] + - Exact: [1024, 4096, 1, 3147] + - Exact: [1024, 4096, 1, 3944] + - Exact: [4096, 1024, 1, 3870] + - Exact: [1024, 4096, 1, 3308] + - Exact: [4096, 1024, 1, 3401] + - Exact: [1024, 4096, 1, 3395] + - Exact: [1024, 4096, 1, 3563] + - Exact: [1024, 33708, 1, 3870] + - Exact: [4096, 1024, 1, 3494] + - Exact: [1024, 4096, 1, 3271] + - Exact: [1024, 33708, 1, 3910] + - Exact: [1024, 4096, 1, 3287] + - Exact: [1024, 33708, 1, 3860] + - Exact: [4096, 1024, 1, 3341] + - Exact: [1024, 4096, 1, 3136] + - Exact: [4096, 1024, 1, 3439] + - Exact: [1024, 4096, 1, 3751] + - Exact: [1024, 4096, 1, 3301] + - Exact: [4096, 1024, 1, 3468] + - Exact: [1024, 4096, 1, 3416] + - Exact: [4096, 1024, 1, 3163] + - Exact: [1024, 4096, 1, 3230] + - Exact: [1024, 4096, 1, 3581] + - Exact: [4096, 1024, 1, 3463] + - Exact: [1024, 4096, 1, 3478] + - Exact: [4096, 1024, 1, 3262] + - Exact: [1024, 4096, 1, 3438] + - Exact: [1024, 4096, 1, 3244] + - Exact: [1024, 4096, 1, 3445] + - Exact: [4096, 1024, 1, 3328] + - Exact: [1024, 4096, 1, 3492] + - Exact: [4096, 1024, 1, 3211] + - Exact: [1024, 4096, 1, 3910] + - Exact: [1024, 4096, 1, 3314] + - Exact: [4096, 1024, 1, 3859] + - Exact: [4096, 1024, 1, 3383] + - Exact: [1024, 4096, 1, 3409] + - Exact: [1024, 4096, 1, 4020] + - Exact: [4096, 1024, 1, 3530] + - Exact: [4096, 1024, 1, 3411] + - Exact: [1024, 4096, 1, 3566] + - Exact: [4096, 1024, 1, 3493] + - Exact: [4096, 1024, 1, 3184] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 4096, 1, 3431] + - Exact: [4096, 1024, 1, 3306] + - Exact: [1024, 4096, 1, 3352] + - Exact: [4096, 1024, 1, 3295] + - Exact: [1024, 4096, 1, 3517] + - Exact: [4096, 1024, 1, 3426] + - Exact: [4096, 1024, 1, 3385] + - Exact: [4096, 1024, 1, 3572] + - Exact: [4096, 1024, 1, 3459] + - Exact: [1024, 4096, 1, 3374] + - Exact: [4096, 1024, 1, 3166] + - Exact: [4096, 1024, 1, 3093] + - Exact: [4096, 1024, 1, 3523] + - Exact: [4096, 1024, 1, 3413] + - Exact: [1024, 4096, 1, 3996] + - Exact: [1024, 4096, 1, 3452] + - Exact: [4096, 1024, 1, 3232] + - Exact: [4096, 1024, 1, 3400] + - Exact: [4096, 1024, 1, 3334] + - Exact: [1024, 4096, 1, 3345] + - Exact: [1024, 4096, 1, 3538] + - Exact: [1024, 4096, 1, 3466] + - Exact: [4096, 1024, 1, 3315] + - Exact: [4096, 1024, 1, 3214] + - Exact: [1024, 33708, 1, 3900] + - Exact: [1024, 4096, 1, 3367] + - Exact: [1024, 4096, 1, 2917] + - Exact: [1024, 4096, 1, 3544] + - Exact: [4096, 1024, 1, 3414] + - Exact: [4096, 1024, 1, 3565] + - Exact: [1024, 4096, 1, 3512] + - Exact: [1024, 4096, 1, 3191] + - Exact: [1024, 4096, 1, 3289] + - Exact: [4096, 1024, 1, 3290] + - Exact: [1024, 4096, 1, 3211] + - Exact: [1024, 33708, 1, 3969] + - Exact: [4096, 1024, 1, 3566] + - Exact: [1024, 4096, 1, 3459] + - Exact: [1024, 4096, 1, 3372] + - Exact: [4096, 1024, 1, 3339] + - Exact: [4096, 1024, 1, 3425] + - Exact: [4096, 1024, 1, 3388] + - Exact: [1024, 4096, 1, 3531] + - Exact: [4096, 1024, 1, 3286] + - Exact: [4096, 1024, 1, 3462] + - Exact: [1024, 4096, 1, 3388] + - Exact: [4096, 1024, 1, 3165] + - Exact: [4096, 1024, 1, 3304] + - Exact: [1024, 4096, 1, 2736] + - Exact: [4096, 1024, 1, 3397] + - Exact: [1024, 4096, 1, 3311] + - Exact: [1024, 4096, 1, 3394] + - Exact: [4096, 1024, 1, 2736] + - Exact: [1024, 4096, 1, 3559] + - Exact: [4096, 1024, 1, 3180] + - Exact: [1024, 4096, 1, 3480] + - Exact: [4096, 1024, 1, 3318] + - Exact: [4096, 1024, 1, 3213] + - Exact: [1024, 4096, 1, 3286] + - Exact: [4096, 1024, 1, 3471] + - Exact: [1024, 4096, 1, 3381] + - Exact: [4096, 1024, 1, 3502] + - Exact: [1024, 4096, 1, 3552] + - Exact: [4096, 1024, 1, 3519] + - Exact: [1024, 4096, 1, 3300] + - Exact: [1024, 4096, 1, 3419] + - Exact: [4096, 1024, 1, 4030] + - Exact: [4096, 1024, 1, 3976] + - Exact: [1024, 4096, 1, 3473] + - Exact: [4096, 1024, 1, 3428] + - Exact: [1024, 4096, 1, 3433] + - Exact: [4096, 1024, 1, 3534] + - Exact: [4096, 1024, 1, 3461] + - Exact: [4096, 1024, 1, 3681] + - Exact: [4096, 1024, 1, 3495] + - Exact: [4096, 1024, 1, 3351] + - Exact: [1024, 4096, 1, 4059] + - Exact: [4096, 1024, 1, 3990] + - Exact: [1024, 4096, 1, 3325] + - Exact: [1024, 4096, 1, 3408] + - Exact: [4096, 1024, 1, 3394] + - Exact: [1024, 4096, 1, 3573] + - Exact: [4096, 1024, 1, 3386] + - Exact: [4096, 1024, 1, 3540] + - Exact: [1024, 4096, 1, 3182] + - Exact: [1024, 4096, 1, 3430] + - Exact: [1024, 4096, 1, 3236] + - Exact: [4096, 1024, 1, 2977] + - Exact: [1024, 4096, 1, 3355] + - Exact: [4096, 1024, 1, 3139] + - Exact: [4096, 1024, 1, 3516] + - Exact: [4096, 1024, 1, 3368] + - Exact: [4096, 1024, 1, 3559] + - Exact: [1024, 4096, 1, 3506] + - Exact: [1024, 4096, 1, 3145] + - Exact: [1024, 4096, 1, 3369] + - Exact: [4096, 1024, 1, 3522] + - Exact: [1024, 33708, 1, 3894] + - Exact: [4096, 1024, 1, 3336] + - Exact: [1024, 4096, 1, 3382] + - Exact: [4096, 1024, 1, 3533] + - Exact: [4096, 1024, 1, 4050] + - Exact: [4096, 1024, 1, 3480] + - Exact: [1024, 4096, 1, 3344] + - Exact: [1024, 4096, 1, 3509] + - Exact: [1024, 4096, 1, 3956] + - Exact: [4096, 1024, 1, 3616] + - Exact: [1024, 4096, 1, 3366] + - Exact: [4096, 1024, 1, 2935] + - Exact: [4096, 1024, 1, 3393] + - Exact: [4096, 1024, 1, 3547] + - Exact: [1024, 4096, 1, 3499] + - Exact: [4096, 1024, 1, 3357] + - Exact: [4096, 1024, 1, 3272] + - Exact: [4096, 1024, 1, 3207] + - Exact: [4096, 1024, 1, 3894] + - Exact: [1024, 4096, 1, 3444] + - Exact: [4096, 1024, 1, 3561] + - Exact: [4096, 1024, 1, 3376] + - Exact: [1024, 4096, 1, 3458] + - Exact: [4096, 1024, 1, 3231] + - Exact: [1024, 4096, 1, 3505] + - Exact: [4096, 1024, 1, 3277] + - Exact: [1024, 4096, 1, 3391] + - Exact: [1024, 4096, 1, 3536] + - Exact: [1024, 4096, 1, 3063] + - Exact: [1024, 4096, 1, 3189] + - Exact: [1024, 4096, 1, 2505] + - Exact: [4096, 1024, 1, 3454] + - Exact: [1024, 4096, 1, 3405] + - Exact: [1024, 33708, 1, 4050] + - Exact: [4096, 1024, 1, 3520] + - Exact: [1024, 4096, 1, 3487] + - Exact: [1024, 4096, 1, 3558] + - Exact: [4096, 1024, 1, 3297] + - Exact: [1024, 4096, 1, 3483] + - Exact: [1024, 33708, 1, 3751] + - Exact: [4096, 1024, 1, 3380] + - Exact: [1024, 4096, 1, 3380] + - Exact: [1024, 4096, 1, 3396] + - Exact: [1024, 4096, 1, 3497] + - Exact: [1024, 4096, 1, 3502] + - Exact: [1024, 4096, 1, 3138] + - Exact: [4096, 1024, 1, 3939] + - Exact: [1024, 4096, 1, 3303] + - Exact: [1024, 4096, 1, 3418] + - Exact: [1024, 4096, 1, 3224] + - Exact: [4096, 1024, 1, 3978] + - Exact: [1024, 4096, 1, 3472] + - Exact: [4096, 1024, 1, 3353] + - Exact: [4096, 1024, 1, 3362] + - Exact: [1024, 33708, 1, 3978] + - Exact: [1024, 4096, 1, 3432] + - Exact: [1024, 4096, 1, 3139] + - Exact: [1024, 4096, 1, 3341] + - Exact: [1024, 4096, 1, 3494] + - Exact: [1024, 4096, 1, 3969] + - Exact: [1024, 4096, 1, 3163] + - Exact: [4096, 1024, 1, 3405] + - Exact: [4096, 1024, 1, 3453] + - Exact: [1024, 4096, 1, 3411] + - Exact: [1024, 4096, 1, 3527] + - Exact: [4096, 1024, 1, 3474] + - Exact: [1024, 4096, 1, 3572] + - Exact: [4096, 1024, 1, 3293] + - Exact: [4096, 1024, 1, 3247] + - Exact: [1024, 4096, 1, 3425] + - Exact: [1024, 4096, 1, 3354] + - Exact: [4096, 1024, 1, 3382] + - Exact: [4096, 1024, 1, 3236] + - Exact: [1024, 4096, 1, 3519] + - Exact: [4096, 1024, 1, 3354] + - Exact: [4096, 1024, 1, 3501] + - Exact: [4096, 1024, 1, 3266] + - Exact: [1024, 4096, 1, 3368] + - Exact: [1024, 4096, 1, 4030] + - Exact: [1024, 4096, 1, 3533] + - Exact: [4096, 1024, 1, 3332] + - Exact: [4096, 1024, 1, 3584] + - Exact: [1024, 4096, 1, 3616] + - Exact: [4096, 1024, 1, 3265] + - Exact: [4096, 1024, 1, 3361] + - Exact: [4096, 1024, 1, 3467] + - Exact: [1024, 4096, 1, 3454] + - Exact: [1024, 4096, 1, 3101] + - Exact: [1024, 4096, 1, 3508] + - Exact: [4096, 1024, 1, 3267] + - Exact: [4096, 1024, 1, 3419] + - Exact: [4096, 1024, 1, 3822] + - Exact: [1024, 4096, 1, 3266] + - Exact: [4096, 1024, 1, 3440] + - Exact: [1024, 4096, 1, 3361] + - Exact: [1024, 4096, 1, 3546] + - Exact: [4096, 1024, 1, 3473] + - Exact: [4096, 1024, 1, 3546] + - Exact: [1024, 4096, 1, 3088] + - Exact: [1024, 4096, 1, 3535] + - Exact: [1024, 4096, 1, 3447] + - Exact: [1024, 4096, 1, 3560] + - Exact: [1024, 4096, 1, 3422] + - Exact: [1024, 4096, 1, 3469] + - Exact: [4096, 1024, 1, 3488] + - Exact: [1024, 4096, 1, 3110] + - Exact: [1024, 4096, 1, 3265] + - Exact: [1024, 4096, 1, 3291] + - Exact: [1024, 4096, 1, 3390] + - Exact: [4096, 1024, 1, 3046] + - Exact: [1024, 4096, 1, 3539] + - Exact: [4096, 1024, 1, 3221] + - Exact: [4096, 1024, 1, 3433] + - Exact: [4096, 1024, 1, 3364] + - Exact: [4096, 1024, 1, 3470] + - Exact: [1024, 4096, 1, 3404] + - Exact: [1024, 33708, 1, 3968] + - Exact: [4096, 1024, 1, 3088] + - Exact: [1024, 4096, 1, 3247] + - Exact: [1024, 33708, 1, 3996] + - Exact: [4096, 1024, 1, 3482] + - Exact: [4096, 1024, 1, 3995] + - Exact: [1024, 4096, 1, 3280] + - Exact: [4096, 1024, 1, 3271] + - Exact: [4096, 1024, 1, 3545] + - Exact: [4096, 1024, 1, 3476] + - Exact: [4096, 1024, 1, 3496] + - Exact: [4096, 1024, 1, 3191] + - Exact: [4096, 1024, 1, 3311] + - Exact: [1024, 4096, 1, 3302] + - Exact: [1024, 4096, 1, 3681] + - Exact: [4096, 1024, 1, 3582] + - Exact: [4096, 1024, 1, 3421] + - Exact: [4096, 1024, 1, 3560] + - Exact: [1024, 4096, 1, 3495] + - Exact: [4096, 1024, 1, 3186] + - Exact: [4096, 1024, 1, 3925] + - Exact: [1024, 4096, 1, 3435] + - Exact: [4096, 1024, 1, 3434] + - Exact: [1024, 33708, 1, 4012] + - Exact: [1024, 4096, 1, 3340] + - Exact: [4096, 1024, 1, 3489] + - Exact: [1024, 4096, 1, 3162] + - Exact: [4096, 1024, 1, 3436] + - Exact: [4096, 1024, 1, 3574] + - Exact: [4096, 1024, 1, 3469] + - Exact: [1024, 4096, 1, 3410] + - Exact: [1024, 4096, 1, 3216] + - Exact: [4096, 1024, 1, 3095] + - Exact: [4096, 1024, 1, 3448] + - Exact: [1024, 4096, 1, 3176] + - Exact: [4096, 1024, 1, 2918] + - Exact: [1024, 4096, 1, 3424] + - Exact: [4096, 1024, 1, 3402] + - Exact: [4096, 1024, 1, 3145] + - Exact: [1024, 33708, 1, 3976] + - Exact: [4096, 1024, 1, 3518] + - Exact: [4096, 1024, 1, 3110] + - Exact: [4096, 1024, 1, 3325] + - Exact: [1024, 33708, 1, 3999] + - Exact: [4096, 1024, 1, 2985] + - Exact: [1024, 4096, 1, 3371] + - Exact: [4096, 1024, 1, 3342] + - Exact: [4096, 1024, 1, 3141] + - Exact: [4096, 1024, 1, 3532] + - Exact: [1024, 4096, 1, 3169] + - Exact: [1024, 4096, 1, 3514] + - Exact: [4096, 1024, 1, 3780] + - Exact: [1024, 4096, 1, 3098] + - Exact: [1024, 4096, 1, 3449] + - Exact: [1024, 4096, 1, 3222] + - Exact: [1024, 4096, 1, 3346] + - Exact: [4096, 1024, 1, 3064] + - Exact: [4096, 1024, 1, 3511] + - Exact: [4096, 1024, 1, 3384] + - Exact: [4096, 1024, 1, 3356] + - Exact: [1024, 4096, 1, 3796] + - Exact: [4096, 1024, 1, 3427] + - Exact: [4096, 1024, 1, 3390] + - Exact: [4096, 1024, 1, 3573] + - Exact: [4096, 1024, 1, 3456] + - Exact: [1024, 4096, 1, 3360] + - Exact: [1024, 33708, 1, 3977] + - Exact: [1024, 4096, 1, 2918] + - Exact: [4096, 1024, 1, 3975] + - Exact: [4096, 1024, 1, 3525] + - Exact: [4096, 1024, 1, 3398] + - Exact: [4096, 1024, 1, 3640] + - Exact: [4096, 1024, 1, 3014] + - Exact: [1024, 4096, 1, 3446] + - Exact: [1024, 33708, 1, 3796] + - Exact: [4096, 1024, 1, 3101] + - Exact: [4096, 1024, 1, 3563] + - Exact: [4096, 1024, 1, 3539] + - Exact: [4096, 1024, 1, 3182] + - Exact: [1024, 4096, 1, 3468] + - Exact: [4096, 1024, 1, 3312] + - Exact: [4096, 1024, 1, 3215] + - Exact: [4096, 1024, 1, 3910] + - Exact: [1024, 33708, 1, 3780] + - Exact: [1024, 4096, 1, 3290] + - Exact: [1024, 4096, 1, 4012] + - Exact: [1024, 4096, 1, 3385] + - Exact: [1024, 33708, 1, 3975] + - Exact: [4096, 1024, 1, 3996] + - Exact: [4096, 1024, 1, 2765] + - Exact: [4096, 1024, 1, 3538] + - Exact: [4096, 1024, 1, 3415] + - Exact: [1024, 4096, 1, 3554] + - Exact: [4096, 1024, 1, 3513] + - Exact: [1024, 4096, 1, 3304] + - Exact: [4096, 1024, 1, 3294] + - Exact: [4096, 1024, 1, 3396] + - Exact: [1024, 4096, 1, 3213] + - Exact: [4096, 1024, 1, 3137] + - Exact: [4096, 1024, 1, 3552] + - Exact: [1024, 4096, 1, 3461] + - Exact: [4096, 1024, 1, 3263] + - Exact: [4096, 1024, 1, 3430] + - Exact: [4096, 1024, 1, 3389] + - Exact: [4096, 1024, 1, 3528] + - Exact: [1024, 4096, 1, 3463] + - Exact: [4096, 1024, 1, 3526] + - Exact: [4096, 1024, 1, 3154] + - Exact: [4096, 1024, 1, 3499] + - Exact: [4096, 1024, 1, 3955] + - Exact: [1024, 4096, 1, 3297] + - Exact: [1024, 4096, 1, 3233] + - Exact: [1024, 4096, 1, 3226] + - Exact: [4096, 1024, 1, 3404] + - Exact: [4096, 1024, 1, 3355] + - Exact: [1024, 4096, 1, 3542] + - Exact: [4096, 1024, 1, 3181] + - Exact: [1024, 4096, 1, 3474] + - Exact: [4096, 1024, 1, 3319] + - Exact: [1024, 4096, 1, 3434] + - Exact: [1024, 4096, 1, 3860] + - Exact: [1024, 4096, 1, 3343] + - Exact: [1024, 4096, 1, 3488] + - Exact: [1024, 4096, 1, 3046] + - Exact: [1024, 4096, 1, 3141] + - Exact: [1024, 4096, 1, 3516] + - Exact: [4096, 1024, 1, 3147] + - Exact: [1024, 4096, 1, 3421] + - Exact: [4096, 1024, 1, 3944] + - Exact: [1024, 4096, 1, 3574] + - Exact: [1024, 4096, 1, 3977] + - Exact: [1024, 4096, 1, 2985] + - Exact: [1024, 4096, 1, 3427] + - Exact: [1024, 4096, 1, 3482] + - Exact: [1024, 4096, 1, 3332] + - Exact: [4096, 1024, 1, 3308] + - Exact: [1024, 4096, 1, 3513] + - Exact: [1024, 4096, 1, 3154] + - Exact: [1024, 4096, 1, 3955] + - Exact: [1024, 4096, 1, 2967] + - Exact: [1024, 33708, 1, 3942] + - Exact: [1024, 4096, 1, 3319] + - Exact: [4096, 1024, 1, 3860] + - Exact: [1024, 4096, 1, 3548] + - Exact: [4096, 1024, 1, 3977] + - Exact: [4096, 1024, 1, 3535] + - Exact: [1024, 4096, 1, 3541] + - Exact: [1024, 33708, 1, 3584] + - Exact: [1024, 4096, 1, 3168] + - Exact: [1024, 4096, 1, 3448] + - Exact: [4096, 1024, 1, 3343] + - Exact: [1024, 4096, 1, 3357] + - Exact: [4096, 1024, 1, 3510] + - Exact: [4096, 1024, 1, 3369] + - Exact: [4096, 1024, 1, 3379] + - Exact: [1024, 4096, 1, 3276] + - Exact: [1024, 4096, 1, 3363] + - Exact: [4096, 1024, 1, 3055] + - Exact: [1024, 4096, 1, 3524] + - Exact: [4096, 1024, 1, 3057] + - Exact: [1024, 33708, 1, 3720] + - Exact: [1024, 4096, 1, 3383] + - Exact: [1024, 4096, 1, 3522] + - Exact: [1024, 33708, 1, 3956] + - Exact: [1024, 4096, 1, 3481] + - Exact: [4096, 1024, 1, 3562] + - Exact: [4096, 1024, 1, 3299] + - Exact: [1024, 4096, 1, 3262] + - Exact: [1024, 33708, 1, 4026] + - Exact: [4096, 1024, 1, 3168] + - Exact: [1024, 4096, 1, 3999] + - Exact: [1024, 4096, 1, 3549] + - Exact: [4096, 1024, 1, 3375] + - Exact: [1024, 4096, 1, 3496] + - Exact: [1024, 4096, 1, 3190] + - Exact: [4096, 1024, 1, 3273] + - Exact: [1024, 4096, 1, 3406] + - Exact: [4096, 1024, 1, 4005] + - Exact: [4096, 1024, 1, 3555] + - Exact: [4096, 1024, 1, 2505] + - Exact: [1024, 4096, 1, 3460] + - Exact: [1024, 4096, 1, 3579] + - Exact: [1024, 33708, 1, 4030] + - Exact: [1024, 4096, 1, 3510] + - Exact: [1024, 4096, 1, 3282] + - Exact: [1024, 4096, 1, 3377] + - Exact: [1024, 4096, 1, 2935] + - Exact: [1024, 4096, 1, 3498] + - Exact: [1024, 4096, 1, 3593] + - Exact: [4096, 1024, 1, 3226] + - Exact: [1024, 4096, 1, 2499] + - Exact: [1024, 4096, 1, 3296] + - Exact: [1024, 4096, 1, 3455] + - Exact: [1024, 4096, 1, 3399] + - Exact: [1024, 4096, 1, 3205] + - Exact: [4096, 1024, 1, 4026] + - Exact: [1024, 4096, 1, 3484] + - Exact: [4096, 1024, 1, 3302] + - Exact: [1024, 4096, 1, 3485] + - Exact: [1024, 4096, 1, 3126] + - Exact: [1024, 4096, 1, 4050] + - Exact: [4096, 1024, 1, 3235] + - Exact: [1024, 33708, 1, 3955] + - Exact: [1024, 4096, 1, 3342] + - Exact: [1024, 4096, 1, 3397] + - Exact: [4096, 1024, 1, 3491] + - Exact: [1024, 4096, 1, 3503] + - Exact: [1024, 4096, 1, 3140] + - Exact: [4096, 1024, 1, 3121] + - Exact: [4096, 1024, 1, 3276] + - Exact: [1024, 4096, 1, 3321] + - Exact: [1024, 4096, 1, 3870] + - Exact: [4096, 1024, 1, 3475] + - Exact: [1024, 4096, 1, 2984] + - Exact: [4096, 1024, 1, 3363] + - Exact: [1024, 4096, 1, 3582] + - Exact: [4096, 1024, 1, 3509] + - Exact: [1024, 4096, 1, 3426] + - Exact: [4096, 1024, 1, 3136] + - Exact: [1024, 4096, 1, 3232] + - Exact: [4096, 1024, 1, 3103] + - Exact: [1024, 4096, 1, 3335] + - Exact: [1024, 4096, 1, 3900] + - Exact: [4096, 1024, 1, 3512] + - Exact: [4096, 1024, 1, 3222] + - Exact: [1024, 4096, 1, 3165] + - Exact: [4096, 1024, 1, 3408] + - Exact: [4096, 1024, 1, 3751] + - Exact: [1024, 4096, 1, 3318] + - Exact: [4096, 1024, 1, 3442] + - Exact: [1024, 4096, 1, 3413] + - Exact: [4096, 1024, 1, 3524] + - Exact: [1024, 4096, 1, 3976] + - Exact: [1024, 4096, 1, 3475] + - Exact: [1024, 4096, 1, 3534] + - Exact: [4096, 1024, 1, 3301] + - Exact: [4096, 1024, 1, 3248] + - Exact: [1024, 4096, 1, 2977] + - Exact: [4096, 1024, 1, 3346] + - Exact: [1024, 4096, 1, 3451] + - Exact: [1024, 4096, 1, 3257] + - Exact: [1024, 4096, 1, 3356] + - Exact: [4096, 1024, 1, 3348] + - Exact: [4096, 1024, 1, 3335] + - Exact: [4096, 1024, 1, 3505] + - Exact: [1024, 4096, 1, 3490] + - Exact: [4096, 1024, 1, 3447] + - Exact: [1024, 4096, 1, 3267] + - Exact: [4096, 1024, 1, 3230] + - Exact: [4096, 1024, 1, 3455] + - Exact: [1024, 4096, 1, 3925] + - Exact: [1024, 4096, 1, 3362] + - Exact: [4096, 1024, 1, 3969] + - Exact: [4096, 1024, 1, 3527] + - Exact: [1024, 4096, 1, 3585] + - Exact: [4096, 1024, 1, 3063] + - Exact: [4096, 1024, 1, 3435] + - Exact: [4096, 1024, 1, 3366] + - Exact: [4096, 1024, 1, 3581] + - Exact: [1024, 33708, 1, 3906] + - Exact: [1024, 4096, 1, 3464] + - Exact: [1024, 4096, 1, 3440] + - Exact: [4096, 1024, 1, 3143] + - Exact: [1024, 4096, 1, 3349] + - Exact: [4096, 1024, 1, 3416] + - Exact: [4096, 1024, 1, 3365] + - Exact: [1024, 4096, 1, 3470] + - Exact: [4096, 1024, 1, 3287] + - Exact: [1024, 4096, 1, 3441] + - Exact: [4096, 1024, 1, 3224] + - Exact: [1024, 4096, 1, 3387] + - Exact: [1024, 4096, 1, 3547] + - Exact: [4096, 1024, 1, 3478] + - Exact: [4096, 1024, 1, 3548] + - Exact: [1024, 33708, 1, 4020] + - Exact: [4096, 1024, 1, 3320] + - Exact: [1024, 4096, 1, 3906] + - Exact: [4096, 1024, 1, 3796] + - Exact: [1024, 4096, 1, 3306] + - Exact: [1024, 4096, 1, 3401] + - Exact: [1024, 4096, 1, 3215] + - Exact: [4096, 1024, 1, 4012] + - Exact: [1024, 4096, 1, 2765] + - Exact: [4096, 1024, 1, 3554] + - Exact: [4096, 1024, 1, 3423] + - Exact: [1024, 4096, 1, 3562] + - Exact: [1024, 4096, 1, 3489] + - Exact: [4096, 1024, 1, 3358] + - Exact: [4096, 1024, 1, 3270] + - Exact: [1024, 4096, 1, 3293] + - Exact: [1024, 4096, 1, 3376] + - Exact: [4096, 1024, 1, 3245] + - Exact: [4096, 1024, 1, 3541] + - Exact: [4096, 1024, 1, 3443] + - Exact: [4096, 1024, 1, 3438] + - Exact: [4096, 1024, 1, 3244] + - Exact: [1024, 4096, 1, 3365] + - Exact: [1024, 4096, 1, 3299] + - Exact: [1024, 4096, 1, 3471] + - Exact: [1024, 4096, 1, 3398] + - Exact: [4096, 1024, 1, 3162] + - Exact: [1024, 4096, 1, 4005] + - Exact: [4096, 1024, 1, 3579] + - Exact: [1024, 4096, 1, 3121] + - Exact: [4096, 1024, 1, 3441] + - Exact: [4096, 1024, 1, 3422] + - Exact: [4096, 1024, 1, 3444] + - Exact: [1024, 4096, 1, 3337] + - Exact: [4096, 1024, 1, 3550] + - Exact: [1024, 4096, 1, 3477] + - Exact: [4096, 1024, 1, 3490] + - Exact: [4096, 1024, 1, 3585] + - Exact: [1024, 4096, 1, 3143] + - Exact: [1024, 33708, 1, 3876] + - Exact: [1024, 4096, 1, 3320] + - Exact: [1024, 4096, 1, 3423] + - Exact: [1024, 4096, 1, 3894] + - Exact: [4096, 1024, 1, 3410] + - Exact: [1024, 4096, 1, 3561] + - Exact: [4096, 1024, 1, 3492] + - Exact: [36548, 1024, 1, 3712] + - Exact: [4096, 2048, 1, 128] + - Exact: [4096, 3072, 1, 128] + - Exact: [768, 3072, 1, 4096] + - Exact: [768, 30522, 1, 1280] + - Exact: [768, 30522, 1, 320] + - Exact: [768, 30522, 1, 640] + - Exact: [256, 512, 36, 98] + - Exact: [256, 256, 64, 56] + - Exact: [512, 486, 36, 800] + - Exact: [512, 512, 36, 1568] + - Exact: [256, 384, 36, 4096] + - Exact: [128, 256, 64, 32] + - Exact: [128, 256, 64, 9] + - Exact: [256, 512, 36, 784] + - Exact: [256, 324, 36, 32] + - Exact: [512, 512, 36, 33] + - Exact: [192, 384, 64, 128] + - Exact: [512, 512, 64, 72] + - Exact: [512, 512, 36, 128] + - Exact: [192, 384, 64, 2304] + - Exact: [384, 256, 64, 450] + - Exact: [384, 256, 64, 2304] + - Exact: [512, 512, 64, 144] + - Exact: [256, 256, 36, 6272] + - Exact: [256, 384, 64, 2304] + - Exact: [512, 512, 36, 66] + - Exact: [128, 256, 64, 800] + - Exact: [192, 256, 36, 512] + - Exact: [256, 512, 64, 200] + - Exact: [256, 512, 64, 25] + - Exact: [128, 256, 36, 1568] + - Exact: [128, 256, 64, 288] + - Exact: [256, 384, 64, 1152] + - Exact: [160, 320, 64, 288] + - Exact: [128, 256, 36, 128] + - Exact: [512, 512, 36, 16] + - Exact: [384, 256, 36, 800] + - Exact: [192, 384, 36, 4096] + - Exact: [256, 384, 64, 576] + - Exact: [512, 512, 64, 14] + - Exact: [512, 512, 36, 8] + - Exact: [512, 486, 64, 128] + - Exact: [256, 256, 36, 128] + - Exact: [256, 256, 36, 32] + - Exact: [192, 256, 64, 288] + - Exact: [256, 256, 36, 16] + - Exact: [128, 256, 36, 3200] + - Exact: [160, 320, 64, 512] + - Exact: [160, 320, 36, 512] + - Exact: [256, 512, 36, 4] + - Exact: [256, 324, 64, 1568] + - Exact: [256, 256, 36, 3200] + - Exact: [256, 256, 36, 210] + - Exact: [192, 384, 64, 576] + - Exact: [512, 512, 64, 800] + - Exact: [256, 256, 64, 1152] + - Exact: [512, 486, 64, 512] + - Exact: [256, 512, 64, 1600] + - Exact: [512, 512, 64, 9] + - Exact: [256, 512, 36, 1568] + - Exact: [128, 256, 64, 3200] + - Exact: [256, 512, 64, 4] + - Exact: [256, 256, 64, 450] + - Exact: [256, 256, 64, 72] + - Exact: [128, 256, 36, 3136] + - Exact: [160, 320, 64, 242] + - Exact: [512, 512, 36, 512] + - Exact: [512, 512, 36, 256] + - Exact: [512, 512, 36, 1024] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 64, 896] + - Exact: [128, 256, 64, 242] + - Exact: [192, 384, 36, 1024] + - Exact: [128, 256, 64, 100] + - Exact: [384, 256, 64, 1152] + - Exact: [192, 384, 36, 128] + - Exact: [128, 256, 64, 1568] + - Exact: [128, 256, 64, 72] + - Exact: [256, 256, 36, 12544] + - Exact: [256, 256, 36, 105] + - Exact: [128, 256, 36, 392] + - Exact: [384, 256, 36, 1024] + - Exact: [128, 256, 64, 1152] + - Exact: [256, 324, 64, 32] + - Exact: [256, 384, 36, 800] + - Exact: [512, 512, 64, 4] + - Exact: [192, 320, 36, 128] + - Exact: [192, 384, 64, 242] + - Exact: [256, 486, 64, 32] + - Exact: [512, 512, 64, 64] + - Exact: [128, 256, 36, 512] + - Exact: [512, 512, 64, 576] + - Exact: [256, 256, 64, 9] + - Exact: [128, 256, 36, 12544] + - Exact: [256, 512, 36, 3136] + - Exact: [144, 288, 36, 512] + - Exact: [384, 384, 36, 800] + - Exact: [512, 512, 64, 1600] + - Exact: [512, 512, 36, 4] + - Exact: [192, 384, 64, 450] + - Exact: [256, 256, 36, 1024] + - Exact: [256, 512, 64, 400] + - Exact: [128, 256, 36, 6272] + - Exact: [256, 256, 36, 512] + - Exact: [256, 256, 64, 112] + - Exact: [512, 512, 64, 18] + - Exact: [256, 256, 64, 18] + - Exact: [256, 256, 64, 1568] + - Exact: [384, 256, 36, 4096] + - Exact: [256, 512, 64, 800] + - Exact: [256, 384, 36, 2048] + - Exact: [384, 384, 64, 2304] + - Exact: [160, 320, 64, 128] + - Exact: [512, 512, 36, 528] + - Exact: [160, 320, 36, 128] + - Exact: [256, 512, 36, 49] + - Exact: [384, 384, 64, 450] + - Exact: [256, 256, 64, 3200] + - Exact: [512, 512, 64, 8] + - Exact: [512, 512, 64, 288] + - Exact: [384, 384, 36, 1024] + - Exact: [128, 256, 36, 16] + - Exact: [256, 256, 64, 288] + - Exact: [256, 384, 36, 1024] + - Exact: [256, 324, 36, 3200] + - Exact: [192, 384, 64, 512] + - Exact: [128, 256, 64, 1600] + - Exact: [512, 512, 36, 32] + - Exact: [512, 512, 36, 3136] + - Exact: [128, 256, 64, 6400] + - Exact: [256, 256, 36, 2048] + - Exact: [256, 256, 64, 6400] + - Exact: [256, 256, 36, 1680] + - Exact: [192, 384, 36, 2048] + - Exact: [256, 256, 64, 144] + - Exact: [384, 384, 36, 4096] + - Exact: [160, 320, 64, 1152] + - Exact: [384, 256, 36, 2048] + - Exact: [256, 512, 36, 392] + - Exact: [256, 512, 64, 50] + - Exact: [384, 384, 36, 2048] + - Exact: [256, 384, 64, 450] + - Exact: [192, 320, 64, 128] + - Exact: [128, 256, 36, 32] + - Exact: [512, 512, 64, 256] + - Exact: [256, 512, 64, 32] + - Exact: [384, 384, 64, 576] + - Exact: [512, 486, 36, 288] + - Exact: [144, 288, 64, 242] + - Exact: [384, 256, 64, 576] + - Exact: [512, 512, 36, 64] + - Exact: [448, 384, 64, 128] + - Exact: [144, 288, 64, 288] + - Exact: [512, 512, 64, 224] + - Exact: [384, 384, 64, 1152] + - Exact: [448, 384, 36, 128] + - Exact: [256, 486, 36, 128] + - Exact: [256, 256, 36, 800] + - Exact: [192, 384, 36, 800] + - Exact: [256, 256, 36, 256] + - Exact: [192, 384, 64, 1152] + - Exact: [128, 256, 64, 200] + - Exact: [512, 512, 64, 28] + - Exact: [144, 288, 64, 1152] + - Exact: [256, 256, 64, 576] + - Exact: [256, 256, 64, 2304] + - Exact: [192, 384, 36, 512] + - Exact: [256, 512, 36, 32] + - Exact: [512, 512, 64, 128] + - Exact: [512, 512, 64, 32] + - Exact: [128, 256, 36, 196] + - Exact: [196, 528, 32, 32] + - Exact: [196, 512, 32, 24] + - Exact: [1225, 192, 32, 32] + - Exact: [1001, 1536, 1, 32] + - Exact: [196, 480, 32, 64] + - Exact: [289, 1024, 32, 384] + - Exact: [784, 192, 32, 96] + - Exact: [50176, 256, 1, 128] + - Exact: [289, 1024, 32, 256] + - Exact: [289, 1024, 32, 192] + - Exact: [12544, 512, 1, 256] + - Exact: [1225, 1728, 1, 192] + - Exact: [196, 480, 32, 96] + - Exact: [196, 512, 32, 144] + - Exact: [289, 768, 32, 128] + - Exact: [5329, 576, 1, 96] + - Exact: [196, 528, 32, 128] + - Exact: [5329, 448, 1, 64] + - Exact: [784, 256, 32, 64] + - Exact: [784, 192, 32, 32] + - Exact: [21609, 288, 1, 32] + - Exact: [784, 256, 32, 32] + - Exact: [5041, 720, 1, 192] + - Exact: [196, 512, 32, 128] + - Exact: [289, 768, 32, 160] + - Exact: [1001, 4096, 1, 512] + - Exact: [1225, 192, 32, 64] + - Exact: [784, 192, 32, 16] + - Exact: [3136, 1024, 1, 2048] + - Exact: [784, 256, 32, 128] + - Exact: [196, 512, 32, 32] + - Exact: [1225, 384, 32, 96] + - Exact: [5041, 576, 1, 96] + - Exact: [5329, 160, 32, 64] + - Exact: [1225, 288, 32, 48] + - Exact: [4096, 9216, 1, 512] + - Exact: [196, 480, 32, 192] + - Exact: [3136, 1024, 1, 512] + - Exact: [784, 192, 32, 64] + - Exact: [289, 1024, 32, 128] + - Exact: [289, 768, 32, 192] + - Exact: [196, 512, 32, 112] + - Exact: [1001, 2048, 1, 32] + - Exact: [1225, 288, 32, 64] + - Exact: [1225, 384, 32, 192] + - Exact: [50176, 256, 1, 512] + - Exact: [196, 512, 32, 160] + - Exact: [4096, 4096, 1, 512] + - Exact: [1225, 256, 32, 64] + - Exact: [196, 480, 32, 16] + - Exact: [1225, 256, 32, 48] + - Exact: [1225, 1200, 1, 64] + - Exact: [1225, 384, 32, 64] + - Exact: [12544, 512, 1, 1024] + - Exact: [196, 512, 32, 64] + - Exact: [196, 528, 32, 256] + - Exact: [196, 528, 32, 160] + - Exact: [1225, 192, 32, 48] + - Exact: [1001, 2048, 1, 64] + - Exact: [289, 768, 128, 128] + - Exact: [1225, 192, 128, 64] + - Exact: [1225, 288, 128, 48] + - Exact: [289, 768, 128, 192] + - Exact: [289, 768, 128, 160] + - Exact: [1225, 256, 128, 48] + - Exact: [1225, 192, 128, 48] + - Exact: [1225, 288, 128, 64] + - Exact: [1225, 256, 128, 64] + - Exact: [1001, 2048, 1, 128] + - Exact: [1225, 192, 128, 32] + - Exact: [1001, 1536, 1, 64] + - Exact: [1024, 4096, 1, 64] + - Exact: [1024, 4096, 1, 6336] + - Exact: [512, 33708, 1, 3780] + - Exact: [512, 33708, 1, 3968] + - Exact: [512, 33708, 1, 4030] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 64, 256] + - Exact: [289, 768, 64, 128] + - Exact: [289, 768, 64, 160] + - Exact: [289, 768, 64, 192] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 64, 128] + - Exact: [1225, 192, 64, 32] + - Exact: [1225, 192, 64, 48] + - Exact: [1225, 192, 64, 64] + - Exact: [1225, 256, 64, 48] + - Exact: [1225, 256, 64, 64] + - Exact: [1225, 288, 64, 48] + - Exact: [1225, 288, 64, 64] + - Exact: [3136, 256, 64, 64] + - Exact: [256, 44505, 1, 8976] + - Exact: [512, 33708, 1, 3796] + - Exact: [512, 33708, 1, 3822] + - Exact: [512, 33708, 1, 3840] + - Exact: [512, 33708, 1, 3859] + - Exact: [512, 33708, 1, 3870] + - Exact: [512, 33708, 1, 3876] + - Exact: [512, 33708, 1, 3906] + - Exact: [512, 33708, 1, 3910] + - Exact: [512, 33708, 1, 3925] + - Exact: [512, 33708, 1, 3942] + - Exact: [512, 33708, 1, 3944] + - Exact: [512, 33708, 1, 3955] + - Exact: [512, 33708, 1, 3969] + - Exact: [512, 33708, 1, 3976] + - Exact: [512, 33708, 1, 3977] + - Exact: [512, 33708, 1, 3978] + - Exact: [512, 33708, 1, 3990] + - Exact: [512, 33708, 1, 3995] + - Exact: [512, 33708, 1, 3996] + - Exact: [512, 33708, 1, 3999] + - Exact: [512, 33708, 1, 4005] + - Exact: [512, 33708, 1, 4012] + - Exact: [512, 33708, 1, 4020] + - Exact: [512, 33708, 1, 4026] + - Exact: [512, 33708, 1, 4032] + - Exact: [1024, 3072, 1, 2048] + - Exact: [1024, 3072, 1, 3072] + - Exact: [1024, 30522, 1, 20] + - Exact: [1024, 30522, 1, 80] + - Exact: [1024, 30522, 1, 120] + - Exact: [1024, 4096, 1, 3840] + - Exact: [1024, 4096, 1, 3968] + - Exact: [1024, 4096, 1, 7200] + - Exact: [1024, 4096, 1, 8160] + - Exact: [1024, 4096, 1, 9520] + - Exact: [1024, 4096, 1, 10200] + - Exact: [1024, 42720, 1, 3968] + - Exact: [1024, 42720, 1, 7200] + - Exact: [1024, 42720, 1, 9520] + - Exact: [4096, 1024, 1, 3840] + - Exact: [4096, 1024, 1, 3968] + - Exact: [4096, 1024, 1, 7200] + - Exact: [4096, 1024, 1, 8160] + - Exact: [4096, 1024, 1, 9520] + - Exact: [4096, 1024, 1, 10200] + - Exact: [5760, 5760, 1, 5760] + - Exact: [7744, 7744, 1, 7744] + - Exact: [1152, 1152, 1, 384] + - Exact: [1536, 1536, 1, 384] + - Exact: [1920, 1920, 1, 384] + - Exact: [2304, 2304, 1, 384] + - Exact: [2688, 2688, 1, 384] + - Exact: [3072, 3072, 1, 384] + - Exact: [3456, 3456, 1, 384] + - Exact: [3840, 3840, 1, 384] + - Exact: [4224, 4224, 1, 384] + - Exact: [4608, 4608, 1, 384] + - Exact: [4992, 4992, 1, 384] + - Exact: [5376, 5376, 1, 384] + - Exact: [5760, 5760, 1, 384] + - Exact: [6144, 6144, 1, 384] + - Exact: [6528, 6528, 1, 384] + - Exact: [6912, 6912, 1, 384] + - Exact: [7296, 7296, 1, 384] + - Exact: [7680, 7680, 1, 384] + - Exact: [1536, 768, 1, 384] + - Exact: [1920, 960, 1, 384] + - Exact: [2304, 1152, 1, 384] + - Exact: [2688, 1344, 1, 384] + - Exact: [3072, 1536, 1, 384] + - Exact: [3456, 1728, 1, 384] + - Exact: [3840, 1920, 1, 384] + - Exact: [4224, 2112, 1, 384] + - Exact: [4608, 2304, 1, 384] + - Exact: [4992, 2496, 1, 384] + - Exact: [5376, 2688, 1, 384] + - Exact: [5760, 2880, 1, 384] + - Exact: [6144, 3072, 1, 384] + - Exact: [6528, 3264, 1, 384] + - Exact: [6912, 3456, 1, 384] + - Exact: [7296, 3648, 1, 384] + - Exact: [7680, 3840, 1, 384] + - Exact: [768, 1536, 1, 384] + - Exact: [1152, 2304, 1, 384] + - Exact: [1536, 3072, 1, 384] + - Exact: [1920, 3840, 1, 384] + - Exact: [2304, 4608, 1, 384] + - Exact: [2688, 5376, 1, 384] + - Exact: [3072, 6144, 1, 384] + - Exact: [3456, 6912, 1, 384] + - Exact: [3840, 7680, 1, 384] + - Exact: [4224, 8448, 1, 384] + - Exact: [4608, 9216, 1, 384] + - Exact: [4992, 9984, 1, 384] + - Exact: [5376, 10752, 1, 384] + - Exact: [5760, 11520, 1, 384] + - Exact: [6144, 12288, 1, 384] + - Exact: [6528, 13056, 1, 384] + - Exact: [6912, 13824, 1, 384] + - Exact: [7296, 14592, 1, 384] + - Exact: [7680, 15360, 1, 384] + - Exact: [2048, 2048, 1, 1024] + - Exact: [256, 10240, 1, 8976] + - Exact: [256, 10496, 1, 8976] + - Exact: [256, 11008, 1, 8976] + - Exact: [256, 11264, 1, 8976] + - Exact: [256, 11520, 1, 8976] + - Exact: [256, 11776, 1, 8976] + - Exact: [256, 12544, 1, 8976] + - Exact: [256, 12800, 1, 8976] + - Exact: [256, 13312, 1, 8976] + - Exact: [256, 13568, 1, 8976] + - Exact: [256, 14336, 1, 8976] + - Exact: [256, 14848, 1, 8976] + - Exact: [256, 15104, 1, 8976] + - Exact: [256, 15872, 1, 8976] + - Exact: [256, 16128, 1, 8976] + - Exact: [256, 17152, 1, 8976] + - Exact: [256, 17408, 1, 8976] + - Exact: [256, 18688, 1, 8976] + - Exact: [256, 19968, 1, 8976] + - Exact: [256, 20480, 1, 8976] + - Exact: [256, 20992, 1, 8976] + - Exact: [256, 21248, 1, 8976] + - Exact: [256, 22016, 1, 8976] + - Exact: [256, 26112, 1, 8976] + - Exact: [256, 32512, 1, 8976] + - Exact: [256, 33536, 1, 8976] + - Exact: [256, 4864, 1, 8976] + - Exact: [256, 5120, 1, 8976] + - Exact: [256, 5632, 1, 8976] + - Exact: [256, 5888, 1, 8976] + - Exact: [256, 6144, 1, 8976] + - Exact: [256, 7168, 1, 8976] + - Exact: [256, 8192, 1, 8976] + - Exact: [256, 8960, 1, 8976] + - Exact: [256, 9728, 1, 8976] + - Exact: [256, 9984, 1, 8976] + - Exact: [3200, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [512, 3280, 1, 1600] + - Exact: [512, 3280, 1, 200] + - Exact: [768, 2048, 1, 256] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 2048, 1, 960] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 512] + - Exact: [1024, 4096, 1, 2048] + - Exact: [1024, 30528, 1, 2048] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 30528, 1, 4096] + - Exact: [9216, 128, 1, 128] + - Exact: [9600, 128, 1, 128] + - Exact: [9984, 128, 1, 128] + - Exact: [10368, 128, 1, 128] + - Exact: [10752, 128, 1, 128] + - Exact: [11136, 128, 1, 128] + - Exact: [11520, 128, 1, 128] + - Exact: [11904, 128, 1, 128] + - Exact: [12288, 128, 1, 128] + - Exact: [12672, 128, 1, 128] + - Exact: [13056, 128, 1, 128] + - Exact: [13440, 128, 1, 128] + - Exact: [13824, 128, 1, 128] + - Exact: [14208, 128, 1, 128] + - Exact: [14592, 128, 1, 128] + - Exact: [14976, 128, 1, 128] + - Exact: [15360, 128, 1, 128] + - Exact: [15744, 128, 1, 128] + - Exact: [16128, 128, 1, 128] + - Exact: [16512, 128, 1, 128] + - Exact: [16896, 128, 1, 128] + - Exact: [17280, 128, 1, 128] + - Exact: [17664, 128, 1, 128] + - Exact: [18048, 128, 1, 128] + - Exact: [18432, 128, 1, 128] + - Exact: [18816, 128, 1, 128] + - Exact: [19200, 128, 1, 128] + - Exact: [19584, 128, 1, 128] + - Exact: [19968, 128, 1, 128] + - Exact: [20352, 128, 1, 128] + - Exact: [20736, 128, 1, 128] + - Exact: [21120, 128, 1, 128] + - Exact: [21504, 128, 1, 128] + - Exact: [21888, 128, 1, 128] + - Exact: [22272, 128, 1, 128] + - Exact: [22656, 128, 1, 128] + - Exact: [23040, 128, 1, 128] + - Exact: [9216, 128, 1, 256] + - Exact: [9600, 128, 1, 256] + - Exact: [9984, 128, 1, 256] + - Exact: [10368, 128, 1, 256] + - Exact: [10752, 128, 1, 256] + - Exact: [11136, 128, 1, 256] + - Exact: [11520, 128, 1, 256] + - Exact: [11904, 128, 1, 256] + - Exact: [12288, 128, 1, 256] + - Exact: [12672, 128, 1, 256] + - Exact: [13056, 128, 1, 256] + - Exact: [13440, 128, 1, 256] + - Exact: [13824, 128, 1, 256] + - Exact: [14208, 128, 1, 256] + - Exact: [14592, 128, 1, 256] + - Exact: [14976, 128, 1, 256] + - Exact: [15360, 128, 1, 256] + - Exact: [15744, 128, 1, 256] + - Exact: [16128, 128, 1, 256] + - Exact: [16512, 128, 1, 256] + - Exact: [16896, 128, 1, 256] + - Exact: [17280, 128, 1, 256] + - Exact: [17664, 128, 1, 256] + - Exact: [18048, 128, 1, 256] + - Exact: [18432, 128, 1, 256] + - Exact: [18816, 128, 1, 256] + - Exact: [19200, 128, 1, 256] + - Exact: [19584, 128, 1, 256] + - Exact: [19968, 128, 1, 256] + - Exact: [20352, 128, 1, 256] + - Exact: [20736, 128, 1, 256] + - Exact: [21120, 128, 1, 256] + - Exact: [21504, 128, 1, 256] + - Exact: [21888, 128, 1, 256] + - Exact: [22272, 128, 1, 256] + - Exact: [22656, 128, 1, 256] + - Exact: [23040, 128, 1, 256] + - Exact: [8064, 8064, 1, 384] + - Exact: [8448, 8448, 1, 384] + - Exact: [8832, 8832, 1, 384] + - Exact: [9216, 9216, 1, 384] + - Exact: [9600, 9600, 1, 384] + - Exact: [9984, 9984, 1, 384] + - Exact: [10368, 10368, 1, 384] + - Exact: [10752, 10752, 1, 384] + - Exact: [11136, 11136, 1, 384] + - Exact: [11520, 11520, 1, 384] + - Exact: [11904, 11904, 1, 384] + - Exact: [12288, 12288, 1, 384] + - Exact: [12672, 12672, 1, 384] + - Exact: [13056, 13056, 1, 384] + - Exact: [13440, 13440, 1, 384] + - Exact: [13824, 13824, 1, 384] + - Exact: [14208, 14208, 1, 384] + - Exact: [14592, 14592, 1, 384] + - Exact: [14976, 14976, 1, 384] + - Exact: [15360, 15360, 1, 384] + - Exact: [15744, 15744, 1, 384] + - Exact: [16128, 16128, 1, 384] + - Exact: [16512, 16512, 1, 384] + - Exact: [16896, 16896, 1, 384] + - Exact: [17280, 17280, 1, 384] + - Exact: [17664, 17664, 1, 384] + - Exact: [18048, 18048, 1, 384] + - Exact: [18432, 18432, 1, 384] + - Exact: [18816, 18816, 1, 384] + - Exact: [19200, 19200, 1, 384] + - Exact: [19584, 19584, 1, 384] + - Exact: [19968, 19968, 1, 384] + - Exact: [20352, 20352, 1, 384] + - Exact: [20736, 20736, 1, 384] + - Exact: [21120, 21120, 1, 384] + - Exact: [21504, 21504, 1, 384] + - Exact: [21888, 21888, 1, 384] + - Exact: [22272, 22272, 1, 384] + - Exact: [22656, 22656, 1, 384] + - Exact: [23040, 23040, 1, 384] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3072, 3072, 1, 3072] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [8064, 4032, 1, 384] + - Exact: [8448, 4224, 1, 384] + - Exact: [8832, 4416, 1, 384] + - Exact: [9216, 4608, 1, 384] + - Exact: [9600, 4800, 1, 384] + - Exact: [9984, 4992, 1, 384] + - Exact: [10368, 5184, 1, 384] + - Exact: [10752, 5376, 1, 384] + - Exact: [11136, 5568, 1, 384] + - Exact: [11520, 5760, 1, 384] + - Exact: [11904, 5952, 1, 384] + - Exact: [12288, 6144, 1, 384] + - Exact: [12672, 6336, 1, 384] + - Exact: [13056, 6528, 1, 384] + - Exact: [13440, 6720, 1, 384] + - Exact: [13824, 6912, 1, 384] + - Exact: [14208, 7104, 1, 384] + - Exact: [14592, 7296, 1, 384] + - Exact: [14976, 7488, 1, 384] + - Exact: [15360, 7680, 1, 384] + - Exact: [15744, 7872, 1, 384] + - Exact: [16128, 8064, 1, 384] + - Exact: [16512, 8256, 1, 384] + - Exact: [16896, 8448, 1, 384] + - Exact: [17280, 8640, 1, 384] + - Exact: [17664, 8832, 1, 384] + - Exact: [18048, 9024, 1, 384] + - Exact: [18432, 9216, 1, 384] + - Exact: [18816, 9408, 1, 384] + - Exact: [19200, 9600, 1, 384] + - Exact: [19584, 9792, 1, 384] + - Exact: [19968, 9984, 1, 384] + - Exact: [20352, 10176, 1, 384] + - Exact: [20736, 10368, 1, 384] + - Exact: [21120, 10560, 1, 384] + - Exact: [21504, 10752, 1, 384] + - Exact: [21888, 10944, 1, 384] + - Exact: [22272, 11136, 1, 384] + - Exact: [22656, 11328, 1, 384] + - Exact: [23040, 11520, 1, 384] + - Exact: [8064, 16128, 1, 384] + - Exact: [8448, 16896, 1, 384] + - Exact: [8832, 17664, 1, 384] + - Exact: [9216, 18432, 1, 384] + - Exact: [9600, 19200, 1, 384] + - Exact: [9984, 19968, 1, 384] + - Exact: [10368, 20736, 1, 384] + - Exact: [10752, 21504, 1, 384] + - Exact: [11136, 22272, 1, 384] + - Exact: [11520, 23040, 1, 384] + - Exact: [11904, 23808, 1, 384] + - Exact: [12288, 24576, 1, 384] + - Exact: [12672, 25344, 1, 384] + - Exact: [13056, 26112, 1, 384] + - Exact: [13440, 26880, 1, 384] + - Exact: [13824, 27648, 1, 384] + - Exact: [14208, 28416, 1, 384] + - Exact: [14592, 29184, 1, 384] + - Exact: [14976, 29952, 1, 384] + - Exact: [15360, 30720, 1, 384] + - Exact: [15744, 31488, 1, 384] + - Exact: [16128, 32256, 1, 384] + - Exact: [16512, 33024, 1, 384] + - Exact: [16896, 33792, 1, 384] + - Exact: [17280, 34560, 1, 384] + - Exact: [17664, 35328, 1, 384] + - Exact: [18048, 36096, 1, 384] + - Exact: [18432, 36864, 1, 384] + - Exact: [18816, 37632, 1, 384] + - Exact: [19200, 38400, 1, 384] + - Exact: [19584, 39168, 1, 384] + - Exact: [19968, 39936, 1, 384] + - Exact: [20352, 40704, 1, 384] + - Exact: [20736, 41472, 1, 384] + - Exact: [21120, 42240, 1, 384] + - Exact: [21504, 43008, 1, 384] + - Exact: [21888, 43776, 1, 384] + - Exact: [22272, 44544, 1, 384] + - Exact: [22656, 45312, 1, 384] + - Exact: [23040, 46080, 1, 384] + - Exact: [1152, 1536, 1, 384] + - Exact: [1920, 1536, 1, 384] + - Exact: [2304, 1536, 1, 384] + - Exact: [2688, 1536, 1, 384] + - Exact: [3456, 1536, 1, 384] + - Exact: [3840, 1536, 1, 384] + - Exact: [4224, 1536, 1, 384] + - Exact: [4608, 1536, 1, 384] + - Exact: [4992, 1536, 1, 384] + - Exact: [5376, 1536, 1, 384] + - Exact: [5760, 1536, 1, 384] + - Exact: [6144, 1536, 1, 384] + - Exact: [6528, 1536, 1, 384] + - Exact: [6912, 1536, 1, 384] + - Exact: [7296, 1536, 1, 384] + - Exact: [7680, 1536, 1, 384] + - Exact: [8064, 1536, 1, 384] + - Exact: [8448, 1536, 1, 384] + - Exact: [8832, 1536, 1, 384] + - Exact: [9216, 1536, 1, 384] + - Exact: [9600, 1536, 1, 384] + - Exact: [9984, 1536, 1, 384] + - Exact: [10368, 1536, 1, 384] + - Exact: [10752, 1536, 1, 384] + - Exact: [11136, 1536, 1, 384] + - Exact: [11520, 1536, 1, 384] + - Exact: [11904, 1536, 1, 384] + - Exact: [12288, 1536, 1, 384] + - Exact: [12672, 1536, 1, 384] + - Exact: [13056, 1536, 1, 384] + - Exact: [13440, 1536, 1, 384] + - Exact: [13824, 1536, 1, 384] + - Exact: [14208, 1536, 1, 384] + - Exact: [14592, 1536, 1, 384] + - Exact: [14976, 1536, 1, 384] + - Exact: [15360, 1536, 1, 384] + - Exact: [15744, 1536, 1, 384] + - Exact: [16128, 1536, 1, 384] + - Exact: [16512, 1536, 1, 384] + - Exact: [16896, 1536, 1, 384] + - Exact: [17280, 1536, 1, 384] + - Exact: [17664, 1536, 1, 384] + - Exact: [18048, 1536, 1, 384] + - Exact: [18432, 1536, 1, 384] + - Exact: [18816, 1536, 1, 384] + - Exact: [19200, 1536, 1, 384] + - Exact: [19584, 1536, 1, 384] + - Exact: [19968, 1536, 1, 384] + - Exact: [20352, 1536, 1, 384] + - Exact: [20736, 1536, 1, 384] + - Exact: [21120, 1536, 1, 384] + - Exact: [21504, 1536, 1, 384] + - Exact: [21888, 1536, 1, 384] + - Exact: [22272, 1536, 1, 384] + - Exact: [22656, 1536, 1, 384] + - Exact: [23040, 1536, 1, 384] + - Exact: [768, 1920, 1, 384] + - Exact: [1152, 1920, 1, 384] + - Exact: [1536, 1920, 1, 384] + - Exact: [2304, 1920, 1, 384] + - Exact: [2688, 1920, 1, 384] + - Exact: [3072, 1920, 1, 384] + - Exact: [3456, 1920, 1, 384] + - Exact: [4224, 1920, 1, 384] + - Exact: [4608, 1920, 1, 384] + - Exact: [4992, 1920, 1, 384] + - Exact: [5376, 1920, 1, 384] + - Exact: [5760, 1920, 1, 384] + - Exact: [6144, 1920, 1, 384] + - Exact: [6528, 1920, 1, 384] + - Exact: [6912, 1920, 1, 384] + - Exact: [7296, 1920, 1, 384] + - Exact: [7680, 1920, 1, 384] + - Exact: [8064, 1920, 1, 384] + - Exact: [8448, 1920, 1, 384] + - Exact: [8832, 1920, 1, 384] + - Exact: [9216, 1920, 1, 384] + - Exact: [9600, 1920, 1, 384] + - Exact: [9984, 1920, 1, 384] + - Exact: [10368, 1920, 1, 384] + - Exact: [10752, 1920, 1, 384] + - Exact: [11136, 1920, 1, 384] + - Exact: [11520, 1920, 1, 384] + - Exact: [11904, 1920, 1, 384] + - Exact: [12288, 1920, 1, 384] + - Exact: [12672, 1920, 1, 384] + - Exact: [13056, 1920, 1, 384] + - Exact: [13440, 1920, 1, 384] + - Exact: [13824, 1920, 1, 384] + - Exact: [14208, 1920, 1, 384] + - Exact: [14592, 1920, 1, 384] + - Exact: [14976, 1920, 1, 384] + - Exact: [15360, 1920, 1, 384] + - Exact: [15744, 1920, 1, 384] + - Exact: [16128, 1920, 1, 384] + - Exact: [16512, 1920, 1, 384] + - Exact: [16896, 1920, 1, 384] + - Exact: [17280, 1920, 1, 384] + - Exact: [17664, 1920, 1, 384] + - Exact: [18048, 1920, 1, 384] + - Exact: [18432, 1920, 1, 384] + - Exact: [18816, 1920, 1, 384] + - Exact: [19200, 1920, 1, 384] + - Exact: [19584, 1920, 1, 384] + - Exact: [19968, 1920, 1, 384] + - Exact: [20352, 1920, 1, 384] + - Exact: [20736, 1920, 1, 384] + - Exact: [21120, 1920, 1, 384] + - Exact: [21504, 1920, 1, 384] + - Exact: [21888, 1920, 1, 384] + - Exact: [22272, 1920, 1, 384] + - Exact: [22656, 1920, 1, 384] + - Exact: [23040, 1920, 1, 384] + - Exact: [768, 2304, 1, 384] + - Exact: [1536, 2304, 1, 384] + - Exact: [1920, 2304, 1, 384] + - Exact: [2688, 2304, 1, 384] + - Exact: [3072, 2304, 1, 384] + - Exact: [3456, 2304, 1, 384] + - Exact: [3840, 2304, 1, 384] + - Exact: [4224, 2304, 1, 384] + - Exact: [4992, 2304, 1, 384] + - Exact: [5376, 2304, 1, 384] + - Exact: [5760, 2304, 1, 384] + - Exact: [6144, 2304, 1, 384] + - Exact: [6528, 2304, 1, 384] + - Exact: [6912, 2304, 1, 384] + - Exact: [7296, 2304, 1, 384] + - Exact: [7680, 2304, 1, 384] + - Exact: [8064, 2304, 1, 384] + - Exact: [8448, 2304, 1, 384] + - Exact: [8832, 2304, 1, 384] + - Exact: [9216, 2304, 1, 384] + - Exact: [9600, 2304, 1, 384] + - Exact: [9984, 2304, 1, 384] + - Exact: [10368, 2304, 1, 384] + - Exact: [10752, 2304, 1, 384] + - Exact: [11136, 2304, 1, 384] + - Exact: [11520, 2304, 1, 384] + - Exact: [11904, 2304, 1, 384] + - Exact: [12288, 2304, 1, 384] + - Exact: [12672, 2304, 1, 384] + - Exact: [13056, 2304, 1, 384] + - Exact: [13440, 2304, 1, 384] + - Exact: [13824, 2304, 1, 384] + - Exact: [14208, 2304, 1, 384] + - Exact: [14592, 2304, 1, 384] + - Exact: [14976, 2304, 1, 384] + - Exact: [15360, 2304, 1, 384] + - Exact: [15744, 2304, 1, 384] + - Exact: [16128, 2304, 1, 384] + - Exact: [16512, 2304, 1, 384] + - Exact: [16896, 2304, 1, 384] + - Exact: [17280, 2304, 1, 384] + - Exact: [17664, 2304, 1, 384] + - Exact: [18048, 2304, 1, 384] + - Exact: [18432, 2304, 1, 384] + - Exact: [18816, 2304, 1, 384] + - Exact: [19200, 2304, 1, 384] + - Exact: [19584, 2304, 1, 384] + - Exact: [19968, 2304, 1, 384] + - Exact: [20352, 2304, 1, 384] + - Exact: [20736, 2304, 1, 384] + - Exact: [21120, 2304, 1, 384] + - Exact: [21504, 2304, 1, 384] + - Exact: [21888, 2304, 1, 384] + - Exact: [22272, 2304, 1, 384] + - Exact: [22656, 2304, 1, 384] + - Exact: [23040, 2304, 1, 384] + - Exact: [256, 32768, 1, 1] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [1444, 128, 120, 256] + - Exact: [1444, 128, 18, 256] + - Exact: [1444, 128, 19, 256] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 512, 120, 256] + - Exact: [361, 512, 18, 256] + - Exact: [361, 512, 19, 256] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 30522, 1, 77] + - Exact: [4096, 1024, 1, 512] + - Exact: [1024, 4096, 1, 1280] + - Exact: [1024, 30522, 1, 200] + - Exact: [4096, 1024, 1, 1280] + - Exact: [1024, 4096, 1, 4992] + - Exact: [1024, 30522, 1, 780] + - Exact: [4096, 1024, 1, 4992] + - Exact: [1024, 30522, 1, 308] + - Exact: [1024, 4096, 1, 5120] + - Exact: [1024, 30522, 1, 800] + - Exact: [4096, 1024, 1, 5120] + - Exact: [1024, 4096, 1, 5248] + - Exact: [1024, 30522, 1, 820] + - Exact: [4096, 1024, 1, 5248] + - Exact: [1024, 4096, 1, 2560] + - Exact: [1024, 30522, 1, 385] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 30522, 1, 462] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 30522, 1, 160] + - Exact: [4096, 1024, 1, 1024] + - Exact: [1024, 4096, 1, 1152] + - Exact: [1024, 30522, 1, 180] + - Exact: [4096, 1024, 1, 1152] + - Exact: [1024, 4096, 1, 8192] + - Exact: [1024, 4096, 1, 9600] + - Exact: [1024, 33712, 1, 8192] + - Exact: [1024, 33712, 1, 9600] + - Exact: [4096, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 9600] + - Exact: [1024, 4096, 1, 10064] + - Exact: [1024, 4096, 1, 10080] + - Exact: [1024, 4096, 1, 6528] + - Exact: [1024, 4096, 1, 7104] + - Exact: [1024, 4096, 1, 8064] + - Exact: [1024, 4096, 1, 9216] + - Exact: [1024, 42720, 1, 10080] + - Exact: [1024, 42720, 1, 6528] + - Exact: [1024, 42720, 1, 7104] + - Exact: [4096, 1024, 1, 10064] + - Exact: [4096, 1024, 1, 10080] + - Exact: [4096, 1024, 1, 6528] + - Exact: [4096, 1024, 1, 7104] + - Exact: [4096, 1024, 1, 8064] + - Exact: [4096, 1024, 1, 9216] + - Exact: [1024, 1600, 1, 1] + - Exact: [2048, 960, 1, 1] + - Exact: [2048, 2048, 1, 2] + - Exact: [2048, 30592, 1, 1024] + - Exact: [2048, 6144, 1, 1024] + - Exact: [2048, 8192, 1, 1024] + - Exact: [8192, 2048, 1, 1024] + - Exact: [1024, 30592, 1, 8192] + - Exact: [1024, 3072, 1, 8192] + - Exact: [1024, 30592, 1, 2048] + - Exact: [1024, 30592, 1, 4096] + - Exact: [1024, 3072, 1, 4096] + - Exact: [2560, 1920, 1, 2048] + - Exact: [2560, 2560, 1, 2048] + - Exact: [2560, 2560, 1, 4] + - Exact: [2560, 7680, 1, 2048] + - Exact: [640, 2560, 1, 2048] + - Exact: [1536, 1536, 1, 4096] + - Exact: [1536, 4608, 1, 4096] + - Exact: [1536, 50304, 1, 4096] + - Exact: [1536, 6144, 1, 4096] + - Exact: [6144, 1536, 1, 4096] + - Exact: [1536, 1536, 1, 8192] + - Exact: [1536, 4608, 1, 8192] + - Exact: [1536, 50304, 1, 8192] + - Exact: [1536, 6144, 1, 8192] + - Exact: [6144, 1536, 1, 8192] + - Exact: [1024, 3072, 1, 16384] + - Exact: [1024, 4096, 1, 16384] + - Exact: [1024, 50304, 1, 16384] + - Exact: [4096, 1024, 1, 16384] + - Exact: [1024, 50304, 1, 2048] + - Exact: [1024, 50304, 1, 4096] + - Exact: [1024, 50304, 1, 8192] + - Exact: [1024, 30528, 1, 8192] + - Exact: [256, 6912, 1, 1] + - Exact: [30528, 1024, 1, 640] + - Exact: [30528, 1024, 1, 1280] + - Exact: [4096, 1024, 1, 10240] + - Exact: [1024, 4096, 1, 10240] + - Exact: [30528, 1024, 1, 1600] + - Exact: [1024, 4096, 1, 10496] + - Exact: [30528, 1024, 1, 1640] + - Exact: [4096, 1024, 1, 10496] + - Exact: [30528, 1024, 1, 160] + - Exact: [1024, 4096, 1, 6144] + - Exact: [30528, 1024, 1, 240] + - Exact: [4096, 1024, 1, 6144] + - Exact: [3136, 128, 64, 256] + - Exact: [784, 256, 64, 512] + - Exact: [3136, 256, 64, 128] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 512, 64, 1024] + - Exact: [784, 512, 64, 256] + - Exact: [784, 512, 64, 512] + - Exact: [196, 1024, 64, 512] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 256] + - Exact: [784, 256, 32, 512] + - Exact: [3136, 256, 32, 128] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 512, 32, 1024] + - Exact: [784, 512, 32, 256] + - Exact: [784, 512, 32, 512] + - Exact: [196, 1024, 32, 512] + - Exact: [196, 1024, 32, 1024] + - Exact: [1024, 4096, 1, 10224] + - Exact: [4096, 1024, 1, 10224] + - Exact: [1024, 3072, 1, 10224] + - Exact: [1024, 3072, 1, 10240] + - Exact: [4096, 1024, 1, 10192] + - Exact: [1024, 3072, 1, 10192] + - Exact: [1024, 4096, 1, 10192] + - Exact: [1024, 3072, 1, 10200] + - Exact: [4096, 1024, 1, 10208] + - Exact: [1024, 3072, 1, 10208] + - Exact: [1024, 4096, 1, 10208] + - Exact: [1024, 2048, 1, 10224] + - Exact: [1024, 2048, 1, 10240] + - Exact: [1024, 2048, 1, 10192] + - Exact: [1024, 3072, 1, 10080] + - Exact: [100352, 256, 1, 512] + - Exact: [12544, 1024, 1, 2048] + - Exact: [12544, 147, 1, 64] + - Exact: [200704, 256, 1, 512] + - Exact: [25088, 512, 1, 1024] + - Exact: [3136, 576, 1, 64] + - Exact: [50176, 512, 1, 1024] + - Exact: [6272, 1024, 1, 2048] + - Exact: [196, 1024, 128, 512] + - Exact: [196, 1024, 256, 512] + - Exact: [3136, 256, 128, 128] + - Exact: [3136, 256, 256, 128] + - Exact: [784, 512, 128, 256] + - Exact: [784, 512, 256, 256] + - Exact: [30528, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 12288] + - Exact: [30528, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [25600, 128, 25, 128] + - Exact: [12544, 128, 36, 128] + - Exact: [9216, 128, 49, 128] + - Exact: [6400, 128, 64, 128] + - Exact: [6400, 256, 25, 256] + - Exact: [4096, 256, 36, 256] + - Exact: [2304, 256, 49, 256] + - Exact: [2304, 256, 64, 256] + - Exact: [2304, 512, 25, 512] + - Exact: [1024, 512, 36, 512] + - Exact: [1024, 512, 49, 512] + - Exact: [1024, 512, 64, 512] + - Exact: [3072, 768, 1, 2048] + - Exact: [768, 3072, 1, 2048] + - Exact: [3072, 768, 1, 4608] + - Exact: [768, 3072, 1, 4608] + - Exact: [4096, 1024, 1, 4608] + - Exact: [1024, 4096, 1, 4608] + - Exact: [4880, 256, 49, 256] + - Exact: [3128, 256, 64, 256] + - Exact: [4680, 256, 49, 256] + - Exact: [5280, 256, 36, 256] + - Exact: [2640, 256, 64, 256] + - Exact: [5304, 256, 49, 256] + - Exact: [4524, 256, 49, 256] + - Exact: [2760, 256, 64, 256] + - Exact: [6440, 256, 36, 256] + - Exact: [5704, 256, 36, 256] + - Exact: [2666, 256, 64, 256] + - Exact: [2128, 256, 64, 256] + - Exact: [1160, 256, 49, 256] + - Exact: [4056, 256, 49, 256] + - Exact: [6144, 256, 36, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [6336, 256, 36, 256] + - Exact: [13600, 512, 2, 128] + - Exact: [15200, 512, 2, 128] + - Exact: [15200, 128, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [5632, 256, 36, 256] + - Exact: [12288, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3220, 1024, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [782, 128, 64, 128] + - Exact: [13824, 512, 2, 128] + - Exact: [13824, 128, 2, 512] + - Exact: [10560, 128, 2, 512] + - Exact: [10752, 128, 2, 512] + - Exact: [13600, 512, 2, 256] + - Exact: [15200, 512, 2, 256] + - Exact: [850, 2048, 2, 512] + - Exact: [768, 2048, 2, 512] + - Exact: [12880, 512, 2, 128] + - Exact: [11616, 128, 2, 512] + - Exact: [14208, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [805, 2048, 2, 512] + - Exact: [6912, 256, 36, 256] + - Exact: [713, 2048, 2, 512] + - Exact: [13824, 512, 2, 256] + - Exact: [11616, 512, 2, 128] + - Exact: [12288, 512, 2, 128] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [10560, 512, 2, 128] + - Exact: [672, 2048, 2, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [9408, 128, 2, 512] + - Exact: [10752, 512, 2, 128] + - Exact: [726, 2048, 2, 512] + - Exact: [11968, 512, 2, 128] + - Exact: [1240, 256, 49, 256] + - Exact: [4032, 256, 2, 1024] + - Exact: [888, 2048, 2, 512] + - Exact: [12880, 512, 2, 256] + - Exact: [12288, 512, 2, 256] + - Exact: [13440, 128, 2, 512] + - Exact: [864, 2048, 2, 256] + - Exact: [12672, 128, 2, 512] + - Exact: [11264, 128, 2, 512] + - Exact: [11776, 128, 2, 512] + - Exact: [16128, 128, 2, 512] + - Exact: [4032, 1024, 2, 256] + - Exact: [14000, 128, 2, 512] + - Exact: [13440, 512, 2, 128] + - Exact: [805, 2048, 2, 256] + - Exact: [768, 2048, 2, 256] + - Exact: [3264, 1024, 2, 256] + - Exact: [1251, 256, 49, 256] + - Exact: [4200, 256, 2, 1024] + - Exact: [2352, 1024, 2, 256] + - Exact: [2400, 1024, 2, 256] + - Exact: [15200, 256, 2, 12] + - Exact: [12880, 256, 2, 12] + - Exact: [2520, 1024, 2, 256] + - Exact: [13600, 256, 2, 12] + - Exact: [15200, 256, 2, 3] + - Exact: [12880, 256, 2, 3] + - Exact: [4200, 1024, 2, 256] + - Exact: [12288, 256, 2, 12] + - Exact: [13824, 256, 2, 12] + - Exact: [13600, 256, 2, 3] + - Exact: [1900, 1024, 1, 2048] + - Exact: [7600, 512, 1, 256] + - Exact: [1610, 1024, 1, 2048] + - Exact: [6144, 512, 1, 256] + - Exact: [1900, 1024, 1, 512] + - Exact: [12544, 1024, 1, 1024] + - Exact: [3220, 256, 2, 12] + - Exact: [3220, 256, 2, 3] + - Exact: [3800, 256, 2, 3] + - Exact: [13824, 256, 2, 3] + - Exact: [12288, 256, 2, 3] + - Exact: [2688, 256, 2, 1024] + - Exact: [3072, 256, 2, 12] + - Exact: [3800, 256, 2, 12] + - Exact: [3072, 256, 2, 3] + - Exact: [2520, 256, 2, 1024] + - Exact: [16128, 512, 2, 128] + - Exact: [2400, 256, 2, 1024] + - Exact: [2352, 256, 2, 1024] + - Exact: [3036, 1024, 2, 256] + - Exact: [2944, 256, 2, 1024] + - Exact: [2992, 1024, 2, 256] + - Exact: [2816, 256, 2, 1024] + - Exact: [3036, 256, 2, 1024] + - Exact: [2904, 1024, 2, 256] + - Exact: [3456, 256, 2, 3] + - Exact: [3400, 256, 2, 3] + - Exact: [2816, 1024, 2, 256] + - Exact: [3456, 256, 2, 12] + - Exact: [2944, 1024, 2, 256] + - Exact: [3168, 256, 2, 1024] + - Exact: [850, 2048, 2, 256] + - Exact: [2992, 256, 2, 1024] + - Exact: [2852, 1024, 2, 256] + - Exact: [51520, 256, 2, 12] + - Exact: [3072, 256, 2, 1024] + - Exact: [2640, 1024, 2, 256] + - Exact: [2688, 1024, 2, 256] + - Exact: [2904, 256, 2, 1024] + - Exact: [3264, 256, 2, 1024] + - Exact: [54400, 256, 2, 12] + - Exact: [950, 2048, 2, 256] + - Exact: [55296, 256, 2, 3] + - Exact: [60800, 256, 2, 12] + - Exact: [51520, 256, 2, 3] + - Exact: [3700, 1024, 2, 256] + - Exact: [55296, 256, 2, 12] + - Exact: [2852, 256, 2, 1024] + - Exact: [3600, 1024, 2, 256] + - Exact: [3700, 256, 2, 1024] + - Exact: [60800, 256, 2, 3] + - Exact: [1269, 256, 49, 256] + - Exact: [1467, 256, 49, 256] + - Exact: [3500, 256, 2, 1024] + - Exact: [952, 256, 64, 256] + - Exact: [49152, 256, 2, 12] + - Exact: [1449, 256, 49, 256] + - Exact: [1278, 256, 49, 256] + - Exact: [3360, 256, 2, 1024] + - Exact: [736, 256, 64, 256] + - Exact: [1413, 256, 49, 256] + - Exact: [600, 256, 64, 256] + - Exact: [1341, 256, 49, 256] + - Exact: [1287, 256, 49, 256] + - Exact: [1332, 256, 49, 256] + - Exact: [1359, 256, 49, 256] + - Exact: [1440, 256, 49, 256] + - Exact: [1395, 256, 49, 256] + - Exact: [1323, 256, 49, 256] + - Exact: [1404, 256, 49, 256] + - Exact: [1386, 256, 49, 256] + - Exact: [3168, 1024, 2, 256] + - Exact: [1350, 256, 49, 256] + - Exact: [1368, 256, 49, 256] + - Exact: [49152, 256, 2, 3] + - Exact: [3600, 256, 2, 1024] + - Exact: [3500, 1024, 2, 256] + - Exact: [3360, 1024, 2, 256] + - Exact: [3220, 256, 2, 1024] + - Exact: [690, 256, 64, 256] + - Exact: [54400, 256, 2, 3] + - Exact: [3072, 1024, 2, 256] + - Exact: [2640, 256, 2, 1024] + - Exact: [616, 256, 64, 256] + - Exact: [3008, 256, 64, 256] + - Exact: [896, 256, 64, 256] + - Exact: [768, 256, 64, 256] + - Exact: [660, 256, 64, 256] + - Exact: [3552, 256, 2, 1024] + - Exact: [3552, 1024, 2, 256] + - Exact: [800, 256, 64, 256] + - Exact: [1120, 256, 49, 256] + - Exact: [2408, 256, 64, 256] + - Exact: [3456, 256, 2, 1024] + - Exact: [672, 256, 64, 256] + - Exact: [782, 256, 64, 256] + - Exact: [884, 256, 64, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [1064, 256, 49, 256] + - Exact: [3400, 256, 2, 1024] + - Exact: [704, 256, 64, 256] + - Exact: [3400, 1024, 2, 256] + - Exact: [3264, 256, 64, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [3800, 256, 2, 1024] + - Exact: [6440, 512, 1, 256] + - Exact: [6912, 512, 1, 256] + - Exact: [6800, 512, 1, 256] + - Exact: [6800, 512, 1, 1024] + - Exact: [6440, 512, 1, 1024] + - Exact: [6912, 512, 1, 1024] + - Exact: [1728, 1024, 1, 512] + - Exact: [1536, 1024, 1, 512] + - Exact: [1610, 1024, 1, 512] + - Exact: [7600, 512, 1, 1024] + - Exact: [6144, 512, 1, 1024] + - Exact: [1700, 1024, 1, 512] + - Exact: [1728, 1024, 1, 2048] + - Exact: [1536, 1024, 1, 2048] + - Exact: [1700, 1024, 1, 2048] + - Exact: [1920, 25216, 1, 16384] + - Exact: [3840, 1920, 1, 16384] + - Exact: [1920, 3840, 1, 16384] + - Exact: [960, 1920, 1, 16384] + - Exact: [1920, 2880, 1, 16384] + - Exact: [1920, 25216, 1, 4096] + - Exact: [3840, 1920, 1, 4096] + - Exact: [1920, 3840, 1, 4096] + - Exact: [960, 1920, 1, 4096] + - Exact: [1920, 2880, 1, 4096] + - Exact: [1920, 25216, 1, 8192] + - Exact: [3840, 1920, 1, 8192] + - Exact: [1920, 3840, 1, 8192] + - Exact: [960, 1920, 1, 8192] + - Exact: [1920, 2880, 1, 8192] + - Exact: [2304, 12672, 1, 16384] + - Exact: [2304, 2304, 1, 16384] + - Exact: [576, 2304, 1, 16384] + - Exact: [2304, 1728, 1, 16384] + - Exact: [2304, 12672, 1, 4096] + - Exact: [2304, 2304, 1, 4096] + - Exact: [576, 2304, 1, 4096] + - Exact: [2304, 1728, 1, 4096] + - Exact: [2304, 12672, 1, 8192] + - Exact: [2304, 2304, 1, 8192] + - Exact: [576, 2304, 1, 8192] + - Exact: [2304, 1728, 1, 8192] + - Exact: [3072, 6400, 1, 4096] + - Exact: [1536, 3072, 1, 4096] + - Exact: [3072, 1536, 1, 4096] + - Exact: [384, 3072, 1, 4096] + - Exact: [3072, 1152, 1, 4096] + - Exact: [3072, 6400, 1, 8192] + - Exact: [1536, 3072, 1, 8192] + - Exact: [3072, 1536, 1, 8192] + - Exact: [384, 3072, 1, 8192] + - Exact: [3072, 1152, 1, 8192] + - Exact: [2048, 2048, 1, 4096] + - Exact: [2048, 2048, 1, 8] + - Exact: [2048, 29000, 1, 199] + - Exact: [2048, 29000, 1, 221] + - Exact: [2048, 29000, 1, 224] + - Exact: [2048, 29000, 1, 229] + - Exact: [2048, 29000, 1, 234] + - Exact: [2048, 29000, 1, 242] + - Exact: [2048, 29000, 1, 246] + - Exact: [2048, 29000, 1, 247] + - Exact: [2048, 29000, 1, 256] + - Exact: [2048, 29000, 1, 262] + - Exact: [2048, 29000, 1, 264] + - Exact: [2048, 29000, 1, 265] + - Exact: [2048, 29000, 1, 274] + - Exact: [2048, 29000, 1, 277] + - Exact: [2048, 29000, 1, 279] + - Exact: [2048, 29000, 1, 288] + - Exact: [2048, 29000, 1, 296] + - Exact: [2048, 29000, 1, 315] + - Exact: [2048, 29000, 1, 335] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 4096] + - Exact: [1024, 29000, 1, 2283] + - Exact: [1024, 29000, 1, 2296] + - Exact: [1024, 29000, 1, 2306] + - Exact: [1024, 29000, 1, 2309] + - Exact: [1024, 29000, 1, 2318] + - Exact: [1024, 29000, 1, 2320] + - Exact: [1024, 29000, 1, 2324] + - Exact: [1024, 29000, 1, 2325] + - Exact: [1024, 29000, 1, 2329] + - Exact: [1024, 29000, 1, 2338] + - Exact: [1024, 29000, 1, 2345] + - Exact: [1024, 29000, 1, 2350] + - Exact: [1024, 29000, 1, 2362] + - Exact: [1024, 29000, 1, 2366] + - Exact: [1024, 29000, 1, 2368] + - Exact: [1024, 29000, 1, 2374] + - Exact: [1024, 29000, 1, 2390] + - Exact: [1024, 29000, 1, 561] + - Exact: [1024, 29000, 1, 574] + - Exact: [1024, 29000, 1, 600] + - Exact: [1024, 29000, 1, 608] + - Exact: [1024, 29000, 1, 615] + - Exact: [1024, 29000, 1, 622] + - Exact: [1024, 29000, 1, 625] + - Exact: [1024, 29000, 1, 626] + - Exact: [1024, 29000, 1, 628] + - Exact: [1024, 29000, 1, 636] + - Exact: [1024, 29000, 1, 651] + - Exact: [1024, 29000, 1, 658] + - Exact: [1024, 29000, 1, 669] + - Exact: [1024, 29000, 1, 670] + - Exact: [1024, 29000, 1, 672] + - Exact: [1024, 29000, 1, 684] + - Exact: [1024, 29000, 1, 716] + - Exact: [1024, 29000, 1, 730] + - Exact: [2560, 2560, 1, 1024] + - Exact: [2560, 2560, 1, 2] + - Exact: [2560, 29000, 1, 109] + - Exact: [2560, 29000, 1, 121] + - Exact: [2560, 29000, 1, 27] + - Exact: [2560, 29000, 1, 35] + - Exact: [2560, 29000, 1, 36] + - Exact: [2560, 29000, 1, 39] + - Exact: [2560, 29000, 1, 40] + - Exact: [2560, 29000, 1, 42] + - Exact: [2560, 29000, 1, 43] + - Exact: [2560, 29000, 1, 44] + - Exact: [2560, 29000, 1, 46] + - Exact: [2560, 29000, 1, 48] + - Exact: [2560, 29000, 1, 49] + - Exact: [2560, 29000, 1, 50] + - Exact: [2560, 29000, 1, 51] + - Exact: [2560, 29000, 1, 53] + - Exact: [2560, 29000, 1, 54] + - Exact: [2560, 29000, 1, 55] + - Exact: [2560, 29000, 1, 56] + - Exact: [2560, 29000, 1, 57] + - Exact: [2560, 29000, 1, 58] + - Exact: [2560, 29000, 1, 59] + - Exact: [2560, 29000, 1, 61] + - Exact: [2560, 29000, 1, 63] + - Exact: [2560, 29000, 1, 65] + - Exact: [2560, 29000, 1, 66] + - Exact: [2560, 29000, 1, 67] + - Exact: [2560, 29000, 1, 69] + - Exact: [2560, 29000, 1, 70] + - Exact: [2560, 29000, 1, 71] + - Exact: [2560, 29000, 1, 73] + - Exact: [2560, 29000, 1, 74] + - Exact: [2560, 29000, 1, 75] + - Exact: [2560, 29000, 1, 77] + - Exact: [2560, 29000, 1, 78] + - Exact: [2560, 29000, 1, 80] + - Exact: [2560, 29000, 1, 81] + - Exact: [2560, 29000, 1, 82] + - Exact: [2560, 29000, 1, 83] + - Exact: [2560, 29000, 1, 84] + - Exact: [2560, 29000, 1, 88] + - Exact: [2560, 29000, 1, 89] + - Exact: [2560, 29000, 1, 90] + - Exact: [2560, 29000, 1, 92] + - Exact: [2560, 29000, 1, 95] + - Exact: [2560, 29000, 1, 98] + - Exact: [2560, 4096, 1, 1024] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 32768] + - Exact: [1024, 4096, 1, 32768] + - Exact: [1024, 50304, 1, 32768] + - Exact: [4096, 1024, 1, 32768] + - Exact: [1024, 128, 24, 1024] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 256, 1, 33536] + - Exact: [1024, 1024, 1, 9520] + - Exact: [1024, 1024, 1, 10200] + - Exact: [1024, 256, 1, 21248] + - Exact: [1024, 256, 1, 21504] + - Exact: [1024, 256, 1, 22016] + - Exact: [1024, 256, 1, 28672] + - Exact: [256, 2560, 1, 8976] + - Exact: [256, 2816, 1, 8976] + - Exact: [256, 3328, 1, 8976] + - Exact: [256, 3584, 1, 8976] + - Exact: [256, 3840, 1, 8976] + - Exact: [256, 4096, 1, 8976] + - Exact: [256, 4352, 1, 8976] + - Exact: [1024, 1024, 1, 32768] + - Exact: [1024, 512, 1, 32768] + - Exact: [479, 1024, 1, 32768] + - Exact: [512, 256, 1, 55296] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 1024, 1, 9600] + - Exact: [1024, 1024, 1, 10064] + - Exact: [1024, 1024, 1, 10080] + - Exact: [1024, 1024, 1, 9216] + - Exact: [480, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 16384] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 10496] + - Exact: [1024, 1024, 1, 10224] + - Exact: [1024, 1024, 1, 10192] + - Exact: [1024, 1024, 1, 10208] + - Exact: [1024, 1024, 1, 10184] + - Exact: [1024, 1024, 1, 10120] + - Exact: [1024, 1024, 1, 10152] + - Exact: [1024, 1024, 1, 12288] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 512] + - Exact: [1024, 1024, 1, 200] + - Exact: [1024, 1024, 1, 4096] + - Exact: [1024, 1024, 1, 2048] + - Exact: [768, 768, 1, 16] + - Exact: [768, 768, 1, 320] + - Exact: [768, 768, 1, 4096] + - Exact: [768, 768, 1, 32] + - Exact: [768, 768, 1, 640] + - Exact: [768, 768, 1, 64] + - Exact: [768, 768, 1, 1280] + - Exact: [1024, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 120] + - Exact: [1024, 1024, 1, 1] + - Exact: [1024, 1024, 1, 20] + - Exact: [1024, 1024, 1, 4] + - Exact: [1024, 1024, 1, 6] + - Exact: [1024, 1024, 1, 80] + - Exact: [128, 64, 512, 128] + - Exact: [512, 64, 64, 512] + - Exact: [64, 64, 768, 64] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [448, 2944, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 128, 1, 1280] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [64, 6784, 1, 1280] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [1024, 1024, 1, 1024] + - Exact: [448, 1024, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [64, 6784, 1, 128] + - Exact: [5888, 64, 1, 1280] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [64, 5056, 1, 128] + - Exact: [5888, 64, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [64, 1536, 64, 384] + - Exact: [64, 1536, 64, 256] + - Exact: [64, 92, 688, 92] + - Exact: [1024, 1024, 1, 3975] + - Exact: [64, 123, 528, 123] + - Exact: [64, 102, 624, 100] + - Exact: [64, 112, 576, 111] + - Exact: [64, 102, 624, 102] + - Exact: [64, 133, 480, 135] + - Exact: [1024, 1024, 1, 4026] + - Exact: [64, 160, 400, 159] + - Exact: [1024, 1024, 1, 3780] + - Exact: [64, 228, 272, 232] + - Exact: [1024, 1024, 1, 3822] + - Exact: [64, 77, 816, 77] + - Exact: [64, 159, 400, 159] + - Exact: [64, 135, 480, 134] + - Exact: [64, 99, 624, 99] + - Exact: [1024, 1024, 1, 3942] + - Exact: [1024, 1024, 1, 3861] + - Exact: [1024, 1024, 1, 4000] + - Exact: [1024, 1024, 1, 3870] + - Exact: [64, 65, 992, 65] + - Exact: [64, 133, 480, 133] + - Exact: [64, 232, 272, 232] + - Exact: [64, 148, 432, 148] + - Exact: [1024, 1024, 1, 4032] + - Exact: [1024, 1024, 1, 4012] + - Exact: [1024, 1024, 1, 3681] + - Exact: [1024, 1024, 1, 3927] + - Exact: [1024, 1024, 1, 3894] + - Exact: [64, 132, 480, 135] + - Exact: [64, 135, 480, 135] + - Exact: [1024, 1024, 1, 3876] + - Exact: [64, 84, 752, 85] + - Exact: [1024, 1024, 1, 4050] + - Exact: [64, 132, 480, 132] + - Exact: [64, 99, 624, 102] + - Exact: [64, 143, 432, 148] + - Exact: [1024, 1024, 1, 3584] + - Exact: [64, 162, 400, 162] + - Exact: [64, 148, 432, 147] + - Exact: [1024, 1024, 1, 3960] + - Exact: [64, 123, 528, 122] + - Exact: [64, 102, 624, 101] + - Exact: [1024, 1024, 1, 3978] + - Exact: [64, 160, 400, 160] + - Exact: [1024, 1024, 1, 3995] + - Exact: [64, 132, 480, 134] + - Exact: [64, 111, 576, 111] + - Exact: [64, 100, 624, 100] + - Exact: [1024, 1024, 1, 3977] + - Exact: [64, 112, 576, 112] + - Exact: [64, 159, 400, 162] + - Exact: [64, 122, 528, 122] + - Exact: [64, 228, 272, 228] + - Exact: [1024, 1024, 1, 3925] + - Exact: [64, 93, 688, 93] + - Exact: [1024, 1024, 1, 3956] + - Exact: [1024, 1024, 1, 3976] + - Exact: [64, 111, 576, 112] + - Exact: [64, 100, 624, 102] + - Exact: [1024, 1024, 1, 3955] + - Exact: [1024, 1024, 1, 4030] + - Exact: [1024, 1024, 1, 3906] + - Exact: [64, 101, 624, 102] + - Exact: [1024, 1024, 1, 3796] + - Exact: [1024, 1024, 1, 3859] + - Exact: [64, 71, 896, 71] + - Exact: [1024, 1024, 1, 3860] + - Exact: [1024, 1024, 1, 4005] + - Exact: [64, 84, 752, 84] + - Exact: [1024, 1024, 1, 3990] + - Exact: [64, 134, 480, 134] + - Exact: [64, 78, 816, 78] + - Exact: [1024, 1024, 1, 3999] + - Exact: [1024, 1024, 1, 4020] + - Exact: [1024, 1024, 1, 3939] + - Exact: [64, 77, 816, 78] + - Exact: [1024, 1024, 1, 4059] + - Exact: [1024, 1024, 1, 3944] + - Exact: [64, 193, 320, 193] + - Exact: [1024, 1024, 1, 3720] + - Exact: [1024, 1024, 1, 3910] + - Exact: [64, 143, 432, 143] + - Exact: [64, 92, 688, 93] + - Exact: [64, 101, 624, 101] + - Exact: [1024, 1024, 1, 3969] + - Exact: [1024, 1024, 1, 3948] + - Exact: [1024, 1024, 1, 3996] + - Exact: [1024, 1024, 1, 3900] + - Exact: [1024, 1024, 1, 3640] + - Exact: [64, 147, 432, 147] + - Exact: [1024, 1024, 1, 3751] + - Exact: [64, 177, 352, 177] + - Exact: [64, 85, 752, 85] + - Exact: [1024, 1024, 1, 3712] + - Exact: [1024, 1024, 1, 128] + - Exact: [64, 256, 192, 256] + - Exact: [64, 128, 384, 128] + - Exact: [64, 192, 36, 25088] + - Exact: [128, 128, 64, 25] + - Exact: [64, 192, 64, 3200] + - Exact: [64, 128, 64, 23104] + - Exact: [128, 128, 64, 1600] + - Exact: [80, 192, 64, 4608] + - Exact: [64, 128, 36, 30] + - Exact: [64, 128, 64, 11552] + - Exact: [128, 192, 64, 946] + - Exact: [64, 192, 64, 12800] + - Exact: [224, 224, 64, 128] + - Exact: [128, 128, 64, 3360] + - Exact: [128, 128, 64, 420] + - Exact: [64, 128, 64, 361] + - Exact: [64, 128, 36, 53824] + - Exact: [128, 160, 36, 512] + - Exact: [147, 64, 36, 18816] + - Exact: [96, 128, 64, 946] + - Exact: [128, 128, 64, 50] + - Exact: [160, 224, 36, 128] + - Exact: [192, 224, 64, 1152] + - Exact: [128, 128, 36, 784] + - Exact: [96, 128, 64, 288] + - Exact: [128, 128, 64, 400] + - Exact: [128, 128, 64, 800] + - Exact: [96, 128, 36, 512] + - Exact: [96, 128, 64, 800] + - Exact: [192, 224, 64, 128] + - Exact: [128, 128, 64, 288] + - Exact: [96, 208, 36, 512] + - Exact: [64, 128, 36, 1568] + - Exact: [192, 192, 36, 512] + - Exact: [128, 128, 36, 512] + - Exact: [96, 208, 64, 1152] + - Exact: [128, 192, 64, 3200] + - Exact: [160, 160, 64, 288] + - Exact: [128, 128, 36, 440] + - Exact: [96, 128, 36, 1568] + - Exact: [112, 224, 36, 2048] + - Exact: [128, 128, 36, 7040] + - Exact: [128, 128, 36, 1568] + - Exact: [160, 224, 64, 128] + - Exact: [192, 224, 36, 2592] + - Exact: [64, 128, 64, 2888] + - Exact: [64, 128, 36, 480] + - Exact: [147, 64, 64, 9702] + - Exact: [64, 192, 64, 3698] + - Exact: [73, 192, 64, 10439] + - Exact: [128, 128, 36, 880] + - Exact: [192, 224, 36, 128] + - Exact: [64, 128, 36, 12544] + - Exact: [160, 160, 36, 512] + - Exact: [128, 128, 36, 3136] + - Exact: [112, 224, 36, 512] + - Exact: [128, 128, 36, 49] + - Exact: [112, 224, 64, 1152] + - Exact: [128, 192, 36, 1568] + - Exact: [128, 192, 36, 512] + - Exact: [192, 192, 64, 288] + - Exact: [96, 208, 64, 242] + - Exact: [64, 128, 64, 5776] + - Exact: [128, 192, 64, 288] + - Exact: [96, 128, 36, 6272] + - Exact: [96, 128, 64, 3200] + - Exact: [128, 192, 64, 800] + - Exact: [64, 128, 64, 10] + - Exact: [96, 208, 64, 288] + - Exact: [64, 128, 64, 160] + - Exact: [128, 128, 64, 1568] + - Exact: [112, 224, 64, 242] + - Exact: [160, 192, 64, 288] + - Exact: [128, 160, 64, 288] + - Exact: [128, 128, 64, 210] + - Exact: [73, 192, 36, 23360] + - Exact: [160, 192, 36, 512] + - Exact: [64, 128, 64, 722] + - Exact: [112, 224, 64, 288] + - Exact: [64, 192, 36, 6272] + - Exact: [64, 128, 36, 6272] + - Exact: [128, 128, 36, 3200] + - Exact: [128, 128, 36, 392] + - Exact: [80, 192, 36, 10368] + - Exact: [224, 224, 36, 128] + - Exact: [64, 128, 36, 784] + - Exact: [128, 128, 64, 200] + - Exact: [5329, 64, 32, 80] + - Exact: [64, 2048, 32, 384] + - Exact: [289, 1792, 1, 320] + - Exact: [1001, 1024, 1, 32] + - Exact: [784, 400, 1, 32] + - Exact: [64, 1536, 32, 256] + - Exact: [289, 2592, 1, 384] + - Exact: [64, 2048, 32, 448] + - Exact: [289, 2016, 1, 256] + - Exact: [64, 1536, 32, 384] + - Exact: [64, 1280, 32, 320] + - Exact: [289, 3456, 1, 384] + - Exact: [64, 1280, 32, 384] + - Exact: [729, 1600, 1, 192] + - Exact: [289, 1344, 1, 192] + - Exact: [64, 2048, 32, 320] + - Exact: [64, 1280, 32, 448] + - Exact: [64, 1280, 32, 192] + - Exact: [289, 1792, 1, 256] + - Exact: [64, 2048, 32, 192] + - Exact: [5329, 64, 128, 80] + - Exact: [64, 1280, 128, 448] + - Exact: [64, 2048, 128, 192] + - Exact: [64, 1280, 128, 384] + - Exact: [64, 1280, 128, 320] + - Exact: [64, 1280, 128, 192] + - Exact: [256, 4096, 1, 6400] + - Exact: [512, 2048, 1, 3427] + - Exact: [512, 2048, 1, 3552] + - Exact: [512, 2048, 1, 3840] + - Exact: [2048, 512, 1, 3427] + - Exact: [2048, 512, 1, 3452] + - Exact: [2048, 512, 1, 3472] + - Exact: [2048, 512, 1, 3475] + - Exact: [64, 64, 496, 64] + - Exact: [64, 64, 496, 65] + - Exact: [64, 65, 496, 65] + - Exact: [64, 71, 448, 71] + - Exact: [64, 77, 408, 77] + - Exact: [64, 77, 408, 78] + - Exact: [64, 78, 408, 78] + - Exact: [64, 85, 376, 85] + - Exact: [64, 93, 344, 93] + - Exact: [64, 112, 288, 112] + - Exact: [64, 122, 264, 122] + - Exact: [64, 123, 264, 122] + - Exact: [64, 123, 264, 123] + - Exact: [64, 134, 240, 134] + - Exact: [64, 135, 240, 134] + - Exact: [64, 135, 240, 135] + - Exact: [64, 1280, 64, 192] + - Exact: [64, 1280, 64, 320] + - Exact: [64, 1280, 64, 384] + - Exact: [64, 1280, 64, 448] + - Exact: [64, 2048, 64, 192] + - Exact: [64, 2048, 64, 320] + - Exact: [64, 2048, 64, 384] + - Exact: [64, 2048, 64, 448] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [5329, 64, 64, 80] + - Exact: [257, 4096, 1, 1024] + - Exact: [512, 2048, 1, 2790] + - Exact: [512, 2048, 1, 2864] + - Exact: [512, 2048, 1, 3092] + - Exact: [512, 2048, 1, 3113] + - Exact: [512, 2048, 1, 3137] + - Exact: [512, 2048, 1, 3165] + - Exact: [512, 2048, 1, 3166] + - Exact: [512, 2048, 1, 3194] + - Exact: [512, 2048, 1, 3219] + - Exact: [512, 2048, 1, 3222] + - Exact: [512, 2048, 1, 3234] + - Exact: [512, 2048, 1, 3237] + - Exact: [512, 2048, 1, 3242] + - Exact: [512, 2048, 1, 3246] + - Exact: [512, 2048, 1, 3249] + - Exact: [512, 2048, 1, 3251] + - Exact: [512, 2048, 1, 3257] + - Exact: [512, 2048, 1, 3262] + - Exact: [512, 2048, 1, 3268] + - Exact: [512, 2048, 1, 3282] + - Exact: [512, 2048, 1, 3286] + - Exact: [512, 2048, 1, 3287] + - Exact: [512, 2048, 1, 3293] + - Exact: [512, 2048, 1, 3297] + - Exact: [512, 2048, 1, 3307] + - Exact: [512, 2048, 1, 3314] + - Exact: [512, 2048, 1, 3315] + - Exact: [512, 2048, 1, 3319] + - Exact: [512, 2048, 1, 3322] + - Exact: [512, 2048, 1, 3323] + - Exact: [512, 2048, 1, 3324] + - Exact: [512, 2048, 1, 3325] + - Exact: [512, 2048, 1, 3327] + - Exact: [512, 2048, 1, 3329] + - Exact: [512, 2048, 1, 3332] + - Exact: [512, 2048, 1, 3336] + - Exact: [512, 2048, 1, 3339] + - Exact: [512, 2048, 1, 3342] + - Exact: [512, 2048, 1, 3344] + - Exact: [512, 2048, 1, 3358] + - Exact: [512, 2048, 1, 3360] + - Exact: [512, 2048, 1, 3364] + - Exact: [512, 2048, 1, 3365] + - Exact: [512, 2048, 1, 3369] + - Exact: [512, 2048, 1, 3371] + - Exact: [512, 2048, 1, 3374] + - Exact: [512, 2048, 1, 3376] + - Exact: [512, 2048, 1, 3377] + - Exact: [512, 2048, 1, 3378] + - Exact: [512, 2048, 1, 3381] + - Exact: [512, 2048, 1, 3382] + - Exact: [512, 2048, 1, 3383] + - Exact: [512, 2048, 1, 3384] + - Exact: [512, 2048, 1, 3385] + - Exact: [512, 2048, 1, 3386] + - Exact: [512, 2048, 1, 3388] + - Exact: [512, 2048, 1, 3390] + - Exact: [512, 2048, 1, 3391] + - Exact: [512, 2048, 1, 3396] + - Exact: [512, 2048, 1, 3399] + - Exact: [512, 2048, 1, 3402] + - Exact: [512, 2048, 1, 3410] + - Exact: [512, 2048, 1, 3412] + - Exact: [512, 2048, 1, 3414] + - Exact: [512, 2048, 1, 3415] + - Exact: [512, 2048, 1, 3418] + - Exact: [512, 2048, 1, 3420] + - Exact: [512, 2048, 1, 3422] + - Exact: [512, 2048, 1, 3425] + - Exact: [512, 2048, 1, 3426] + - Exact: [512, 2048, 1, 3428] + - Exact: [512, 2048, 1, 3430] + - Exact: [512, 2048, 1, 3431] + - Exact: [512, 2048, 1, 3432] + - Exact: [512, 2048, 1, 3438] + - Exact: [512, 2048, 1, 3439] + - Exact: [512, 2048, 1, 3440] + - Exact: [512, 2048, 1, 3443] + - Exact: [512, 2048, 1, 3445] + - Exact: [512, 2048, 1, 3447] + - Exact: [512, 2048, 1, 3448] + - Exact: [512, 2048, 1, 3450] + - Exact: [512, 2048, 1, 3451] + - Exact: [512, 2048, 1, 3452] + - Exact: [512, 2048, 1, 3453] + - Exact: [512, 2048, 1, 3455] + - Exact: [512, 2048, 1, 3456] + - Exact: [512, 2048, 1, 3457] + - Exact: [512, 2048, 1, 3458] + - Exact: [512, 2048, 1, 3459] + - Exact: [512, 2048, 1, 3460] + - Exact: [512, 2048, 1, 3461] + - Exact: [512, 2048, 1, 3462] + - Exact: [512, 2048, 1, 3466] + - Exact: [512, 2048, 1, 3467] + - Exact: [512, 2048, 1, 3468] + - Exact: [512, 2048, 1, 3470] + - Exact: [512, 2048, 1, 3471] + - Exact: [512, 2048, 1, 3472] + - Exact: [512, 2048, 1, 3475] + - Exact: [512, 2048, 1, 3476] + - Exact: [512, 2048, 1, 3477] + - Exact: [512, 2048, 1, 3478] + - Exact: [512, 2048, 1, 3479] + - Exact: [512, 2048, 1, 3480] + - Exact: [512, 2048, 1, 3481] + - Exact: [512, 2048, 1, 3483] + - Exact: [512, 2048, 1, 3484] + - Exact: [512, 2048, 1, 3487] + - Exact: [512, 2048, 1, 3489] + - Exact: [512, 2048, 1, 3490] + - Exact: [512, 2048, 1, 3491] + - Exact: [512, 2048, 1, 3493] + - Exact: [512, 2048, 1, 3494] + - Exact: [512, 2048, 1, 3495] + - Exact: [512, 2048, 1, 3497] + - Exact: [512, 2048, 1, 3498] + - Exact: [512, 2048, 1, 3499] + - Exact: [512, 2048, 1, 3501] + - Exact: [512, 2048, 1, 3503] + - Exact: [512, 2048, 1, 3507] + - Exact: [512, 2048, 1, 3508] + - Exact: [512, 2048, 1, 3509] + - Exact: [512, 2048, 1, 3511] + - Exact: [512, 2048, 1, 3514] + - Exact: [512, 2048, 1, 3515] + - Exact: [512, 2048, 1, 3517] + - Exact: [512, 2048, 1, 3518] + - Exact: [512, 2048, 1, 3519] + - Exact: [512, 2048, 1, 3520] + - Exact: [512, 2048, 1, 3523] + - Exact: [512, 2048, 1, 3528] + - Exact: [512, 2048, 1, 3529] + - Exact: [512, 2048, 1, 3530] + - Exact: [512, 2048, 1, 3532] + - Exact: [512, 2048, 1, 3533] + - Exact: [512, 2048, 1, 3534] + - Exact: [512, 2048, 1, 3538] + - Exact: [512, 2048, 1, 3539] + - Exact: [512, 2048, 1, 3541] + - Exact: [512, 2048, 1, 3547] + - Exact: [512, 2048, 1, 3548] + - Exact: [512, 2048, 1, 3564] + - Exact: [512, 2048, 1, 3575] + - Exact: [512, 2048, 1, 3598] + - Exact: [512, 2048, 1, 3599] + - Exact: [512, 2048, 1, 3608] + - Exact: [512, 2048, 1, 3780] + - Exact: [512, 2048, 1, 3796] + - Exact: [512, 2048, 1, 3822] + - Exact: [512, 2048, 1, 3859] + - Exact: [512, 2048, 1, 3870] + - Exact: [512, 2048, 1, 3876] + - Exact: [512, 2048, 1, 3906] + - Exact: [512, 2048, 1, 3910] + - Exact: [512, 2048, 1, 3925] + - Exact: [512, 2048, 1, 3942] + - Exact: [512, 2048, 1, 3944] + - Exact: [512, 2048, 1, 3955] + - Exact: [512, 2048, 1, 3968] + - Exact: [512, 2048, 1, 3969] + - Exact: [512, 2048, 1, 3976] + - Exact: [512, 2048, 1, 3977] + - Exact: [512, 2048, 1, 3978] + - Exact: [512, 2048, 1, 3990] + - Exact: [512, 2048, 1, 3995] + - Exact: [512, 2048, 1, 3996] + - Exact: [512, 2048, 1, 3999] + - Exact: [512, 2048, 1, 4005] + - Exact: [512, 2048, 1, 4012] + - Exact: [512, 2048, 1, 4020] + - Exact: [512, 2048, 1, 4026] + - Exact: [512, 2048, 1, 4030] + - Exact: [512, 2048, 1, 4032] + - Exact: [2048, 512, 1, 2790] + - Exact: [2048, 512, 1, 2864] + - Exact: [2048, 512, 1, 3092] + - Exact: [2048, 512, 1, 3113] + - Exact: [2048, 512, 1, 3137] + - Exact: [2048, 512, 1, 3165] + - Exact: [2048, 512, 1, 3166] + - Exact: [2048, 512, 1, 3194] + - Exact: [2048, 512, 1, 3219] + - Exact: [2048, 512, 1, 3222] + - Exact: [2048, 512, 1, 3234] + - Exact: [2048, 512, 1, 3237] + - Exact: [2048, 512, 1, 3242] + - Exact: [2048, 512, 1, 3246] + - Exact: [2048, 512, 1, 3249] + - Exact: [2048, 512, 1, 3251] + - Exact: [2048, 512, 1, 3257] + - Exact: [2048, 512, 1, 3262] + - Exact: [2048, 512, 1, 3268] + - Exact: [2048, 512, 1, 3282] + - Exact: [2048, 512, 1, 3286] + - Exact: [2048, 512, 1, 3287] + - Exact: [2048, 512, 1, 3293] + - Exact: [2048, 512, 1, 3297] + - Exact: [2048, 512, 1, 3307] + - Exact: [2048, 512, 1, 3314] + - Exact: [2048, 512, 1, 3315] + - Exact: [2048, 512, 1, 3319] + - Exact: [2048, 512, 1, 3322] + - Exact: [2048, 512, 1, 3323] + - Exact: [2048, 512, 1, 3324] + - Exact: [2048, 512, 1, 3325] + - Exact: [2048, 512, 1, 3327] + - Exact: [2048, 512, 1, 3329] + - Exact: [2048, 512, 1, 3332] + - Exact: [2048, 512, 1, 3336] + - Exact: [2048, 512, 1, 3339] + - Exact: [2048, 512, 1, 3342] + - Exact: [2048, 512, 1, 3344] + - Exact: [2048, 512, 1, 3358] + - Exact: [2048, 512, 1, 3360] + - Exact: [2048, 512, 1, 3364] + - Exact: [2048, 512, 1, 3365] + - Exact: [2048, 512, 1, 3369] + - Exact: [2048, 512, 1, 3371] + - Exact: [2048, 512, 1, 3374] + - Exact: [2048, 512, 1, 3376] + - Exact: [2048, 512, 1, 3377] + - Exact: [2048, 512, 1, 3378] + - Exact: [2048, 512, 1, 3381] + - Exact: [2048, 512, 1, 3382] + - Exact: [2048, 512, 1, 3383] + - Exact: [2048, 512, 1, 3384] + - Exact: [2048, 512, 1, 3385] + - Exact: [2048, 512, 1, 3386] + - Exact: [2048, 512, 1, 3388] + - Exact: [2048, 512, 1, 3390] + - Exact: [2048, 512, 1, 3391] + - Exact: [2048, 512, 1, 3396] + - Exact: [2048, 512, 1, 3399] + - Exact: [2048, 512, 1, 3402] + - Exact: [2048, 512, 1, 3410] + - Exact: [2048, 512, 1, 3412] + - Exact: [2048, 512, 1, 3414] + - Exact: [2048, 512, 1, 3415] + - Exact: [2048, 512, 1, 3418] + - Exact: [2048, 512, 1, 3420] + - Exact: [2048, 512, 1, 3422] + - Exact: [2048, 512, 1, 3425] + - Exact: [2048, 512, 1, 3426] + - Exact: [2048, 512, 1, 3428] + - Exact: [2048, 512, 1, 3430] + - Exact: [2048, 512, 1, 3431] + - Exact: [2048, 512, 1, 3432] + - Exact: [2048, 512, 1, 3438] + - Exact: [2048, 512, 1, 3439] + - Exact: [2048, 512, 1, 3440] + - Exact: [2048, 512, 1, 3443] + - Exact: [2048, 512, 1, 3445] + - Exact: [2048, 512, 1, 3447] + - Exact: [2048, 512, 1, 3448] + - Exact: [2048, 512, 1, 3450] + - Exact: [2048, 512, 1, 3451] + - Exact: [2048, 512, 1, 3453] + - Exact: [2048, 512, 1, 3455] + - Exact: [2048, 512, 1, 3456] + - Exact: [2048, 512, 1, 3457] + - Exact: [2048, 512, 1, 3458] + - Exact: [2048, 512, 1, 3459] + - Exact: [2048, 512, 1, 3460] + - Exact: [2048, 512, 1, 3461] + - Exact: [2048, 512, 1, 3462] + - Exact: [2048, 512, 1, 3466] + - Exact: [2048, 512, 1, 3467] + - Exact: [2048, 512, 1, 3468] + - Exact: [2048, 512, 1, 3470] + - Exact: [2048, 512, 1, 3471] + - Exact: [2048, 512, 1, 3476] + - Exact: [2048, 512, 1, 3477] + - Exact: [2048, 512, 1, 3478] + - Exact: [2048, 512, 1, 3479] + - Exact: [2048, 512, 1, 3480] + - Exact: [2048, 512, 1, 3481] + - Exact: [2048, 512, 1, 3483] + - Exact: [2048, 512, 1, 3484] + - Exact: [2048, 512, 1, 3487] + - Exact: [2048, 512, 1, 3489] + - Exact: [2048, 512, 1, 3490] + - Exact: [2048, 512, 1, 3491] + - Exact: [2048, 512, 1, 3493] + - Exact: [2048, 512, 1, 3494] + - Exact: [2048, 512, 1, 3495] + - Exact: [2048, 512, 1, 3497] + - Exact: [2048, 512, 1, 3498] + - Exact: [2048, 512, 1, 3499] + - Exact: [2048, 512, 1, 3501] + - Exact: [2048, 512, 1, 3503] + - Exact: [2048, 512, 1, 3507] + - Exact: [2048, 512, 1, 3508] + - Exact: [2048, 512, 1, 3509] + - Exact: [2048, 512, 1, 3511] + - Exact: [2048, 512, 1, 3514] + - Exact: [2048, 512, 1, 3515] + - Exact: [2048, 512, 1, 3517] + - Exact: [2048, 512, 1, 3518] + - Exact: [2048, 512, 1, 3519] + - Exact: [2048, 512, 1, 3520] + - Exact: [2048, 512, 1, 3523] + - Exact: [2048, 512, 1, 3528] + - Exact: [2048, 512, 1, 3529] + - Exact: [2048, 512, 1, 3530] + - Exact: [2048, 512, 1, 3532] + - Exact: [2048, 512, 1, 3533] + - Exact: [2048, 512, 1, 3534] + - Exact: [2048, 512, 1, 3538] + - Exact: [2048, 512, 1, 3539] + - Exact: [2048, 512, 1, 3541] + - Exact: [2048, 512, 1, 3547] + - Exact: [2048, 512, 1, 3548] + - Exact: [2048, 512, 1, 3552] + - Exact: [2048, 512, 1, 3564] + - Exact: [2048, 512, 1, 3575] + - Exact: [2048, 512, 1, 3598] + - Exact: [2048, 512, 1, 3599] + - Exact: [2048, 512, 1, 3608] + - Exact: [2048, 512, 1, 3780] + - Exact: [2048, 512, 1, 3796] + - Exact: [2048, 512, 1, 3822] + - Exact: [2048, 512, 1, 3840] + - Exact: [2048, 512, 1, 3859] + - Exact: [2048, 512, 1, 3870] + - Exact: [2048, 512, 1, 3876] + - Exact: [2048, 512, 1, 3906] + - Exact: [2048, 512, 1, 3910] + - Exact: [2048, 512, 1, 3925] + - Exact: [2048, 512, 1, 3942] + - Exact: [2048, 512, 1, 3944] + - Exact: [2048, 512, 1, 3955] + - Exact: [2048, 512, 1, 3968] + - Exact: [2048, 512, 1, 3969] + - Exact: [2048, 512, 1, 3976] + - Exact: [2048, 512, 1, 3977] + - Exact: [2048, 512, 1, 3978] + - Exact: [2048, 512, 1, 3990] + - Exact: [2048, 512, 1, 3995] + - Exact: [2048, 512, 1, 3996] + - Exact: [2048, 512, 1, 3999] + - Exact: [2048, 512, 1, 4005] + - Exact: [2048, 512, 1, 4012] + - Exact: [2048, 512, 1, 4020] + - Exact: [2048, 512, 1, 4026] + - Exact: [2048, 512, 1, 4030] + - Exact: [2048, 512, 1, 4032] + - Exact: [64, 102, 312, 102] + - Exact: [64, 512, 16, 512] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 1024, 1, 3840] + - Exact: [1024, 1024, 1, 3968] + - Exact: [1024, 1024, 1, 7200] + - Exact: [1024, 1024, 1, 8160] + - Exact: [768, 768, 1, 384] + - Exact: [768, 384, 1, 384] + - Exact: [1152, 576, 1, 384] + - Exact: [384, 768, 1, 384] + - Exact: [1024, 1024, 1, 32] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [1024, 1024, 1, 1600] + - Exact: [2048, 256, 1, 1024] + - Exact: [256, 1280, 1, 8976] + - Exact: [512, 2048, 1, 256] + - Exact: [560, 1024, 1, 1600] + - Exact: [560, 1024, 1, 200] + - Exact: [1024, 1024, 1, 960] + - Exact: [2304, 128, 1, 128] + - Exact: [2688, 128, 1, 128] + - Exact: [3072, 128, 1, 128] + - Exact: [3456, 128, 1, 128] + - Exact: [3840, 128, 1, 128] + - Exact: [4224, 128, 1, 128] + - Exact: [4608, 128, 1, 128] + - Exact: [4992, 128, 1, 128] + - Exact: [5376, 128, 1, 128] + - Exact: [5760, 128, 1, 128] + - Exact: [6144, 128, 1, 128] + - Exact: [6528, 128, 1, 128] + - Exact: [6912, 128, 1, 128] + - Exact: [7296, 128, 1, 128] + - Exact: [7680, 128, 1, 128] + - Exact: [8064, 128, 1, 128] + - Exact: [8448, 128, 1, 128] + - Exact: [8832, 128, 1, 128] + - Exact: [2304, 128, 1, 256] + - Exact: [2688, 128, 1, 256] + - Exact: [3072, 128, 1, 256] + - Exact: [3456, 128, 1, 256] + - Exact: [3840, 128, 1, 256] + - Exact: [4224, 128, 1, 256] + - Exact: [4608, 128, 1, 256] + - Exact: [4992, 128, 1, 256] + - Exact: [5376, 128, 1, 256] + - Exact: [5760, 128, 1, 256] + - Exact: [6144, 128, 1, 256] + - Exact: [6528, 128, 1, 256] + - Exact: [6912, 128, 1, 256] + - Exact: [7296, 128, 1, 256] + - Exact: [7680, 128, 1, 256] + - Exact: [8064, 128, 1, 256] + - Exact: [8448, 128, 1, 256] + - Exact: [8832, 128, 1, 256] + - Exact: [768, 768, 1, 768] + - Exact: [384, 1536, 1, 384] + - Exact: [384, 1920, 1, 384] + - Exact: [384, 2304, 1, 384] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [1225, 64, 32, 192] + - Exact: [1225, 64, 32, 256] + - Exact: [1225, 64, 32, 288] + - Exact: [5329, 80, 32, 64] + - Exact: [289, 128, 32, 768] + - Exact: [289, 160, 32, 768] + - Exact: [289, 192, 32, 768] + - Exact: [3136, 64, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [1024, 1024, 1, 6912] + - Exact: [1024, 512, 1, 4096] + - Exact: [480, 1024, 1, 4096] + - Exact: [1024, 512, 1, 6912] + - Exact: [480, 1024, 1, 6912] + - Exact: [100, 512, 120, 128] + - Exact: [100, 512, 18, 128] + - Exact: [100, 512, 19, 128] + - Exact: [1444, 576, 1, 128] + - Exact: [173280, 64, 1, 128] + - Exact: [25992, 64, 1, 128] + - Exact: [27436, 64, 1, 128] + - Exact: [361, 2304, 1, 512] + - Exact: [960, 1024, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 1024, 1, 77] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 10] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 1024, 1, 39] + - Exact: [1024, 1024, 1, 780] + - Exact: [1024, 1024, 1, 4992] + - Exact: [1024, 1024, 1, 308] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 1024, 1, 40] + - Exact: [1024, 1024, 1, 800] + - Exact: [1024, 1024, 1, 5120] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 1024, 1, 41] + - Exact: [1024, 1024, 1, 820] + - Exact: [1024, 1024, 1, 5248] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 1024, 1, 5] + - Exact: [1024, 1024, 1, 385] + - Exact: [1024, 1024, 1, 2560] + - Exact: [1024, 1024, 1, 462] + - Exact: [64, 128, 128, 128] + - Exact: [1024, 1024, 1, 8] + - Exact: [1024, 1024, 1, 160] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 1024, 1, 9] + - Exact: [1024, 1024, 1, 180] + - Exact: [1024, 1024, 1, 1152] + - Exact: [1024, 1024, 1, 6528] + - Exact: [1024, 1024, 1, 7104] + - Exact: [1024, 1024, 1, 8064] + - Exact: [2048, 512, 1, 1] + - Exact: [1024, 1024, 1, 16] + - Exact: [512, 64, 256, 512] + - Exact: [64, 512, 256, 512] + - Exact: [512, 64, 128, 512] + - Exact: [64, 512, 128, 512] + - Exact: [512, 64, 40, 512] + - Exact: [64, 512, 40, 512] + - Exact: [1024, 96, 64, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [1024, 96, 128, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [1024, 64, 256, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [1024, 64, 32, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [1024, 64, 64, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [1024, 64, 128, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [1024, 1024, 1, 64] + - Exact: [64, 128, 1024, 128] + - Exact: [128, 64, 1024, 128] + - Exact: [1024, 1024, 1, 3456] + - Exact: [1024, 1024, 1, 864] + - Exact: [1024, 512, 1, 3456] + - Exact: [1024, 512, 1, 864] + - Exact: [256, 3456, 1, 1] + - Exact: [256, 4096, 1, 1] + - Exact: [480, 1024, 1, 3456] + - Exact: [480, 1024, 1, 864] + - Exact: [64, 128, 1280, 128] + - Exact: [128, 64, 1280, 128] + - Exact: [1024, 1024, 1, 82] + - Exact: [128, 64, 1312, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [1024, 1024, 1, 12] + - Exact: [1024, 1024, 1, 6144] + - Exact: [64, 512, 192, 512] + - Exact: [512, 64, 192, 512] + - Exact: [3136, 64, 64, 128] + - Exact: [3136, 64, 32, 128] + - Exact: [196, 2304, 1, 256] + - Exact: [784, 1152, 1, 128] + - Exact: [64, 128, 2048, 128] + - Exact: [128, 64, 2048, 128] + - Exact: [128, 64, 1536, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [1024, 1024, 1, 96] + - Exact: [92416, 64, 25, 64] + - Exact: [50176, 64, 36, 64] + - Exact: [36864, 64, 49, 64] + - Exact: [25600, 64, 64, 64] + - Exact: [64, 128, 192, 128] + - Exact: [128, 64, 192, 128] + - Exact: [768, 768, 1, 2048] + - Exact: [64, 384, 144, 384] + - Exact: [384, 64, 144, 384] + - Exact: [768, 768, 1, 4608] + - Exact: [64, 512, 48, 512] + - Exact: [512, 64, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [128, 64, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [384, 64, 192, 384] + - Exact: [1024, 1024, 1, 4608] + - Exact: [768, 512, 2, 2048] + - Exact: [713, 512, 2, 2048] + - Exact: [672, 512, 2, 2048] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [1008, 512, 2, 2048] + - Exact: [748, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [840, 512, 2, 2048] + - Exact: [850, 256, 2, 3] + - Exact: [805, 256, 2, 12] + - Exact: [805, 256, 2, 3] + - Exact: [850, 256, 2, 12] + - Exact: [768, 256, 2, 12] + - Exact: [864, 256, 2, 3] + - Exact: [950, 256, 2, 12] + - Exact: [864, 256, 2, 12] + - Exact: [950, 256, 2, 3] + - Exact: [768, 256, 2, 3] + - Exact: [1024, 320, 1, 1024] + - Exact: [96, 1024, 160, 1024] + - Exact: [1024, 96, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [1024, 96, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [1024, 96, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [1024, 96, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [1024, 96, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [1024, 96, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [1024, 96, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [1024, 96, 32, 1024] + - Exact: [512, 64, 320, 512] + - Exact: [64, 512, 320, 512] + - Exact: [512, 64, 80, 512] + - Exact: [1024, 64, 512, 1024] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 64, 13216] + - Exact: [64, 96, 36, 10368] + - Exact: [64, 64, 36, 12544] + - Exact: [64, 64, 36, 11552] + - Exact: [1024, 256, 1, 10496] + - Exact: [1024, 256, 1, 11520] + - Exact: [1024, 256, 1, 12032] + - Exact: [1024, 256, 1, 13568] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 14848] + - Exact: [1024, 256, 1, 15104] + - Exact: [1024, 256, 1, 15872] + - Exact: [1024, 256, 1, 16128] + - Exact: [1024, 256, 1, 17152] + - Exact: [1024, 256, 1, 17408] + - Exact: [1024, 256, 1, 18944] + - Exact: [1024, 256, 1, 19712] + - Exact: [1024, 256, 1, 19968] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 8448] + - Exact: [1024, 256, 1, 9728] + - Exact: [1024, 256, 1, 9984] + - Exact: [512, 256, 1, 32768] + - Exact: [256, 128, 1, 55296] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 200] + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [4288, 64, 1, 1280] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [64, 2944, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1408, 128, 1, 128] + - Exact: [1024, 256, 1, 128] + - Exact: [2944, 64, 1, 128] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 704, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [448, 448, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 256, 2048] + - Exact: [64, 38, 1680, 38] + - Exact: [64, 59, 1088, 59] + - Exact: [64, 32, 1984, 32] + - Exact: [64, 54, 1184, 54] + - Exact: [64, 49, 1296, 49] + - Exact: [64, 45, 1424, 45] + - Exact: [64, 35, 1808, 35] + - Exact: [64, 41, 1552, 41] + - Exact: [64, 64, 36, 3136] + - Exact: [64, 64, 64, 826] + - Exact: [64, 64, 64, 1600] + - Exact: [64, 96, 64, 288] + - Exact: [96, 96, 36, 1568] + - Exact: [96, 96, 36, 2592] + - Exact: [64, 96, 64, 800] + - Exact: [35, 96, 36, 8960] + - Exact: [32, 64, 36, 43808] + - Exact: [64, 64, 64, 81] + - Exact: [64, 96, 36, 512] + - Exact: [64, 64, 64, 3200] + - Exact: [64, 64, 36, 3520] + - Exact: [64, 64, 64, 5408] + - Exact: [35, 96, 36, 13440] + - Exact: [96, 96, 64, 1152] + - Exact: [32, 64, 36, 90] + - Exact: [64, 64, 64, 800] + - Exact: [64, 64, 36, 1568] + - Exact: [64, 64, 36, 196] + - Exact: [35, 96, 64, 4235] + - Exact: [149, 32, 36, 19072] + - Exact: [64, 96, 36, 1568] + - Exact: [96, 96, 64, 800] + - Exact: [32, 64, 64, 640] + - Exact: [64, 64, 36, 392] + - Exact: [64, 64, 64, 1652] + - Exact: [64, 96, 36, 2592] + - Exact: [64, 64, 36, 6272] + - Exact: [32, 64, 64, 20000] + - Exact: [64, 64, 64, 648] + - Exact: [32, 64, 36, 1440] + - Exact: [64, 64, 64, 100] + - Exact: [64, 96, 64, 4608] + - Exact: [64, 64, 64, 200] + - Exact: [32, 64, 64, 40] + - Exact: [64, 96, 64, 1152] + - Exact: [149, 32, 64, 8195] + - Exact: [35, 96, 64, 6160] + - Exact: [64, 64, 36, 1760] + - Exact: [64, 2880, 1, 320] + - Exact: [49, 832, 32, 256] + - Exact: [289, 1120, 1, 160] + - Exact: [64, 1728, 1, 320] + - Exact: [49, 832, 32, 160] + - Exact: [49, 832, 32, 384] + - Exact: [289, 896, 1, 192] + - Exact: [289, 896, 1, 128] + - Exact: [196, 800, 1, 64] + - Exact: [64, 1344, 1, 512] + - Exact: [64, 1152, 1, 384] + - Exact: [64, 1152, 1, 448] + - Exact: [49, 832, 32, 128] + - Exact: [49, 832, 32, 48] + - Exact: [64, 1152, 1, 256] + - Exact: [49, 832, 32, 32] + - Exact: [289, 1120, 1, 192] + - Exact: [196, 600, 1, 64] + - Exact: [49, 832, 32, 192] + - Exact: [64, 1728, 1, 192] + - Exact: [64, 38, 840, 38] + - Exact: [64, 49, 648, 49] + - Exact: [64, 32, 992, 32] + - Exact: [64, 35, 904, 35] + - Exact: [64, 41, 776, 41] + - Exact: [64, 45, 712, 45] + - Exact: [64, 54, 592, 54] + - Exact: [64, 59, 544, 59] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 64, 512] + - Exact: [33, 32, 1600, 33] + - Exact: [33, 32, 200, 33] + - Exact: [67, 2048, 1, 512] + - Exact: [512, 512, 1, 3780] + - Exact: [512, 512, 1, 3796] + - Exact: [512, 512, 1, 3822] + - Exact: [512, 512, 1, 3840] + - Exact: [512, 512, 1, 3859] + - Exact: [512, 512, 1, 3870] + - Exact: [512, 512, 1, 3876] + - Exact: [512, 512, 1, 3906] + - Exact: [512, 512, 1, 3910] + - Exact: [512, 512, 1, 3925] + - Exact: [512, 512, 1, 3927] + - Exact: [512, 512, 1, 3942] + - Exact: [512, 512, 1, 3944] + - Exact: [512, 512, 1, 3955] + - Exact: [512, 512, 1, 3968] + - Exact: [512, 512, 1, 3969] + - Exact: [512, 512, 1, 3976] + - Exact: [512, 512, 1, 3977] + - Exact: [512, 512, 1, 3978] + - Exact: [512, 512, 1, 3990] + - Exact: [512, 512, 1, 3995] + - Exact: [512, 512, 1, 3996] + - Exact: [512, 512, 1, 3999] + - Exact: [512, 512, 1, 4005] + - Exact: [512, 512, 1, 4012] + - Exact: [512, 512, 1, 4020] + - Exact: [512, 512, 1, 4026] + - Exact: [512, 512, 1, 4030] + - Exact: [512, 512, 1, 4032] + - Exact: [512, 512, 1, 4050] + - Exact: [512, 512, 1, 4059] + - Exact: [384, 384, 1, 384] + - Exact: [384, 192, 1, 384] + - Exact: [1024, 256, 1, 1024] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 256, 1, 3072] + - Exact: [1024, 256, 1, 3328] + - Exact: [1024, 256, 1, 3584] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 256, 1, 4352] + - Exact: [1024, 256, 1, 4608] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 256, 1, 5376] + - Exact: [1024, 256, 1, 5632] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 256, 1, 6400] + - Exact: [1024, 256, 1, 7680] + - Exact: [1024, 256, 1, 7936] + - Exact: [512, 512, 1, 1600] + - Exact: [100, 2048, 1, 512] + - Exact: [74, 2048, 1, 512] + - Exact: [74, 2048, 1, 960] + - Exact: [768, 128, 1, 128] + - Exact: [1152, 128, 1, 128] + - Exact: [1536, 128, 1, 128] + - Exact: [1920, 128, 1, 128] + - Exact: [768, 128, 1, 256] + - Exact: [1152, 128, 1, 256] + - Exact: [1536, 128, 1, 256] + - Exact: [1920, 128, 1, 256] + - Exact: [448, 448, 1, 448] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [1225, 32, 32, 192] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [1225, 48, 32, 288] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [512, 256, 1, 4096] + - Exact: [512, 256, 1, 6912] + - Exact: [100, 2304, 1, 512] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [32, 64, 4608, 32] + - Exact: [32, 64, 4608, 35] + - Exact: [34, 64, 4736, 24] + - Exact: [34, 64, 4736, 34] + - Exact: [35, 64, 4608, 35] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 32, 4608, 35] + - Exact: [64, 34, 4736, 24] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 35] + - Exact: [256, 864, 1, 1] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 864] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 4608, 1, 512] + - Exact: [56, 512, 64, 512] + - Exact: [228, 256, 2, 12] + - Exact: [228, 256, 2, 3] + - Exact: [187, 256, 2, 12] + - Exact: [247, 256, 2, 12] + - Exact: [176, 256, 2, 3] + - Exact: [187, 256, 2, 3] + - Exact: [221, 256, 2, 3] + - Exact: [221, 256, 2, 12] + - Exact: [176, 256, 2, 12] + - Exact: [247, 256, 2, 3] + - Exact: [216, 256, 2, 3] + - Exact: [192, 256, 2, 12] + - Exact: [192, 256, 2, 3] + - Exact: [216, 256, 2, 12] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [32, 32, 36, 43808] + - Exact: [32, 32, 64, 20000] + - Exact: [256, 128, 1, 32768] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 128] + - Exact: [2368, 4, 1, 256] + - Exact: [1408, 4, 1, 256] + - Exact: [5056, 4, 1, 1280] + - Exact: [1408, 4, 1, 3328] + - Exact: [6784, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [1856, 4, 1, 256] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [2048, 1, 1, 512] + - Exact: [2048, 1, 1, 960] + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 2, 1, 4] + - Exact: [2048, 2, 1, 8] + - Exact: [2560, 2, 1, 2] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + - Exact: [49, 1200, 1, 128] + - Exact: [1, 1152, 1, 256] + - Exact: [25, 1152, 1, 256] + - Exact: [9, 1152, 1, 256] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [16, 32, 36, 5760] + - Exact: [3, 64, 36, 6272] + - Exact: [3, 64, 64, 46208] + - Exact: [3, 64, 64, 92416] + - Exact: [1, 16, 36, 23040] + - Exact: [1, 16, 64, 10240] + - Exact: [3, 64, 36, 25088] + - Exact: [3, 64, 64, 11552] + - Exact: [3, 64, 36, 200704] + - Exact: [3, 64, 64, 23104] + - Exact: [3, 64, 36, 100352] + - Exact: [3, 64, 36, 50176] + - Exact: [8, 384, 64, 6600] + - Exact: [65, 1024, 1, 6400] + - Exact: [13, 512, 1, 32768] + - Exact: [256, 1, 1, 32768] + - Exact: [256, 4, 1, 6912] + - Exact: [13, 512, 1, 55296] + - Exact: [1024, 2, 1, 4992] + - Exact: [1024, 2, 1, 5120] + - Exact: [1024, 2, 1, 5248] + - Exact: [13, 512, 1, 6912] + - Exact: [256, 1, 1, 6912] + - Exact: [256, 128, 1, 6912] + - Exact: [768, 2, 1, 4608] + - Exact: [1024, 2, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 64, 1, 512] + - Exact: [512, 32, 1, 200] + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 1024, 1, 128] + - Exact: [4, 64, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [448, 4, 1, 1280] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [512, 32, 1, 512] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 256, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [512, 16, 1, 512] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [64, 1024, 1, 3328] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [4, 128, 1, 3328] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [64, 64, 1, 3328] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 128, 1, 128] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [64, 23, 2720, 23] + - Exact: [64, 19, 3264, 19] + - Exact: [64, 25, 2512, 25] + - Exact: [64, 9, 6544, 9] + - Exact: [64, 7, 8192, 7] + - Exact: [64, 8, 7280, 8] + - Exact: [64, 27, 2336, 27] + - Exact: [64, 16, 3840, 16] + - Exact: [64, 11, 5456, 11] + - Exact: [64, 21, 2976, 21] + - Exact: [64, 15, 4096, 15] + - Exact: [64, 10, 5952, 10] + - Exact: [64, 14, 4368, 14] + - Exact: [64, 13, 4672, 13] + - Exact: [64, 12, 5040, 12] + - Exact: [64, 29, 2176, 29] + - Exact: [64, 17, 3632, 17] + - Exact: [64, 18, 3440, 18] + - Exact: [768, 2, 1, 16] + - Exact: [768, 2, 1, 32] + - Exact: [3, 64, 64, 2888] + - Exact: [1, 16, 64, 640] + - Exact: [512, 24, 36, 800] + - Exact: [16, 32, 36, 360] + - Exact: [1, 16, 36, 1440] + - Exact: [512, 24, 64, 512] + - Exact: [3, 64, 36, 3136] + - Exact: [256, 24, 64, 32] + - Exact: [256, 16, 36, 3200] + - Exact: [256, 16, 36, 32] + - Exact: [512, 24, 36, 288] + - Exact: [512, 24, 64, 128] + - Exact: [3, 64, 64, 1444] + - Exact: [16, 32, 64, 160] + - Exact: [256, 16, 64, 32] + - Exact: [256, 16, 64, 1568] + - Exact: [256, 24, 36, 128] + - Exact: [16, 32, 64, 2560] + - Exact: [49, 800, 1, 128] + - Exact: [64, 12, 2520, 12] + - Exact: [64, 13, 2336, 13] + - Exact: [64, 14, 2184, 14] + - Exact: [64, 15, 2048, 15] + - Exact: [64, 16, 1920, 16] + - Exact: [64, 17, 1816, 17] + - Exact: [64, 18, 1720, 18] + - Exact: [64, 19, 1632, 19] + - Exact: [64, 21, 1488, 21] + - Exact: [64, 23, 1360, 23] + - Exact: [64, 25, 1256, 25] + - Exact: [64, 27, 1168, 27] + - Exact: [64, 29, 1088, 29] + - Exact: [1024, 2, 1, 512] + - Exact: [1024, 2, 1, 3072] + - Exact: [1024, 2, 1, 6] + - Exact: [3, 64, 512, 3] + - Exact: [9, 64, 512, 9] + - Exact: [1024, 1, 1, 200] + - Exact: [5, 64, 512, 5] + - Exact: [1024, 2, 1, 1] + - Exact: [1024, 2, 1, 2048] + - Exact: [17, 64, 1, 15] + - Exact: [17, 64, 1, 17] + - Exact: [30, 64, 1, 30] + - Exact: [30, 64, 1, 31] + - Exact: [31, 64, 1, 31] + - Exact: [64, 17, 1, 15] + - Exact: [64, 17, 1, 17] + - Exact: [64, 30, 1, 30] + - Exact: [64, 30, 1, 31] + - Exact: [64, 31, 1, 31] + - Exact: [14, 64, 1, 14] + - Exact: [15, 64, 1, 14] + - Exact: [15, 64, 1, 15] + - Exact: [64, 14, 1, 14] + - Exact: [64, 15, 1, 14] + - Exact: [64, 15, 1, 15] + - Exact: [1024, 2, 1, 32] + - Exact: [1024, 2, 1, 4] + - Exact: [512, 32, 1, 1600] + - Exact: [1024, 64, 1, 960] + - Exact: [512, 64, 1, 512] + - Exact: [384, 128, 1, 128] + - Exact: [384, 128, 1, 256] + - Exact: [64, 64, 1, 64] + - Exact: [256, 4, 1, 4096] + - Exact: [25, 256, 120, 128] + - Exact: [25, 256, 18, 128] + - Exact: [25, 256, 19, 128] + - Exact: [9, 256, 120, 128] + - Exact: [9, 256, 18, 128] + - Exact: [9, 256, 19, 128] + - Exact: [1024, 2, 1, 10] + - Exact: [1024, 2, 1, 1280] + - Exact: [1024, 2, 1, 39] + - Exact: [1024, 2, 1, 40] + - Exact: [1024, 2, 1, 41] + - Exact: [1024, 2, 1, 5] + - Exact: [1024, 2, 1, 2560] + - Exact: [1024, 2, 1, 8] + - Exact: [1024, 2, 1, 1024] + - Exact: [1024, 2, 1, 9] + - Exact: [1024, 2, 1, 1152] + - Exact: [4, 64, 32768, 4] + - Exact: [4, 64, 38400, 4] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [14, 64, 10880, 14] + - Exact: [15, 64, 10880, 14] + - Exact: [15, 64, 7680, 15] + - Exact: [15, 64, 10880, 15] + - Exact: [17, 64, 7680, 15] + - Exact: [17, 64, 6144, 17] + - Exact: [17, 64, 7680, 17] + - Exact: [21, 64, 6144, 17] + - Exact: [21, 64, 6144, 21] + - Exact: [24, 64, 4736, 24] + - Exact: [30, 64, 2048, 30] + - Exact: [30, 64, 2048, 31] + - Exact: [31, 64, 2048, 31] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 15, 10880, 14] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 17, 7680, 15] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 21, 6144, 17] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 30, 2048, 31] + - Exact: [64, 31, 2048, 31] + - Exact: [64, 512, 1, 512] + - Exact: [5, 64, 1, 5] + - Exact: [33, 32, 1, 33] + - Exact: [1024, 1, 1, 1600] + - Exact: [5, 64, 960, 5] + - Exact: [27, 128, 32768, 27] + - Exact: [1024, 2, 1, 16] + - Exact: [1024, 2, 1, 64] + - Exact: [13, 512, 1, 3456] + - Exact: [13, 512, 1, 4096] + - Exact: [13, 512, 1, 864] + - Exact: [256, 1, 1, 3456] + - Exact: [256, 1, 1, 4096] + - Exact: [256, 1, 1, 864] + - Exact: [256, 128, 1, 3456] + - Exact: [256, 128, 1, 4096] + - Exact: [256, 128, 1, 864] + - Exact: [1024, 2, 1, 80] + - Exact: [1024, 2, 1, 82] + - Exact: [1024, 2, 1, 12] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [1024, 2, 1, 128] + - Exact: [1024, 2, 1, 96] + - Exact: [768, 2, 1, 2048] + - Exact: [1024, 81, 1, 1024] + - Exact: [2, 1024, 1, 6] + - Exact: [1024, 2, 1, 20] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml new file mode 100644 index 000000000..867ee7da1 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml @@ -0,0 +1,5584 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 4096, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [30528, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 1024] + - Exact: [4096, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [30528, 2048, 1, 1024] + - Exact: [30522, 320, 1, 768] + - Exact: [3072, 4096, 1, 768] + - Exact: [768, 4096, 1, 3072] + - Exact: [768, 4096, 1, 768] + - Exact: [30522, 160, 1, 768] + - Exact: [30522, 640, 1, 768] + - Exact: [30522, 1280, 1, 768] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 2048, 1, 3072] + - Exact: [1024, 3072, 1, 3072] + - Exact: [3072, 2048, 1, 1024] + - Exact: [3072, 3072, 1, 1024] + - Exact: [3072, 512, 1, 1024] + - Exact: [30522, 160, 1, 1024] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [256, 256, 192, 64] + - Exact: [256, 256, 96, 64] + - Exact: [128, 128, 384, 64] + - Exact: [128, 128, 96, 64] + - Exact: [512, 512, 16, 64] + - Exact: [512, 512, 96, 64] + - Exact: [512, 512, 128, 64] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [2368, 6784, 1, 128] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [6144, 24000, 1, 2048] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1760, 6400, 1, 1760] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [6784, 2944, 1, 128] + - Exact: [2560, 1600, 1, 2560] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2944, 6784, 1, 3328] + - Exact: [6144, 5984, 1, 2048] + - Exact: [3584, 704, 1, 3328] + - Exact: [2048, 1600, 1, 512] + - Exact: [448, 4288, 1, 256] + - Exact: [1856, 4288, 1, 128] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [2048, 7000, 1, 2048] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [2368, 2368, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [1856, 5056, 1, 128] + - Exact: [8192, 3200, 1, 2048] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2560, 800, 1, 2560] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [2560, 6400, 1, 2560] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 1024, 1, 128] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [8448, 12000, 1, 2816] + - Exact: [4288, 1408, 1, 128] + - Exact: [1856, 5888, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4096, 800, 1, 1024] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4096, 400, 1, 1024] + - Exact: [3072, 24000, 1, 1024] + - Exact: [5888, 1856, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [3584, 448, 1, 1280] + - Exact: [8448, 5984, 1, 2816] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [4608, 5984, 1, 1536] + - Exact: [1760, 3200, 1, 1760] + - Exact: [1024, 1856, 1, 256] + - Exact: [4096, 1600, 1, 1024] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [8192, 1600, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [704, 6784, 1, 128] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2048, 3200, 1, 512] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [4608, 12000, 1, 1536] + - Exact: [5056, 5056, 1, 128] + - Exact: [8192, 800, 1, 2048] + - Exact: [448, 5056, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [2048, 800, 1, 512] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5888, 3584, 1, 128] + - Exact: [1024, 2944, 1, 128] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [6144, 12000, 1, 2048] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [1856, 2368, 1, 128] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [2048, 1600, 1, 2048] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [5124, 9124, 1, 2048] + - Exact: [2944, 1408, 1, 128] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [2944, 4288, 1, 128] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [4096, 3200, 1, 1024] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 1280] + - Exact: [2560, 7000, 1, 2560] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 1856, 1, 128] + - Exact: [3584, 5056, 1, 3328] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2944, 2944, 1, 128] + - Exact: [1856, 3584, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [4608, 24000, 1, 1536] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [2048, 6400, 1, 2048] + - Exact: [512, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [1024, 24000, 1, 1536] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [2048, 800, 1, 2048] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [2368, 5056, 1, 128] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [4096, 7000, 1, 4096] + - Exact: [704, 2944, 1, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [16384, 400, 1, 4096] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2560] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [7680, 12000, 1, 2560] + - Exact: [1408, 1024, 1, 256] + - Exact: [8192, 400, 1, 2048] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1856, 2944, 1, 1280] + - Exact: [1024, 48000, 1, 2816] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 2944, 1, 128] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [1760, 1600, 1, 1760] + - Exact: [5888, 1408, 1, 3328] + - Exact: [2368, 1856, 1, 256] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 5888, 1, 128] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [2560, 3200, 1, 2560] + - Exact: [5888, 6784, 1, 128] + - Exact: [6144, 48000, 1, 2048] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [1024, 2368, 1, 128] + - Exact: [16384, 800, 1, 4096] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2944, 4288, 1, 3328] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [2048, 3200, 1, 2048] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [1760, 800, 1, 1760] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1408, 2368, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [448, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 128] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [4288, 704, 1, 1280] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [6784, 1856, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [16384, 1600, 1, 4096] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [704, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5056, 1024, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4288, 1024, 1, 128] + - Exact: [4096, 512, 1, 32] + - Exact: [2048, 1024, 1, 1664] + - Exact: [4096, 512, 1, 1408] + - Exact: [4096, 1024, 1, 1280] + - Exact: [2048, 1024, 1, 640] + - Exact: [4096, 1024, 1, 13312] + - Exact: [2048, 1024, 1, 13312] + - Exact: [2048, 1024, 1, 3584] + - Exact: [4096, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [4096, 1024, 1, 8320] + - Exact: [4096, 1024, 1, 15360] + - Exact: [4096, 512, 1, 3072] + - Exact: [4096, 512, 1, 13312] + - Exact: [4096, 1024, 1, 3840] + - Exact: [2048, 1024, 1, 3200] + - Exact: [4096, 512, 1, 3840] + - Exact: [4096, 512, 1, 5632] + - Exact: [4096, 512, 1, 64] + - Exact: [2048, 1024, 1, 512] + - Exact: [4096, 512, 1, 8192] + - Exact: [4096, 512, 1, 2304] + - Exact: [4096, 512, 1, 2816] + - Exact: [2048, 1024, 1, 7680] + - Exact: [4096, 512, 1, 1920] + - Exact: [4096, 1024, 1, 32] + - Exact: [4096, 512, 1, 16640] + - Exact: [2048, 1024, 1, 1024] + - Exact: [4096, 512, 1, 1792] + - Exact: [4096, 1024, 1, 8192] + - Exact: [2048, 1024, 1, 4160] + - Exact: [4096, 512, 1, 10240] + - Exact: [4096, 512, 1, 512] + - Exact: [2048, 1024, 1, 6656] + - Exact: [2048, 1024, 1, 14336] + - Exact: [4096, 512, 1, 11264] + - Exact: [4096, 512, 1, 128] + - Exact: [4096, 512, 1, 768] + - Exact: [4096, 1024, 1, 11264] + - Exact: [4096, 1024, 1, 16640] + - Exact: [2048, 1024, 1, 5632] + - Exact: [4096, 512, 1, 12288] + - Exact: [4096, 1024, 1, 5632] + - Exact: [2048, 1024, 1, 10240] + - Exact: [4096, 1024, 1, 640] + - Exact: [2048, 1024, 1, 12288] + - Exact: [4096, 1024, 1, 10240] + - Exact: [2048, 1024, 1, 4608] + - Exact: [4096, 512, 1, 3584] + - Exact: [4096, 1024, 1, 4608] + - Exact: [4096, 1024, 1, 3328] + - Exact: [2048, 1024, 1, 9216] + - Exact: [2048, 1024, 1, 2304] + - Exact: [4096, 512, 1, 6144] + - Exact: [4096, 512, 1, 15360] + - Exact: [4096, 1024, 1, 7168] + - Exact: [4096, 1024, 1, 9216] + - Exact: [4096, 1024, 1, 7680] + - Exact: [2048, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 64] + - Exact: [2048, 1024, 1, 1280] + - Exact: [2048, 1024, 1, 3328] + - Exact: [4096, 512, 1, 14336] + - Exact: [4096, 512, 1, 8320] + - Exact: [4096, 1024, 1, 6656] + - Exact: [2048, 1024, 1, 256] + - Exact: [4096, 512, 1, 1024] + - Exact: [4096, 1024, 1, 1536] + - Exact: [2048, 1024, 1, 32] + - Exact: [4096, 512, 1, 640] + - Exact: [4096, 512, 1, 16384] + - Exact: [4096, 1024, 1, 512] + - Exact: [2048, 1024, 1, 1152] + - Exact: [4096, 1024, 1, 2080] + - Exact: [4096, 1024, 1, 768] + - Exact: [4096, 1024, 1, 2560] + - Exact: [2048, 1024, 1, 64] + - Exact: [4096, 1024, 1, 16384] + - Exact: [4096, 512, 1, 6656] + - Exact: [2048, 1024, 1, 128] + - Exact: [2048, 1024, 1, 2080] + - Exact: [2048, 1024, 1, 16640] + - Exact: [2048, 1024, 1, 3072] + - Exact: [4096, 1024, 1, 1408] + - Exact: [4096, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 2560] + - Exact: [4096, 1024, 1, 128] + - Exact: [4096, 1024, 1, 14336] + - Exact: [4096, 512, 1, 9216] + - Exact: [2048, 1024, 1, 2048] + - Exact: [4096, 512, 1, 1536] + - Exact: [2048, 1024, 1, 16384] + - Exact: [4096, 1024, 1, 1024] + - Exact: [4096, 1024, 1, 1664] + - Exact: [4096, 512, 1, 384] + - Exact: [4096, 512, 1, 3328] + - Exact: [4096, 1024, 1, 256] + - Exact: [2048, 1024, 1, 7168] + - Exact: [2048, 1024, 1, 1536] + - Exact: [4096, 512, 1, 7168] + - Exact: [4096, 1024, 1, 896] + - Exact: [4096, 1024, 1, 4096] + - Exact: [2048, 1024, 1, 6144] + - Exact: [4096, 512, 1, 4160] + - Exact: [4096, 512, 1, 2080] + - Exact: [4096, 1024, 1, 5120] + - Exact: [2048, 1024, 1, 1920] + - Exact: [2048, 1024, 1, 15360] + - Exact: [4096, 1024, 1, 2816] + - Exact: [4096, 512, 1, 256] + - Exact: [2048, 1024, 1, 5120] + - Exact: [2048, 1024, 1, 4096] + - Exact: [4096, 512, 1, 4608] + - Exact: [4096, 512, 1, 1664] + - Exact: [2048, 1024, 1, 896] + - Exact: [4096, 1024, 1, 4160] + - Exact: [2048, 1024, 1, 11264] + - Exact: [2048, 1024, 1, 384] + - Exact: [2048, 1024, 1, 3840] + - Exact: [4096, 512, 1, 1280] + - Exact: [4096, 1024, 1, 1152] + - Exact: [2048, 1024, 1, 1408] + - Exact: [4096, 512, 1, 896] + - Exact: [4096, 1024, 1, 3072] + - Exact: [2048, 1024, 1, 2816] + - Exact: [4096, 1024, 1, 1792] + - Exact: [4096, 512, 1, 1152] + - Exact: [4096, 512, 1, 7680] + - Exact: [4096, 1024, 1, 384] + - Exact: [2048, 1024, 1, 1792] + - Exact: [4096, 1024, 1, 3584] + - Exact: [2048, 1024, 1, 768] + - Exact: [2048, 1024, 1, 8320] + - Exact: [4096, 512, 1, 2048] + - Exact: [4096, 512, 1, 2560] + - Exact: [4096, 1024, 1, 2304] + - Exact: [4096, 512, 1, 5120] + - Exact: [4096, 1024, 1, 6144] + - Exact: [1024, 3392, 1, 4096] + - Exact: [1024, 3301, 1, 4096] + - Exact: [1024, 3443, 1, 4096] + - Exact: [132, 134, 480, 64] + - Exact: [162, 162, 400, 64] + - Exact: [4096, 3548, 1, 1024] + - Exact: [4096, 2977, 1, 1024] + - Exact: [132, 135, 480, 64] + - Exact: [1024, 2985, 1, 4096] + - Exact: [33708, 3681, 1, 1024] + - Exact: [4096, 3443, 1, 1024] + - Exact: [1024, 3400, 1, 4096] + - Exact: [4096, 3995, 1, 1024] + - Exact: [4096, 3190, 1, 1024] + - Exact: [4096, 3594, 1, 1024] + - Exact: [159, 162, 400, 64] + - Exact: [1024, 3565, 1, 4096] + - Exact: [4096, 3422, 1, 1024] + - Exact: [1024, 3214, 1, 4096] + - Exact: [33708, 3584, 1, 1024] + - Exact: [33708, 3640, 1, 1024] + - Exact: [4096, 3263, 1, 1024] + - Exact: [4096, 3296, 1, 1024] + - Exact: [1024, 3557, 1, 4096] + - Exact: [4096, 3463, 1, 1024] + - Exact: [4096, 3528, 1, 1024] + - Exact: [4096, 3226, 1, 1024] + - Exact: [4096, 3439, 1, 1024] + - Exact: [1024, 3523, 1, 4096] + - Exact: [1024, 3098, 1, 4096] + - Exact: [4096, 3121, 1, 1024] + - Exact: [33708, 3894, 1, 1024] + - Exact: [1024, 3548, 1, 4096] + - Exact: [1024, 3451, 1, 4096] + - Exact: [4096, 3353, 1, 1024] + - Exact: [4096, 3402, 1, 1024] + - Exact: [4096, 3939, 1, 1024] + - Exact: [133, 133, 480, 64] + - Exact: [1024, 3559, 1, 4096] + - Exact: [1024, 2977, 1, 4096] + - Exact: [1024, 3478, 1, 4096] + - Exact: [134, 134, 480, 64] + - Exact: [1024, 3368, 1, 4096] + - Exact: [4096, 4012, 1, 1024] + - Exact: [4096, 3486, 1, 1024] + - Exact: [1024, 3479, 1, 4096] + - Exact: [1024, 3505, 1, 4096] + - Exact: [4096, 3381, 1, 1024] + - Exact: [4096, 3430, 1, 1024] + - Exact: [1024, 3554, 1, 4096] + - Exact: [4096, 3271, 1, 1024] + - Exact: [1024, 3063, 1, 4096] + - Exact: [1024, 3209, 1, 4096] + - Exact: [4096, 3503, 1, 1024] + - Exact: [4096, 3344, 1, 1024] + - Exact: [1024, 3147, 1, 4096] + - Exact: [1024, 3322, 1, 4096] + - Exact: [1024, 3341, 1, 4096] + - Exact: [1024, 3516, 1, 4096] + - Exact: [1024, 3454, 1, 4096] + - Exact: [4096, 3969, 1, 1024] + - Exact: [4096, 3466, 1, 1024] + - Exact: [1024, 3999, 1, 1024] + - Exact: [1024, 4032, 1, 1024] + - Exact: [1024, 3403, 1, 4096] + - Exact: [4096, 3361, 1, 1024] + - Exact: [1024, 3527, 1, 4096] + - Exact: [1024, 3822, 1, 4096] + - Exact: [4096, 3315, 1, 1024] + - Exact: [232, 232, 272, 64] + - Exact: [1024, 3336, 1, 4096] + - Exact: [228, 232, 272, 64] + - Exact: [4096, 3547, 1, 1024] + - Exact: [4096, 3340, 1, 1024] + - Exact: [1024, 3906, 1, 1024] + - Exact: [1024, 3295, 1, 4096] + - Exact: [4096, 3294, 1, 1024] + - Exact: [33708, 3968, 1, 1024] + - Exact: [1024, 3473, 1, 4096] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3189, 1, 1024] + - Exact: [4096, 3494, 1, 1024] + - Exact: [1024, 3522, 1, 4096] + - Exact: [33708, 3944, 1, 1024] + - Exact: [135, 135, 480, 64] + - Exact: [4096, 3421, 1, 1024] + - Exact: [4096, 3311, 1, 1024] + - Exact: [1024, 3990, 1, 1024] + - Exact: [1024, 3290, 1, 4096] + - Exact: [4096, 3565, 1, 1024] + - Exact: [1024, 3484, 1, 4096] + - Exact: [4096, 3384, 1, 1024] + - Exact: [1024, 3422, 1, 4096] + - Exact: [4096, 3681, 1, 1024] + - Exact: [1024, 3584, 1, 1024] + - Exact: [4096, 4050, 1, 1024] + - Exact: [1024, 3996, 1, 4096] + - Exact: [4096, 3169, 1, 1024] + - Exact: [4096, 3538, 1, 1024] + - Exact: [1024, 3495, 1, 4096] + - Exact: [4096, 3401, 1, 1024] + - Exact: [1024, 3560, 1, 4096] + - Exact: [133, 135, 480, 64] + - Exact: [1024, 3263, 1, 4096] + - Exact: [1024, 3870, 1, 4096] + - Exact: [4096, 3555, 1, 1024] + - Exact: [4096, 3412, 1, 1024] + - Exact: [1024, 3296, 1, 4096] + - Exact: [1024, 3379, 1, 4096] + - Exact: [4096, 3302, 1, 1024] + - Exact: [1024, 3490, 1, 4096] + - Exact: [1024, 3428, 1, 4096] + - Exact: [1024, 3976, 1, 4096] + - Exact: [4096, 3485, 1, 1024] + - Exact: [4096, 3534, 1, 1024] + - Exact: [1024, 3064, 1, 4096] + - Exact: [4096, 3216, 1, 1024] + - Exact: [1024, 3450, 1, 4096] + - Exact: [1024, 3533, 1, 4096] + - Exact: [1024, 4030, 1, 1024] + - Exact: [1024, 3311, 1, 4096] + - Exact: [1024, 3468, 1, 4096] + - Exact: [4096, 3359, 1, 1024] + - Exact: [4096, 3392, 1, 1024] + - Exact: [1024, 3925, 1, 1024] + - Exact: [4096, 3233, 1, 1024] + - Exact: [4096, 3956, 1, 1024] + - Exact: [1024, 3463, 1, 4096] + - Exact: [1024, 3126, 1, 4096] + - Exact: [1024, 3363, 1, 4096] + - Exact: [4096, 3465, 1, 1024] + - Exact: [33708, 3996, 1, 1024] + - Exact: [1024, 3231, 1, 4096] + - Exact: [33708, 3978, 1, 1024] + - Exact: [4096, 3476, 1, 1024] + - Exact: [4096, 3339, 1, 1024] + - Exact: [4096, 3452, 1, 1024] + - Exact: [1024, 3396, 1, 4096] + - Exact: [4096, 3293, 1, 1024] + - Exact: [1024, 3432, 1, 4096] + - Exact: [4096, 3493, 1, 1024] + - Exact: [4096, 3350, 1, 1024] + - Exact: [1024, 3079, 1, 4096] + - Exact: [1024, 3101, 1, 4096] + - Exact: [33708, 3939, 1, 1024] + - Exact: [4096, 3256, 1, 1024] + - Exact: [1024, 3439, 1, 4096] + - Exact: [1024, 3510, 1, 4096] + - Exact: [4096, 3900, 1, 1024] + - Exact: [1024, 3470, 1, 4096] + - Exact: [4096, 3456, 1, 1024] + - Exact: [4096, 3014, 1, 1024] + - Exact: [4096, 3367, 1, 1024] + - Exact: [4096, 3432, 1, 1024] + - Exact: [33708, 4026, 1, 1024] + - Exact: [4096, 3273, 1, 1024] + - Exact: [4096, 3130, 1, 1024] + - Exact: [1024, 3496, 1, 4096] + - Exact: [1024, 3995, 1, 4096] + - Exact: [1024, 3939, 1, 4096] + - Exact: [1024, 3121, 1, 4096] + - Exact: [1024, 3232, 1, 4096] + - Exact: [4096, 3147, 1, 1024] + - Exact: [4096, 3516, 1, 1024] + - Exact: [1024, 3969, 1, 1024] + - Exact: [1024, 3364, 1, 4096] + - Exact: [4096, 3411, 1, 1024] + - Exact: [147, 147, 432, 64] + - Exact: [4096, 3301, 1, 1024] + - Exact: [1024, 3513, 1, 4096] + - Exact: [1024, 3469, 1, 4096] + - Exact: [1024, 3095, 1, 4096] + - Exact: [4096, 3533, 1, 1024] + - Exact: [4096, 3390, 1, 1024] + - Exact: [4096, 3582, 1, 1024] + - Exact: [1024, 3956, 1, 1024] + - Exact: [4096, 3585, 1, 1024] + - Exact: [4096, 3231, 1, 1024] + - Exact: [1024, 3205, 1, 4096] + - Exact: [4096, 3496, 1, 1024] + - Exact: [1024, 3143, 1, 4096] + - Exact: [1024, 3318, 1, 4096] + - Exact: [1024, 3353, 1, 4096] + - Exact: [1024, 3464, 1, 4096] + - Exact: [4096, 2736, 1, 1024] + - Exact: [1024, 3402, 1, 4096] + - Exact: [4096, 3138, 1, 1024] + - Exact: [1024, 3860, 1, 4096] + - Exact: [148, 148, 432, 64] + - Exact: [1024, 3539, 1, 4096] + - Exact: [4096, 3211, 1, 1024] + - Exact: [1024, 3332, 1, 4096] + - Exact: [1024, 3466, 1, 4096] + - Exact: [4096, 3475, 1, 1024] + - Exact: [4096, 3524, 1, 1024] + - Exact: [4096, 2985, 1, 1024] + - Exact: [4096, 3222, 1, 1024] + - Exact: [4096, 3451, 1, 1024] + - Exact: [1024, 3181, 1, 4096] + - Exact: [1024, 3640, 1, 4096] + - Exact: [1024, 3375, 1, 4096] + - Exact: [1024, 3550, 1, 4096] + - Exact: [1024, 4020, 1, 1024] + - Exact: [4096, 3349, 1, 1024] + - Exact: [4096, 3398, 1, 1024] + - Exact: [33708, 3976, 1, 1024] + - Exact: [1024, 2917, 1, 4096] + - Exact: [33708, 3910, 1, 1024] + - Exact: [4096, 3860, 1, 1024] + - Exact: [4096, 3304, 1, 1024] + - Exact: [1024, 3286, 1, 4096] + - Exact: [1024, 3460, 1, 4096] + - Exact: [1024, 4026, 1, 4096] + - Exact: [4096, 3471, 1, 1024] + - Exact: [193, 193, 320, 64] + - Exact: [1024, 3894, 1, 1024] + - Exact: [1024, 3506, 1, 4096] + - Exact: [1024, 4000, 1, 1024] + - Exact: [1024, 3900, 1, 4096] + - Exact: [1024, 3445, 1, 4096] + - Exact: [4096, 3442, 1, 1024] + - Exact: [1024, 3358, 1, 4096] + - Exact: [1024, 3211, 1, 4096] + - Exact: [4096, 3515, 1, 1024] + - Exact: [1024, 3564, 1, 4096] + - Exact: [4096, 3057, 1, 1024] + - Exact: [1024, 3343, 1, 4096] + - Exact: [4096, 3262, 1, 1024] + - Exact: [1024, 3518, 1, 4096] + - Exact: [33708, 3876, 1, 1024] + - Exact: [4096, 3462, 1, 1024] + - Exact: [1024, 3265, 1, 4096] + - Exact: [4096, 3389, 1, 1024] + - Exact: [4096, 3438, 1, 1024] + - Exact: [1024, 3955, 1, 1024] + - Exact: [1024, 3545, 1, 4096] + - Exact: [1024, 3144, 1, 4096] + - Exact: [1024, 3417, 1, 4096] + - Exact: [4096, 3543, 1, 1024] + - Exact: [4096, 3352, 1, 1024] + - Exact: [33708, 3975, 1, 1024] + - Exact: [148, 147, 432, 64] + - Exact: [4096, 3137, 1, 1024] + - Exact: [4096, 3506, 1, 1024] + - Exact: [1024, 3975, 1, 1024] + - Exact: [1024, 3859, 1, 4096] + - Exact: [4096, 3369, 1, 1024] + - Exact: [1024, 3434, 1, 4096] + - Exact: [1024, 3292, 1, 4096] + - Exact: [4096, 3523, 1, 1024] + - Exact: [4096, 3380, 1, 1024] + - Exact: [1024, 3408, 1, 4096] + - Exact: [4096, 3221, 1, 1024] + - Exact: [4096, 3270, 1, 1024] + - Exact: [143, 143, 432, 64] + - Exact: [1024, 3303, 1, 4096] + - Exact: [4096, 3502, 1, 1024] + - Exact: [1024, 3222, 1, 4096] + - Exact: [4096, 2505, 1, 1024] + - Exact: [4096, 3397, 1, 1024] + - Exact: [4096, 3562, 1, 1024] + - Exact: [4096, 3095, 1, 1024] + - Exact: [1024, 3226, 1, 4096] + - Exact: [177, 177, 352, 64] + - Exact: [4096, 3360, 1, 1024] + - Exact: [1024, 3942, 1, 1024] + - Exact: [1024, 3298, 1, 4096] + - Exact: [1024, 3381, 1, 4096] + - Exact: [4096, 3314, 1, 1024] + - Exact: [1024, 3492, 1, 4096] + - Exact: [1024, 3430, 1, 4096] + - Exact: [4096, 3977, 1, 1024] + - Exact: [4096, 3546, 1, 1024] + - Exact: [4096, 3640, 1, 1024] + - Exact: [4096, 3441, 1, 1024] + - Exact: [33708, 4059, 1, 1024] + - Exact: [1024, 3978, 1, 1024] + - Exact: [1024, 3376, 1, 4096] + - Exact: [1024, 3482, 1, 4096] + - Exact: [1024, 3563, 1, 4096] + - Exact: [4096, 4020, 1, 1024] + - Exact: [1024, 3271, 1, 4096] + - Exact: [1024, 3291, 1, 4096] + - Exact: [1024, 3431, 1, 4096] + - Exact: [1024, 3481, 1, 4096] + - Exact: [4096, 3461, 1, 1024] + - Exact: [1024, 3574, 1, 4096] + - Exact: [1024, 4059, 1, 1024] + - Exact: [1024, 3421, 1, 4096] + - Exact: [4096, 3224, 1, 1024] + - Exact: [4096, 3437, 1, 1024] + - Exact: [4096, 3168, 1, 1024] + - Exact: [33708, 3990, 1, 1024] + - Exact: [1024, 3349, 1, 4096] + - Exact: [4096, 3335, 1, 1024] + - Exact: [4096, 3400, 1, 1024] + - Exact: [160, 159, 400, 64] + - Exact: [1024, 3398, 1, 4096] + - Exact: [1024, 3780, 1, 4096] + - Exact: [4096, 3098, 1, 1024] + - Exact: [1024, 4012, 1, 4096] + - Exact: [4096, 3505, 1, 1024] + - Exact: [4096, 3554, 1, 1024] + - Exact: [4096, 3063, 1, 1024] + - Exact: [1024, 3503, 1, 4096] + - Exact: [1024, 3166, 1, 4096] + - Exact: [1024, 3425, 1, 4096] + - Exact: [1024, 3344, 1, 4096] + - Exact: [4096, 3484, 1, 1024] + - Exact: [1024, 3681, 1, 1024] + - Exact: [1024, 4050, 1, 1024] + - Exact: [4096, 3379, 1, 1024] + - Exact: [4096, 3428, 1, 1024] + - Exact: [1024, 3304, 1, 4096] + - Exact: [1024, 3387, 1, 4096] + - Exact: [4096, 3126, 1, 1024] + - Exact: [1024, 3498, 1, 4096] + - Exact: [1024, 3436, 1, 4096] + - Exact: [4096, 3501, 1, 1024] + - Exact: [4096, 3358, 1, 1024] + - Exact: [4096, 3232, 1, 1024] + - Exact: [1024, 3585, 1, 4096] + - Exact: [4096, 3143, 1, 1024] + - Exact: [4096, 3464, 1, 1024] + - Exact: [1024, 3366, 1, 4096] + - Exact: [4096, 3375, 1, 1024] + - Exact: [4096, 2917, 1, 1024] + - Exact: [4096, 4026, 1, 1024] + - Exact: [1024, 3277, 1, 4096] + - Exact: [1024, 3103, 1, 4096] + - Exact: [33708, 3995, 1, 1024] + - Exact: [1024, 3297, 1, 4096] + - Exact: [4096, 3545, 1, 1024] + - Exact: [1024, 3399, 1, 4096] + - Exact: [33708, 3796, 1, 1024] + - Exact: [4096, 3292, 1, 1024] + - Exact: [33708, 3859, 1, 1024] + - Exact: [4096, 3566, 1, 1024] + - Exact: [4096, 3894, 1, 1024] + - Exact: [4096, 3492, 1, 1024] + - Exact: [1024, 3977, 1, 1024] + - Exact: [1024, 3272, 1, 4096] + - Exact: [135, 134, 480, 64] + - Exact: [1024, 3355, 1, 4096] + - Exact: [4096, 3419, 1, 1024] + - Exact: [1024, 3404, 1, 4096] + - Exact: [4096, 3999, 1, 1024] + - Exact: [4096, 3166, 1, 1024] + - Exact: [33708, 3840, 1, 1024] + - Exact: [4096, 4032, 1, 1024] + - Exact: [1024, 3573, 1, 4096] + - Exact: [4096, 3366, 1, 1024] + - Exact: [1024, 3541, 1, 4096] + - Exact: [4096, 3207, 1, 1024] + - Exact: [4096, 3272, 1, 1024] + - Exact: [1024, 3334, 1, 4096] + - Exact: [228, 228, 272, 64] + - Exact: [4096, 3183, 1, 1024] + - Exact: [4096, 3536, 1, 1024] + - Exact: [1024, 4005, 1, 1024] + - Exact: [1024, 3245, 1, 4096] + - Exact: [4096, 3447, 1, 1024] + - Exact: [1024, 3183, 1, 4096] + - Exact: [1024, 3361, 1, 4096] + - Exact: [33708, 3870, 1, 1024] + - Exact: [1024, 3321, 1, 4096] + - Exact: [1024, 3486, 1, 4096] + - Exact: [4096, 4005, 1, 1024] + - Exact: [4096, 3410, 1, 1024] + - Exact: [1024, 3944, 1, 1024] + - Exact: [4096, 3300, 1, 1024] + - Exact: [4096, 3579, 1, 1024] + - Exact: [4096, 3483, 1, 1024] + - Exact: [4096, 3532, 1, 1024] + - Exact: [1024, 3140, 1, 4096] + - Exact: [1024, 3372, 1, 4096] + - Exact: [1024, 3224, 1, 4096] + - Exact: [4096, 3230, 1, 1024] + - Exact: [4096, 3427, 1, 1024] + - Exact: [1024, 3796, 1, 1024] + - Exact: [143, 148, 432, 64] + - Exact: [1024, 3616, 1, 4096] + - Exact: [1024, 3315, 1, 4096] + - Exact: [1024, 3476, 1, 4096] + - Exact: [1024, 3509, 1, 4096] + - Exact: [4096, 3357, 1, 1024] + - Exact: [4096, 3406, 1, 1024] + - Exact: [1024, 3558, 1, 4096] + - Exact: [4096, 3593, 1, 1024] + - Exact: [4096, 3247, 1, 1024] + - Exact: [4096, 3088, 1, 1024] + - Exact: [1024, 3213, 1, 4096] + - Exact: [4096, 3511, 1, 1024] + - Exact: [1024, 3365, 1, 4096] + - Exact: [1024, 3504, 1, 4096] + - Exact: [1024, 3442, 1, 4096] + - Exact: [4096, 3474, 1, 1024] + - Exact: [4096, 2984, 1, 1024] + - Exact: [1024, 3876, 1, 4096] + - Exact: [4096, 3337, 1, 1024] + - Exact: [4096, 3450, 1, 1024] + - Exact: [1024, 3547, 1, 4096] + - Exact: [4096, 3291, 1, 1024] + - Exact: [1024, 3340, 1, 4096] + - Exact: [4096, 3491, 1, 1024] + - Exact: [4096, 3348, 1, 1024] + - Exact: [4096, 3906, 1, 1024] + - Exact: [1024, 3477, 1, 4096] + - Exact: [1024, 3397, 1, 4096] + - Exact: [4096, 3165, 1, 1024] + - Exact: [4096, 3470, 1, 1024] + - Exact: [1024, 3526, 1, 4096] + - Exact: [4096, 3365, 1, 1024] + - Exact: [4096, 3319, 1, 1024] + - Exact: [1024, 3401, 1, 4096] + - Exact: [1024, 3294, 1, 4096] + - Exact: [159, 159, 400, 64] + - Exact: [1024, 3472, 1, 4096] + - Exact: [4096, 3328, 1, 1024] + - Exact: [1024, 3861, 1, 1024] + - Exact: [1024, 3910, 1, 1024] + - Exact: [1024, 3410, 1, 4096] + - Exact: [1024, 3395, 1, 4096] + - Exact: [4096, 3282, 1, 1024] + - Exact: [1024, 3751, 1, 1024] + - Exact: [4096, 3145, 1, 1024] + - Exact: [4096, 3514, 1, 1024] + - Exact: [4096, 3944, 1, 1024] + - Exact: [1024, 3515, 1, 4096] + - Exact: [4096, 3409, 1, 1024] + - Exact: [4096, 3564, 1, 1024] + - Exact: [4096, 3299, 1, 1024] + - Exact: [1024, 3057, 1, 4096] + - Exact: [4096, 3531, 1, 1024] + - Exact: [4096, 3388, 1, 1024] + - Exact: [1024, 3189, 1, 4096] + - Exact: [1024, 3300, 1, 4096] + - Exact: [1024, 3720, 1, 4096] + - Exact: [1024, 3383, 1, 4096] + - Exact: [1024, 3494, 1, 4096] + - Exact: [1024, 3448, 1, 4096] + - Exact: [4096, 3542, 1, 1024] + - Exact: [1024, 3488, 1, 4096] + - Exact: [4096, 3405, 1, 1024] + - Exact: [1024, 3262, 1, 4096] + - Exact: [33708, 4005, 1, 1024] + - Exact: [1024, 3594, 1, 4096] + - Exact: [4096, 3103, 1, 1024] + - Exact: [4096, 3136, 1, 1024] + - Exact: [1024, 3378, 1, 4096] + - Exact: [4096, 3559, 1, 1024] + - Exact: [4096, 3368, 1, 1024] + - Exact: [4096, 3209, 1, 1024] + - Exact: [4096, 3322, 1, 1024] + - Exact: [1024, 3483, 1, 4096] + - Exact: [4096, 3473, 1, 1024] + - Exact: [4096, 3522, 1, 1024] + - Exact: [1024, 3532, 1, 4096] + - Exact: [4096, 3449, 1, 1024] + - Exact: [1024, 3351, 1, 4096] + - Exact: [1024, 3462, 1, 4096] + - Exact: [4096, 3396, 1, 1024] + - Exact: [132, 132, 480, 64] + - Exact: [1024, 3416, 1, 4096] + - Exact: [4096, 3469, 1, 1024] + - Exact: [1024, 3582, 1, 4096] + - Exact: [1024, 3230, 1, 4096] + - Exact: [1024, 3489, 1, 4096] + - Exact: [1024, 3427, 1, 4096] + - Exact: [1024, 3346, 1, 4096] + - Exact: [33708, 3977, 1, 1024] + - Exact: [4096, 3796, 1, 1024] + - Exact: [4096, 3176, 1, 1024] + - Exact: [4096, 3990, 1, 1024] + - Exact: [1024, 3257, 1, 4096] + - Exact: [4096, 3343, 1, 1024] + - Exact: [4096, 3440, 1, 1024] + - Exact: [33708, 4030, 1, 1024] + - Exact: [1024, 3190, 1, 4096] + - Exact: [1024, 3389, 1, 4096] + - Exact: [1024, 3500, 1, 4096] + - Exact: [1024, 3471, 1, 4096] + - Exact: [1024, 3438, 1, 4096] + - Exact: [4096, 3513, 1, 1024] + - Exact: [1024, 3562, 1, 4096] + - Exact: [4096, 3616, 1, 1024] + - Exact: [4096, 3955, 1, 1024] + - Exact: [1024, 3441, 1, 4096] + - Exact: [1024, 3236, 1, 4096] + - Exact: [1024, 3524, 1, 4096] + - Exact: [4096, 3460, 1, 1024] + - Exact: [1024, 3384, 1, 4096] + - Exact: [4096, 3387, 1, 1024] + - Exact: [4096, 3436, 1, 1024] + - Exact: [4096, 3277, 1, 1024] + - Exact: [1024, 3457, 1, 4096] + - Exact: [1024, 3999, 1, 4096] + - Exact: [1024, 4032, 1, 4096] + - Exact: [4096, 3541, 1, 1024] + - Exact: [4096, 3334, 1, 1024] + - Exact: [1024, 3393, 1, 4096] + - Exact: [1024, 3411, 1, 4096] + - Exact: [1024, 3822, 1, 1024] + - Exact: [1024, 3593, 1, 4096] + - Exact: [33708, 3822, 1, 1024] + - Exact: [4096, 3504, 1, 1024] + - Exact: [1024, 3163, 1, 4096] + - Exact: [1024, 3357, 1, 4096] + - Exact: [1024, 3906, 1, 4096] + - Exact: [4096, 3415, 1, 1024] + - Exact: [1024, 3406, 1, 4096] + - Exact: [4096, 3321, 1, 1024] + - Exact: [4096, 3584, 1, 1024] + - Exact: [1024, 2736, 1, 4096] + - Exact: [1024, 3110, 1, 4096] + - Exact: [33708, 3999, 1, 1024] + - Exact: [1024, 3093, 1, 4096] + - Exact: [4096, 3378, 1, 1024] + - Exact: [1024, 3543, 1, 4096] + - Exact: [33708, 3925, 1, 1024] + - Exact: [1024, 3352, 1, 4096] + - Exact: [4096, 3780, 1, 1024] + - Exact: [1024, 3990, 1, 4096] + - Exact: [4096, 3500, 1, 1024] + - Exact: [4096, 3996, 1, 1024] + - Exact: [1024, 3247, 1, 4096] + - Exact: [4096, 3395, 1, 1024] + - Exact: [1024, 3169, 1, 4096] + - Exact: [1024, 3088, 1, 4096] + - Exact: [1024, 3584, 1, 4096] + - Exact: [4096, 3093, 1, 1024] + - Exact: [1024, 3538, 1, 4096] + - Exact: [1024, 3996, 1, 1024] + - Exact: [1024, 3581, 1, 4096] + - Exact: [4096, 3374, 1, 1024] + - Exact: [33708, 3751, 1, 1024] + - Exact: [4096, 3215, 1, 1024] + - Exact: [4096, 3312, 1, 1024] + - Exact: [4096, 3581, 1, 1024] + - Exact: [4096, 3479, 1, 1024] + - Exact: [4096, 3544, 1, 1024] + - Exact: [1024, 3870, 1, 1024] + - Exact: [1024, 3374, 1, 4096] + - Exact: [1024, 2967, 1, 4096] + - Exact: [4096, 3455, 1, 1024] + - Exact: [4096, 3942, 1, 1024] + - Exact: [1024, 3528, 1, 4096] + - Exact: [4096, 3186, 1, 1024] + - Exact: [1024, 3976, 1, 1024] + - Exact: [1024, 3511, 1, 4096] + - Exact: [4096, 3573, 1, 1024] + - Exact: [4096, 3561, 1, 1024] + - Exact: [4096, 3418, 1, 1024] + - Exact: [33708, 3906, 1, 1024] + - Exact: [4096, 3259, 1, 1024] + - Exact: [4096, 3308, 1, 1024] + - Exact: [1024, 3419, 1, 4096] + - Exact: [1024, 3215, 1, 4096] + - Exact: [1024, 4030, 1, 4096] + - Exact: [4096, 3459, 1, 1024] + - Exact: [1024, 3572, 1, 4096] + - Exact: [1024, 3137, 1, 4096] + - Exact: [1024, 3312, 1, 4096] + - Exact: [1024, 3925, 1, 4096] + - Exact: [1024, 3453, 1, 4096] + - Exact: [4096, 3435, 1, 1024] + - Exact: [1024, 3176, 1, 4096] + - Exact: [1024, 3444, 1, 4096] + - Exact: [4096, 3975, 1, 1024] + - Exact: [4096, 3182, 1, 1024] + - Exact: [1024, 3475, 1, 4096] + - Exact: [33708, 3955, 1, 1024] + - Exact: [4096, 3446, 1, 1024] + - Exact: [1024, 3138, 1, 4096] + - Exact: [1024, 3549, 1, 4096] + - Exact: [4096, 3287, 1, 1024] + - Exact: [1024, 3342, 1, 4096] + - Exact: [4096, 3519, 1, 1024] + - Exact: [4096, 3552, 1, 1024] + - Exact: [4096, 3859, 1, 1024] + - Exact: [33708, 3969, 1, 1024] + - Exact: [1024, 3369, 1, 4096] + - Exact: [4096, 3482, 1, 1024] + - Exact: [1024, 3306, 1, 4096] + - Exact: [1024, 3474, 1, 4096] + - Exact: [4096, 3377, 1, 1024] + - Exact: [4096, 3426, 1, 1024] + - Exact: [4096, 2935, 1, 1024] + - Exact: [4096, 3267, 1, 1024] + - Exact: [1024, 3299, 1, 4096] + - Exact: [1024, 3456, 1, 4096] + - Exact: [1024, 3280, 1, 4096] + - Exact: [1024, 3555, 1, 4096] + - Exact: [4096, 3499, 1, 1024] + - Exact: [4096, 3356, 1, 1024] + - Exact: [1024, 3412, 1, 4096] + - Exact: [1024, 2984, 1, 4096] + - Exact: [4096, 3141, 1, 1024] + - Exact: [4096, 3510, 1, 1024] + - Exact: [1024, 3995, 1, 1024] + - Exact: [1024, 3517, 1, 4096] + - Exact: [1024, 3455, 1, 4096] + - Exact: [1024, 3939, 1, 1024] + - Exact: [1024, 3447, 1, 4096] + - Exact: [1024, 3969, 1, 4096] + - Exact: [4096, 3527, 1, 1024] + - Exact: [4096, 3336, 1, 1024] + - Exact: [1024, 3191, 1, 4096] + - Exact: [1024, 3302, 1, 4096] + - Exact: [1024, 3337, 1, 4096] + - Exact: [4096, 3290, 1, 1024] + - Exact: [1024, 3512, 1, 4096] + - Exact: [1024, 3433, 1, 4096] + - Exact: [4096, 3876, 1, 1024] + - Exact: [4096, 3490, 1, 1024] + - Exact: [4096, 3064, 1, 1024] + - Exact: [1024, 3508, 1, 4096] + - Exact: [1024, 3956, 1, 4096] + - Exact: [4096, 3417, 1, 1024] + - Exact: [1024, 3248, 1, 4096] + - Exact: [1024, 2499, 1, 4096] + - Exact: [1024, 3186, 1, 4096] + - Exact: [1024, 3180, 1, 4096] + - Exact: [4096, 3364, 1, 1024] + - Exact: [4096, 3976, 1, 1024] + - Exact: [4096, 3205, 1, 1024] + - Exact: [4096, 3318, 1, 1024] + - Exact: [1024, 3377, 1, 4096] + - Exact: [1024, 3485, 1, 4096] + - Exact: [4096, 3181, 1, 1024] + - Exact: [4096, 3550, 1, 1024] + - Exact: [1024, 3534, 1, 4096] + - Exact: [1024, 3860, 1, 1024] + - Exact: [160, 160, 400, 64] + - Exact: [4096, 3445, 1, 1024] + - Exact: [1024, 3391, 1, 4096] + - Exact: [1024, 3221, 1, 4096] + - Exact: [4096, 3079, 1, 1024] + - Exact: [4096, 3144, 1, 1024] + - Exact: [1024, 3270, 1, 4096] + - Exact: [1024, 3561, 1, 4096] + - Exact: [1024, 3480, 1, 4096] + - Exact: [4096, 3408, 1, 1024] + - Exact: [1024, 3418, 1, 4096] + - Exact: [4096, 3298, 1, 1024] + - Exact: [1024, 3640, 1, 1024] + - Exact: [1024, 3449, 1, 4096] + - Exact: [1024, 4020, 1, 4096] + - Exact: [4096, 3481, 1, 1024] + - Exact: [4096, 3530, 1, 1024] + - Exact: [1024, 3216, 1, 4096] + - Exact: [1024, 3491, 1, 4096] + - Exact: [1024, 3154, 1, 4096] + - Exact: [4096, 3425, 1, 1024] + - Exact: [1024, 3348, 1, 4096] + - Exact: [1024, 3415, 1, 4096] + - Exact: [1024, 4026, 1, 1024] + - Exact: [1024, 3367, 1, 4096] + - Exact: [1024, 3259, 1, 4096] + - Exact: [1024, 3894, 1, 4096] + - Exact: [4096, 3355, 1, 1024] + - Exact: [4096, 3404, 1, 1024] + - Exact: [1024, 3308, 1, 4096] + - Exact: [4096, 3245, 1, 1024] + - Exact: [1024, 3502, 1, 4096] + - Exact: [33708, 4032, 1, 1024] + - Exact: [1024, 3424, 1, 4096] + - Exact: [4096, 3509, 1, 1024] + - Exact: [4096, 3558, 1, 1024] + - Exact: [1024, 3900, 1, 1024] + - Exact: [1024, 2505, 1, 4096] + - Exact: [4096, 3472, 1, 1024] + - Exact: [1024, 3386, 1, 4096] + - Exact: [4096, 3383, 1, 1024] + - Exact: [4096, 3448, 1, 1024] + - Exact: [4096, 4030, 1, 1024] + - Exact: [4096, 3289, 1, 1024] + - Exact: [1024, 3459, 1, 4096] + - Exact: [1024, 2918, 1, 4096] + - Exact: [4096, 3489, 1, 1024] + - Exact: [4096, 3346, 1, 1024] + - Exact: [4096, 3572, 1, 1024] + - Exact: [1024, 3955, 1, 4096] + - Exact: [4096, 3236, 1, 1024] + - Exact: [4096, 3163, 1, 1024] + - Exact: [4096, 3468, 1, 1024] + - Exact: [1024, 3165, 1, 4096] + - Exact: [1024, 3276, 1, 4096] + - Exact: [1024, 3359, 1, 4096] + - Exact: [4096, 3363, 1, 1024] + - Exact: [1024, 3385, 1, 4096] + - Exact: [1024, 3207, 1, 4096] + - Exact: [1024, 3458, 1, 4096] + - Exact: [4096, 3110, 1, 1024] + - Exact: [4096, 3925, 1, 1024] + - Exact: [1024, 3975, 1, 4096] + - Exact: [4096, 3549, 1, 1024] + - Exact: [4096, 3342, 1, 1024] + - Exact: [1024, 3859, 1, 1024] + - Exact: [1024, 3497, 1, 4096] + - Exact: [4096, 3280, 1, 1024] + - Exact: [1024, 3435, 1, 4096] + - Exact: [1024, 3354, 1, 4096] + - Exact: [4096, 3191, 1, 1024] + - Exact: [4096, 3512, 1, 1024] + - Exact: [1024, 3055, 1, 4096] + - Exact: [4096, 2499, 1, 1024] + - Exact: [1024, 3233, 1, 4096] + - Exact: [4096, 3423, 1, 1024] + - Exact: [1024, 3319, 1, 4096] + - Exact: [4096, 3297, 1, 1024] + - Exact: [4096, 3154, 1, 1024] + - Exact: [1024, 3540, 1, 4096] + - Exact: [1024, 3289, 1, 4096] + - Exact: [4096, 3529, 1, 1024] + - Exact: [4096, 3386, 1, 1024] + - Exact: [4096, 3276, 1, 1024] + - Exact: [1024, 3244, 1, 4096] + - Exact: [1024, 3182, 1, 4096] + - Exact: [4096, 3540, 1, 1024] + - Exact: [1024, 3360, 1, 4096] + - Exact: [1024, 3942, 1, 4096] + - Exact: [4096, 3403, 1, 1024] + - Exact: [4096, 3101, 1, 1024] + - Exact: [4096, 2918, 1, 1024] + - Exact: [1024, 3465, 1, 4096] + - Exact: [33708, 3780, 1, 1024] + - Exact: [4096, 3557, 1, 1024] + - Exact: [4096, 3414, 1, 1024] + - Exact: [1024, 3948, 1, 1024] + - Exact: [4096, 3320, 1, 1024] + - Exact: [4096, 2765, 1, 1024] + - Exact: [1024, 3978, 1, 4096] + - Exact: [4096, 3487, 1, 1024] + - Exact: [4096, 3520, 1, 1024] + - Exact: [1024, 3139, 1, 4096] + - Exact: [1024, 3314, 1, 4096] + - Exact: [4096, 3431, 1, 1024] + - Exact: [1024, 3446, 1, 4096] + - Exact: [1024, 4059, 1, 4096] + - Exact: [4096, 3345, 1, 1024] + - Exact: [4096, 3394, 1, 1024] + - Exact: [1024, 3927, 1, 1024] + - Exact: [4096, 3235, 1, 1024] + - Exact: [1024, 3328, 1, 4096] + - Exact: [33708, 3956, 1, 1024] + - Exact: [4096, 3467, 1, 1024] + - Exact: [1024, 3287, 1, 4096] + - Exact: [4096, 3214, 1, 1024] + - Exact: [4096, 3910, 1, 1024] + - Exact: [1024, 3780, 1, 1024] + - Exact: [1024, 3371, 1, 4096] + - Exact: [4096, 3478, 1, 1024] + - Exact: [1024, 3546, 1, 4096] + - Exact: [1024, 4012, 1, 1024] + - Exact: [4096, 3341, 1, 1024] + - Exact: [4096, 3454, 1, 1024] + - Exact: [4096, 3295, 1, 1024] + - Exact: [4096, 3072, 1, 1024] + - Exact: [1024, 3282, 1, 4096] + - Exact: [33708, 3720, 1, 1024] + - Exact: [1024, 3681, 1, 4096] + - Exact: [1024, 4050, 1, 4096] + - Exact: [4096, 3495, 1, 1024] + - Exact: [4096, 3560, 1, 1024] + - Exact: [4096, 3751, 1, 1024] + - Exact: [1024, 3414, 1, 4096] + - Exact: [33708, 3860, 1, 1024] + - Exact: [1024, 3325, 1, 4096] + - Exact: [4096, 3458, 1, 1024] + - Exact: [4096, 2967, 1, 1024] + - Exact: [1024, 3519, 1, 4096] + - Exact: [4096, 3385, 1, 1024] + - Exact: [4096, 3434, 1, 1024] + - Exact: [1024, 3552, 1, 4096] + - Exact: [4096, 3822, 1, 1024] + - Exact: [1024, 3544, 1, 4096] + - Exact: [4096, 3539, 1, 1024] + - Exact: [4096, 3332, 1, 1024] + - Exact: [1024, 3145, 1, 4096] + - Exact: [1024, 3535, 1, 4096] + - Exact: [1024, 3320, 1, 4096] + - Exact: [33708, 4012, 1, 1024] + - Exact: [4096, 3286, 1, 1024] + - Exact: [1024, 3514, 1, 4096] + - Exact: [1024, 2765, 1, 4096] + - Exact: [1024, 3452, 1, 4096] + - Exact: [4096, 3518, 1, 1024] + - Exact: [1024, 3529, 1, 4096] + - Exact: [4096, 3413, 1, 1024] + - Exact: [33708, 4050, 1, 1024] + - Exact: [1024, 3525, 1, 4096] + - Exact: [4096, 3303, 1, 1024] + - Exact: [1024, 3382, 1, 4096] + - Exact: [1024, 3390, 1, 4096] + - Exact: [1024, 3977, 1, 4096] + - Exact: [1024, 3184, 1, 4096] + - Exact: [4096, 3535, 1, 1024] + - Exact: [4096, 3376, 1, 1024] + - Exact: [4096, 3978, 1, 1024] + - Exact: [1024, 3136, 1, 4096] + - Exact: [1024, 3293, 1, 4096] + - Exact: [4096, 3266, 1, 1024] + - Exact: [1024, 3487, 1, 4096] + - Exact: [1024, 3409, 1, 4096] + - Exact: [4096, 3498, 1, 1024] + - Exact: [1024, 3520, 1, 4096] + - Exact: [1024, 3530, 1, 4096] + - Exact: [4096, 3393, 1, 1024] + - Exact: [4096, 3140, 1, 1024] + - Exact: [1024, 3536, 1, 4096] + - Exact: [1024, 3288, 1, 4096] + - Exact: [1024, 4005, 1, 4096] + - Exact: [1024, 3579, 1, 4096] + - Exact: [4096, 3372, 1, 1024] + - Exact: [1024, 3440, 1, 4096] + - Exact: [4096, 3213, 1, 1024] + - Exact: [4096, 3477, 1, 1024] + - Exact: [4096, 3526, 1, 1024] + - Exact: [1024, 3493, 1, 4096] + - Exact: [1024, 3944, 1, 4096] + - Exact: [4096, 3453, 1, 1024] + - Exact: [1024, 3350, 1, 4096] + - Exact: [4096, 3184, 1, 1024] + - Exact: [1024, 3423, 1, 4096] + - Exact: [4096, 3351, 1, 1024] + - Exact: [4096, 3416, 1, 1024] + - Exact: [1024, 3796, 1, 4096] + - Exact: [4096, 3257, 1, 1024] + - Exact: [4096, 3306, 1, 1024] + - Exact: [33708, 4020, 1, 1024] + - Exact: [1024, 3426, 1, 4096] + - Exact: [4096, 3457, 1, 1024] + - Exact: [1024, 2935, 1, 4096] + - Exact: [1024, 3046, 1, 4096] + - Exact: [4096, 3433, 1, 1024] + - Exact: [1024, 3256, 1, 4096] + - Exact: [1024, 3531, 1, 4096] + - Exact: [4096, 3180, 1, 1024] + - Exact: [1024, 3388, 1, 4096] + - Exact: [4096, 3444, 1, 1024] + - Exact: [1024, 3501, 1, 4096] + - Exact: [1024, 3266, 1, 4096] + - Exact: [1024, 3267, 1, 4096] + - Exact: [1024, 3461, 1, 4096] + - Exact: [4096, 3870, 1, 1024] + - Exact: [4096, 3517, 1, 1024] + - Exact: [1024, 3566, 1, 4096] + - Exact: [4096, 3574, 1, 1024] + - Exact: [1024, 3876, 1, 1024] + - Exact: [4096, 3720, 1, 1024] + - Exact: [4096, 3248, 1, 1024] + - Exact: [4096, 4059, 1, 1024] + - Exact: [1024, 3380, 1, 4096] + - Exact: [4096, 3480, 1, 1024] + - Exact: [1024, 3335, 1, 4096] + - Exact: [1024, 3345, 1, 4096] + - Exact: [4096, 3391, 1, 1024] + - Exact: [4096, 3424, 1, 1024] + - Exact: [1024, 3394, 1, 4096] + - Exact: [4096, 3265, 1, 1024] + - Exact: [1024, 3014, 1, 4096] + - Exact: [4096, 3497, 1, 1024] + - Exact: [4096, 3354, 1, 1024] + - Exact: [4096, 3055, 1, 1024] + - Exact: [1024, 3499, 1, 4096] + - Exact: [1024, 3162, 1, 4096] + - Exact: [4096, 3244, 1, 1024] + - Exact: [1024, 3437, 1, 4096] + - Exact: [1024, 3356, 1, 4096] + - Exact: [4096, 3139, 1, 1024] + - Exact: [4096, 3508, 1, 1024] + - Exact: [1024, 3235, 1, 4096] + - Exact: [1024, 3910, 1, 4096] + - Exact: [4096, 3371, 1, 1024] + - Exact: [1024, 3751, 1, 4096] + - Exact: [4096, 3325, 1, 1024] + - Exact: [1024, 3413, 1, 4096] + - Exact: [1024, 3542, 1, 4096] + - Exact: [33708, 3900, 1, 1024] + - Exact: [4096, 3525, 1, 1024] + - Exact: [4096, 3382, 1, 1024] + - Exact: [1024, 3339, 1, 4096] + - Exact: [4096, 3288, 1, 1024] + - Exact: [1024, 3141, 1, 4096] + - Exact: [1024, 3168, 1, 4096] + - Exact: [4096, 3488, 1, 1024] + - Exact: [4096, 3046, 1, 1024] + - Exact: [1024, 3362, 1, 4096] + - Exact: [33708, 3942, 1, 1024] + - Exact: [4096, 3399, 1, 1024] + - Exact: [1024, 3720, 1, 1024] + - Exact: [4096, 3563, 1, 1024] + - Exact: [1024, 3273, 1, 4096] + - Exact: [4096, 3162, 1, 1024] + - Exact: [1024, 3467, 1, 4096] + - Exact: [1024, 3130, 1, 4096] + - Exact: [1024, 3405, 1, 4096] + - Exact: [4096, 3362, 1, 1024] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3712, 1, 36548] + - Exact: [1024, 3712, 1, 1024] + - Exact: [4032, 384, 1, 64] + - Exact: [1024, 2048, 1, 49] + - Exact: [4608, 512, 1, 49] + - Exact: [9216, 512, 1, 4096] + - Exact: [3456, 384, 1, 289] + - Exact: [3456, 384, 1, 169] + - Exact: [4096, 512, 1, 1001] + - Exact: [384, 448, 49, 512] + - Exact: [384, 448, 64, 256] + - Exact: [384, 448, 36, 256] + - Exact: [384, 448, 49, 256] + - Exact: [384, 448, 64, 512] + - Exact: [384, 448, 36, 512] + - Exact: [1024, 6400, 1, 65] + - Exact: [4096, 6400, 1, 256] + - Exact: [512, 3194, 1, 2048] + - Exact: [512, 3222, 1, 2048] + - Exact: [512, 3234, 1, 2048] + - Exact: [512, 3242, 1, 2048] + - Exact: [512, 3257, 1, 2048] + - Exact: [512, 3332, 1, 2048] + - Exact: [512, 3336, 1, 2048] + - Exact: [512, 3378, 1, 2048] + - Exact: [512, 3396, 1, 2048] + - Exact: [512, 3399, 1, 2048] + - Exact: [512, 3451, 1, 2048] + - Exact: [512, 3456, 1, 2048] + - Exact: [512, 3458, 1, 2048] + - Exact: [512, 3467, 1, 2048] + - Exact: [512, 3468, 1, 2048] + - Exact: [512, 3470, 1, 2048] + - Exact: [512, 3477, 1, 2048] + - Exact: [512, 3478, 1, 2048] + - Exact: [512, 3495, 1, 2048] + - Exact: [512, 3507, 1, 2048] + - Exact: [512, 3515, 1, 2048] + - Exact: [512, 3517, 1, 2048] + - Exact: [2048, 2864, 1, 512] + - Exact: [2048, 3287, 1, 512] + - Exact: [2048, 3412, 1, 512] + - Exact: [2048, 3456, 1, 512] + - Exact: [2048, 3466, 1, 512] + - Exact: [2048, 3476, 1, 512] + - Exact: [2048, 3999, 1, 512] + - Exact: [33708, 189, 1, 512] + - Exact: [33708, 2496, 1, 512] + - Exact: [33708, 3864, 1, 512] + - Exact: [33708, 3969, 1, 512] + - Exact: [33708, 3995, 1, 512] + - Exact: [134, 134, 240, 64] + - Exact: [135, 134, 240, 64] + - Exact: [135, 135, 240, 64] + - Exact: [512, 2790, 1, 2048] + - Exact: [512, 2864, 1, 2048] + - Exact: [512, 3092, 1, 2048] + - Exact: [512, 3113, 1, 2048] + - Exact: [512, 3137, 1, 2048] + - Exact: [512, 3165, 1, 2048] + - Exact: [512, 3166, 1, 2048] + - Exact: [512, 3219, 1, 2048] + - Exact: [512, 3237, 1, 2048] + - Exact: [512, 3246, 1, 2048] + - Exact: [512, 3249, 1, 2048] + - Exact: [512, 3251, 1, 2048] + - Exact: [512, 3262, 1, 2048] + - Exact: [512, 3268, 1, 2048] + - Exact: [512, 3282, 1, 2048] + - Exact: [512, 3286, 1, 2048] + - Exact: [512, 3287, 1, 2048] + - Exact: [512, 3293, 1, 2048] + - Exact: [512, 3297, 1, 2048] + - Exact: [512, 3307, 1, 2048] + - Exact: [512, 3314, 1, 2048] + - Exact: [512, 3315, 1, 2048] + - Exact: [512, 3319, 1, 2048] + - Exact: [512, 3322, 1, 2048] + - Exact: [512, 3323, 1, 2048] + - Exact: [512, 3324, 1, 2048] + - Exact: [512, 3325, 1, 2048] + - Exact: [512, 3327, 1, 2048] + - Exact: [512, 3329, 1, 2048] + - Exact: [512, 3339, 1, 2048] + - Exact: [512, 3342, 1, 2048] + - Exact: [512, 3344, 1, 2048] + - Exact: [512, 3358, 1, 2048] + - Exact: [512, 3360, 1, 2048] + - Exact: [512, 3364, 1, 2048] + - Exact: [512, 3365, 1, 2048] + - Exact: [512, 3369, 1, 2048] + - Exact: [512, 3371, 1, 2048] + - Exact: [512, 3374, 1, 2048] + - Exact: [512, 3376, 1, 2048] + - Exact: [512, 3377, 1, 2048] + - Exact: [512, 3381, 1, 2048] + - Exact: [512, 3382, 1, 2048] + - Exact: [512, 3383, 1, 2048] + - Exact: [512, 3384, 1, 2048] + - Exact: [512, 3385, 1, 2048] + - Exact: [512, 3386, 1, 2048] + - Exact: [512, 3388, 1, 2048] + - Exact: [512, 3390, 1, 2048] + - Exact: [512, 3391, 1, 2048] + - Exact: [512, 3402, 1, 2048] + - Exact: [512, 3410, 1, 2048] + - Exact: [512, 3412, 1, 2048] + - Exact: [512, 3414, 1, 2048] + - Exact: [512, 3415, 1, 2048] + - Exact: [512, 3418, 1, 2048] + - Exact: [512, 3420, 1, 2048] + - Exact: [512, 3422, 1, 2048] + - Exact: [512, 3425, 1, 2048] + - Exact: [512, 3426, 1, 2048] + - Exact: [512, 3427, 1, 2048] + - Exact: [512, 3428, 1, 2048] + - Exact: [512, 3430, 1, 2048] + - Exact: [512, 3431, 1, 2048] + - Exact: [512, 3432, 1, 2048] + - Exact: [512, 3438, 1, 2048] + - Exact: [512, 3439, 1, 2048] + - Exact: [512, 3440, 1, 2048] + - Exact: [512, 3443, 1, 2048] + - Exact: [512, 3445, 1, 2048] + - Exact: [512, 3447, 1, 2048] + - Exact: [512, 3448, 1, 2048] + - Exact: [512, 3450, 1, 2048] + - Exact: [512, 3452, 1, 2048] + - Exact: [512, 3453, 1, 2048] + - Exact: [512, 3455, 1, 2048] + - Exact: [512, 3457, 1, 2048] + - Exact: [512, 3459, 1, 2048] + - Exact: [512, 3460, 1, 2048] + - Exact: [512, 3461, 1, 2048] + - Exact: [512, 3462, 1, 2048] + - Exact: [512, 3466, 1, 2048] + - Exact: [512, 3471, 1, 2048] + - Exact: [512, 3472, 1, 2048] + - Exact: [512, 3475, 1, 2048] + - Exact: [512, 3476, 1, 2048] + - Exact: [512, 3479, 1, 2048] + - Exact: [512, 3480, 1, 2048] + - Exact: [512, 3481, 1, 2048] + - Exact: [512, 3483, 1, 2048] + - Exact: [512, 3484, 1, 2048] + - Exact: [512, 3487, 1, 2048] + - Exact: [512, 3489, 1, 2048] + - Exact: [512, 3490, 1, 2048] + - Exact: [512, 3491, 1, 2048] + - Exact: [512, 3493, 1, 2048] + - Exact: [512, 3494, 1, 2048] + - Exact: [512, 3497, 1, 2048] + - Exact: [512, 3498, 1, 2048] + - Exact: [512, 3499, 1, 2048] + - Exact: [512, 3501, 1, 2048] + - Exact: [512, 3503, 1, 2048] + - Exact: [512, 3508, 1, 2048] + - Exact: [512, 3509, 1, 2048] + - Exact: [512, 3511, 1, 2048] + - Exact: [512, 3514, 1, 2048] + - Exact: [512, 3518, 1, 2048] + - Exact: [512, 3519, 1, 2048] + - Exact: [512, 3520, 1, 2048] + - Exact: [512, 3523, 1, 2048] + - Exact: [512, 3528, 1, 2048] + - Exact: [512, 3529, 1, 2048] + - Exact: [512, 3530, 1, 2048] + - Exact: [512, 3532, 1, 2048] + - Exact: [512, 3533, 1, 2048] + - Exact: [512, 3534, 1, 2048] + - Exact: [512, 3538, 1, 2048] + - Exact: [512, 3539, 1, 2048] + - Exact: [512, 3541, 1, 2048] + - Exact: [512, 3547, 1, 2048] + - Exact: [512, 3548, 1, 2048] + - Exact: [512, 3552, 1, 2048] + - Exact: [512, 3564, 1, 2048] + - Exact: [512, 3575, 1, 2048] + - Exact: [512, 3598, 1, 2048] + - Exact: [512, 3599, 1, 2048] + - Exact: [512, 3608, 1, 2048] + - Exact: [512, 3780, 1, 512] + - Exact: [512, 3780, 1, 2048] + - Exact: [512, 3796, 1, 512] + - Exact: [512, 3796, 1, 2048] + - Exact: [512, 3822, 1, 512] + - Exact: [512, 3822, 1, 2048] + - Exact: [512, 3840, 1, 512] + - Exact: [512, 3840, 1, 2048] + - Exact: [512, 3859, 1, 512] + - Exact: [512, 3859, 1, 2048] + - Exact: [512, 3870, 1, 512] + - Exact: [512, 3870, 1, 2048] + - Exact: [512, 3876, 1, 512] + - Exact: [512, 3876, 1, 2048] + - Exact: [512, 3906, 1, 512] + - Exact: [512, 3906, 1, 2048] + - Exact: [512, 3910, 1, 512] + - Exact: [512, 3910, 1, 2048] + - Exact: [512, 3925, 1, 512] + - Exact: [512, 3925, 1, 2048] + - Exact: [512, 3927, 1, 512] + - Exact: [512, 3942, 1, 512] + - Exact: [512, 3942, 1, 2048] + - Exact: [512, 3944, 1, 512] + - Exact: [512, 3944, 1, 2048] + - Exact: [512, 3955, 1, 512] + - Exact: [512, 3955, 1, 2048] + - Exact: [512, 3968, 1, 512] + - Exact: [512, 3968, 1, 2048] + - Exact: [512, 3969, 1, 512] + - Exact: [512, 3969, 1, 2048] + - Exact: [512, 3976, 1, 512] + - Exact: [512, 3976, 1, 2048] + - Exact: [512, 3977, 1, 512] + - Exact: [512, 3977, 1, 2048] + - Exact: [512, 3978, 1, 512] + - Exact: [512, 3978, 1, 2048] + - Exact: [512, 3990, 1, 512] + - Exact: [512, 3990, 1, 2048] + - Exact: [512, 3995, 1, 512] + - Exact: [512, 3995, 1, 2048] + - Exact: [512, 3996, 1, 512] + - Exact: [512, 3996, 1, 2048] + - Exact: [512, 3999, 1, 512] + - Exact: [512, 3999, 1, 2048] + - Exact: [512, 4005, 1, 512] + - Exact: [512, 4005, 1, 2048] + - Exact: [512, 4012, 1, 512] + - Exact: [512, 4012, 1, 2048] + - Exact: [512, 4020, 1, 512] + - Exact: [512, 4020, 1, 2048] + - Exact: [512, 4026, 1, 512] + - Exact: [512, 4026, 1, 2048] + - Exact: [512, 4030, 1, 512] + - Exact: [512, 4030, 1, 2048] + - Exact: [512, 4032, 1, 512] + - Exact: [512, 4032, 1, 2048] + - Exact: [512, 4050, 1, 512] + - Exact: [512, 4059, 1, 512] + - Exact: [2048, 2790, 1, 512] + - Exact: [2048, 3092, 1, 512] + - Exact: [2048, 3113, 1, 512] + - Exact: [2048, 3137, 1, 512] + - Exact: [2048, 3165, 1, 512] + - Exact: [2048, 3166, 1, 512] + - Exact: [2048, 3194, 1, 512] + - Exact: [2048, 3219, 1, 512] + - Exact: [2048, 3222, 1, 512] + - Exact: [2048, 3234, 1, 512] + - Exact: [2048, 3237, 1, 512] + - Exact: [2048, 3242, 1, 512] + - Exact: [2048, 3246, 1, 512] + - Exact: [2048, 3249, 1, 512] + - Exact: [2048, 3251, 1, 512] + - Exact: [2048, 3257, 1, 512] + - Exact: [2048, 3262, 1, 512] + - Exact: [2048, 3268, 1, 512] + - Exact: [2048, 3282, 1, 512] + - Exact: [2048, 3286, 1, 512] + - Exact: [2048, 3293, 1, 512] + - Exact: [2048, 3297, 1, 512] + - Exact: [2048, 3307, 1, 512] + - Exact: [2048, 3314, 1, 512] + - Exact: [2048, 3315, 1, 512] + - Exact: [2048, 3319, 1, 512] + - Exact: [2048, 3322, 1, 512] + - Exact: [2048, 3323, 1, 512] + - Exact: [2048, 3324, 1, 512] + - Exact: [2048, 3325, 1, 512] + - Exact: [2048, 3327, 1, 512] + - Exact: [2048, 3329, 1, 512] + - Exact: [2048, 3332, 1, 512] + - Exact: [2048, 3336, 1, 512] + - Exact: [2048, 3339, 1, 512] + - Exact: [2048, 3342, 1, 512] + - Exact: [2048, 3344, 1, 512] + - Exact: [2048, 3358, 1, 512] + - Exact: [2048, 3360, 1, 512] + - Exact: [2048, 3364, 1, 512] + - Exact: [2048, 3365, 1, 512] + - Exact: [2048, 3369, 1, 512] + - Exact: [2048, 3371, 1, 512] + - Exact: [2048, 3374, 1, 512] + - Exact: [2048, 3376, 1, 512] + - Exact: [2048, 3377, 1, 512] + - Exact: [2048, 3378, 1, 512] + - Exact: [2048, 3381, 1, 512] + - Exact: [2048, 3382, 1, 512] + - Exact: [2048, 3383, 1, 512] + - Exact: [2048, 3384, 1, 512] + - Exact: [2048, 3385, 1, 512] + - Exact: [2048, 3386, 1, 512] + - Exact: [2048, 3388, 1, 512] + - Exact: [2048, 3390, 1, 512] + - Exact: [2048, 3391, 1, 512] + - Exact: [2048, 3396, 1, 512] + - Exact: [2048, 3399, 1, 512] + - Exact: [2048, 3402, 1, 512] + - Exact: [2048, 3410, 1, 512] + - Exact: [2048, 3414, 1, 512] + - Exact: [2048, 3415, 1, 512] + - Exact: [2048, 3418, 1, 512] + - Exact: [2048, 3420, 1, 512] + - Exact: [2048, 3422, 1, 512] + - Exact: [2048, 3425, 1, 512] + - Exact: [2048, 3426, 1, 512] + - Exact: [2048, 3427, 1, 512] + - Exact: [2048, 3428, 1, 512] + - Exact: [2048, 3430, 1, 512] + - Exact: [2048, 3431, 1, 512] + - Exact: [2048, 3432, 1, 512] + - Exact: [2048, 3438, 1, 512] + - Exact: [2048, 3439, 1, 512] + - Exact: [2048, 3440, 1, 512] + - Exact: [2048, 3443, 1, 512] + - Exact: [2048, 3445, 1, 512] + - Exact: [2048, 3447, 1, 512] + - Exact: [2048, 3448, 1, 512] + - Exact: [2048, 3450, 1, 512] + - Exact: [2048, 3451, 1, 512] + - Exact: [2048, 3452, 1, 512] + - Exact: [2048, 3453, 1, 512] + - Exact: [2048, 3455, 1, 512] + - Exact: [2048, 3457, 1, 512] + - Exact: [2048, 3458, 1, 512] + - Exact: [2048, 3459, 1, 512] + - Exact: [2048, 3460, 1, 512] + - Exact: [2048, 3461, 1, 512] + - Exact: [2048, 3462, 1, 512] + - Exact: [2048, 3467, 1, 512] + - Exact: [2048, 3468, 1, 512] + - Exact: [2048, 3470, 1, 512] + - Exact: [2048, 3471, 1, 512] + - Exact: [2048, 3472, 1, 512] + - Exact: [2048, 3475, 1, 512] + - Exact: [2048, 3477, 1, 512] + - Exact: [2048, 3478, 1, 512] + - Exact: [2048, 3479, 1, 512] + - Exact: [2048, 3480, 1, 512] + - Exact: [2048, 3481, 1, 512] + - Exact: [2048, 3483, 1, 512] + - Exact: [2048, 3484, 1, 512] + - Exact: [2048, 3487, 1, 512] + - Exact: [2048, 3489, 1, 512] + - Exact: [2048, 3490, 1, 512] + - Exact: [2048, 3491, 1, 512] + - Exact: [2048, 3493, 1, 512] + - Exact: [2048, 3494, 1, 512] + - Exact: [2048, 3495, 1, 512] + - Exact: [2048, 3497, 1, 512] + - Exact: [2048, 3498, 1, 512] + - Exact: [2048, 3499, 1, 512] + - Exact: [2048, 3501, 1, 512] + - Exact: [2048, 3503, 1, 512] + - Exact: [2048, 3507, 1, 512] + - Exact: [2048, 3508, 1, 512] + - Exact: [2048, 3509, 1, 512] + - Exact: [2048, 3511, 1, 512] + - Exact: [2048, 3514, 1, 512] + - Exact: [2048, 3515, 1, 512] + - Exact: [2048, 3517, 1, 512] + - Exact: [2048, 3518, 1, 512] + - Exact: [2048, 3519, 1, 512] + - Exact: [2048, 3520, 1, 512] + - Exact: [2048, 3523, 1, 512] + - Exact: [2048, 3528, 1, 512] + - Exact: [2048, 3529, 1, 512] + - Exact: [2048, 3530, 1, 512] + - Exact: [2048, 3532, 1, 512] + - Exact: [2048, 3533, 1, 512] + - Exact: [2048, 3534, 1, 512] + - Exact: [2048, 3538, 1, 512] + - Exact: [2048, 3539, 1, 512] + - Exact: [2048, 3541, 1, 512] + - Exact: [2048, 3547, 1, 512] + - Exact: [2048, 3548, 1, 512] + - Exact: [2048, 3552, 1, 512] + - Exact: [2048, 3564, 1, 512] + - Exact: [2048, 3575, 1, 512] + - Exact: [2048, 3598, 1, 512] + - Exact: [2048, 3599, 1, 512] + - Exact: [2048, 3608, 1, 512] + - Exact: [2048, 3780, 1, 512] + - Exact: [2048, 3796, 1, 512] + - Exact: [2048, 3822, 1, 512] + - Exact: [2048, 3840, 1, 512] + - Exact: [2048, 3859, 1, 512] + - Exact: [2048, 3870, 1, 512] + - Exact: [2048, 3876, 1, 512] + - Exact: [2048, 3906, 1, 512] + - Exact: [2048, 3910, 1, 512] + - Exact: [2048, 3925, 1, 512] + - Exact: [2048, 3942, 1, 512] + - Exact: [2048, 3944, 1, 512] + - Exact: [2048, 3955, 1, 512] + - Exact: [2048, 3968, 1, 512] + - Exact: [2048, 3969, 1, 512] + - Exact: [2048, 3976, 1, 512] + - Exact: [2048, 3977, 1, 512] + - Exact: [2048, 3978, 1, 512] + - Exact: [2048, 3990, 1, 512] + - Exact: [2048, 3995, 1, 512] + - Exact: [2048, 3996, 1, 512] + - Exact: [2048, 4005, 1, 512] + - Exact: [2048, 4012, 1, 512] + - Exact: [2048, 4020, 1, 512] + - Exact: [2048, 4026, 1, 512] + - Exact: [2048, 4030, 1, 512] + - Exact: [2048, 4032, 1, 512] + - Exact: [33708, 184, 1, 512] + - Exact: [33708, 208, 1, 512] + - Exact: [33708, 246, 1, 512] + - Exact: [33708, 264, 1, 512] + - Exact: [33708, 465, 1, 512] + - Exact: [33708, 468, 1, 512] + - Exact: [33708, 493, 1, 512] + - Exact: [33708, 540, 1, 512] + - Exact: [33708, 550, 1, 512] + - Exact: [33708, 560, 1, 512] + - Exact: [33708, 644, 1, 512] + - Exact: [33708, 714, 1, 512] + - Exact: [33708, 720, 1, 512] + - Exact: [33708, 781, 1, 512] + - Exact: [33708, 936, 1, 512] + - Exact: [33708, 980, 1, 512] + - Exact: [33708, 1232, 1, 512] + - Exact: [33708, 1290, 1, 512] + - Exact: [33708, 1350, 1, 512] + - Exact: [33708, 1424, 1, 512] + - Exact: [33708, 1458, 1, 512] + - Exact: [33708, 1462, 1, 512] + - Exact: [33708, 1520, 1, 512] + - Exact: [33708, 1596, 1, 512] + - Exact: [33708, 1599, 1, 512] + - Exact: [33708, 1615, 1, 512] + - Exact: [33708, 1680, 1, 512] + - Exact: [33708, 1917, 1, 512] + - Exact: [33708, 2205, 1, 512] + - Exact: [33708, 2418, 1, 512] + - Exact: [33708, 3776, 1, 512] + - Exact: [33708, 3780, 1, 512] + - Exact: [33708, 3796, 1, 512] + - Exact: [33708, 3822, 1, 512] + - Exact: [33708, 3835, 1, 512] + - Exact: [33708, 3840, 1, 512] + - Exact: [33708, 3859, 1, 512] + - Exact: [33708, 3870, 1, 512] + - Exact: [33708, 3876, 1, 512] + - Exact: [33708, 3906, 1, 512] + - Exact: [33708, 3910, 1, 512] + - Exact: [33708, 3925, 1, 512] + - Exact: [33708, 3942, 1, 512] + - Exact: [33708, 3944, 1, 512] + - Exact: [33708, 3955, 1, 512] + - Exact: [33708, 3968, 1, 512] + - Exact: [33708, 3976, 1, 512] + - Exact: [33708, 3977, 1, 512] + - Exact: [33708, 3978, 1, 512] + - Exact: [33708, 3990, 1, 512] + - Exact: [33708, 3996, 1, 512] + - Exact: [33708, 3999, 1, 512] + - Exact: [33708, 4005, 1, 512] + - Exact: [33708, 4012, 1, 512] + - Exact: [33708, 4020, 1, 512] + - Exact: [33708, 4026, 1, 512] + - Exact: [33708, 4030, 1, 512] + - Exact: [33708, 4032, 1, 512] + - Exact: [3072, 512, 1, 3072] + - Exact: [511, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8193, 1, 8192] + - Exact: [3072, 3072, 1, 3071] + - Exact: [8192, 8192, 1, 8193] + - Exact: [7681, 8192, 1, 8192] + - Exact: [7680, 8192, 1, 8193] + - Exact: [513, 4096, 1, 4096] + - Exact: [3073, 512, 1, 3072] + - Exact: [7680, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4097] + - Exact: [8192, 8191, 1, 8192] + - Exact: [8192, 512, 1, 8193] + - Exact: [2880, 3071, 1, 3072] + - Exact: [2880, 3072, 1, 3072] + - Exact: [4096, 511, 1, 4096] + - Exact: [512, 3072, 1, 3072] + - Exact: [512, 8191, 1, 8192] + - Exact: [4096, 4095, 1, 4096] + - Exact: [8192, 511, 1, 8192] + - Exact: [8192, 512, 1, 8192] + - Exact: [511, 3072, 1, 3072] + - Exact: [7680, 8193, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 512, 1, 3073] + - Exact: [513, 8192, 1, 8192] + - Exact: [7679, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4097] + - Exact: [512, 3072, 1, 3071] + - Exact: [7680, 8192, 1, 8191] + - Exact: [3072, 511, 1, 3072] + - Exact: [8193, 8192, 1, 8192] + - Exact: [512, 4096, 1, 4095] + - Exact: [512, 3071, 1, 3072] + - Exact: [3073, 3072, 1, 3072] + - Exact: [512, 3073, 1, 3072] + - Exact: [4096, 4096, 1, 4095] + - Exact: [1920, 2048, 1, 2047] + - Exact: [1920, 2049, 1, 2048] + - Exact: [512, 8192, 1, 8191] + - Exact: [3840, 4096, 1, 4096] + - Exact: [8191, 512, 1, 8192] + - Exact: [2881, 3072, 1, 3072] + - Exact: [512, 4096, 1, 4096] + - Exact: [3841, 4096, 1, 4096] + - Exact: [2880, 3072, 1, 3073] + - Exact: [4095, 512, 1, 4096] + - Exact: [1919, 2048, 1, 2048] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [511, 4096, 1, 4096] + - Exact: [8192, 513, 1, 8192] + - Exact: [513, 3072, 1, 3072] + - Exact: [7680, 8191, 1, 8192] + - Exact: [512, 4097, 1, 4096] + - Exact: [2047, 2048, 1, 2048] + - Exact: [2049, 2048, 1, 2048] + - Exact: [3840, 4095, 1, 4096] + - Exact: [2880, 3072, 1, 3071] + - Exact: [3072, 3072, 1, 3073] + - Exact: [2880, 3073, 1, 3072] + - Exact: [4096, 513, 1, 4096] + - Exact: [4097, 512, 1, 4096] + - Exact: [8192, 512, 1, 8191] + - Exact: [1921, 2048, 1, 2048] + - Exact: [512, 3072, 1, 3073] + - Exact: [2048, 2049, 1, 2048] + - Exact: [3072, 512, 1, 3071] + - Exact: [3071, 3072, 1, 3072] + - Exact: [3840, 4097, 1, 4096] + - Exact: [2048, 2047, 1, 2048] + - Exact: [2879, 3072, 1, 3072] + - Exact: [3072, 513, 1, 3072] + - Exact: [512, 4095, 1, 4096] + - Exact: [3071, 512, 1, 3072] + - Exact: [4096, 512, 1, 4096] + - Exact: [4097, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2047] + - Exact: [3839, 4096, 1, 4096] + - Exact: [512, 4096, 1, 4097] + - Exact: [3072, 3073, 1, 3072] + - Exact: [2048, 2048, 1, 2049] + - Exact: [8191, 8192, 1, 8192] + - Exact: [3072, 3071, 1, 3072] + - Exact: [4096, 512, 1, 4097] + - Exact: [3840, 4096, 1, 4095] + - Exact: [1920, 2047, 1, 2048] + - Exact: [8192, 8192, 1, 8191] + - Exact: [3072, 3072, 1, 3072] + - Exact: [512, 8193, 1, 8192] + - Exact: [4096, 512, 1, 4095] + - Exact: [8193, 512, 1, 8192] + - Exact: [4095, 4096, 1, 4096] + - Exact: [4096, 4097, 1, 4096] + - Exact: [512, 8192, 1, 8192] + - Exact: [512, 8192, 1, 8193] + - Exact: [1920, 2048, 1, 2049] + - Exact: [479, 3072, 1, 3072] + - Exact: [479, 4096, 1, 4096] + - Exact: [479, 8192, 1, 8192] + - Exact: [480, 3072, 1, 3071] + - Exact: [480, 3072, 1, 3073] + - Exact: [480, 3073, 1, 3072] + - Exact: [480, 4095, 1, 4096] + - Exact: [480, 4096, 1, 4095] + - Exact: [480, 4096, 1, 4097] + - Exact: [480, 4097, 1, 4096] + - Exact: [480, 8191, 1, 8192] + - Exact: [480, 8192, 1, 8191] + - Exact: [480, 8192, 1, 8193] + - Exact: [480, 8193, 1, 8192] + - Exact: [481, 3072, 1, 3072] + - Exact: [481, 4096, 1, 4096] + - Exact: [481, 8192, 1, 8192] + - Exact: [3072, 479, 1, 3072] + - Exact: [3072, 480, 1, 3071] + - Exact: [3072, 480, 1, 3073] + - Exact: [3072, 481, 1, 3072] + - Exact: [3073, 480, 1, 3072] + - Exact: [480, 3072, 1, 3072] + - Exact: [480, 4096, 1, 4096] + - Exact: [480, 8192, 1, 8192] + - Exact: [3072, 480, 1, 3072] + - Exact: [4096, 480, 1, 4096] + - Exact: [8192, 480, 1, 8192] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [42720, 3968, 1, 1024] + - Exact: [42720, 7200, 1, 1024] + - Exact: [42720, 9520, 1, 1024] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 960, 1, 74] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 2048, 1, 960] + - Exact: [4096, 1024, 1, 257] + - Exact: [10240, 8976, 1, 256] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1600, 1, 560] + - Exact: [10496, 8976, 1, 256] + - Exact: [11264, 8976, 1, 256] + - Exact: [11776, 8976, 1, 256] + - Exact: [12544, 8976, 1, 256] + - Exact: [1280, 8976, 1, 256] + - Exact: [13312, 8976, 1, 256] + - Exact: [13568, 8976, 1, 256] + - Exact: [13824, 8976, 1, 256] + - Exact: [15104, 8976, 1, 256] + - Exact: [15360, 8976, 1, 256] + - Exact: [15872, 8976, 1, 256] + - Exact: [16128, 8976, 1, 256] + - Exact: [17152, 8976, 1, 256] + - Exact: [1792, 8976, 1, 256] + - Exact: [18176, 8976, 1, 256] + - Exact: [18688, 8976, 1, 256] + - Exact: [18944, 8976, 1, 256] + - Exact: [19712, 8976, 1, 256] + - Exact: [19968, 8976, 1, 256] + - Exact: [20480, 8976, 1, 256] + - Exact: [2048, 1536, 1, 512] + - Exact: [2048, 1536, 1, 768] + - Exact: [2048, 684, 1, 512] + - Exact: [2048, 684, 1, 768] + - Exact: [2048, 8976, 1, 256] + - Exact: [20992, 8976, 1, 256] + - Exact: [21248, 8976, 1, 256] + - Exact: [2304, 8976, 1, 256] + - Exact: [23552, 8976, 1, 256] + - Exact: [2560, 8976, 1, 256] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [2816, 8976, 1, 256] + - Exact: [28672, 8976, 1, 256] + - Exact: [3072, 8976, 1, 256] + - Exact: [31488, 8976, 1, 256] + - Exact: [3328, 8976, 1, 256] + - Exact: [33536, 8976, 1, 256] + - Exact: [3840, 8976, 1, 256] + - Exact: [4096, 8976, 1, 256] + - Exact: [4352, 8976, 1, 256] + - Exact: [44505, 8976, 1, 256] + - Exact: [4608, 8976, 1, 256] + - Exact: [4864, 8976, 1, 256] + - Exact: [5120, 8976, 1, 256] + - Exact: [5376, 8976, 1, 256] + - Exact: [5632, 8976, 1, 256] + - Exact: [5888, 8976, 1, 256] + - Exact: [6144, 8976, 1, 256] + - Exact: [6400, 8976, 1, 256] + - Exact: [684, 8976, 1, 256] + - Exact: [7168, 8976, 1, 256] + - Exact: [7936, 8976, 1, 256] + - Exact: [8192, 8976, 1, 256] + - Exact: [8448, 8976, 1, 256] + - Exact: [8960, 8976, 1, 256] + - Exact: [9472, 8976, 1, 256] + - Exact: [9728, 8976, 1, 256] + - Exact: [9984, 8976, 1, 256] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [512, 32768, 1, 13] + - Exact: [256, 32768, 1, 512] + - Exact: [128, 32768, 1, 512] + - Exact: [1024, 32768, 1, 479] + - Exact: [1024, 32768, 1, 1024] + - Exact: [512, 32768, 1, 1024] + - Exact: [1023, 2048, 1, 4096] + - Exact: [1025, 2048, 1, 4096] + - Exact: [1024, 2047, 1, 4096] + - Exact: [1024, 2049, 1, 4096] + - Exact: [1024, 2048, 1, 4095] + - Exact: [1024, 2048, 1, 4097] + - Exact: [1023, 3072, 1, 1024] + - Exact: [1025, 3072, 1, 1024] + - Exact: [1024, 3071, 1, 1024] + - Exact: [1024, 3073, 1, 1024] + - Exact: [1024, 3072, 1, 1023] + - Exact: [1024, 3072, 1, 1025] + - Exact: [3071, 512, 1, 1024] + - Exact: [3073, 512, 1, 1024] + - Exact: [3072, 511, 1, 1024] + - Exact: [3072, 513, 1, 1024] + - Exact: [3072, 512, 1, 1023] + - Exact: [3072, 512, 1, 1025] + - Exact: [128, 32768, 1, 256] + - Exact: [1024, 4096, 1, 480] + - Exact: [512, 4096, 1, 1024] + - Exact: [512, 55296, 1, 13] + - Exact: [256, 55296, 1, 512] + - Exact: [128, 55296, 1, 256] + - Exact: [1024, 6912, 1, 480] + - Exact: [1024, 6912, 1, 1024] + - Exact: [512, 6912, 1, 1024] + - Exact: [256, 6912, 1, 512] + - Exact: [1151, 1152, 1, 1152] + - Exact: [1153, 1152, 1, 1152] + - Exact: [1152, 1151, 1, 1152] + - Exact: [1152, 1153, 1, 1152] + - Exact: [1152, 1152, 1, 1151] + - Exact: [1152, 1152, 1, 1153] + - Exact: [1535, 1536, 1, 1536] + - Exact: [1537, 1536, 1, 1536] + - Exact: [1536, 1535, 1, 1536] + - Exact: [1536, 1537, 1, 1536] + - Exact: [1536, 1536, 1, 1535] + - Exact: [1536, 1536, 1, 1537] + - Exact: [1919, 1920, 1, 1920] + - Exact: [1921, 1920, 1, 1920] + - Exact: [1920, 1919, 1, 1920] + - Exact: [1920, 1921, 1, 1920] + - Exact: [1920, 1920, 1, 1919] + - Exact: [1920, 1920, 1, 1921] + - Exact: [2303, 2304, 1, 2304] + - Exact: [2305, 2304, 1, 2304] + - Exact: [2304, 2303, 1, 2304] + - Exact: [2304, 2305, 1, 2304] + - Exact: [2304, 2304, 1, 2303] + - Exact: [2304, 2304, 1, 2305] + - Exact: [2687, 2688, 1, 2688] + - Exact: [2689, 2688, 1, 2688] + - Exact: [2688, 2687, 1, 2688] + - Exact: [2688, 2689, 1, 2688] + - Exact: [2688, 2688, 1, 2687] + - Exact: [2688, 2688, 1, 2689] + - Exact: [3455, 3456, 1, 3456] + - Exact: [3457, 3456, 1, 3456] + - Exact: [3456, 3455, 1, 3456] + - Exact: [3456, 3457, 1, 3456] + - Exact: [3456, 3456, 1, 3455] + - Exact: [3456, 3456, 1, 3457] + - Exact: [3839, 3840, 1, 3840] + - Exact: [3841, 3840, 1, 3840] + - Exact: [3840, 3839, 1, 3840] + - Exact: [3840, 3841, 1, 3840] + - Exact: [3840, 3840, 1, 3839] + - Exact: [3840, 3840, 1, 3841] + - Exact: [4223, 4224, 1, 4224] + - Exact: [4225, 4224, 1, 4224] + - Exact: [4224, 4223, 1, 4224] + - Exact: [4224, 4225, 1, 4224] + - Exact: [4224, 4224, 1, 4223] + - Exact: [4224, 4224, 1, 4225] + - Exact: [4607, 4608, 1, 4608] + - Exact: [4609, 4608, 1, 4608] + - Exact: [4608, 4607, 1, 4608] + - Exact: [4608, 4609, 1, 4608] + - Exact: [4608, 4608, 1, 4607] + - Exact: [4608, 4608, 1, 4609] + - Exact: [4991, 4992, 1, 4992] + - Exact: [4993, 4992, 1, 4992] + - Exact: [4992, 4991, 1, 4992] + - Exact: [4992, 4993, 1, 4992] + - Exact: [4992, 4992, 1, 4991] + - Exact: [4992, 4992, 1, 4993] + - Exact: [5375, 5376, 1, 5376] + - Exact: [5377, 5376, 1, 5376] + - Exact: [5376, 5375, 1, 5376] + - Exact: [5376, 5377, 1, 5376] + - Exact: [5376, 5376, 1, 5375] + - Exact: [5376, 5376, 1, 5377] + - Exact: [5759, 5760, 1, 5760] + - Exact: [5761, 5760, 1, 5760] + - Exact: [5760, 5759, 1, 5760] + - Exact: [5760, 5761, 1, 5760] + - Exact: [5760, 5760, 1, 5759] + - Exact: [5760, 5760, 1, 5761] + - Exact: [6143, 6144, 1, 6144] + - Exact: [6145, 6144, 1, 6144] + - Exact: [6144, 6143, 1, 6144] + - Exact: [6144, 6145, 1, 6144] + - Exact: [6144, 6144, 1, 6143] + - Exact: [6144, 6144, 1, 6145] + - Exact: [6527, 6528, 1, 6528] + - Exact: [6529, 6528, 1, 6528] + - Exact: [6528, 6527, 1, 6528] + - Exact: [6528, 6529, 1, 6528] + - Exact: [6528, 6528, 1, 6527] + - Exact: [6528, 6528, 1, 6529] + - Exact: [6911, 6912, 1, 6912] + - Exact: [6913, 6912, 1, 6912] + - Exact: [6912, 6911, 1, 6912] + - Exact: [6912, 6913, 1, 6912] + - Exact: [6912, 6912, 1, 6911] + - Exact: [6912, 6912, 1, 6913] + - Exact: [7295, 7296, 1, 7296] + - Exact: [7297, 7296, 1, 7296] + - Exact: [7296, 7295, 1, 7296] + - Exact: [7296, 7297, 1, 7296] + - Exact: [7296, 7296, 1, 7295] + - Exact: [7296, 7296, 1, 7297] + - Exact: [7679, 7680, 1, 7680] + - Exact: [7681, 7680, 1, 7680] + - Exact: [7680, 7679, 1, 7680] + - Exact: [7680, 7681, 1, 7680] + - Exact: [7680, 7680, 1, 7679] + - Exact: [7680, 7680, 1, 7681] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [256, 128, 49, 1152] + - Exact: [256, 128, 121, 120] + - Exact: [256, 128, 169, 120] + - Exact: [256, 128, 36, 120] + - Exact: [256, 128, 49, 120] + - Exact: [256, 128, 64, 120] + - Exact: [256, 128, 36, 12000] + - Exact: [256, 128, 49, 1216] + - Exact: [256, 128, 121, 18] + - Exact: [256, 128, 169, 18] + - Exact: [256, 128, 36, 18] + - Exact: [256, 128, 49, 18] + - Exact: [256, 128, 64, 18] + - Exact: [256, 128, 36, 1800] + - Exact: [256, 128, 121, 19] + - Exact: [256, 128, 169, 19] + - Exact: [256, 128, 36, 19] + - Exact: [256, 128, 49, 19] + - Exact: [256, 128, 64, 19] + - Exact: [256, 128, 36, 1900] + - Exact: [256, 128, 49, 480] + - Exact: [256, 128, 81, 480] + - Exact: [256, 128, 64, 5880] + - Exact: [256, 128, 49, 72] + - Exact: [256, 128, 81, 72] + - Exact: [256, 128, 49, 76] + - Exact: [256, 128, 81, 76] + - Exact: [256, 128, 49, 7680] + - Exact: [256, 128, 64, 882] + - Exact: [256, 128, 64, 931] + - Exact: [256, 256, 49, 1152] + - Exact: [256, 256, 36, 12000] + - Exact: [256, 256, 49, 1216] + - Exact: [256, 256, 36, 1800] + - Exact: [256, 256, 36, 1900] + - Exact: [256, 256, 64, 5880] + - Exact: [256, 256, 49, 7680] + - Exact: [256, 256, 64, 882] + - Exact: [256, 256, 64, 931] + - Exact: [340, 256, 49, 1152] + - Exact: [340, 256, 36, 120] + - Exact: [340, 256, 49, 120] + - Exact: [340, 256, 64, 120] + - Exact: [340, 256, 36, 12000] + - Exact: [340, 256, 49, 1216] + - Exact: [340, 256, 36, 18] + - Exact: [340, 256, 49, 18] + - Exact: [340, 256, 64, 18] + - Exact: [340, 256, 36, 1800] + - Exact: [340, 256, 36, 19] + - Exact: [340, 256, 49, 19] + - Exact: [340, 256, 64, 19] + - Exact: [340, 256, 36, 1900] + - Exact: [340, 256, 64, 5880] + - Exact: [340, 256, 49, 7680] + - Exact: [340, 256, 64, 882] + - Exact: [340, 256, 64, 931] + - Exact: [510, 256, 49, 120] + - Exact: [510, 256, 64, 120] + - Exact: [510, 256, 49, 18] + - Exact: [510, 256, 64, 18] + - Exact: [510, 256, 49, 19] + - Exact: [510, 256, 64, 19] + - Exact: [510, 256, 36, 480] + - Exact: [510, 256, 36, 72] + - Exact: [510, 256, 36, 76] + - Exact: [510, 512, 36, 1080] + - Exact: [510, 512, 36, 162] + - Exact: [510, 512, 36, 171] + - Exact: [510, 512, 49, 1920] + - Exact: [510, 512, 64, 1920] + - Exact: [510, 512, 49, 288] + - Exact: [510, 512, 64, 288] + - Exact: [510, 512, 36, 3000] + - Exact: [510, 512, 49, 304] + - Exact: [510, 512, 64, 304] + - Exact: [510, 512, 36, 450] + - Exact: [510, 512, 36, 475] + - Exact: [510, 512, 49, 480] + - Exact: [510, 512, 64, 480] + - Exact: [510, 512, 49, 72] + - Exact: [510, 512, 64, 72] + - Exact: [510, 512, 49, 76] + - Exact: [510, 512, 64, 76] + - Exact: [512, 256, 81, 1080] + - Exact: [512, 256, 25, 12000] + - Exact: [512, 256, 81, 162] + - Exact: [512, 256, 81, 171] + - Exact: [512, 256, 25, 1800] + - Exact: [512, 256, 25, 1900] + - Exact: [512, 256, 121, 1920] + - Exact: [512, 256, 169, 1920] + - Exact: [512, 256, 49, 1920] + - Exact: [512, 256, 121, 288] + - Exact: [512, 256, 169, 288] + - Exact: [512, 256, 49, 288] + - Exact: [512, 256, 25, 3000] + - Exact: [512, 256, 81, 3000] + - Exact: [512, 256, 121, 304] + - Exact: [512, 256, 169, 304] + - Exact: [512, 256, 49, 304] + - Exact: [512, 256, 25, 450] + - Exact: [512, 256, 81, 450] + - Exact: [512, 256, 25, 475] + - Exact: [512, 256, 81, 475] + - Exact: [512, 256, 121, 480] + - Exact: [512, 256, 169, 480] + - Exact: [512, 256, 49, 5880] + - Exact: [512, 256, 121, 72] + - Exact: [512, 256, 169, 72] + - Exact: [512, 256, 121, 76] + - Exact: [512, 256, 169, 76] + - Exact: [512, 256, 49, 882] + - Exact: [512, 256, 49, 931] + - Exact: [2304, 512, 1, 100] + - Exact: [2304, 512, 1, 361] + - Exact: [4608, 510, 1, 100] + - Exact: [4608, 510, 1, 361] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [30522, 616, 1, 1024] + - Exact: [128, 128, 128, 64] + - Exact: [128, 128, 160, 64] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [30522, 200, 1, 1024] + - Exact: [128, 128, 624, 64] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [30522, 780, 1, 1024] + - Exact: [30522, 308, 1, 1024] + - Exact: [128, 128, 640, 64] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [30522, 800, 1, 1024] + - Exact: [128, 128, 656, 64] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [30522, 820, 1, 1024] + - Exact: [512, 512, 80, 64] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [30522, 385, 1, 1024] + - Exact: [30522, 462, 1, 1024] + - Exact: [128, 128, 144, 64] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [30522, 180, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [33712, 8192, 1, 1024] + - Exact: [33712, 9600, 1, 1024] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [42720, 10080, 1, 1024] + - Exact: [42720, 6528, 1, 1024] + - Exact: [42720, 7104, 1, 1024] + - Exact: [1024, 32768, 1, 480] + - Exact: [30592, 1024, 1, 2048] + - Exact: [6144, 1024, 1, 2048] + - Exact: [8192, 1024, 1, 2048] + - Exact: [30592, 8192, 1, 1024] + - Exact: [3072, 8192, 1, 1024] + - Exact: [512, 512, 256, 64] + - Exact: [30592, 2048, 1, 1024] + - Exact: [30592, 4096, 1, 1024] + - Exact: [3072, 4096, 1, 1024] + - Exact: [1920, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 640] + - Exact: [7680, 2048, 1, 2560] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 6144] + - Exact: [4608, 4096, 1, 1536] + - Exact: [50304, 4096, 1, 1536] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1536, 8192, 1, 6144] + - Exact: [4608, 8192, 1, 1536] + - Exact: [50304, 8192, 1, 1536] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 16384, 1, 4096] + - Exact: [3072, 16384, 1, 1024] + - Exact: [4096, 16384, 1, 1024] + - Exact: [50304, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [50304, 2048, 1, 1024] + - Exact: [1024, 1024, 32, 64] + - Exact: [50304, 4096, 1, 1024] + - Exact: [1024, 1024, 64, 64] + - Exact: [50304, 8192, 1, 1024] + - Exact: [1024, 1024, 128, 64] + - Exact: [30528, 8192, 1, 1024] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 480] + - Exact: [512, 3456, 1, 1024] + - Exact: [512, 3456, 1, 13] + - Exact: [512, 4096, 1, 13] + - Exact: [512, 6912, 1, 13] + - Exact: [30528, 640, 1, 1024] + - Exact: [30528, 1280, 1, 1024] + - Exact: [30528, 1600, 1, 1024] + - Exact: [1024, 10240, 1, 1024] + - Exact: [4096, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 10496, 1, 4096] + - Exact: [30528, 1640, 1, 1024] + - Exact: [4096, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [30528, 160, 1, 1024] + - Exact: [30528, 240, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 4096] + - Exact: [512, 512, 192, 64] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [4096, 10224, 1, 1024] + - Exact: [3072, 10224, 1, 1024] + - Exact: [3072, 10240, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [4096, 10192, 1, 1024] + - Exact: [3072, 10192, 1, 1024] + - Exact: [3072, 10200, 1, 1024] + - Exact: [1024, 10184, 1, 1024] + - Exact: [3072, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 4096] + - Exact: [4096, 10208, 1, 1024] + - Exact: [2048, 10224, 1, 1024] + - Exact: [2048, 10240, 1, 1024] + - Exact: [1024, 10120, 1, 1024] + - Exact: [2048, 10192, 1, 1024] + - Exact: [1024, 10152, 1, 1024] + - Exact: [3072, 10080, 1, 1024] + - Exact: [256, 256, 25, 12544] + - Exact: [256, 256, 49, 3200] + - Exact: [256, 256, 25, 6272] + - Exact: [256, 256, 49, 6400] + - Exact: [512, 512, 49, 1152] + - Exact: [512, 512, 25, 2048] + - Exact: [512, 512, 49, 2304] + - Exact: [512, 512, 25, 4096] + - Exact: [128, 128, 2048, 64] + - Exact: [30528, 2560, 1, 1024] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 1024] + - Exact: [1024, 12288, 1, 4096] + - Exact: [30528, 1920, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [128, 128, 81, 12544] + - Exact: [128, 128, 121, 9216] + - Exact: [128, 128, 169, 6400] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 49, 2304] + - Exact: [256, 256, 64, 2304] + - Exact: [256, 256, 81, 4096] + - Exact: [256, 256, 121, 2304] + - Exact: [256, 256, 169, 2304] + - Exact: [512, 512, 81, 1024] + - Exact: [512, 512, 121, 1024] + - Exact: [512, 512, 169, 1024] + - Exact: [512, 512, 36, 1024] + - Exact: [512, 512, 49, 1024] + - Exact: [512, 512, 64, 1024] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 768] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 1024] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [256, 256, 36, 432] + - Exact: [256, 256, 36, 456] + - Exact: [256, 256, 36, 504] + - Exact: [256, 256, 49, 1120] + - Exact: [256, 256, 36, 442] + - Exact: [256, 256, 49, 950] + - Exact: [256, 256, 64, 616] + - Exact: [256, 256, 64, 660] + - Exact: [256, 256, 36, 408] + - Exact: [256, 256, 49, 1008] + - Exact: [256, 256, 36, 462] + - Exact: [256, 256, 36, 468] + - Exact: [256, 256, 36, 494] + - Exact: [512, 512, 64, 48] + - Exact: [256, 256, 64, 140] + - Exact: [512, 512, 64, 56] + - Exact: [512, 512, 49, 90] + - Exact: [512, 512, 49, 60] + - Exact: [256, 256, 49, 864] + - Exact: [256, 256, 64, 224] + - Exact: [256, 256, 64, 176] + - Exact: [256, 256, 64, 154] + - Exact: [512, 512, 49, 80] + - Exact: [256, 256, 49, 1200] + - Exact: [256, 256, 64, 704] + - Exact: [256, 256, 64, 768] + - Exact: [256, 256, 49, 1160] + - Exact: [256, 256, 49, 320] + - Exact: [512, 512, 49, 70] + - Exact: [256, 256, 49, 1240] + - Exact: [256, 256, 36, 384] + - Exact: [1024, 2048, 1, 888] + - Exact: [1024, 2048, 1, 713] + - Exact: [1024, 2048, 1, 660] + - Exact: [1024, 2048, 1, 726] + - Exact: [1024, 2048, 1, 672] + - Exact: [1024, 2048, 1, 850] + - Exact: [1024, 2048, 1, 805] + - Exact: [1024, 2048, 1, 864] + - Exact: [1024, 2048, 1, 768] + - Exact: [1024, 2048, 1, 950] + - Exact: [1024, 1024, 160, 96] + - Exact: [2880, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 960] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [25216, 16384, 1, 1920] + - Exact: [1024, 1024, 40, 96] + - Exact: [2880, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 960] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [25216, 4096, 1, 1920] + - Exact: [1024, 1024, 80, 96] + - Exact: [2880, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 960] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [25216, 8192, 1, 1920] + - Exact: [1024, 1024, 96, 96] + - Exact: [1728, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 576] + - Exact: [2304, 16384, 1, 2304] + - Exact: [12672, 16384, 1, 2304] + - Exact: [1024, 1024, 24, 96] + - Exact: [1728, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 576] + - Exact: [2304, 4096, 1, 2304] + - Exact: [12672, 4096, 1, 2304] + - Exact: [1024, 1024, 48, 96] + - Exact: [1728, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 576] + - Exact: [2304, 8192, 1, 2304] + - Exact: [12672, 8192, 1, 2304] + - Exact: [1024, 1024, 16, 96] + - Exact: [1152, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 384] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [6400, 4096, 1, 3072] + - Exact: [1024, 1024, 32, 96] + - Exact: [1152, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 384] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [6400, 8192, 1, 3072] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [29000, 199, 1, 2048] + - Exact: [29000, 221, 1, 2048] + - Exact: [29000, 224, 1, 2048] + - Exact: [29000, 229, 1, 2048] + - Exact: [29000, 234, 1, 2048] + - Exact: [29000, 242, 1, 2048] + - Exact: [29000, 246, 1, 2048] + - Exact: [29000, 247, 1, 2048] + - Exact: [29000, 256, 1, 2048] + - Exact: [29000, 262, 1, 2048] + - Exact: [29000, 264, 1, 2048] + - Exact: [29000, 265, 1, 2048] + - Exact: [29000, 274, 1, 2048] + - Exact: [29000, 277, 1, 2048] + - Exact: [29000, 279, 1, 2048] + - Exact: [29000, 288, 1, 2048] + - Exact: [29000, 296, 1, 2048] + - Exact: [29000, 315, 1, 2048] + - Exact: [29000, 335, 1, 2048] + - Exact: [4096, 4096, 1, 2048] + - Exact: [29000, 2283, 1, 1024] + - Exact: [29000, 2296, 1, 1024] + - Exact: [29000, 2306, 1, 1024] + - Exact: [29000, 2309, 1, 1024] + - Exact: [29000, 2318, 1, 1024] + - Exact: [29000, 2320, 1, 1024] + - Exact: [29000, 2324, 1, 1024] + - Exact: [29000, 2325, 1, 1024] + - Exact: [29000, 2329, 1, 1024] + - Exact: [29000, 2338, 1, 1024] + - Exact: [29000, 2345, 1, 1024] + - Exact: [29000, 2350, 1, 1024] + - Exact: [29000, 2362, 1, 1024] + - Exact: [29000, 2366, 1, 1024] + - Exact: [29000, 2368, 1, 1024] + - Exact: [29000, 2374, 1, 1024] + - Exact: [29000, 2390, 1, 1024] + - Exact: [512, 512, 320, 64] + - Exact: [29000, 561, 1, 1024] + - Exact: [29000, 574, 1, 1024] + - Exact: [29000, 600, 1, 1024] + - Exact: [29000, 608, 1, 1024] + - Exact: [29000, 615, 1, 1024] + - Exact: [29000, 622, 1, 1024] + - Exact: [29000, 625, 1, 1024] + - Exact: [29000, 626, 1, 1024] + - Exact: [29000, 628, 1, 1024] + - Exact: [29000, 636, 1, 1024] + - Exact: [29000, 651, 1, 1024] + - Exact: [29000, 658, 1, 1024] + - Exact: [29000, 669, 1, 1024] + - Exact: [29000, 670, 1, 1024] + - Exact: [29000, 672, 1, 1024] + - Exact: [29000, 684, 1, 1024] + - Exact: [29000, 716, 1, 1024] + - Exact: [29000, 730, 1, 1024] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 4096] + - Exact: [3072, 32768, 1, 1024] + - Exact: [4096, 32768, 1, 1024] + - Exact: [50304, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4096, 256, 1, 12288] + - Exact: [2048, 256, 1, 13312] + - Exact: [4096, 256, 1, 15360] + - Exact: [2048, 512, 1, 16640] + - Exact: [4096, 256, 1, 14336] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 512, 1, 16384] + - Exact: [4096, 256, 1, 9216] + - Exact: [1024, 512, 1, 12288] + - Exact: [4096, 200, 1, 12288] + - Exact: [1024, 1024, 1, 13312] + - Exact: [2048, 256, 1, 16384] + - Exact: [2048, 512, 1, 16384] + - Exact: [1024, 1024, 1, 8320] + - Exact: [2048, 256, 1, 14336] + - Exact: [4096, 200, 1, 16640] + - Exact: [1024, 1024, 1, 16640] + - Exact: [1024, 1024, 1, 14336] + - Exact: [2048, 512, 1, 9216] + - Exact: [1024, 1024, 1, 15360] + - Exact: [2048, 512, 1, 8192] + - Exact: [2048, 512, 1, 13312] + - Exact: [1024, 1024, 1, 11264] + - Exact: [1024, 512, 1, 16640] + - Exact: [2048, 512, 1, 10240] + - Exact: [2048, 256, 1, 16640] + - Exact: [4096, 256, 1, 13312] + - Exact: [4096, 200, 1, 15360] + - Exact: [2048, 512, 1, 12288] + - Exact: [4096, 256, 1, 8192] + - Exact: [2048, 512, 1, 15360] + - Exact: [2048, 512, 1, 11264] + - Exact: [2048, 256, 1, 12288] + - Exact: [1024, 1024, 1, 12288] + - Exact: [4096, 256, 1, 16384] + - Exact: [2048, 256, 1, 15360] + - Exact: [2048, 512, 1, 8320] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 9216] + - Exact: [4096, 200, 1, 16384] + - Exact: [2048, 512, 1, 14336] + - Exact: [1024, 512, 1, 13312] + - Exact: [4096, 256, 1, 8320] + - Exact: [4096, 200, 1, 13312] + - Exact: [1024, 512, 1, 14336] + - Exact: [4096, 256, 1, 11264] + - Exact: [4096, 256, 1, 10240] + - Exact: [4096, 200, 1, 14336] + - Exact: [4096, 256, 1, 16640] + - Exact: [1024, 512, 1, 15360] + - Exact: [1024, 1024, 1, 16384] + - Exact: [224, 192, 36, 10368] + - Exact: [320, 256, 9, 19584] + - Exact: [256, 256, 11, 13056] + - Exact: [320, 256, 9, 9792] + - Exact: [320, 256, 11, 13056] + - Exact: [256, 256, 9, 9792] + - Exact: [256, 224, 9, 19584] + - Exact: [256, 256, 9, 19584] + - Exact: [128, 128, 36, 12000] + - Exact: [128, 128, 49, 12800] + - Exact: [128, 128, 25, 25088] + - Exact: [128, 128, 49, 25600] + - Exact: [128, 128, 25, 50176] + - Exact: [128, 128, 36, 12544] + - Exact: [128, 128, 49, 9216] + - Exact: [1024, 1024, 1, 12544] + - Exact: [1024, 1000, 1, 12544] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 512, 1, 1600] + - Exact: [2048, 512, 1, 100] + - Exact: [768, 640, 1, 768] + - Exact: [768, 1280, 1, 768] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 3072] + - Exact: [30522, 120, 1, 1024] + - Exact: [30522, 80, 1, 1024] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [64, 64, 768, 64] + - Exact: [64, 64, 96, 64] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [6784, 64, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [1024, 700, 1, 512] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 3584, 1, 3328] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [704, 704, 1, 128] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [448, 1024, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [2560, 128, 1, 2560] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [448, 2944, 1, 128] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [64, 5056, 1, 128] + - Exact: [64, 6784, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [5888, 64, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [2048, 200, 1, 3200] + - Exact: [2048, 256, 1, 3328] + - Exact: [4096, 200, 1, 11264] + - Exact: [2048, 512, 1, 1024] + - Exact: [1024, 1024, 1, 64] + - Exact: [512, 1024, 1, 1536] + - Exact: [1024, 512, 1, 512] + - Exact: [2048, 512, 1, 640] + - Exact: [1024, 1024, 1, 512] + - Exact: [2048, 256, 1, 2048] + - Exact: [1024, 512, 1, 128] + - Exact: [2048, 512, 1, 256] + - Exact: [4096, 200, 1, 2560] + - Exact: [1024, 1024, 1, 1152] + - Exact: [2048, 200, 1, 32] + - Exact: [512, 1024, 1, 2816] + - Exact: [2048, 200, 1, 2080] + - Exact: [2048, 200, 1, 1024] + - Exact: [4096, 200, 1, 4096] + - Exact: [1024, 512, 1, 11264] + - Exact: [1024, 1024, 1, 1792] + - Exact: [4096, 200, 1, 768] + - Exact: [4096, 256, 1, 1024] + - Exact: [1024, 512, 1, 256] + - Exact: [1024, 512, 1, 1408] + - Exact: [1024, 512, 1, 5632] + - Exact: [4096, 200, 1, 256] + - Exact: [512, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 4160] + - Exact: [2048, 256, 1, 384] + - Exact: [4096, 200, 1, 640] + - Exact: [1024, 1024, 1, 7168] + - Exact: [4096, 256, 1, 768] + - Exact: [2048, 256, 1, 6656] + - Exact: [2048, 200, 1, 3072] + - Exact: [1024, 512, 1, 2816] + - Exact: [4096, 256, 1, 7680] + - Exact: [4096, 200, 1, 1024] + - Exact: [2048, 200, 1, 1792] + - Exact: [1024, 1024, 1, 2816] + - Exact: [2048, 512, 1, 1536] + - Exact: [4096, 256, 1, 3072] + - Exact: [2048, 256, 1, 5632] + - Exact: [1024, 512, 1, 6656] + - Exact: [4096, 200, 1, 2080] + - Exact: [2048, 200, 1, 13312] + - Exact: [4096, 256, 1, 3584] + - Exact: [2048, 256, 1, 8192] + - Exact: [2048, 512, 1, 512] + - Exact: [2048, 512, 1, 1152] + - Exact: [2048, 200, 1, 9216] + - Exact: [2048, 200, 1, 2560] + - Exact: [2048, 256, 1, 4608] + - Exact: [2048, 256, 1, 3584] + - Exact: [1024, 512, 1, 640] + - Exact: [2048, 512, 1, 768] + - Exact: [2048, 200, 1, 1408] + - Exact: [4096, 200, 1, 2048] + - Exact: [1024, 1024, 1, 5632] + - Exact: [2048, 512, 1, 3584] + - Exact: [1024, 512, 1, 64] + - Exact: [4096, 200, 1, 7680] + - Exact: [1024, 1024, 1, 1280] + - Exact: [2048, 200, 1, 896] + - Exact: [2048, 256, 1, 32] + - Exact: [2048, 256, 1, 1280] + - Exact: [4096, 256, 1, 4096] + - Exact: [2048, 256, 1, 11264] + - Exact: [4096, 200, 1, 9216] + - Exact: [1024, 512, 1, 4096] + - Exact: [4096, 200, 1, 3840] + - Exact: [1024, 1024, 1, 1920] + - Exact: [2048, 200, 1, 7168] + - Exact: [4096, 256, 1, 1152] + - Exact: [2048, 256, 1, 1920] + - Exact: [2048, 512, 1, 4160] + - Exact: [2048, 512, 1, 5632] + - Exact: [4096, 256, 1, 7168] + - Exact: [4096, 200, 1, 128] + - Exact: [2048, 200, 1, 5120] + - Exact: [1024, 1024, 1, 6656] + - Exact: [512, 1024, 1, 3200] + - Exact: [2048, 256, 1, 1536] + - Exact: [4096, 256, 1, 256] + - Exact: [2048, 512, 1, 1408] + - Exact: [1024, 512, 1, 2080] + - Exact: [2048, 512, 1, 2304] + - Exact: [4096, 200, 1, 512] + - Exact: [2048, 200, 1, 1280] + - Exact: [1024, 1024, 1, 2304] + - Exact: [2048, 512, 1, 4608] + - Exact: [4096, 256, 1, 6144] + - Exact: [4096, 256, 1, 896] + - Exact: [2048, 256, 1, 640] + - Exact: [2048, 512, 1, 384] + - Exact: [2048, 200, 1, 16384] + - Exact: [4096, 200, 1, 10240] + - Exact: [1024, 512, 1, 9216] + - Exact: [4096, 200, 1, 1920] + - Exact: [2048, 512, 1, 7680] + - Exact: [1024, 512, 1, 3584] + - Exact: [1024, 1024, 1, 32] + - Exact: [2048, 512, 1, 1664] + - Exact: [2048, 200, 1, 2048] + - Exact: [1024, 1024, 1, 3584] + - Exact: [4096, 256, 1, 6656] + - Exact: [4096, 256, 1, 4160] + - Exact: [2048, 256, 1, 3072] + - Exact: [2048, 256, 1, 8320] + - Exact: [1024, 512, 1, 3200] + - Exact: [1024, 512, 1, 896] + - Exact: [2048, 512, 1, 1280] + - Exact: [4096, 200, 1, 64] + - Exact: [1024, 1024, 1, 5120] + - Exact: [2048, 512, 1, 6656] + - Exact: [1024, 1024, 1, 128] + - Exact: [512, 1024, 1, 1792] + - Exact: [4096, 256, 1, 2816] + - Exact: [1024, 1024, 1, 4096] + - Exact: [2048, 200, 1, 4160] + - Exact: [1024, 512, 1, 768] + - Exact: [4096, 200, 1, 8320] + - Exact: [2048, 512, 1, 896] + - Exact: [4096, 200, 1, 7168] + - Exact: [2048, 200, 1, 3840] + - Exact: [1024, 1024, 1, 768] + - Exact: [4096, 256, 1, 2304] + - Exact: [2048, 200, 1, 16640] + - Exact: [2048, 256, 1, 2816] + - Exact: [1024, 512, 1, 384] + - Exact: [2048, 200, 1, 7680] + - Exact: [1024, 512, 1, 4608] + - Exact: [4096, 200, 1, 32] + - Exact: [4096, 200, 1, 3328] + - Exact: [1024, 1024, 1, 1408] + - Exact: [2048, 200, 1, 15360] + - Exact: [512, 1024, 1, 2048] + - Exact: [4096, 256, 1, 5632] + - Exact: [2048, 256, 1, 1408] + - Exact: [2048, 256, 1, 6144] + - Exact: [4096, 256, 1, 3328] + - Exact: [2048, 512, 1, 6144] + - Exact: [2048, 512, 1, 3200] + - Exact: [2048, 200, 1, 4608] + - Exact: [1024, 1024, 1, 6144] + - Exact: [4096, 256, 1, 1664] + - Exact: [2048, 200, 1, 384] + - Exact: [4096, 256, 1, 1792] + - Exact: [2048, 512, 1, 2816] + - Exact: [4096, 256, 1, 384] + - Exact: [2048, 256, 1, 128] + - Exact: [1024, 1024, 1, 640] + - Exact: [4096, 200, 1, 5632] + - Exact: [2048, 200, 1, 1152] + - Exact: [4096, 256, 1, 512] + - Exact: [1024, 1024, 1, 384] + - Exact: [2048, 200, 1, 512] + - Exact: [2048, 256, 1, 9216] + - Exact: [2048, 256, 1, 1792] + - Exact: [4096, 200, 1, 1792] + - Exact: [2048, 200, 1, 1536] + - Exact: [1024, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 2080] + - Exact: [2048, 200, 1, 2304] + - Exact: [2048, 256, 1, 7168] + - Exact: [2048, 512, 1, 1792] + - Exact: [1024, 1024, 1, 4608] + - Exact: [512, 1024, 1, 1280] + - Exact: [2048, 256, 1, 3200] + - Exact: [1024, 512, 1, 3328] + - Exact: [1024, 512, 1, 4160] + - Exact: [4096, 200, 1, 6656] + - Exact: [2048, 200, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [2048, 256, 1, 64] + - Exact: [2048, 256, 1, 2304] + - Exact: [4096, 200, 1, 8192] + - Exact: [1024, 512, 1, 7168] + - Exact: [1024, 512, 1, 1792] + - Exact: [4096, 200, 1, 2816] + - Exact: [1024, 1024, 1, 896] + - Exact: [4096, 256, 1, 5120] + - Exact: [4096, 256, 1, 2048] + - Exact: [2048, 256, 1, 5120] + - Exact: [2048, 256, 1, 7680] + - Exact: [2048, 200, 1, 3584] + - Exact: [1024, 512, 1, 1536] + - Exact: [2048, 200, 1, 64] + - Exact: [2048, 200, 1, 4096] + - Exact: [1024, 1024, 1, 1536] + - Exact: [4096, 256, 1, 32] + - Exact: [4096, 256, 1, 1280] + - Exact: [2048, 256, 1, 1024] + - Exact: [1024, 512, 1, 1152] + - Exact: [2048, 512, 1, 3328] + - Exact: [4096, 200, 1, 3584] + - Exact: [2048, 200, 1, 256] + - Exact: [4096, 256, 1, 1920] + - Exact: [2048, 256, 1, 1664] + - Exact: [4096, 200, 1, 5120] + - Exact: [1024, 512, 1, 8192] + - Exact: [4096, 200, 1, 896] + - Exact: [2048, 200, 1, 640] + - Exact: [4096, 200, 1, 1408] + - Exact: [2048, 200, 1, 5632] + - Exact: [1024, 512, 1, 2560] + - Exact: [4096, 200, 1, 1280] + - Exact: [1024, 1024, 1, 2560] + - Exact: [2048, 512, 1, 64] + - Exact: [2048, 200, 1, 8192] + - Exact: [2048, 512, 1, 3072] + - Exact: [4096, 256, 1, 640] + - Exact: [2048, 256, 1, 4096] + - Exact: [4096, 200, 1, 1664] + - Exact: [2048, 200, 1, 6656] + - Exact: [512, 1024, 1, 768] + - Exact: [2048, 200, 1, 8320] + - Exact: [4096, 256, 1, 3840] + - Exact: [1024, 1024, 1, 3200] + - Exact: [4096, 256, 1, 4608] + - Exact: [1024, 512, 1, 32] + - Exact: [1024, 512, 1, 3840] + - Exact: [2048, 512, 1, 1920] + - Exact: [4096, 200, 1, 6144] + - Exact: [2048, 200, 1, 2816] + - Exact: [1024, 1024, 1, 3840] + - Exact: [2048, 256, 1, 3840] + - Exact: [1024, 512, 1, 7680] + - Exact: [2048, 200, 1, 10240] + - Exact: [2048, 512, 1, 5120] + - Exact: [512, 1024, 1, 512] + - Exact: [2048, 512, 1, 32] + - Exact: [4096, 256, 1, 2560] + - Exact: [4096, 256, 1, 64] + - Exact: [2048, 200, 1, 768] + - Exact: [2048, 512, 1, 2560] + - Exact: [2048, 512, 1, 7168] + - Exact: [2048, 512, 1, 128] + - Exact: [4096, 200, 1, 2304] + - Exact: [2048, 512, 1, 4096] + - Exact: [2048, 256, 1, 2560] + - Exact: [2048, 256, 1, 4160] + - Exact: [1024, 512, 1, 1664] + - Exact: [2048, 512, 1, 2080] + - Exact: [2048, 512, 1, 3840] + - Exact: [4096, 200, 1, 3072] + - Exact: [1024, 1024, 1, 1664] + - Exact: [512, 1024, 1, 2304] + - Exact: [4096, 256, 1, 1408] + - Exact: [2048, 256, 1, 1152] + - Exact: [1024, 512, 1, 1280] + - Exact: [2048, 200, 1, 12288] + - Exact: [2048, 200, 1, 1664] + - Exact: [4096, 200, 1, 4608] + - Exact: [512, 1024, 1, 2560] + - Exact: [4096, 200, 1, 384] + - Exact: [2048, 200, 1, 128] + - Exact: [2048, 200, 1, 11264] + - Exact: [1024, 512, 1, 1920] + - Exact: [4096, 256, 1, 1536] + - Exact: [2048, 256, 1, 256] + - Exact: [2048, 256, 1, 10240] + - Exact: [1024, 512, 1, 5120] + - Exact: [1024, 512, 1, 8320] + - Exact: [1024, 512, 1, 10240] + - Exact: [1024, 1024, 1, 2048] + - Exact: [2048, 256, 1, 2080] + - Exact: [4096, 256, 1, 128] + - Exact: [2048, 256, 1, 896] + - Exact: [4096, 200, 1, 1152] + - Exact: [2048, 200, 1, 6144] + - Exact: [1024, 1024, 1, 7680] + - Exact: [2048, 200, 1, 1920] + - Exact: [4096, 256, 1, 2080] + - Exact: [2048, 200, 1, 14336] + - Exact: [1024, 512, 1, 6144] + - Exact: [1024, 512, 1, 2304] + - Exact: [4096, 200, 1, 4160] + - Exact: [4096, 200, 1, 1536] + - Exact: [2048, 320, 1, 64] + - Exact: [2048, 384, 1, 64] + - Exact: [1024, 384, 1, 289] + - Exact: [2048, 448, 1, 64] + - Exact: [102, 101, 624, 64] + - Exact: [101, 101, 624, 64] + - Exact: [85, 85, 752, 64] + - Exact: [112, 111, 576, 64] + - Exact: [65, 65, 992, 64] + - Exact: [77, 77, 816, 64] + - Exact: [111, 111, 576, 64] + - Exact: [84, 85, 752, 64] + - Exact: [84, 84, 752, 64] + - Exact: [71, 71, 896, 64] + - Exact: [122, 122, 528, 64] + - Exact: [78, 78, 816, 64] + - Exact: [112, 112, 576, 64] + - Exact: [77, 78, 816, 64] + - Exact: [111, 112, 576, 64] + - Exact: [92, 93, 688, 64] + - Exact: [102, 102, 624, 64] + - Exact: [99, 99, 624, 64] + - Exact: [100, 102, 624, 64] + - Exact: [123, 122, 528, 64] + - Exact: [99, 102, 624, 64] + - Exact: [93, 93, 688, 64] + - Exact: [123, 123, 528, 64] + - Exact: [100, 100, 624, 64] + - Exact: [101, 102, 624, 64] + - Exact: [102, 100, 624, 64] + - Exact: [92, 92, 688, 64] + - Exact: [3072, 128, 1, 4096] + - Exact: [1728, 320, 1, 64] + - Exact: [1440, 320, 1, 196] + - Exact: [2592, 384, 1, 289] + - Exact: [192, 80, 36, 10368] + - Exact: [1280, 384, 1, 64] + - Exact: [1280, 448, 1, 64] + - Exact: [3456, 256, 1, 169] + - Exact: [2304, 256, 1, 196] + - Exact: [224, 192, 36, 2592] + - Exact: [192, 128, 36, 1568] + - Exact: [1296, 288, 1, 196] + - Exact: [192, 64, 36, 6272] + - Exact: [1728, 224, 1, 1225] + - Exact: [1152, 384, 1, 64] + - Exact: [1792, 256, 1, 289] + - Exact: [1728, 384, 1, 169] + - Exact: [1568, 256, 1, 289] + - Exact: [1152, 448, 1, 64] + - Exact: [1536, 256, 1, 64] + - Exact: [1440, 320, 1, 49] + - Exact: [1344, 512, 1, 64] + - Exact: [1152, 256, 1, 196] + - Exact: [1728, 192, 1, 1225] + - Exact: [2048, 512, 1, 49] + - Exact: [512, 2048, 1, 49] + - Exact: [1728, 192, 1, 64] + - Exact: [1536, 384, 1, 64] + - Exact: [2048, 192, 1, 64] + - Exact: [128, 96, 36, 1568] + - Exact: [128, 128, 36, 3136] + - Exact: [1280, 320, 1, 64] + - Exact: [1792, 320, 1, 289] + - Exact: [2880, 320, 1, 64] + - Exact: [1728, 384, 1, 49] + - Exact: [512, 1024, 1, 196] + - Exact: [224, 192, 36, 5184] + - Exact: [192, 80, 36, 20736] + - Exact: [224, 192, 64, 4608] + - Exact: [224, 192, 64, 2304] + - Exact: [192, 80, 49, 14400] + - Exact: [224, 192, 49, 6272] + - Exact: [224, 192, 49, 3136] + - Exact: [192, 80, 36, 41472] + - Exact: [192, 80, 49, 28800] + - Exact: [192, 80, 64, 9216] + - Exact: [256, 224, 9, 9792] + - Exact: [256, 256, 9, 4896] + - Exact: [320, 256, 9, 4896] + - Exact: [224, 192, 9, 19584] + - Exact: [192, 192, 11, 3264] + - Exact: [192, 192, 11, 6528] + - Exact: [192, 192, 9, 4896] + - Exact: [224, 192, 11, 6528] + - Exact: [192, 192, 9, 19584] + - Exact: [256, 224, 11, 13056] + - Exact: [224, 192, 11, 13056] + - Exact: [256, 256, 11, 3264] + - Exact: [320, 256, 11, 6528] + - Exact: [192, 192, 9, 9792] + - Exact: [224, 224, 9, 9792] + - Exact: [224, 192, 11, 3264] + - Exact: [224, 224, 11, 6528] + - Exact: [224, 224, 9, 19584] + - Exact: [192, 192, 11, 13056] + - Exact: [224, 224, 9, 4896] + - Exact: [320, 256, 11, 3264] + - Exact: [256, 256, 11, 6528] + - Exact: [224, 192, 9, 4896] + - Exact: [224, 224, 11, 13056] + - Exact: [224, 224, 11, 3264] + - Exact: [256, 224, 11, 6528] + - Exact: [256, 224, 11, 3264] + - Exact: [224, 192, 9, 9792] + - Exact: [256, 224, 9, 4896] + - Exact: [64, 64, 496, 64] + - Exact: [135, 135, 32, 64] + - Exact: [64, 65, 496, 64] + - Exact: [65, 65, 472, 64] + - Exact: [65, 65, 496, 64] + - Exact: [70, 70, 216, 64] + - Exact: [70, 71, 216, 64] + - Exact: [71, 71, 216, 64] + - Exact: [71, 71, 448, 64] + - Exact: [77, 77, 248, 64] + - Exact: [77, 77, 408, 64] + - Exact: [77, 78, 248, 64] + - Exact: [77, 78, 408, 64] + - Exact: [78, 78, 248, 64] + - Exact: [78, 78, 408, 64] + - Exact: [80, 80, 152, 64] + - Exact: [80, 84, 152, 64] + - Exact: [84, 84, 152, 64] + - Exact: [85, 85, 376, 64] + - Exact: [93, 93, 344, 64] + - Exact: [102, 102, 312, 64] + - Exact: [112, 112, 288, 64] + - Exact: [122, 122, 264, 64] + - Exact: [123, 122, 264, 64] + - Exact: [123, 123, 264, 64] + - Exact: [511, 2048, 1, 2048] + - Exact: [1024, 512, 1, 1025] + - Exact: [512, 1023, 1, 1024] + - Exact: [1025, 1024, 1, 1024] + - Exact: [2048, 513, 1, 2048] + - Exact: [1024, 1024, 1, 1025] + - Exact: [960, 1024, 1, 1023] + - Exact: [1024, 1024, 1, 1024] + - Exact: [960, 1025, 1, 1024] + - Exact: [2049, 512, 1, 2048] + - Exact: [513, 1024, 1, 1024] + - Exact: [512, 2048, 1, 2048] + - Exact: [1024, 511, 1, 1024] + - Exact: [1024, 512, 1, 1023] + - Exact: [960, 1024, 1, 1025] + - Exact: [959, 1024, 1, 1024] + - Exact: [2048, 512, 1, 2049] + - Exact: [511, 1024, 1, 1024] + - Exact: [512, 2049, 1, 2048] + - Exact: [1024, 513, 1, 1024] + - Exact: [2048, 512, 1, 2047] + - Exact: [1025, 512, 1, 1024] + - Exact: [1024, 1024, 1, 1023] + - Exact: [513, 2048, 1, 2048] + - Exact: [1024, 1025, 1, 1024] + - Exact: [512, 2048, 1, 2049] + - Exact: [1024, 1023, 1, 1024] + - Exact: [960, 1023, 1, 1024] + - Exact: [2048, 511, 1, 2048] + - Exact: [1023, 512, 1, 1024] + - Exact: [2047, 512, 1, 2048] + - Exact: [512, 1024, 1, 1024] + - Exact: [512, 1024, 1, 1025] + - Exact: [512, 2047, 1, 2048] + - Exact: [512, 1025, 1, 1024] + - Exact: [512, 2048, 1, 2047] + - Exact: [960, 1024, 1, 1024] + - Exact: [961, 1024, 1, 1024] + - Exact: [512, 1024, 1, 1023] + - Exact: [1023, 1024, 1, 1024] + - Exact: [479, 1024, 1, 1024] + - Exact: [479, 2048, 1, 2048] + - Exact: [480, 1023, 1, 1024] + - Exact: [480, 1024, 1, 1023] + - Exact: [480, 1024, 1, 1025] + - Exact: [480, 1025, 1, 1024] + - Exact: [480, 2047, 1, 2048] + - Exact: [480, 2048, 1, 2047] + - Exact: [480, 2048, 1, 2049] + - Exact: [480, 2049, 1, 2048] + - Exact: [480, 3071, 1, 3072] + - Exact: [481, 1024, 1, 1024] + - Exact: [481, 2048, 1, 2048] + - Exact: [1023, 480, 1, 1024] + - Exact: [1024, 479, 1, 1024] + - Exact: [1024, 480, 1, 1023] + - Exact: [1024, 480, 1, 1025] + - Exact: [1024, 481, 1, 1024] + - Exact: [1025, 480, 1, 1024] + - Exact: [2047, 480, 1, 2048] + - Exact: [2048, 479, 1, 2048] + - Exact: [2048, 480, 1, 2047] + - Exact: [2048, 480, 1, 2049] + - Exact: [2048, 481, 1, 2048] + - Exact: [2049, 480, 1, 2048] + - Exact: [3071, 480, 1, 3072] + - Exact: [480, 1024, 1, 1024] + - Exact: [480, 2048, 1, 2048] + - Exact: [1024, 480, 1, 1024] + - Exact: [2048, 480, 1, 2048] + - Exact: [1024, 512, 1, 2048] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1600] + - Exact: [1024, 1024, 1, 960] + - Exact: [2048, 215, 1, 512] + - Exact: [2048, 215, 1, 768] + - Exact: [2048, 256, 1, 512] + - Exact: [2048, 256, 1, 768] + - Exact: [2048, 512, 1, 2048] + - Exact: [2048, 512, 1, 67] + - Exact: [2048, 512, 1, 74] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3328, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [512, 1600, 1, 512] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [767, 1280, 1, 768] + - Exact: [769, 1280, 1, 768] + - Exact: [768, 1279, 1, 768] + - Exact: [768, 1281, 1, 768] + - Exact: [768, 1280, 1, 767] + - Exact: [768, 1280, 1, 769] + - Exact: [256, 4096, 1, 512] + - Exact: [767, 768, 1, 768] + - Exact: [769, 768, 1, 768] + - Exact: [768, 767, 1, 768] + - Exact: [768, 769, 1, 768] + - Exact: [768, 768, 1, 767] + - Exact: [768, 768, 1, 769] + - Exact: [768, 768, 1, 768] + - Exact: [128, 128, 49, 1152] + - Exact: [128, 128, 49, 1216] + - Exact: [128, 128, 36, 1800] + - Exact: [128, 128, 36, 1900] + - Exact: [128, 128, 64, 5880] + - Exact: [128, 128, 49, 7680] + - Exact: [128, 128, 64, 882] + - Exact: [128, 128, 64, 931] + - Exact: [128, 64, 121, 1152] + - Exact: [128, 64, 81, 12000] + - Exact: [128, 64, 121, 1216] + - Exact: [128, 64, 81, 1800] + - Exact: [128, 64, 81, 1900] + - Exact: [128, 64, 49, 20280] + - Exact: [128, 64, 49, 3042] + - Exact: [128, 64, 49, 3211] + - Exact: [128, 64, 169, 5880] + - Exact: [128, 64, 121, 7680] + - Exact: [128, 64, 169, 882] + - Exact: [128, 64, 169, 931] + - Exact: [256, 128, 25, 1080] + - Exact: [256, 128, 25, 162] + - Exact: [256, 128, 25, 171] + - Exact: [1152, 256, 1, 1] + - Exact: [1152, 256, 1, 1444] + - Exact: [1152, 256, 1, 25] + - Exact: [1152, 256, 1, 9] + - Exact: [2304, 256, 1, 1444] + - Exact: [2304, 340, 1, 1] + - Exact: [2304, 340, 1, 1444] + - Exact: [2304, 340, 1, 9] + - Exact: [2304, 510, 1, 25] + - Exact: [30522, 77, 1, 1024] + - Exact: [1024, 780, 1, 1024] + - Exact: [1024, 800, 1, 1024] + - Exact: [1024, 820, 1, 1024] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 512, 128, 512] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 480] + - Exact: [128, 3456, 1, 256] + - Exact: [128, 4096, 1, 256] + - Exact: [128, 6912, 1, 256] + - Exact: [256, 3456, 1, 512] + - Exact: [512, 864, 1, 1024] + - Exact: [512, 864, 1, 13] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [1024, 512, 1, 196] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [128, 128, 64, 6400] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [128, 128, 49, 1120] + - Exact: [128, 128, 49, 1064] + - Exact: [128, 128, 49, 1040] + - Exact: [128, 128, 64, 600] + - Exact: [128, 128, 64, 616] + - Exact: [128, 128, 49, 950] + - Exact: [128, 128, 49, 972] + - Exact: [128, 128, 64, 560] + - Exact: [128, 128, 49, 1008] + - Exact: [128, 128, 64, 532] + - Exact: [128, 128, 49, 1080] + - Exact: [128, 128, 64, 588] + - Exact: [128, 128, 49, 1160] + - Exact: [128, 128, 49, 988] + - Exact: [128, 128, 49, 936] + - Exact: [512, 1024, 1, 3800] + - Exact: [512, 1024, 1, 3400] + - Exact: [512, 1024, 1, 3456] + - Exact: [2048, 512, 1, 950] + - Exact: [512, 1024, 1, 3552] + - Exact: [512, 1024, 1, 3220] + - Exact: [2048, 512, 1, 850] + - Exact: [512, 2048, 1, 864] + - Exact: [512, 2048, 1, 768] + - Exact: [2048, 512, 1, 805] + - Exact: [512, 1024, 1, 2852] + - Exact: [512, 2048, 1, 888] + - Exact: [2048, 512, 1, 864] + - Exact: [2048, 512, 1, 888] + - Exact: [2048, 256, 1, 950] + - Exact: [2048, 512, 1, 713] + - Exact: [512, 1024, 1, 2688] + - Exact: [512, 1024, 1, 2640] + - Exact: [512, 1024, 1, 2904] + - Exact: [1024, 512, 1, 950] + - Exact: [512, 2048, 1, 672] + - Exact: [512, 2048, 1, 660] + - Exact: [512, 2048, 1, 1008] + - Exact: [2048, 256, 1, 850] + - Exact: [2048, 512, 1, 726] + - Exact: [1024, 512, 1, 850] + - Exact: [2048, 512, 1, 660] + - Exact: [2048, 512, 1, 672] + - Exact: [512, 2048, 1, 840] + - Exact: [2048, 512, 1, 1008] + - Exact: [512, 2048, 1, 792] + - Exact: [1024, 512, 1, 805] + - Exact: [512, 2048, 1, 1050] + - Exact: [2048, 512, 1, 748] + - Exact: [2048, 256, 1, 864] + - Exact: [1024, 512, 1, 864] + - Exact: [2048, 512, 1, 875] + - Exact: [2048, 512, 1, 840] + - Exact: [2048, 512, 1, 792] + - Exact: [512, 2048, 1, 736] + - Exact: [2048, 256, 1, 888] + - Exact: [512, 2048, 1, 704] + - Exact: [512, 2048, 1, 588] + - Exact: [1024, 512, 1, 888] + - Exact: [512, 2048, 1, 816] + - Exact: [1024, 512, 1, 713] + - Exact: [2048, 512, 1, 736] + - Exact: [2048, 512, 1, 588] + - Exact: [2048, 512, 1, 704] + - Exact: [1024, 512, 1, 660] + - Exact: [2048, 256, 1, 660] + - Exact: [2048, 256, 1, 672] + - Exact: [1024, 512, 1, 672] + - Exact: [1024, 512, 1, 726] + - Exact: [512, 2048, 1, 630] + - Exact: [512, 2048, 1, 600] + - Exact: [2048, 256, 1, 805] + - Exact: [2048, 256, 1, 713] + - Exact: [2048, 256, 1, 726] + - Exact: [320, 1024, 1, 1024] + - Exact: [1024, 1000, 1, 1024] + - Exact: [320, 1000, 1, 1024] + - Exact: [128, 128, 49, 1280] + - Exact: [128, 128, 49, 1360] + - Exact: [128, 128, 49, 1200] + - Exact: [128, 128, 49, 1240] + - Exact: [2304, 256, 1, 704] + - Exact: [2304, 256, 1, 736] + - Exact: [2304, 256, 1, 792] + - Exact: [2304, 256, 1, 748] + - Exact: [2304, 256, 1, 726] + - Exact: [2304, 256, 1, 713] + - Exact: [2304, 256, 1, 768] + - Exact: [512, 2048, 1, 759] + - Exact: [512, 2048, 1, 925] + - Exact: [2304, 256, 1, 805] + - Exact: [512, 2048, 1, 900] + - Exact: [512, 2048, 1, 875] + - Exact: [512, 2048, 1, 748] + - Exact: [512, 2048, 1, 726] + - Exact: [512, 2048, 1, 713] + - Exact: [512, 2048, 1, 805] + - Exact: [512, 2048, 1, 850] + - Exact: [512, 2048, 1, 950] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 512, 80, 512] + - Exact: [29000, 109, 1, 2560] + - Exact: [29000, 121, 1, 2560] + - Exact: [29000, 65, 1, 2560] + - Exact: [29000, 66, 1, 2560] + - Exact: [29000, 67, 1, 2560] + - Exact: [29000, 69, 1, 2560] + - Exact: [29000, 70, 1, 2560] + - Exact: [29000, 71, 1, 2560] + - Exact: [29000, 73, 1, 2560] + - Exact: [29000, 74, 1, 2560] + - Exact: [29000, 75, 1, 2560] + - Exact: [29000, 77, 1, 2560] + - Exact: [29000, 78, 1, 2560] + - Exact: [29000, 80, 1, 2560] + - Exact: [29000, 81, 1, 2560] + - Exact: [29000, 82, 1, 2560] + - Exact: [29000, 83, 1, 2560] + - Exact: [29000, 84, 1, 2560] + - Exact: [29000, 88, 1, 2560] + - Exact: [29000, 89, 1, 2560] + - Exact: [29000, 90, 1, 2560] + - Exact: [29000, 92, 1, 2560] + - Exact: [29000, 95, 1, 2560] + - Exact: [29000, 98, 1, 2560] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 200, 1, 13312] + - Exact: [1024, 256, 1, 15360] + - Exact: [1024, 256, 1, 16384] + - Exact: [1024, 200, 1, 16384] + - Exact: [1024, 256, 1, 12288] + - Exact: [1024, 200, 1, 12288] + - Exact: [1024, 200, 1, 15360] + - Exact: [1024, 256, 1, 9216] + - Exact: [1024, 200, 1, 14336] + - Exact: [1024, 256, 1, 16640] + - Exact: [1024, 200, 1, 8192] + - Exact: [1024, 200, 1, 10240] + - Exact: [1024, 200, 1, 9216] + - Exact: [1024, 256, 1, 11264] + - Exact: [1024, 200, 1, 8320] + - Exact: [1024, 256, 1, 8320] + - Exact: [1024, 200, 1, 16640] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 13312] + - Exact: [1024, 200, 1, 11264] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 10240] + - Exact: [96, 64, 64, 18432] + - Exact: [96, 64, 36, 10368] + - Exact: [96, 64, 36, 20736] + - Exact: [96, 96, 36, 10368] + - Exact: [96, 64, 49, 28800] + - Exact: [96, 64, 36, 41472] + - Exact: [64, 64, 11, 233600] + - Exact: [64, 64, 11, 116800] + - Exact: [64, 64, 9, 172864] + - Exact: [64, 64, 11, 58400] + - Exact: [192, 160, 9, 19584] + - Exact: [128, 128, 9, 9792] + - Exact: [192, 160, 11, 13056] + - Exact: [64, 64, 9, 86432] + - Exact: [128, 128, 9, 19584] + - Exact: [160, 160, 11, 13056] + - Exact: [160, 160, 9, 19584] + - Exact: [192, 128, 9, 19584] + - Exact: [192, 160, 9, 9792] + - Exact: [64, 64, 9, 345728] + - Exact: [128, 128, 11, 13056] + - Exact: [160, 160, 9, 9792] + - Exact: [192, 128, 11, 13056] + - Exact: [192, 128, 9, 9792] + - Exact: [128, 64, 25, 43320] + - Exact: [64, 64, 64, 20280] + - Exact: [64, 64, 49, 27000] + - Exact: [64, 64, 36, 43320] + - Exact: [64, 64, 36, 50176] + - Exact: [64, 64, 49, 36864] + - Exact: [64, 64, 64, 25600] + - Exact: [256, 256, 1, 60800] + - Exact: [256, 256, 1, 54400] + - Exact: [256, 256, 1, 51520] + - Exact: [256, 256, 1, 55296] + - Exact: [256, 256, 1, 56832] + - Exact: [256, 256, 1, 45632] + - Exact: [256, 256, 1, 49152] + - Exact: [256, 512, 1, 13600] + - Exact: [256, 256, 1, 43008] + - Exact: [256, 512, 1, 15200] + - Exact: [256, 512, 1, 12880] + - Exact: [256, 512, 1, 13824] + - Exact: [512, 256, 1, 13824] + - Exact: [256, 512, 1, 14208] + - Exact: [512, 256, 1, 14208] + - Exact: [512, 256, 1, 15200] + - Exact: [256, 512, 1, 12288] + - Exact: [512, 256, 1, 12288] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 200, 1, 560] + - Exact: [768, 320, 1, 768] + - Exact: [1024, 120, 1, 1024] + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [35, 8457, 1, 1760] + - Exact: [64, 2944, 1, 128] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [2560, 32, 1, 2560] + - Exact: [128, 1408, 1, 256] + - Exact: [35, 8457, 1, 2560] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [1408, 128, 1, 128] + - Exact: [128, 1024, 1, 1280] + - Exact: [2944, 64, 1, 128] + - Exact: [1024, 128, 1, 3328] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [35, 8457, 1, 2048] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [35, 8457, 1, 4096] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [128, 704, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [64, 2944, 1, 3328] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 1408, 1, 128] + - Exact: [256, 704, 1, 128] + - Exact: [1408, 64, 1, 128] + - Exact: [448, 448, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2048, 128, 1, 2048] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [256, 1024, 1, 196] + - Exact: [1024, 256, 1, 1536] + - Exact: [1024, 200, 1, 1408] + - Exact: [1024, 200, 1, 6144] + - Exact: [1024, 256, 1, 3328] + - Exact: [512, 256, 1, 3200] + - Exact: [1024, 200, 1, 4608] + - Exact: [512, 256, 1, 1792] + - Exact: [1024, 200, 1, 1792] + - Exact: [512, 200, 1, 2816] + - Exact: [512, 200, 1, 3072] + - Exact: [1024, 200, 1, 128] + - Exact: [1024, 200, 1, 5120] + - Exact: [1024, 256, 1, 256] + - Exact: [512, 256, 1, 2560] + - Exact: [1024, 256, 1, 4160] + - Exact: [1024, 200, 1, 512] + - Exact: [512, 512, 1, 1536] + - Exact: [1024, 256, 1, 896] + - Exact: [1024, 200, 1, 3200] + - Exact: [1024, 200, 1, 1536] + - Exact: [1024, 256, 1, 1024] + - Exact: [128, 1024, 1, 512] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 200, 1, 2304] + - Exact: [1024, 256, 1, 1664] + - Exact: [512, 512, 1, 1024] + - Exact: [1024, 256, 1, 2080] + - Exact: [512, 200, 1, 768] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 200, 1, 64] + - Exact: [512, 512, 1, 2304] + - Exact: [128, 1024, 1, 2048] + - Exact: [512, 200, 1, 2560] + - Exact: [512, 256, 1, 1024] + - Exact: [1024, 256, 1, 1920] + - Exact: [512, 200, 1, 2304] + - Exact: [1024, 256, 1, 384] + - Exact: [1024, 256, 1, 32] + - Exact: [1024, 200, 1, 2816] + - Exact: [1024, 200, 1, 3072] + - Exact: [512, 256, 1, 1536] + - Exact: [1024, 256, 1, 512] + - Exact: [256, 512, 1, 512] + - Exact: [1024, 200, 1, 3840] + - Exact: [256, 1024, 1, 512] + - Exact: [1024, 256, 1, 1152] + - Exact: [512, 512, 1, 2816] + - Exact: [512, 200, 1, 1280] + - Exact: [512, 200, 1, 3200] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 200, 1, 2560] + - Exact: [1024, 256, 1, 5632] + - Exact: [512, 256, 1, 768] + - Exact: [1024, 256, 1, 3072] + - Exact: [256, 512, 1, 2048] + - Exact: [1024, 200, 1, 1152] + - Exact: [512, 512, 1, 3072] + - Exact: [1024, 200, 1, 1664] + - Exact: [1024, 200, 1, 32] + - Exact: [1024, 200, 1, 384] + - Exact: [512, 256, 1, 2304] + - Exact: [256, 512, 1, 1024] + - Exact: [1024, 200, 1, 3328] + - Exact: [1024, 200, 1, 2080] + - Exact: [512, 200, 1, 1792] + - Exact: [1024, 256, 1, 1792] + - Exact: [1024, 200, 1, 7168] + - Exact: [512, 256, 1, 3072] + - Exact: [1024, 200, 1, 2048] + - Exact: [512, 512, 1, 1280] + - Exact: [1024, 200, 1, 1280] + - Exact: [512, 200, 1, 512] + - Exact: [1024, 256, 1, 2560] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 256, 1, 3200] + - Exact: [512, 512, 1, 2560] + - Exact: [1024, 256, 1, 640] + - Exact: [1024, 256, 1, 3584] + - Exact: [512, 512, 1, 3200] + - Exact: [1024, 256, 1, 7680] + - Exact: [512, 200, 1, 1536] + - Exact: [512, 256, 1, 2816] + - Exact: [1024, 200, 1, 768] + - Exact: [512, 200, 1, 2048] + - Exact: [1024, 256, 1, 128] + - Exact: [1024, 200, 1, 4096] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 200, 1, 896] + - Exact: [1024, 256, 1, 4608] + - Exact: [128, 1024, 1, 1024] + - Exact: [1024, 256, 1, 2048] + - Exact: [512, 256, 1, 1280] + - Exact: [256, 1024, 1, 2048] + - Exact: [512, 512, 1, 2048] + - Exact: [512, 256, 1, 512] + - Exact: [1024, 200, 1, 7680] + - Exact: [1024, 200, 1, 6656] + - Exact: [512, 200, 1, 1024] + - Exact: [1024, 256, 1, 3840] + - Exact: [512, 512, 1, 768] + - Exact: [1024, 256, 1, 64] + - Exact: [1024, 200, 1, 1920] + - Exact: [1024, 256, 1, 7168] + - Exact: [512, 512, 1, 1792] + - Exact: [1024, 200, 1, 256] + - Exact: [256, 1024, 1, 1024] + - Exact: [1024, 200, 1, 640] + - Exact: [1024, 200, 1, 4160] + - Exact: [1024, 200, 1, 5632] + - Exact: [1024, 256, 1, 6656] + - Exact: [1024, 256, 1, 768] + - Exact: [512, 256, 1, 2048] + - Exact: [1024, 200, 1, 3584] + - Exact: [1024, 256, 1, 1408] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 128, 1, 289] + - Exact: [768, 192, 1, 289] + - Exact: [32, 32, 1984, 64] + - Exact: [54, 54, 1184, 64] + - Exact: [35, 35, 1808, 64] + - Exact: [45, 45, 1424, 64] + - Exact: [49, 49, 1296, 64] + - Exact: [59, 59, 1088, 64] + - Exact: [41, 41, 1552, 64] + - Exact: [38, 38, 1680, 64] + - Exact: [2048, 128, 1, 4096] + - Exact: [1024, 128, 1, 1024] + - Exact: [1152, 128, 1, 784] + - Exact: [864, 96, 1, 1225] + - Exact: [896, 192, 1, 289] + - Exact: [768, 128, 1, 289] + - Exact: [1344, 192, 1, 289] + - Exact: [384, 192, 1, 1225] + - Exact: [832, 192, 1, 49] + - Exact: [1280, 192, 1, 64] + - Exact: [512, 256, 1, 196] + - Exact: [864, 96, 1, 289] + - Exact: [896, 128, 1, 289] + - Exact: [1200, 64, 1, 1225] + - Exact: [1024, 256, 1, 289] + - Exact: [1024, 256, 1, 196] + - Exact: [1120, 192, 1, 289] + - Exact: [800, 96, 1, 784] + - Exact: [864, 128, 1, 784] + - Exact: [1344, 224, 1, 289] + - Exact: [1152, 192, 1, 784] + - Exact: [800, 128, 1, 196] + - Exact: [864, 208, 1, 196] + - Exact: [720, 192, 1, 5041] + - Exact: [576, 192, 1, 3136] + - Exact: [832, 256, 1, 49] + - Exact: [1200, 128, 1, 49] + - Exact: [528, 256, 1, 196] + - Exact: [256, 512, 1, 784] + - Exact: [480, 192, 1, 196] + - Exact: [96, 64, 36, 2592] + - Exact: [96, 96, 36, 2592] + - Exact: [1024, 192, 1, 289] + - Exact: [528, 160, 1, 196] + - Exact: [512, 160, 1, 196] + - Exact: [768, 160, 1, 289] + - Exact: [64, 32, 36, 43808] + - Exact: [832, 160, 1, 49] + - Exact: [2048, 64, 1, 1001] + - Exact: [2048, 128, 1, 1001] + - Exact: [1536, 64, 1, 1001] + - Exact: [96, 96, 49, 3136] + - Exact: [64, 32, 49, 57600] + - Exact: [96, 64, 49, 6272] + - Exact: [64, 32, 49, 115200] + - Exact: [96, 96, 64, 2304] + - Exact: [96, 96, 49, 6272] + - Exact: [96, 64, 36, 5184] + - Exact: [64, 32, 64, 40000] + - Exact: [96, 64, 64, 4608] + - Exact: [96, 96, 36, 5184] + - Exact: [96, 64, 64, 2304] + - Exact: [96, 64, 49, 3136] + - Exact: [64, 32, 36, 87616] + - Exact: [64, 32, 64, 80000] + - Exact: [96, 96, 64, 4608] + - Exact: [64, 32, 36, 175232] + - Exact: [128, 128, 11, 3264] + - Exact: [192, 128, 11, 6528] + - Exact: [128, 128, 11, 6528] + - Exact: [160, 160, 9, 4896] + - Exact: [192, 160, 11, 6528] + - Exact: [192, 128, 9, 4896] + - Exact: [128, 128, 9, 4896] + - Exact: [192, 128, 11, 3264] + - Exact: [160, 160, 11, 3264] + - Exact: [192, 160, 9, 4896] + - Exact: [192, 160, 11, 3264] + - Exact: [160, 160, 11, 6528] + - Exact: [4096, 64, 1, 1024] + - Exact: [49, 49, 160, 64] + - Exact: [54, 54, 592, 64] + - Exact: [59, 59, 512, 64] + - Exact: [104, 104, 16, 64] + - Exact: [32, 32, 624, 64] + - Exact: [32, 32, 992, 64] + - Exact: [35, 35, 384, 64] + - Exact: [35, 35, 904, 64] + - Exact: [38, 38, 320, 64] + - Exact: [38, 38, 840, 64] + - Exact: [41, 41, 312, 64] + - Exact: [41, 41, 776, 64] + - Exact: [45, 45, 392, 64] + - Exact: [45, 45, 712, 64] + - Exact: [49, 49, 648, 64] + - Exact: [54, 54, 200, 64] + - Exact: [59, 59, 544, 64] + - Exact: [91, 91, 40, 64] + - Exact: [91, 93, 40, 64] + - Exact: [93, 93, 40, 64] + - Exact: [102, 102, 56, 64] + - Exact: [103, 103, 16, 64] + - Exact: [103, 104, 16, 64] + - Exact: [112, 112, 16, 64] + - Exact: [112, 123, 16, 64] + - Exact: [119, 119, 32, 64] + - Exact: [119, 135, 32, 64] + - Exact: [123, 123, 16, 64] + - Exact: [512, 512, 1, 512] + - Exact: [513, 512, 1, 512] + - Exact: [512, 512, 1, 513] + - Exact: [512, 512, 1, 511] + - Exact: [512, 513, 1, 512] + - Exact: [512, 511, 1, 512] + - Exact: [511, 512, 1, 512] + - Exact: [479, 512, 1, 512] + - Exact: [480, 511, 1, 512] + - Exact: [480, 512, 1, 511] + - Exact: [480, 512, 1, 513] + - Exact: [480, 513, 1, 512] + - Exact: [481, 512, 1, 512] + - Exact: [511, 480, 1, 512] + - Exact: [512, 479, 1, 512] + - Exact: [512, 480, 1, 511] + - Exact: [512, 480, 1, 513] + - Exact: [512, 481, 1, 512] + - Exact: [513, 480, 1, 512] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 64] + - Exact: [2048, 114, 1, 512] + - Exact: [2048, 114, 1, 768] + - Exact: [256, 684, 1, 1024] + - Exact: [33, 33, 1600, 32] + - Exact: [256, 684, 1, 1024] + - Exact: [383, 384, 1, 384] + - Exact: [385, 384, 1, 384] + - Exact: [384, 383, 1, 384] + - Exact: [384, 385, 1, 384] + - Exact: [384, 384, 1, 383] + - Exact: [384, 384, 1, 385] + - Exact: [384, 384, 1, 384] + - Exact: [128, 64, 25, 6498] + - Exact: [128, 64, 25, 6859] + - Exact: [64, 64, 64, 3042] + - Exact: [64, 64, 64, 3211] + - Exact: [64, 64, 49, 4050] + - Exact: [64, 64, 49, 4275] + - Exact: [64, 64, 36, 6498] + - Exact: [64, 64, 36, 6859] + - Exact: [1152, 128, 1, 1444] + - Exact: [512, 256, 1, 361] + - Exact: [576, 128, 1, 1444] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [32, 32, 4608, 64] + - Exact: [32, 35, 4608, 64] + - Exact: [34, 34, 4736, 64] + - Exact: [35, 35, 4608, 64] + - Exact: [128, 864, 1, 256] + - Exact: [256, 864, 1, 512] + - Exact: [512, 256, 1, 784] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 256, 1, 3800] + - Exact: [1024, 256, 1, 3400] + - Exact: [256, 1024, 1, 3400] + - Exact: [1024, 256, 1, 3220] + - Exact: [256, 1024, 1, 3220] + - Exact: [1024, 256, 1, 3456] + - Exact: [256, 1024, 1, 3456] + - Exact: [256, 1024, 1, 3072] + - Exact: [1024, 256, 1, 3552] + - Exact: [256, 1024, 1, 3552] + - Exact: [256, 1024, 1, 2852] + - Exact: [1024, 256, 1, 2852] + - Exact: [256, 512, 1, 10752] + - Exact: [256, 1024, 1, 3800] + - Exact: [256, 512, 1, 10560] + - Exact: [256, 1024, 1, 2992] + - Exact: [256, 1024, 1, 2688] + - Exact: [1024, 256, 1, 2688] + - Exact: [256, 1024, 1, 2904] + - Exact: [1024, 256, 1, 2904] + - Exact: [256, 1024, 1, 2640] + - Exact: [1024, 256, 1, 2640] + - Exact: [1024, 256, 1, 4032] + - Exact: [1024, 256, 1, 2992] + - Exact: [256, 1024, 1, 3360] + - Exact: [1024, 256, 1, 3360] + - Exact: [1024, 256, 1, 3500] + - Exact: [256, 1024, 1, 3500] + - Exact: [1024, 256, 1, 3168] + - Exact: [256, 1024, 1, 3168] + - Exact: [256, 1024, 1, 3036] + - Exact: [1024, 256, 1, 4200] + - Exact: [1024, 256, 1, 3600] + - Exact: [256, 1024, 1, 3600] + - Exact: [256, 1024, 1, 2944] + - Exact: [1024, 256, 1, 2944] + - Exact: [1024, 256, 1, 3700] + - Exact: [256, 1024, 1, 2352] + - Exact: [1024, 256, 1, 2352] + - Exact: [256, 1024, 1, 3700] + - Exact: [256, 1024, 1, 2816] + - Exact: [256, 512, 1, 11408] + - Exact: [1024, 256, 1, 3036] + - Exact: [1024, 256, 1, 3264] + - Exact: [256, 1024, 1, 3264] + - Exact: [1024, 256, 1, 3864] + - Exact: [256, 1024, 1, 4032] + - Exact: [1024, 256, 1, 3128] + - Exact: [256, 1024, 1, 3128] + - Exact: [256, 1024, 1, 3200] + - Exact: [256, 512, 1, 11616] + - Exact: [1024, 256, 1, 4000] + - Exact: [256, 1024, 1, 2520] + - Exact: [1024, 256, 1, 2520] + - Exact: [256, 1024, 1, 2976] + - Exact: [256, 1024, 1, 2400] + - Exact: [1024, 256, 1, 2400] + - Exact: [1024, 256, 1, 3696] + - Exact: [1024, 256, 1, 3900] + - Exact: [1024, 256, 1, 3772] + - Exact: [256, 1024, 1, 3696] + - Exact: [256, 1024, 1, 2728] + - Exact: [1024, 256, 1, 2728] + - Exact: [1024, 256, 1, 2480] + - Exact: [256, 1024, 1, 2480] + - Exact: [1024, 256, 1, 2880] + - Exact: [512, 256, 1, 3220] + - Exact: [256, 1024, 1, 2880] + - Exact: [256, 1024, 1, 4200] + - Exact: [1024, 256, 1, 3648] + - Exact: [1024, 256, 1, 3312] + - Exact: [256, 1024, 1, 3648] + - Exact: [1024, 256, 1, 3300] + - Exact: [1024, 256, 1, 3528] + - Exact: [256, 1024, 1, 2604] + - Exact: [1024, 256, 1, 2604] + - Exact: [512, 256, 1, 11408] + - Exact: [256, 1024, 1, 3312] + - Exact: [256, 1024, 1, 3300] + - Exact: [256, 1024, 1, 3528] + - Exact: [1024, 256, 1, 2976] + - Exact: [1024, 256, 1, 2760] + - Exact: [512, 256, 1, 3800] + - Exact: [256, 1024, 1, 2760] + - Exact: [1024, 256, 1, 2160] + - Exact: [256, 1024, 1, 2160] + - Exact: [512, 256, 1, 11616] + - Exact: [512, 256, 1, 2852] + - Exact: [256, 1024, 1, 3864] + - Exact: [512, 256, 1, 2640] + - Exact: [256, 1024, 1, 4000] + - Exact: [512, 256, 1, 2904] + - Exact: [256, 1024, 1, 3900] + - Exact: [512, 256, 1, 2688] + - Exact: [256, 1024, 1, 3772] + - Exact: [512, 256, 1, 3400] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 3552] + - Exact: [29000, 35, 1, 2560] + - Exact: [29000, 36, 1, 2560] + - Exact: [29000, 39, 1, 2560] + - Exact: [29000, 40, 1, 2560] + - Exact: [29000, 42, 1, 2560] + - Exact: [29000, 43, 1, 2560] + - Exact: [29000, 44, 1, 2560] + - Exact: [29000, 46, 1, 2560] + - Exact: [29000, 48, 1, 2560] + - Exact: [29000, 49, 1, 2560] + - Exact: [29000, 50, 1, 2560] + - Exact: [29000, 51, 1, 2560] + - Exact: [29000, 53, 1, 2560] + - Exact: [29000, 54, 1, 2560] + - Exact: [29000, 55, 1, 2560] + - Exact: [29000, 56, 1, 2560] + - Exact: [29000, 57, 1, 2560] + - Exact: [29000, 58, 1, 2560] + - Exact: [29000, 59, 1, 2560] + - Exact: [29000, 61, 1, 2560] + - Exact: [29000, 63, 1, 2560] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [288, 64, 1, 21609] + - Exact: [32, 32, 36, 43808] + - Exact: [32, 32, 64, 40000] + - Exact: [32, 32, 49, 115200] + - Exact: [32, 32, 36, 175232] + - Exact: [32, 32, 49, 57600] + - Exact: [32, 32, 36, 87616] + - Exact: [32, 32, 64, 80000] + - Exact: [256, 128, 1, 13600] + - Exact: [256, 128, 1, 12880] + - Exact: [128, 512, 1, 15200] + - Exact: [512, 128, 1, 15200] + - Exact: [128, 512, 1, 11408] + - Exact: [256, 128, 1, 13824] + - Exact: [128, 512, 1, 11616] + - Exact: [256, 128, 1, 14208] + - Exact: [128, 512, 1, 14208] + - Exact: [256, 128, 1, 15200] + - Exact: [512, 128, 1, 11408] + - Exact: [512, 128, 1, 16800] + - Exact: [128, 512, 1, 11264] + - Exact: [512, 128, 1, 11616] + - Exact: [512, 128, 1, 16128] + - Exact: [512, 128, 1, 11968] + - Exact: [128, 512, 1, 11968] + - Exact: [512, 128, 1, 12288] + - Exact: [128, 512, 1, 12288] + - Exact: [128, 512, 1, 12672] + - Exact: [512, 128, 1, 11776] + - Exact: [512, 128, 1, 12144] + - Exact: [512, 128, 1, 11264] + - Exact: [128, 512, 1, 12144] + - Exact: [512, 128, 1, 12672] + - Exact: [128, 512, 1, 12512] + - Exact: [128, 512, 1, 11776] + - Exact: [256, 128, 1, 12288] + - Exact: [40, 40, 1, 1909283] + - Exact: [40, 40, 1, 3818566] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [30522, 20, 1, 1024] + - Exact: [1760, 32, 1, 1760] + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [1760, 16, 1, 1760] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [8448, 16, 1, 2816] + - Exact: [2368, 4, 1, 256] + - Exact: [5888, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [2048, 16, 1, 2048] + - Exact: [1408, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2368, 4, 1, 3328] + - Exact: [5056, 4, 1, 1280] + - Exact: [3072, 16, 1, 1024] + - Exact: [1408, 4, 1, 3328] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [1856, 4, 1, 256] + - Exact: [6784, 4, 1, 128] + - Exact: [4288, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [2560, 16, 1, 2560] + - Exact: [7680, 16, 1, 2560] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [6784, 4, 1, 3328] + - Exact: [2048, 32, 1, 2048] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 1280] + - Exact: [4608, 16, 1, 1536] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [2048, 32, 1, 1001] + - Exact: [1536, 32, 1, 1001] + - Exact: [1600, 1, 1, 1024] + - Exact: [32768, 1, 1, 256] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2560] + - Exact: [3456, 1, 1, 256] + - Exact: [4096, 1, 1, 256] + - Exact: [6912, 1, 1, 256] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2560] + - Exact: [29000, 27, 1, 2560] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + - Exact: [32, 1600, 1, 512] + - Exact: [2, 2048, 1, 1024] + - Exact: [1, 4096, 1, 256] + - Exact: [1, 6912, 1, 256] + - Exact: [2, 2048, 1, 768] + - Exact: [2, 4608, 1, 768] + - Exact: [2, 4608, 1, 1024] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [64, 80, 1, 5329] + - Exact: [576, 96, 1, 5329] + - Exact: [288, 32, 1, 21609] + - Exact: [576, 96, 1, 5041] + - Exact: [27, 32, 1, 22201] + - Exact: [160, 64, 1, 5329] + - Exact: [448, 64, 1, 5329] + - Exact: [147, 64, 1, 12544] + - Exact: [147, 64, 1, 22500] + - Exact: [576, 64, 1, 5625] + - Exact: [256, 128, 1, 10752] + - Exact: [256, 128, 1, 10560] + - Exact: [256, 128, 1, 11408] + - Exact: [256, 12, 1, 11408] + - Exact: [256, 128, 1, 11616] + - Exact: [256, 12, 1, 11616] + - Exact: [256, 12, 1, 12288] + - Exact: [11, 11, 1, 1909283] + - Exact: [11, 11, 1, 3818566] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [768, 32, 1, 768] + - Exact: [768, 64, 1, 768] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 20, 1, 1024] + - Exact: [768, 16, 1, 768] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 6, 1, 1024] + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [448, 128, 1, 128] + - Exact: [256, 256, 1, 3328] + - Exact: [4, 64, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 1024, 1, 1280] + - Exact: [4, 448, 1, 1280] + - Exact: [448, 4, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [64, 128, 1, 128] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [4, 64, 1, 128] + - Exact: [64, 1024, 1, 128] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [4, 4, 1, 3328] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [4, 128, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 200, 1, 1024] + - Exact: [32, 512, 1, 1024] + - Exact: [1, 512, 1, 1024] + - Exact: [128, 512, 1, 2048] + - Exact: [64, 256, 1, 1024] + - Exact: [1, 200, 1, 1024] + - Exact: [128, 512, 1, 1024] + - Exact: [32, 256, 1, 2048] + - Exact: [32, 256, 1, 512] + - Exact: [256, 200, 1, 1024] + - Exact: [1, 256, 1, 2048] + - Exact: [32, 200, 1, 2048] + - Exact: [128, 200, 1, 1024] + - Exact: [128, 256, 1, 2048] + - Exact: [64, 1024, 1, 1024] + - Exact: [1, 512, 1, 2048] + - Exact: [128, 256, 1, 512] + - Exact: [128, 200, 1, 2048] + - Exact: [64, 200, 1, 512] + - Exact: [1, 256, 1, 1024] + - Exact: [1, 1024, 1, 1024] + - Exact: [256, 256, 1, 2048] + - Exact: [128, 256, 1, 1024] + - Exact: [1, 256, 1, 4096] + - Exact: [32, 512, 1, 512] + - Exact: [64, 200, 1, 2048] + - Exact: [1, 200, 1, 2048] + - Exact: [1, 512, 1, 4096] + - Exact: [256, 256, 1, 1024] + - Exact: [64, 256, 1, 2048] + - Exact: [1, 200, 1, 4096] + - Exact: [32, 256, 1, 1024] + - Exact: [32, 200, 1, 1024] + - Exact: [32, 512, 1, 2048] + - Exact: [128, 200, 1, 512] + - Exact: [64, 1024, 1, 2048] + - Exact: [1, 1024, 1, 2048] + - Exact: [32, 1024, 1, 512] + - Exact: [64, 1024, 1, 512] + - Exact: [1, 1024, 1, 4096] + - Exact: [64, 256, 1, 512] + - Exact: [256, 200, 1, 512] + - Exact: [32, 1024, 1, 1024] + - Exact: [32, 200, 1, 512] + - Exact: [256, 256, 1, 512] + - Exact: [128, 512, 1, 512] + - Exact: [256, 200, 1, 2048] + - Exact: [64, 512, 1, 2048] + - Exact: [32, 1024, 1, 2048] + - Exact: [256, 64, 1, 1225] + - Exact: [384, 64, 1, 1225] + - Exact: [288, 64, 1, 1225] + - Exact: [384, 96, 1, 1225] + - Exact: [11, 11, 5456, 64] + - Exact: [14, 14, 4368, 64] + - Exact: [23, 23, 2720, 64] + - Exact: [13, 13, 4672, 64] + - Exact: [29, 29, 2176, 64] + - Exact: [12, 12, 5040, 64] + - Exact: [27, 27, 2336, 64] + - Exact: [10, 10, 5952, 64] + - Exact: [7, 7, 8192, 64] + - Exact: [16, 16, 3840, 64] + - Exact: [17, 17, 3632, 64] + - Exact: [9, 9, 6544, 64] + - Exact: [8, 8, 7280, 64] + - Exact: [21, 21, 2976, 64] + - Exact: [19, 19, 3264, 64] + - Exact: [25, 25, 2512, 64] + - Exact: [18, 18, 3440, 64] + - Exact: [15, 15, 4096, 64] + - Exact: [2, 16, 1, 768] + - Exact: [2, 8, 1, 768] + - Exact: [2, 64, 1, 768] + - Exact: [256, 128, 1, 784] + - Exact: [192, 48, 1, 1225] + - Exact: [64, 256, 1, 3136] + - Exact: [512, 144, 1, 196] + - Exact: [400, 32, 1, 784] + - Exact: [832, 48, 1, 49] + - Exact: [192, 32, 1, 784] + - Exact: [288, 48, 1, 1225] + - Exact: [512, 112, 1, 196] + - Exact: [528, 32, 1, 196] + - Exact: [576, 64, 1, 3136] + - Exact: [480, 64, 1, 196] + - Exact: [192, 64, 1, 784] + - Exact: [192, 32, 1, 1225] + - Exact: [400, 48, 1, 196] + - Exact: [480, 16, 1, 196] + - Exact: [512, 64, 1, 196] + - Exact: [800, 64, 1, 196] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 784] + - Exact: [256, 48, 1, 1225] + - Exact: [192, 16, 1, 784] + - Exact: [576, 96, 1, 1225] + - Exact: [512, 128, 1, 196] + - Exact: [192, 96, 1, 784] + - Exact: [192, 64, 1, 1225] + - Exact: [512, 32, 1, 196] + - Exact: [528, 128, 1, 196] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + - Exact: [256, 32, 1, 784] + - Exact: [480, 96, 1, 196] + - Exact: [1024, 32, 1, 1001] + - Exact: [18, 18, 648, 64] + - Exact: [7, 7, 736, 64] + - Exact: [8, 8, 264, 64] + - Exact: [9, 9, 416, 64] + - Exact: [10, 10, 448, 64] + - Exact: [11, 11, 568, 64] + - Exact: [12, 12, 480, 64] + - Exact: [12, 12, 2520, 64] + - Exact: [13, 13, 576, 64] + - Exact: [13, 13, 2336, 64] + - Exact: [14, 14, 704, 64] + - Exact: [14, 14, 2184, 64] + - Exact: [15, 15, 688, 64] + - Exact: [15, 15, 2048, 64] + - Exact: [16, 16, 712, 64] + - Exact: [16, 16, 1920, 64] + - Exact: [17, 17, 688, 64] + - Exact: [17, 17, 1816, 64] + - Exact: [18, 18, 1720, 64] + - Exact: [19, 19, 680, 64] + - Exact: [19, 19, 1632, 64] + - Exact: [21, 21, 1472, 64] + - Exact: [21, 21, 1488, 64] + - Exact: [23, 23, 64, 64] + - Exact: [23, 23, 1360, 64] + - Exact: [25, 25, 176, 64] + - Exact: [25, 25, 1256, 64] + - Exact: [26, 26, 56, 64] + - Exact: [26, 27, 56, 64] + - Exact: [27, 27, 56, 64] + - Exact: [27, 27, 1168, 64] + - Exact: [29, 29, 136, 64] + - Exact: [29, 29, 1088, 64] + - Exact: [256, 1, 1, 4] + - Exact: [2, 1, 1, 1024] + - Exact: [1024, 1, 1, 1024] + - Exact: [2, 6, 1, 1024] + - Exact: [2, 8, 1, 1024] + - Exact: [14, 14, 1, 64] + - Exact: [15, 14, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [17, 15, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [30, 30, 1, 64] + - Exact: [30, 31, 1, 64] + - Exact: [31, 31, 1, 64] + - Exact: [1024, 32, 1, 1024] + - Exact: [2, 32, 1, 1024] + - Exact: [2, 4, 1, 1024] + - Exact: [64, 512, 1, 512] + - Exact: [64, 960, 1, 1024] + - Exact: [200, 1, 1, 1024] + - Exact: [512, 1, 1, 2048] + - Exact: [64, 512, 1, 1024] + - Exact: [3, 3, 512, 64] + - Exact: [5, 5, 512, 64] + - Exact: [9, 9, 512, 64] + - Exact: [128, 256, 1, 1444] + - Exact: [256, 128, 1, 25] + - Exact: [256, 128, 1, 9] + - Exact: [256, 256, 1, 1444] + - Exact: [512, 128, 1, 100] + - Exact: [64, 128, 1, 1444] + - Exact: [1024, 77, 1, 1024] + - Exact: [2, 10, 1, 1024] + - Exact: [1024, 10, 1, 1024] + - Exact: [2, 39, 1, 1024] + - Exact: [1024, 39, 1, 1024] + - Exact: [2, 40, 1, 1024] + - Exact: [1024, 40, 1, 1024] + - Exact: [2, 41, 1, 1024] + - Exact: [1024, 41, 1, 1024] + - Exact: [2, 5, 1, 1024] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 8, 1, 1024] + - Exact: [2, 9, 1, 1024] + - Exact: [1024, 9, 1, 1024] + - Exact: [4, 4, 32768, 64] + - Exact: [4, 4, 38400, 64] + - Exact: [14, 14, 10880, 64] + - Exact: [15, 14, 10880, 64] + - Exact: [15, 15, 7680, 64] + - Exact: [15, 15, 10880, 64] + - Exact: [17, 15, 7680, 64] + - Exact: [17, 17, 6144, 64] + - Exact: [17, 17, 7680, 64] + - Exact: [21, 17, 6144, 64] + - Exact: [21, 21, 6144, 64] + - Exact: [24, 24, 4736, 64] + - Exact: [30, 30, 2048, 64] + - Exact: [30, 31, 2048, 64] + - Exact: [31, 31, 2048, 64] + - Exact: [34, 24, 4736, 64] + - Exact: [128, 128, 1, 64] + - Exact: [2, 1024, 1, 1024] + - Exact: [5, 5, 1, 64] + - Exact: [33, 33, 1, 32] + - Exact: [5, 5, 960, 64] + - Exact: [27, 27, 32768, 128] + - Exact: [960, 1, 1, 2048] + - Exact: [2, 2, 1, 2048] + - Exact: [1024, 16, 1, 1024] + - Exact: [2, 16, 1, 1024] + - Exact: [2, 4, 1, 2560] + - Exact: [1024, 64, 1, 1024] + - Exact: [2, 64, 1, 1024] + - Exact: [864, 1, 1, 256] + - Exact: [2, 80, 1, 1024] + - Exact: [1024, 82, 1, 1024] + - Exact: [2, 82, 1, 1024] + - Exact: [1024, 12, 1, 1024] + - Exact: [2, 12, 1, 1024] + - Exact: [24, 24, 6816, 64] + - Exact: [26, 26, 6272, 64] + - Exact: [256, 128, 1, 3136] + - Exact: [2, 128, 1, 1024] + - Exact: [2, 96, 1, 1024] + - Exact: [768, 12, 1, 768] + - Exact: [768, 4, 1, 768] + - Exact: [256, 80, 1, 784] + - Exact: [256, 12, 1, 3800] + - Exact: [256, 3, 1, 3800] + - Exact: [256, 12, 1, 950] + - Exact: [256, 3, 1, 950] + - Exact: [256, 12, 1, 3220] + - Exact: [256, 3, 1, 3220] + - Exact: [256, 12, 1, 3072] + - Exact: [256, 3, 1, 3072] + - Exact: [256, 12, 1, 850] + - Exact: [256, 3, 1, 850] + - Exact: [256, 12, 1, 2852] + - Exact: [256, 3, 1, 2852] + - Exact: [256, 12, 1, 805] + - Exact: [256, 3, 1, 805] + - Exact: [256, 3, 1, 864] + - Exact: [256, 3, 1, 768] + - Exact: [256, 12, 1, 864] + - Exact: [256, 12, 1, 768] + - Exact: [256, 12, 1, 2904] + - Exact: [256, 3, 1, 2904] + - Exact: [256, 3, 1, 713] + - Exact: [256, 12, 1, 888] + - Exact: [256, 3, 1, 888] + - Exact: [256, 12, 1, 713] + - Exact: [256, 3, 1, 660] + - Exact: [256, 3, 1, 672] + - Exact: [256, 12, 1, 660] + - Exact: [256, 3, 1, 726] + - Exact: [256, 12, 1, 672] + - Exact: [256, 3, 1, 247] + - Exact: [256, 12, 1, 726] + - Exact: [256, 3, 1, 216] + - Exact: [256, 3, 1, 3400] + - Exact: [256, 3, 1, 221] + - Exact: [256, 12, 1, 3552] + - Exact: [256, 3, 1, 3456] + - Exact: [256, 3, 1, 204] + - Exact: [256, 12, 1, 3400] + - Exact: [256, 12, 1, 3456] + - Exact: [256, 12, 1, 221] + - Exact: [256, 3, 1, 3552] + - Exact: [256, 3, 1, 228] + - Exact: [256, 3, 1, 234] + - Exact: [256, 12, 1, 234] + - Exact: [81, 1024, 1, 1024] + - Exact: [81, 1000, 1, 1024] + - Exact: [256, 12, 1, 228] + - Exact: [256, 3, 1, 252] + - Exact: [256, 12, 1, 252] + - Exact: [256, 12, 1, 247] + - Exact: [1024, 6, 1, 2] + - Exact: [2, 8, 1, 2048] + - Exact: [2, 20, 1, 1024] + - Exact: [2, 2, 1, 2560] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml new file mode 100644 index 000000000..a9743054a --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml @@ -0,0 +1,1293 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + StridedBatched: False + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [448, 5056, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [5888, 2944, 1, 128] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [1024, 4288, 1, 128] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [6784, 5888, 1, 3328] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 1408, 1, 128] + - Exact: [4288, 5056, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 128] + - Exact: [5888, 1856, 1, 128] + - Exact: [2368, 1024, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [1856, 4288, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [2944, 4288, 1, 3328] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [1856, 2368, 1, 128] + - Exact: [3584, 704, 1, 1280] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [2368, 5056, 1, 128] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 128] + - Exact: [1856, 1856, 1, 128] + - Exact: [3584, 1024, 1, 3328] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 2944, 1, 128] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [1024, 2368, 1, 128] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 4288, 1, 3328] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [704, 4288, 1, 128] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [1408, 1024, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 6784, 1, 128] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [2368, 6784, 1, 128] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [1856, 5056, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5888, 1408, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [4288, 1024, 1, 128] + - Exact: [1920, 2048, 1, 2048] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3840, 4096, 1, 4096] + - Exact: [7680, 8192, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 3072, 1, 3072] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8192, 1, 8192] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [448, 2944, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [64, 6784, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [2368, 256, 1, 128] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 128, 1, 1280] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [64, 6784, 1, 1280] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [1024, 1024, 1, 1024] + - Exact: [448, 1024, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [64, 6784, 1, 128] + - Exact: [5888, 64, 1, 1280] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [1024, 1024, 1, 128] + - Exact: [64, 5056, 1, 128] + - Exact: [5888, 64, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [960, 1024, 1, 1024] + - Exact: [768, 768, 1, 768] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [4288, 64, 1, 1280] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [64, 2944, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1408, 128, 1, 128] + - Exact: [1024, 256, 1, 128] + - Exact: [2944, 64, 1, 128] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 704, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [512, 512, 1, 512] + - Exact: [384, 384, 1, 384] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 128] + - Exact: [2368, 4, 1, 256] + - Exact: [1408, 4, 1, 256] + - Exact: [5056, 4, 1, 1280] + - Exact: [1408, 4, 1, 3328] + - Exact: [6784, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [1856, 4, 1, 256] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [64, 1024, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 1024, 1, 128] + - Exact: [4, 64, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [448, 4, 1, 1280] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 256, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [4, 128, 1, 3328] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [64, 64, 1, 3328] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 128, 1, 128] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml new file mode 100644 index 000000000..5a310a265 --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml @@ -0,0 +1,5202 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 4096, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 1024] + - Exact: [4096, 2048, 1, 1024] + - Exact: [768, 4096, 1, 2] + - Exact: [768, 4096, 1, 768] + - Exact: [3072, 4096, 1, 768] + - Exact: [768, 2048, 1, 2] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [3072, 1024, 1, 768] + - Exact: [3072, 512, 1, 768] + - Exact: [1024, 3072, 1, 1024] + - Exact: [3072, 2048, 1, 1024] + - Exact: [3072, 3072, 1, 1024] + - Exact: [3072, 512, 1, 1024] + - Exact: [3072, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 2] + - Exact: [1024, 3072, 1, 2] + - Exact: [1024, 4096, 1, 2] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [6144, 6000, 1, 2560] + - Exact: [2368, 6784, 1, 128] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [1408, 3584, 1, 256] + - Exact: [4288, 448, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [512, 6000, 1, 2560] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 448, 1, 1280] + - Exact: [2944, 5888, 1, 256] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [6784, 2944, 1, 128] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [256, 193600, 1, 64] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [2048, 7000, 1, 2048] + - Exact: [5056, 704, 1, 256] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [1024, 1500, 1, 2560] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 4288, 1, 1280] + - Exact: [2368, 2368, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [1856, 5056, 1, 128] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [512, 6000, 1, 2816] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1024, 12544, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 1024, 1, 128] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [4288, 5888, 1, 1280] + - Exact: [1024, 6000, 1, 1536] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [512, 3000, 1, 1536] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [4288, 1408, 1, 128] + - Exact: [1856, 5888, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4096, 7000, 1, 4096] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 3000, 1, 2560] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [3072, 24000, 1, 1024] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5888, 1856, 1, 128] + - Exact: [448, 6784, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [4288, 2368, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [512, 6000, 1, 2048] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [704, 6784, 1, 128] + - Exact: [6784, 3584, 1, 256] + - Exact: [1856, 1408, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [3584, 6784, 1, 128] + - Exact: [2368, 6784, 1, 1280] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 3000, 1, 2816] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [1024, 1500, 1, 2816] + - Exact: [1024, 6000, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [6144, 3000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [4608, 6000, 1, 1536] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [4288, 4288, 1, 256] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [3072, 1500, 1, 128] + - Exact: [2048, 3136, 1, 512] + - Exact: [3025, 256, 64, 64] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5888, 3584, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [1024, 1408, 1, 128] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [1856, 2368, 1, 128] + - Exact: [6784, 2368, 1, 3328] + - Exact: [1408, 6784, 1, 128] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [6784, 4288, 1, 3328] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [8448, 6000, 1, 2816] + - Exact: [3584, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [6784, 448, 1, 256] + - Exact: [5124, 9124, 1, 2048] + - Exact: [2944, 5056, 1, 3328] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [3072, 1500, 1, 1024] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [512, 6000, 1, 1536] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [3072, 6000, 1, 1024] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [2944, 4288, 1, 128] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 1280] + - Exact: [1024, 3000, 1, 2048] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 1856, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [2944, 6784, 1, 256] + - Exact: [2944, 2944, 1, 128] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [512, 48000, 1, 2816] + - Exact: [512, 3000, 1, 2816] + - Exact: [1024, 5888, 1, 256] + - Exact: [6784, 1408, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [1408, 5056, 1, 256] + - Exact: [512, 50176, 1, 128] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [1024, 24000, 1, 1536] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [1408, 2944, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [4288, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [5124, 1500, 1, 2048] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [6144, 1500, 1, 2560] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [2368, 5056, 1, 128] + - Exact: [1408, 1856, 1, 3328] + - Exact: [5888, 5056, 1, 3328] + - Exact: [7680, 6000, 1, 2560] + - Exact: [6784, 1408, 1, 1280] + - Exact: [512, 3000, 1, 2560] + - Exact: [704, 2944, 1, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1024, 1500, 1, 1536] + - Exact: [1408, 1408, 1, 1280] + - Exact: [3072, 3000, 1, 1024] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [704, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2560] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [1408, 5888, 1, 128] + - Exact: [512, 3136, 1, 2048] + - Exact: [1408, 1024, 1, 256] + - Exact: [8448, 1500, 1, 2816] + - Exact: [2560, 7000, 1, 2560] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [4608, 3000, 1, 1536] + - Exact: [5056, 3584, 1, 1280] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1856, 2944, 1, 1280] + - Exact: [4608, 1500, 1, 1536] + - Exact: [1024, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 2944, 1, 128] + - Exact: [2944, 1024, 1, 128] + - Exact: [5124, 700, 1, 2048] + - Exact: [6784, 5056, 1, 128] + - Exact: [256, 12544, 1, 1024] + - Exact: [5888, 1408, 1, 3328] + - Exact: [2368, 1856, 1, 256] + - Exact: [256, 5056, 1, 256] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 3584, 1, 3328] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1856, 4288, 1, 128] + - Exact: [1024, 6000, 1, 2560] + - Exact: [1024, 5056, 1, 256] + - Exact: [5056, 5888, 1, 128] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [1024, 2368, 1, 128] + - Exact: [3072, 48000, 1, 1024] + - Exact: [1024, 5888, 1, 128] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [8448, 3000, 1, 2816] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [7680, 1500, 1, 2560] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2944, 4288, 1, 3328] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [4608, 24000, 1, 1536] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [7680, 3000, 1, 2560] + - Exact: [4224, 1500, 1, 176] + - Exact: [5124, 700, 1, 2560] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [5124, 1500, 1, 2560] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [448, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 128] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [448, 5888, 1, 1280] + - Exact: [512, 3000, 1, 2048] + - Exact: [5056, 448, 1, 128] + - Exact: [4288, 704, 1, 1280] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [3584, 4288, 1, 128] + - Exact: [1024, 6000, 1, 2816] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [1024, 1500, 1, 2048] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [704, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [1024, 3000, 1, 1536] + - Exact: [5056, 1024, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [128, 50176, 1, 512] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4288, 1024, 1, 128] + - Exact: [784, 128, 128, 512] + - Exact: [784, 512, 256, 128] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 512, 128, 128] + - Exact: [784, 128, 256, 512] + - Exact: [3136, 256, 128, 64] + - Exact: [4096, 512, 1, 1024] + - Exact: [2048, 768, 1, 512] + - Exact: [4096, 512, 1, 2048] + - Exact: [4096, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 1024] + - Exact: [2048, 1024, 1, 512] + - Exact: [4096, 1024, 1, 4096] + - Exact: [2048, 1024, 1, 1024] + - Exact: [4096, 384, 1, 2048] + - Exact: [1225, 192, 64, 384] + - Exact: [289, 128, 64, 1024] + - Exact: [4096, 384, 1, 1536] + - Exact: [289, 192, 64, 1024] + - Exact: [4096, 384, 1, 1280] + - Exact: [4096, 448, 1, 1280] + - Exact: [289, 256, 64, 1024] + - Exact: [4096, 448, 1, 2048] + - Exact: [289, 384, 64, 1024] + - Exact: [1024, 3594, 1, 4096] + - Exact: [4096, 3103, 1, 1024] + - Exact: [4096, 3136, 1, 1024] + - Exact: [1024, 3141, 1, 4096] + - Exact: [4096, 3559, 1, 1024] + - Exact: [4096, 3368, 1, 1024] + - Exact: [1024, 3335, 1, 4096] + - Exact: [1024, 3510, 1, 4096] + - Exact: [4096, 3209, 1, 1024] + - Exact: [4096, 3322, 1, 1024] + - Exact: [1024, 3400, 1, 4096] + - Exact: [1024, 3995, 1, 4096] + - Exact: [1024, 3503, 1, 4096] + - Exact: [4096, 3594, 1, 1024] + - Exact: [4096, 3473, 1, 1024] + - Exact: [4096, 3522, 1, 1024] + - Exact: [1024, 3103, 1, 4096] + - Exact: [1024, 3214, 1, 4096] + - Exact: [4096, 3449, 1, 1024] + - Exact: [1024, 3136, 1, 4096] + - Exact: [1024, 3955, 1, 33708] + - Exact: [1024, 3780, 1, 4096] + - Exact: [1024, 3906, 1, 33708] + - Exact: [1024, 3386, 1, 4096] + - Exact: [4096, 3396, 1, 1024] + - Exact: [1024, 3183, 1, 4096] + - Exact: [1024, 3098, 1, 4096] + - Exact: [1024, 3548, 1, 4096] + - Exact: [1024, 3224, 1, 4096] + - Exact: [4096, 3469, 1, 1024] + - Exact: [1024, 3582, 1, 4096] + - Exact: [1024, 2977, 1, 4096] + - Exact: [1024, 3939, 1, 1024] + - Exact: [4096, 3176, 1, 1024] + - Exact: [1024, 3559, 1, 4096] + - Exact: [1024, 3478, 1, 4096] + - Exact: [4096, 3343, 1, 1024] + - Exact: [4096, 3440, 1, 1024] + - Exact: [1024, 3996, 1, 33708] + - Exact: [1024, 4012, 1, 4096] + - Exact: [1024, 3322, 1, 4096] + - Exact: [1024, 3990, 1, 33708] + - Exact: [1024, 3314, 1, 4096] + - Exact: [4096, 3513, 1, 1024] + - Exact: [1024, 3562, 1, 4096] + - Exact: [1024, 3443, 1, 4096] + - Exact: [1024, 3554, 1, 4096] + - Exact: [1024, 3063, 1, 4096] + - Exact: [4096, 3460, 1, 1024] + - Exact: [1024, 3209, 1, 4096] + - Exact: [1024, 3147, 1, 4096] + - Exact: [4096, 3387, 1, 1024] + - Exact: [4096, 3436, 1, 1024] + - Exact: [1024, 3341, 1, 4096] + - Exact: [1024, 3516, 1, 4096] + - Exact: [4096, 3277, 1, 1024] + - Exact: [1024, 3454, 1, 4096] + - Exact: [1024, 3969, 1, 4096] + - Exact: [1024, 3999, 1, 4096] + - Exact: [1024, 4032, 1, 4096] + - Exact: [4096, 3541, 1, 1024] + - Exact: [4096, 3334, 1, 1024] + - Exact: [1024, 3365, 1, 4096] + - Exact: [1024, 3527, 1, 4096] + - Exact: [1024, 3190, 1, 4096] + - Exact: [4096, 3906, 1, 1024] + - Exact: [1024, 3593, 1, 4096] + - Exact: [1024, 3336, 1, 4096] + - Exact: [4096, 3504, 1, 1024] + - Exact: [4096, 3977, 1, 1024] + - Exact: [1024, 3906, 1, 4096] + - Exact: [4096, 3415, 1, 1024] + - Exact: [1024, 3295, 1, 4096] + - Exact: [4096, 3321, 1, 1024] + - Exact: [1024, 3072, 1, 4096] + - Exact: [1024, 3408, 1, 4096] + - Exact: [1024, 3522, 1, 4096] + - Exact: [4096, 3751, 1, 1024] + - Exact: [4096, 3378, 1, 1024] + - Exact: [1024, 3925, 1, 33708] + - Exact: [1024, 3990, 1, 1024] + - Exact: [1024, 3290, 1, 4096] + - Exact: [4096, 3500, 1, 1024] + - Exact: [4096, 3565, 1, 1024] + - Exact: [1024, 3484, 1, 4096] + - Exact: [4096, 3395, 1, 1024] + - Exact: [1024, 3681, 1, 1024] + - Exact: [1024, 3584, 1, 1024] + - Exact: [4096, 3093, 1, 1024] + - Exact: [1024, 4050, 1, 1024] + - Exact: [1024, 3301, 1, 4096] + - Exact: [1024, 3581, 1, 4096] + - Exact: [4096, 3374, 1, 1024] + - Exact: [1024, 3449, 1, 4096] + - Exact: [4096, 3215, 1, 1024] + - Exact: [4096, 3312, 1, 1024] + - Exact: [4096, 3479, 1, 1024] + - Exact: [4096, 3544, 1, 1024] + - Exact: [1024, 3263, 1, 4096] + - Exact: [4096, 3455, 1, 1024] + - Exact: [1024, 3379, 1, 4096] + - Exact: [1024, 3490, 1, 4096] + - Exact: [1024, 3368, 1, 4096] + - Exact: [4096, 3186, 1, 1024] + - Exact: [1024, 3428, 1, 4096] + - Exact: [4096, 3561, 1, 1024] + - Exact: [4096, 3418, 1, 1024] + - Exact: [1024, 3064, 1, 4096] + - Exact: [4096, 3259, 1, 1024] + - Exact: [4096, 3308, 1, 1024] + - Exact: [1024, 3533, 1, 4096] + - Exact: [1024, 3344, 1, 4096] + - Exact: [1024, 4030, 1, 1024] + - Exact: [4096, 3459, 1, 1024] + - Exact: [1024, 3572, 1, 4096] + - Exact: [1024, 3925, 1, 1024] + - Exact: [4096, 3435, 1, 1024] + - Exact: [1024, 3956, 1, 4096] + - Exact: [1024, 3463, 1, 4096] + - Exact: [4096, 3182, 1, 1024] + - Exact: [4096, 3976, 1, 1024] + - Exact: [1024, 3417, 1, 4096] + - Exact: [1024, 3528, 1, 4096] + - Exact: [4096, 3446, 1, 1024] + - Exact: [1024, 3543, 1, 4096] + - Exact: [4096, 3287, 1, 1024] + - Exact: [1024, 3499, 1, 4096] + - Exact: [1024, 3231, 1, 4096] + - Exact: [4096, 3519, 1, 1024] + - Exact: [4096, 3552, 1, 1024] + - Exact: [1024, 3458, 1, 4096] + - Exact: [1024, 3374, 1, 4096] + - Exact: [1024, 3396, 1, 4096] + - Exact: [1024, 2967, 1, 4096] + - Exact: [4096, 3482, 1, 1024] + - Exact: [1024, 3226, 1, 4096] + - Exact: [4096, 3377, 1, 1024] + - Exact: [4096, 3426, 1, 1024] + - Exact: [4096, 2935, 1, 1024] + - Exact: [1024, 3439, 1, 4096] + - Exact: [4096, 3267, 1, 1024] + - Exact: [4096, 3499, 1, 1024] + - Exact: [4096, 3356, 1, 1024] + - Exact: [4096, 3939, 1, 1024] + - Exact: [1024, 3526, 1, 4096] + - Exact: [1024, 3859, 1, 33708] + - Exact: [1024, 3385, 1, 4096] + - Exact: [1024, 3496, 1, 4096] + - Exact: [4096, 3141, 1, 1024] + - Exact: [4096, 3510, 1, 1024] + - Exact: [1024, 3434, 1, 4096] + - Exact: [4096, 3969, 1, 1024] + - Exact: [1024, 3121, 1, 4096] + - Exact: [1024, 3232, 1, 4096] + - Exact: [1024, 4030, 1, 33708] + - Exact: [1024, 3780, 1, 33708] + - Exact: [1024, 3969, 1, 1024] + - Exact: [4096, 3527, 1, 1024] + - Exact: [4096, 3336, 1, 1024] + - Exact: [4096, 3290, 1, 1024] + - Exact: [1024, 3469, 1, 4096] + - Exact: [4096, 3490, 1, 1024] + - Exact: [4096, 3064, 1, 1024] + - Exact: [4096, 3582, 1, 1024] + - Exact: [1024, 3956, 1, 1024] + - Exact: [4096, 3417, 1, 1024] + - Exact: [1024, 2736, 1, 4096] + - Exact: [1024, 3205, 1, 4096] + - Exact: [1024, 3143, 1, 4096] + - Exact: [1024, 4020, 1, 4096] + - Exact: [1024, 3318, 1, 4096] + - Exact: [4096, 3364, 1, 1024] + - Exact: [1024, 3353, 1, 4096] + - Exact: [1024, 3464, 1, 4096] + - Exact: [4096, 3205, 1, 1024] + - Exact: [4096, 3318, 1, 1024] + - Exact: [1024, 3402, 1, 4096] + - Exact: [4096, 3181, 1, 1024] + - Exact: [4096, 3550, 1, 1024] + - Exact: [4096, 3445, 1, 1024] + - Exact: [1024, 3138, 1, 4096] + - Exact: [4096, 3079, 1, 1024] + - Exact: [4096, 3144, 1, 1024] + - Exact: [4096, 3860, 1, 1024] + - Exact: [1024, 3515, 1, 4096] + - Exact: [4096, 3408, 1, 1024] + - Exact: [1024, 3181, 1, 4096] + - Exact: [4096, 3298, 1, 1024] + - Exact: [4096, 3585, 1, 1024] + - Exact: [1024, 3550, 1, 4096] + - Exact: [1024, 4020, 1, 1024] + - Exact: [4096, 3481, 1, 1024] + - Exact: [4096, 3530, 1, 1024] + - Exact: [4096, 3425, 1, 1024] + - Exact: [4096, 4026, 1, 1024] + - Exact: [1024, 3860, 1, 1024] + - Exact: [4096, 3975, 1, 1024] + - Exact: [1024, 3286, 1, 4096] + - Exact: [1024, 3176, 1, 4096] + - Exact: [1024, 3894, 1, 4096] + - Exact: [4096, 3355, 1, 1024] + - Exact: [4096, 3404, 1, 1024] + - Exact: [1024, 3501, 1, 4096] + - Exact: [4096, 3245, 1, 1024] + - Exact: [1024, 3431, 1, 4096] + - Exact: [1024, 4000, 1, 1024] + - Exact: [4096, 3509, 1, 1024] + - Exact: [4096, 3558, 1, 1024] + - Exact: [1024, 3535, 1, 4096] + - Exact: [1024, 3414, 1, 4096] + - Exact: [1024, 3445, 1, 4096] + - Exact: [1024, 3436, 1, 4096] + - Exact: [4096, 3472, 1, 1024] + - Exact: [1024, 3211, 1, 4096] + - Exact: [4096, 3383, 1, 1024] + - Exact: [4096, 3448, 1, 1024] + - Exact: [1024, 3343, 1, 4096] + - Exact: [1024, 3518, 1, 4096] + - Exact: [4096, 3289, 1, 1024] + - Exact: [1024, 3440, 1, 4096] + - Exact: [1024, 4032, 1, 33708] + - Exact: [4096, 3489, 1, 1024] + - Exact: [4096, 3346, 1, 1024] + - Exact: [1024, 3534, 1, 4096] + - Exact: [1024, 3079, 1, 4096] + - Exact: [1024, 3955, 1, 4096] + - Exact: [4096, 3236, 1, 1024] + - Exact: [1024, 3545, 1, 4096] + - Exact: [1024, 3144, 1, 4096] + - Exact: [4096, 3780, 1, 1024] + - Exact: [4096, 3163, 1, 1024] + - Exact: [4096, 3468, 1, 1024] + - Exact: [1024, 3539, 1, 4096] + - Exact: [1024, 3541, 1, 4096] + - Exact: [4096, 3363, 1, 1024] + - Exact: [1024, 3475, 1, 4096] + - Exact: [4096, 3110, 1, 1024] + - Exact: [1024, 3509, 1, 4096] + - Exact: [1024, 3413, 1, 4096] + - Exact: [1024, 3975, 1, 1024] + - Exact: [4096, 3549, 1, 1024] + - Exact: [4096, 3342, 1, 1024] + - Exact: [1024, 2985, 1, 4096] + - Exact: [1024, 3876, 1, 33708] + - Exact: [4096, 3280, 1, 1024] + - Exact: [4096, 3191, 1, 1024] + - Exact: [4096, 3512, 1, 1024] + - Exact: [1024, 3560, 1, 4096] + - Exact: [4096, 2499, 1, 1024] + - Exact: [1024, 3248, 1, 4096] + - Exact: [4096, 3423, 1, 1024] + - Exact: [4096, 3297, 1, 1024] + - Exact: [4096, 3154, 1, 1024] + - Exact: [1024, 3303, 1, 4096] + - Exact: [1024, 3222, 1, 4096] + - Exact: [1024, 3978, 1, 1024] + - Exact: [4096, 3529, 1, 1024] + - Exact: [4096, 3386, 1, 1024] + - Exact: [1024, 3451, 1, 4096] + - Exact: [4096, 3562, 1, 1024] + - Exact: [4096, 3276, 1, 1024] + - Exact: [1024, 3894, 1, 33708] + - Exact: [4096, 3540, 1, 1024] + - Exact: [1024, 3416, 1, 4096] + - Exact: [1024, 4005, 1, 33708] + - Exact: [1024, 3942, 1, 4096] + - Exact: [4096, 3403, 1, 1024] + - Exact: [4096, 3381, 1, 1024] + - Exact: [1024, 3492, 1, 4096] + - Exact: [4096, 3101, 1, 1024] + - Exact: [1024, 3430, 1, 4096] + - Exact: [1024, 3977, 1, 4096] + - Exact: [1024, 3640, 1, 4096] + - Exact: [4096, 3557, 1, 1024] + - Exact: [4096, 3414, 1, 1024] + - Exact: [1024, 3391, 1, 4096] + - Exact: [1024, 3356, 1, 4096] + - Exact: [4096, 3320, 1, 1024] + - Exact: [4096, 2765, 1, 1024] + - Exact: [1024, 3411, 1, 4096] + - Exact: [1024, 3978, 1, 4096] + - Exact: [4096, 3487, 1, 1024] + - Exact: [4096, 3520, 1, 1024] + - Exact: [4096, 3942, 1, 1024] + - Exact: [4096, 3431, 1, 1024] + - Exact: [1024, 3271, 1, 4096] + - Exact: [4096, 4020, 1, 1024] + - Exact: [1024, 3481, 1, 4096] + - Exact: [1024, 3419, 1, 4096] + - Exact: [1024, 4059, 1, 4096] + - Exact: [4096, 3345, 1, 1024] + - Exact: [4096, 3394, 1, 1024] + - Exact: [1024, 3298, 1, 4096] + - Exact: [4096, 3235, 1, 1024] + - Exact: [1024, 3681, 1, 33708] + - Exact: [1024, 3362, 1, 4096] + - Exact: [4096, 3467, 1, 1024] + - Exact: [1024, 3349, 1, 4096] + - Exact: [1024, 3460, 1, 4096] + - Exact: [4096, 3214, 1, 1024] + - Exact: [1024, 3398, 1, 4096] + - Exact: [4096, 3478, 1, 1024] + - Exact: [1024, 4050, 1, 33708] + - Exact: [1024, 3244, 1, 4096] + - Exact: [4096, 3341, 1, 1024] + - Exact: [4096, 3454, 1, 1024] + - Exact: [1024, 3166, 1, 4096] + - Exact: [1024, 3425, 1, 4096] + - Exact: [4096, 3295, 1, 1024] + - Exact: [4096, 3072, 1, 1024] + - Exact: [4096, 3822, 1, 1024] + - Exact: [1024, 3681, 1, 4096] + - Exact: [1024, 4050, 1, 4096] + - Exact: [4096, 3495, 1, 1024] + - Exact: [4096, 3560, 1, 1024] + - Exact: [1024, 3524, 1, 4096] + - Exact: [1024, 3942, 1, 33708] + - Exact: [1024, 3304, 1, 4096] + - Exact: [1024, 3387, 1, 4096] + - Exact: [1024, 3498, 1, 4096] + - Exact: [4096, 3458, 1, 1024] + - Exact: [4096, 2967, 1, 1024] + - Exact: [4096, 3385, 1, 1024] + - Exact: [4096, 3434, 1, 1024] + - Exact: [1024, 3519, 1, 4096] + - Exact: [1024, 3511, 1, 4096] + - Exact: [1024, 3288, 1, 4096] + - Exact: [1024, 2918, 1, 4096] + - Exact: [4096, 3573, 1, 1024] + - Exact: [1024, 3822, 1, 33708] + - Exact: [4096, 3539, 1, 1024] + - Exact: [4096, 3332, 1, 1024] + - Exact: [4096, 3286, 1, 1024] + - Exact: [1024, 4026, 1, 4096] + - Exact: [1024, 3277, 1, 4096] + - Exact: [1024, 3471, 1, 4096] + - Exact: [4096, 3518, 1, 1024] + - Exact: [1024, 3393, 1, 4096] + - Exact: [4096, 3413, 1, 1024] + - Exact: [4096, 3303, 1, 1024] + - Exact: [1024, 3207, 1, 4096] + - Exact: [1024, 3894, 1, 1024] + - Exact: [1024, 3977, 1, 1024] + - Exact: [4096, 3535, 1, 1024] + - Exact: [4096, 3376, 1, 1024] + - Exact: [1024, 3355, 1, 4096] + - Exact: [1024, 3466, 1, 4096] + - Exact: [4096, 3266, 1, 1024] + - Exact: [1024, 3404, 1, 4096] + - Exact: [1024, 3999, 1, 1024] + - Exact: [4096, 3498, 1, 1024] + - Exact: [1024, 4032, 1, 1024] + - Exact: [1024, 3410, 1, 4096] + - Exact: [4096, 3393, 1, 1024] + - Exact: [1024, 3140, 1, 4096] + - Exact: [1024, 3910, 1, 33708] + - Exact: [1024, 3334, 1, 4096] + - Exact: [4096, 3140, 1, 1024] + - Exact: [1024, 4005, 1, 4096] + - Exact: [1024, 3579, 1, 4096] + - Exact: [4096, 3372, 1, 1024] + - Exact: [1024, 3245, 1, 4096] + - Exact: [4096, 3956, 1, 1024] + - Exact: [4096, 3213, 1, 1024] + - Exact: [1024, 3361, 1, 4096] + - Exact: [1024, 3536, 1, 4096] + - Exact: [4096, 3477, 1, 1024] + - Exact: [4096, 3526, 1, 1024] + - Exact: [1024, 4005, 1, 1024] + - Exact: [1024, 3530, 1, 4096] + - Exact: [1024, 3944, 1, 4096] + - Exact: [4096, 3453, 1, 1024] + - Exact: [4096, 3184, 1, 1024] + - Exact: [4096, 3579, 1, 1024] + - Exact: [4096, 3351, 1, 1024] + - Exact: [4096, 3416, 1, 1024] + - Exact: [1024, 3822, 1, 4096] + - Exact: [1024, 3796, 1, 4096] + - Exact: [4096, 3257, 1, 1024] + - Exact: [4096, 3306, 1, 1024] + - Exact: [1024, 3505, 1, 4096] + - Exact: [1024, 3315, 1, 4096] + - Exact: [1024, 3486, 1, 4096] + - Exact: [4096, 3457, 1, 1024] + - Exact: [4096, 3870, 1, 1024] + - Exact: [1024, 3447, 1, 4096] + - Exact: [1024, 3558, 1, 4096] + - Exact: [4096, 3433, 1, 1024] + - Exact: [4096, 3180, 1, 1024] + - Exact: [1024, 3213, 1, 4096] + - Exact: [1024, 3900, 1, 4096] + - Exact: [4096, 3444, 1, 1024] + - Exact: [1024, 3504, 1, 4096] + - Exact: [4096, 4059, 1, 1024] + - Exact: [1024, 3442, 1, 4096] + - Exact: [4096, 3517, 1, 1024] + - Exact: [1024, 3566, 1, 4096] + - Exact: [4096, 3248, 1, 1024] + - Exact: [1024, 3547, 1, 4096] + - Exact: [1024, 3340, 1, 4096] + - Exact: [4096, 3480, 1, 1024] + - Exact: [4096, 3424, 1, 1024] + - Exact: [1024, 3906, 1, 1024] + - Exact: [4096, 3265, 1, 1024] + - Exact: [1024, 3384, 1, 4096] + - Exact: [1024, 3494, 1, 4096] + - Exact: [1024, 3236, 1, 4096] + - Exact: [4096, 3497, 1, 1024] + - Exact: [4096, 3354, 1, 1024] + - Exact: [4096, 3055, 1, 1024] + - Exact: [4096, 3244, 1, 1024] + - Exact: [4096, 3139, 1, 1024] + - Exact: [4096, 3508, 1, 1024] + - Exact: [4096, 4050, 1, 1024] + - Exact: [1024, 3472, 1, 4096] + - Exact: [1024, 3861, 1, 1024] + - Exact: [1024, 3910, 1, 1024] + - Exact: [4096, 3371, 1, 1024] + - Exact: [1024, 3751, 1, 4096] + - Exact: [4096, 3325, 1, 1024] + - Exact: [1024, 3321, 1, 4096] + - Exact: [1024, 3944, 1, 1024] + - Exact: [4096, 3525, 1, 1024] + - Exact: [4096, 3382, 1, 1024] + - Exact: [1024, 3453, 1, 4096] + - Exact: [4096, 3564, 1, 1024] + - Exact: [4096, 3288, 1, 1024] + - Exact: [1024, 3925, 1, 4096] + - Exact: [1024, 3057, 1, 4096] + - Exact: [4096, 3488, 1, 1024] + - Exact: [4096, 3046, 1, 1024] + - Exact: [1024, 3189, 1, 4096] + - Exact: [4096, 3399, 1, 1024] + - Exact: [1024, 3383, 1, 4096] + - Exact: [1024, 3415, 1, 4096] + - Exact: [1024, 3388, 1, 4096] + - Exact: [1024, 3376, 1, 4096] + - Exact: [1024, 3473, 1, 4096] + - Exact: [4096, 3162, 1, 1024] + - Exact: [1024, 3448, 1, 4096] + - Exact: [4096, 3362, 1, 1024] + - Exact: [1024, 3262, 1, 4096] + - Exact: [1024, 3184, 1, 4096] + - Exact: [1024, 3378, 1, 4096] + - Exact: [4096, 3548, 1, 1024] + - Exact: [4096, 2977, 1, 1024] + - Exact: [4096, 3443, 1, 1024] + - Exact: [1024, 3289, 1, 4096] + - Exact: [1024, 3483, 1, 4096] + - Exact: [4096, 3190, 1, 1024] + - Exact: [1024, 3421, 1, 4096] + - Exact: [1024, 3514, 1, 4096] + - Exact: [1024, 3532, 1, 4096] + - Exact: [1024, 3565, 1, 4096] + - Exact: [4096, 3422, 1, 1024] + - Exact: [4096, 3263, 1, 1024] + - Exact: [4096, 3296, 1, 1024] + - Exact: [4096, 3640, 1, 1024] + - Exact: [4096, 3463, 1, 1024] + - Exact: [4096, 3528, 1, 1024] + - Exact: [1024, 3351, 1, 4096] + - Exact: [1024, 3462, 1, 4096] + - Exact: [4096, 3226, 1, 1024] + - Exact: [4096, 3439, 1, 1024] + - Exact: [4096, 3121, 1, 1024] + - Exact: [1024, 4059, 1, 33708] + - Exact: [1024, 3311, 1, 4096] + - Exact: [1024, 3230, 1, 4096] + - Exact: [4096, 3353, 1, 1024] + - Exact: [4096, 3402, 1, 1024] + - Exact: [1024, 3427, 1, 4096] + - Exact: [1024, 3346, 1, 4096] + - Exact: [1024, 3126, 1, 4096] + - Exact: [1024, 3796, 1, 1024] + - Exact: [1024, 3990, 1, 4096] + - Exact: [1024, 3257, 1, 4096] + - Exact: [4096, 3996, 1, 1024] + - Exact: [1024, 3306, 1, 4096] + - Exact: [1024, 3389, 1, 4096] + - Exact: [1024, 3500, 1, 4096] + - Exact: [1024, 3999, 1, 33708] + - Exact: [4096, 3486, 1, 1024] + - Exact: [1024, 3438, 1, 4096] + - Exact: [4096, 3616, 1, 1024] + - Exact: [1024, 3955, 1, 1024] + - Exact: [4096, 3430, 1, 1024] + - Exact: [4096, 3271, 1, 1024] + - Exact: [1024, 3364, 1, 4096] + - Exact: [1024, 3497, 1, 4096] + - Exact: [4096, 3503, 1, 1024] + - Exact: [4096, 3344, 1, 1024] + - Exact: [1024, 3457, 1, 4096] + - Exact: [4096, 3466, 1, 1024] + - Exact: [1024, 3976, 1, 33708] + - Exact: [1024, 3395, 1, 4096] + - Exact: [4096, 3361, 1, 1024] + - Exact: [1024, 3751, 1, 33708] + - Exact: [1024, 3822, 1, 1024] + - Exact: [4096, 3315, 1, 1024] + - Exact: [1024, 3163, 1, 4096] + - Exact: [4096, 3547, 1, 1024] + - Exact: [4096, 3340, 1, 1024] + - Exact: [1024, 3296, 1, 4096] + - Exact: [1024, 3468, 1, 4096] + - Exact: [4096, 3294, 1, 1024] + - Exact: [1024, 3406, 1, 4096] + - Exact: [1024, 3860, 1, 33708] + - Exact: [1024, 3584, 1, 4096] + - Exact: [4096, 3189, 1, 1024] + - Exact: [4096, 3494, 1, 1024] + - Exact: [1024, 3093, 1, 4096] + - Exact: [4096, 3421, 1, 1024] + - Exact: [1024, 3479, 1, 4096] + - Exact: [1024, 3433, 1, 4096] + - Exact: [4096, 3311, 1, 1024] + - Exact: [1024, 3381, 1, 4096] + - Exact: [1024, 3996, 1, 4096] + - Exact: [4096, 3384, 1, 1024] + - Exact: [1024, 3247, 1, 4096] + - Exact: [1024, 3169, 1, 4096] + - Exact: [1024, 3088, 1, 4096] + - Exact: [1024, 3363, 1, 4096] + - Exact: [1024, 3538, 1, 4096] + - Exact: [1024, 3996, 1, 1024] + - Exact: [4096, 3169, 1, 1024] + - Exact: [4096, 3538, 1, 1024] + - Exact: [4096, 3401, 1, 1024] + - Exact: [4096, 3581, 1, 1024] + - Exact: [1024, 3180, 1, 4096] + - Exact: [1024, 3870, 1, 1024] + - Exact: [4096, 3555, 1, 1024] + - Exact: [4096, 3412, 1, 1024] + - Exact: [4096, 3302, 1, 1024] + - Exact: [1024, 3561, 1, 4096] + - Exact: [1024, 3302, 1, 4096] + - Exact: [1024, 3976, 1, 4096] + - Exact: [4096, 3485, 1, 1024] + - Exact: [4096, 3534, 1, 1024] + - Exact: [1024, 3110, 1, 4096] + - Exact: [1024, 3401, 1, 4096] + - Exact: [4096, 3216, 1, 1024] + - Exact: [1024, 4020, 1, 33708] + - Exact: [1024, 3215, 1, 4096] + - Exact: [4096, 3566, 1, 1024] + - Exact: [1024, 3137, 1, 4096] + - Exact: [4096, 3359, 1, 1024] + - Exact: [4096, 3392, 1, 1024] + - Exact: [1024, 3506, 1, 4096] + - Exact: [4096, 3233, 1, 1024] + - Exact: [1024, 3444, 1, 4096] + - Exact: [1024, 3975, 1, 4096] + - Exact: [1024, 3870, 1, 33708] + - Exact: [4096, 3465, 1, 1024] + - Exact: [1024, 3523, 1, 4096] + - Exact: [4096, 3990, 1, 1024] + - Exact: [1024, 3549, 1, 4096] + - Exact: [1024, 3342, 1, 4096] + - Exact: [4096, 3476, 1, 1024] + - Exact: [1024, 3418, 1, 4096] + - Exact: [1024, 3859, 1, 1024] + - Exact: [4096, 3339, 1, 1024] + - Exact: [4096, 3452, 1, 1024] + - Exact: [4096, 3293, 1, 1024] + - Exact: [1024, 3369, 1, 4096] + - Exact: [1024, 3544, 1, 4096] + - Exact: [4096, 3493, 1, 1024] + - Exact: [4096, 3350, 1, 1024] + - Exact: [4096, 3256, 1, 1024] + - Exact: [1024, 3870, 1, 4096] + - Exact: [4096, 4012, 1, 1024] + - Exact: [1024, 3280, 1, 4096] + - Exact: [4096, 3456, 1, 1024] + - Exact: [1024, 3555, 1, 4096] + - Exact: [4096, 3014, 1, 1024] + - Exact: [1024, 3474, 1, 4096] + - Exact: [4096, 3367, 1, 1024] + - Exact: [4096, 3432, 1, 1024] + - Exact: [4096, 3273, 1, 1024] + - Exact: [4096, 3130, 1, 1024] + - Exact: [1024, 2984, 1, 4096] + - Exact: [1024, 3995, 1, 1024] + - Exact: [1024, 3517, 1, 4096] + - Exact: [1024, 3455, 1, 4096] + - Exact: [1024, 3939, 1, 4096] + - Exact: [4096, 3147, 1, 1024] + - Exact: [4096, 3516, 1, 1024] + - Exact: [1024, 3876, 1, 4096] + - Exact: [1024, 3191, 1, 4096] + - Exact: [4096, 3411, 1, 1024] + - Exact: [1024, 3337, 1, 4096] + - Exact: [1024, 3512, 1, 4096] + - Exact: [4096, 3301, 1, 1024] + - Exact: [1024, 3450, 1, 4096] + - Exact: [4096, 3533, 1, 1024] + - Exact: [4096, 3390, 1, 1024] + - Exact: [4096, 3231, 1, 1024] + - Exact: [1024, 2499, 1, 4096] + - Exact: [1024, 3186, 1, 4096] + - Exact: [1024, 3380, 1, 4096] + - Exact: [4096, 3496, 1, 1024] + - Exact: [1024, 3956, 1, 33708] + - Exact: [1024, 3976, 1, 1024] + - Exact: [4096, 2736, 1, 1024] + - Exact: [1024, 3291, 1, 4096] + - Exact: [1024, 3944, 1, 33708] + - Exact: [1024, 3485, 1, 4096] + - Exact: [4096, 3138, 1, 1024] + - Exact: [1024, 3423, 1, 4096] + - Exact: [1024, 3491, 1, 4096] + - Exact: [1024, 3860, 1, 4096] + - Exact: [4096, 3211, 1, 1024] + - Exact: [1024, 3221, 1, 4096] + - Exact: [1024, 2917, 1, 4096] + - Exact: [4096, 3475, 1, 1024] + - Exact: [4096, 3524, 1, 1024] + - Exact: [4096, 2985, 1, 1024] + - Exact: [1024, 3480, 1, 4096] + - Exact: [4096, 3222, 1, 1024] + - Exact: [4096, 3451, 1, 1024] + - Exact: [1024, 3969, 1, 33708] + - Exact: [1024, 3640, 1, 1024] + - Exact: [1024, 3297, 1, 4096] + - Exact: [4096, 3944, 1, 1024] + - Exact: [1024, 3216, 1, 4096] + - Exact: [4096, 3349, 1, 1024] + - Exact: [4096, 3398, 1, 1024] + - Exact: [1024, 3154, 1, 4096] + - Exact: [1024, 3978, 1, 33708] + - Exact: [1024, 3348, 1, 4096] + - Exact: [4096, 3304, 1, 1024] + - Exact: [4096, 4030, 1, 1024] + - Exact: [1024, 4026, 1, 1024] + - Exact: [4096, 3471, 1, 1024] + - Exact: [1024, 3259, 1, 4096] + - Exact: [1024, 3308, 1, 4096] + - Exact: [4096, 3391, 1, 1024] + - Exact: [1024, 3312, 1, 4096] + - Exact: [1024, 3502, 1, 4096] + - Exact: [1024, 3968, 1, 33708] + - Exact: [1024, 3424, 1, 4096] + - Exact: [4096, 4032, 1, 1024] + - Exact: [1024, 3900, 1, 1024] + - Exact: [4096, 3442, 1, 1024] + - Exact: [1024, 3366, 1, 4096] + - Exact: [4096, 3999, 1, 1024] + - Exact: [1024, 3477, 1, 4096] + - Exact: [1024, 2505, 1, 4096] + - Exact: [4096, 3515, 1, 1024] + - Exact: [1024, 3564, 1, 4096] + - Exact: [4096, 3057, 1, 1024] + - Exact: [1024, 3339, 1, 4096] + - Exact: [4096, 3262, 1, 1024] + - Exact: [1024, 4030, 1, 4096] + - Exact: [1024, 3265, 1, 4096] + - Exact: [1024, 3459, 1, 4096] + - Exact: [4096, 3462, 1, 1024] + - Exact: [1024, 3513, 1, 4096] + - Exact: [1024, 3397, 1, 4096] + - Exact: [4096, 3572, 1, 1024] + - Exact: [4096, 3389, 1, 1024] + - Exact: [4096, 3438, 1, 1024] + - Exact: [1024, 3640, 1, 33708] + - Exact: [1024, 3995, 1, 33708] + - Exact: [1024, 3165, 1, 4096] + - Exact: [4096, 3543, 1, 1024] + - Exact: [4096, 3352, 1, 1024] + - Exact: [1024, 3359, 1, 4096] + - Exact: [1024, 3470, 1, 4096] + - Exact: [1024, 3392, 1, 4096] + - Exact: [4096, 3137, 1, 1024] + - Exact: [4096, 3506, 1, 1024] + - Exact: [1024, 3095, 1, 4096] + - Exact: [1024, 3859, 1, 4096] + - Exact: [4096, 3369, 1, 1024] + - Exact: [1024, 3435, 1, 4096] + - Exact: [1024, 3354, 1, 4096] + - Exact: [1024, 3055, 1, 4096] + - Exact: [4096, 3523, 1, 1024] + - Exact: [4096, 3380, 1, 1024] + - Exact: [1024, 3233, 1, 4096] + - Exact: [4096, 3221, 1, 1024] + - Exact: [4096, 3270, 1, 1024] + - Exact: [4096, 3593, 1, 1024] + - Exact: [1024, 3358, 1, 4096] + - Exact: [1024, 3540, 1, 4096] + - Exact: [4096, 3502, 1, 1024] + - Exact: [4096, 2505, 1, 1024] + - Exact: [4096, 3397, 1, 1024] + - Exact: [1024, 3300, 1, 4096] + - Exact: [4096, 3095, 1, 1024] + - Exact: [1024, 3182, 1, 4096] + - Exact: [1024, 3299, 1, 4096] + - Exact: [1024, 3276, 1, 4096] + - Exact: [1024, 3360, 1, 4096] + - Exact: [4096, 3360, 1, 1024] + - Exact: [4096, 2918, 1, 1024] + - Exact: [1024, 3939, 1, 33708] + - Exact: [4096, 3314, 1, 1024] + - Exact: [1024, 3319, 1, 4096] + - Exact: [1024, 3942, 1, 1024] + - Exact: [1024, 3465, 1, 4096] + - Exact: [4096, 3546, 1, 1024] + - Exact: [1024, 3403, 1, 4096] + - Exact: [1024, 3948, 1, 1024] + - Exact: [4096, 3441, 1, 1024] + - Exact: [1024, 3139, 1, 4096] + - Exact: [1024, 3563, 1, 4096] + - Exact: [1024, 3508, 1, 4096] + - Exact: [1024, 3975, 1, 33708] + - Exact: [1024, 3446, 1, 4096] + - Exact: [1024, 3529, 1, 4096] + - Exact: [4096, 3461, 1, 1024] + - Exact: [1024, 3574, 1, 4096] + - Exact: [1024, 3101, 1, 4096] + - Exact: [1024, 3927, 1, 1024] + - Exact: [4096, 3224, 1, 1024] + - Exact: [4096, 3437, 1, 1024] + - Exact: [4096, 3900, 1, 1024] + - Exact: [1024, 3495, 1, 4096] + - Exact: [1024, 3977, 1, 33708] + - Exact: [1024, 3328, 1, 4096] + - Exact: [4096, 3168, 1, 1024] + - Exact: [1024, 4026, 1, 33708] + - Exact: [1024, 3292, 1, 4096] + - Exact: [1024, 3294, 1, 4096] + - Exact: [4096, 3335, 1, 1024] + - Exact: [4096, 3400, 1, 1024] + - Exact: [1024, 3287, 1, 4096] + - Exact: [1024, 3910, 1, 4096] + - Exact: [1024, 3780, 1, 1024] + - Exact: [4096, 3098, 1, 1024] + - Exact: [1024, 3584, 1, 33708] + - Exact: [1024, 3371, 1, 4096] + - Exact: [1024, 3546, 1, 4096] + - Exact: [1024, 4012, 1, 1024] + - Exact: [4096, 3505, 1, 1024] + - Exact: [4096, 3554, 1, 1024] + - Exact: [4096, 3063, 1, 1024] + - Exact: [1024, 3900, 1, 33708] + - Exact: [1024, 3345, 1, 4096] + - Exact: [1024, 3357, 1, 4096] + - Exact: [1024, 3282, 1, 4096] + - Exact: [4096, 3484, 1, 1024] + - Exact: [1024, 3557, 1, 4096] + - Exact: [1024, 3476, 1, 4096] + - Exact: [1024, 3751, 1, 1024] + - Exact: [4096, 3379, 1, 1024] + - Exact: [4096, 3428, 1, 1024] + - Exact: [4096, 3126, 1, 1024] + - Exact: [1024, 3325, 1, 4096] + - Exact: [4096, 3501, 1, 1024] + - Exact: [4096, 3358, 1, 1024] + - Exact: [1024, 3441, 1, 4096] + - Exact: [1024, 3552, 1, 4096] + - Exact: [4096, 3232, 1, 1024] + - Exact: [1024, 3412, 1, 4096] + - Exact: [1024, 3372, 1, 4096] + - Exact: [1024, 3585, 1, 4096] + - Exact: [4096, 3143, 1, 1024] + - Exact: [4096, 3464, 1, 1024] + - Exact: [1024, 3145, 1, 4096] + - Exact: [4096, 3375, 1, 1024] + - Exact: [4096, 2917, 1, 1024] + - Exact: [4096, 3978, 1, 1024] + - Exact: [1024, 2765, 1, 4096] + - Exact: [1024, 3452, 1, 4096] + - Exact: [4096, 3584, 1, 1024] + - Exact: [4096, 3545, 1, 1024] + - Exact: [1024, 3352, 1, 4096] + - Exact: [4096, 3292, 1, 1024] + - Exact: [1024, 3525, 1, 4096] + - Exact: [1024, 3266, 1, 4096] + - Exact: [1024, 3382, 1, 4096] + - Exact: [4096, 3492, 1, 1024] + - Exact: [4096, 3419, 1, 1024] + - Exact: [1024, 3796, 1, 33708] + - Exact: [1024, 3293, 1, 4096] + - Exact: [4096, 3796, 1, 1024] + - Exact: [1024, 3487, 1, 4096] + - Exact: [4096, 3166, 1, 1024] + - Exact: [1024, 3409, 1, 4096] + - Exact: [1024, 3520, 1, 4096] + - Exact: [1024, 3573, 1, 4096] + - Exact: [4096, 3366, 1, 1024] + - Exact: [4096, 3720, 1, 1024] + - Exact: [4096, 3207, 1, 1024] + - Exact: [4096, 3272, 1, 1024] + - Exact: [1024, 3390, 1, 4096] + - Exact: [4096, 3183, 1, 1024] + - Exact: [4096, 3536, 1, 1024] + - Exact: [4096, 3563, 1, 1024] + - Exact: [1024, 3482, 1, 4096] + - Exact: [4096, 3447, 1, 1024] + - Exact: [4096, 3955, 1, 1024] + - Exact: [4096, 4005, 1, 1024] + - Exact: [1024, 3493, 1, 4096] + - Exact: [4096, 3410, 1, 1024] + - Exact: [1024, 3422, 1, 4096] + - Exact: [1024, 3350, 1, 4096] + - Exact: [4096, 3300, 1, 1024] + - Exact: [4096, 3910, 1, 1024] + - Exact: [1024, 3489, 1, 4096] + - Exact: [4096, 3483, 1, 1024] + - Exact: [4096, 3532, 1, 1024] + - Exact: [4096, 3230, 1, 1024] + - Exact: [4096, 3427, 1, 1024] + - Exact: [1024, 3377, 1, 4096] + - Exact: [1024, 3488, 1, 4096] + - Exact: [1024, 3616, 1, 4096] + - Exact: [1024, 3426, 1, 4096] + - Exact: [4096, 3357, 1, 1024] + - Exact: [4096, 3406, 1, 1024] + - Exact: [1024, 3046, 1, 4096] + - Exact: [1024, 3272, 1, 4096] + - Exact: [1024, 3256, 1, 4096] + - Exact: [4096, 3247, 1, 1024] + - Exact: [4096, 3088, 1, 1024] + - Exact: [1024, 3531, 1, 4096] + - Exact: [4096, 3511, 1, 1024] + - Exact: [1024, 3720, 1, 33708] + - Exact: [1024, 3267, 1, 4096] + - Exact: [1024, 3270, 1, 4096] + - Exact: [1024, 3461, 1, 4096] + - Exact: [4096, 3474, 1, 1024] + - Exact: [4096, 2984, 1, 1024] + - Exact: [1024, 3399, 1, 4096] + - Exact: [4096, 3574, 1, 1024] + - Exact: [1024, 3876, 1, 1024] + - Exact: [4096, 3337, 1, 1024] + - Exact: [4096, 3450, 1, 1024] + - Exact: [1024, 3720, 1, 1024] + - Exact: [1024, 4059, 1, 1024] + - Exact: [4096, 3291, 1, 1024] + - Exact: [4096, 3995, 1, 1024] + - Exact: [4096, 3491, 1, 1024] + - Exact: [4096, 3348, 1, 1024] + - Exact: [4096, 3925, 1, 1024] + - Exact: [4096, 3894, 1, 1024] + - Exact: [1024, 3456, 1, 4096] + - Exact: [1024, 3394, 1, 4096] + - Exact: [4096, 3165, 1, 1024] + - Exact: [4096, 3470, 1, 1024] + - Exact: [1024, 3014, 1, 4096] + - Exact: [1024, 3375, 1, 4096] + - Exact: [4096, 3859, 1, 1024] + - Exact: [4096, 3365, 1, 1024] + - Exact: [1024, 3162, 1, 4096] + - Exact: [1024, 3840, 1, 33708] + - Exact: [1024, 3437, 1, 4096] + - Exact: [4096, 3319, 1, 1024] + - Exact: [1024, 3320, 1, 4096] + - Exact: [4096, 3328, 1, 1024] + - Exact: [1024, 3235, 1, 4096] + - Exact: [4096, 3282, 1, 1024] + - Exact: [1024, 3367, 1, 4096] + - Exact: [1024, 3542, 1, 4096] + - Exact: [4096, 3145, 1, 1024] + - Exact: [4096, 3514, 1, 1024] + - Exact: [1024, 3432, 1, 4096] + - Exact: [4096, 3409, 1, 1024] + - Exact: [1024, 4012, 1, 33708] + - Exact: [4096, 3876, 1, 1024] + - Exact: [4096, 3299, 1, 1024] + - Exact: [1024, 3168, 1, 4096] + - Exact: [4096, 3681, 1, 1024] + - Exact: [4096, 3531, 1, 1024] + - Exact: [4096, 3388, 1, 1024] + - Exact: [1024, 3720, 1, 4096] + - Exact: [1024, 3332, 1, 4096] + - Exact: [1024, 3273, 1, 4096] + - Exact: [1024, 2935, 1, 4096] + - Exact: [1024, 3467, 1, 4096] + - Exact: [4096, 3542, 1, 1024] + - Exact: [1024, 3130, 1, 4096] + - Exact: [1024, 3405, 1, 4096] + - Exact: [1024, 3960, 1, 1024] + - Exact: [4096, 3405, 1, 1024] + - Exact: [1024, 10080, 1, 1024] + - Exact: [36548, 1216, 1, 1024] + - Exact: [1024, 2592, 1, 1024] + - Exact: [1024, 1568, 1, 1024] + - Exact: [1024, 4445, 1, 1024] + - Exact: [1024, 6272, 1, 1024] + - Exact: [36548, 3584, 1, 1024] + - Exact: [1024, 1827, 1, 1024] + - Exact: [1024, 3220, 1, 1024] + - Exact: [1024, 1856, 1, 1024] + - Exact: [1024, 1760, 1, 1024] + - Exact: [36548, 4235, 1, 1024] + - Exact: [1024, 1984, 1, 1024] + - Exact: [1024, 14720, 1, 1024] + - Exact: [1024, 1152, 1, 1024] + - Exact: [36548, 14976, 1, 1024] + - Exact: [36548, 1152, 1, 1024] + - Exact: [1024, 3392, 1, 1024] + - Exact: [1024, 1408, 1, 1024] + - Exact: [1024, 2080, 1, 1024] + - Exact: [1024, 1824, 1, 1024] + - Exact: [36548, 2432, 1, 1024] + - Exact: [36548, 1827, 1, 1024] + - Exact: [1024, 10176, 1, 1024] + - Exact: [1024, 1952, 1, 1024] + - Exact: [1024, 17024, 1, 1024] + - Exact: [1024, 1472, 1, 1024] + - Exact: [36548, 4459, 1, 1024] + - Exact: [1024, 3712, 1, 1024] + - Exact: [36548, 12928, 1, 1024] + - Exact: [1024, 1632, 1, 1024] + - Exact: [1024, 1696, 1, 1024] + - Exact: [36548, 1764, 1, 1024] + - Exact: [1024, 2944, 1, 1024] + - Exact: [36548, 14080, 1, 1024] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 13440, 1, 1024] + - Exact: [36548, 9120, 1, 1024] + - Exact: [1024, 3008, 1, 1024] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2208, 1, 1024] + - Exact: [1024, 1920, 1, 1024] + - Exact: [36548, 2496, 1, 1024] + - Exact: [1024, 2016, 1, 1024] + - Exact: [1024, 1184, 1, 1024] + - Exact: [1024, 1664, 1, 1024] + - Exact: [1024, 11424, 1, 1024] + - Exact: [1024, 1216, 1, 1024] + - Exact: [36548, 3185, 1, 1024] + - Exact: [36548, 9216, 1, 1024] + - Exact: [1024, 3200, 1, 1024] + - Exact: [1024, 2656, 1, 1024] + - Exact: [1024, 2368, 1, 1024] + - Exact: [1024, 4459, 1, 1024] + - Exact: [1024, 3808, 1, 1024] + - Exact: [1024, 2336, 1, 1024] + - Exact: [1024, 2304, 1, 1024] + - Exact: [1024, 1560, 1, 1024] + - Exact: [1024, 2496, 1, 1024] + - Exact: [1024, 1504, 1, 1024] + - Exact: [1024, 3232, 1, 1024] + - Exact: [36548, 1015, 1, 1024] + - Exact: [1024, 2000, 1, 1024] + - Exact: [36548, 243, 1, 1024] + - Exact: [1024, 13184, 1, 1024] + - Exact: [1024, 2688, 1, 1024] + - Exact: [36548, 950, 1, 1024] + - Exact: [1024, 1764, 1, 1024] + - Exact: [1024, 1376, 1, 1024] + - Exact: [36548, 774, 1, 1024] + - Exact: [1024, 4256, 1, 1024] + - Exact: [36548, 3712, 1, 1024] + - Exact: [1024, 3360, 1, 1024] + - Exact: [1024, 2784, 1, 1024] + - Exact: [1024, 4992, 1, 1024] + - Exact: [36548, 1102, 1, 1024] + - Exact: [1024, 1536, 1, 1024] + - Exact: [1024, 2720, 1, 1024] + - Exact: [1024, 2752, 1, 1024] + - Exact: [1024, 2816, 1, 1024] + - Exact: [1024, 2624, 1, 1024] + - Exact: [1024, 2144, 1, 1024] + - Exact: [36548, 1131, 1, 1024] + - Exact: [1024, 3296, 1, 1024] + - Exact: [36548, 4992, 1, 1024] + - Exact: [1024, 1344, 1, 1024] + - Exact: [36548, 2401, 1, 1024] + - Exact: [1024, 15744, 1, 1024] + - Exact: [1024, 15232, 1, 1024] + - Exact: [1024, 1888, 1, 1024] + - Exact: [1024, 1792, 1, 1024] + - Exact: [36548, 1073, 1, 1024] + - Exact: [36548, 15488, 1, 1024] + - Exact: [1024, 2464, 1, 1024] + - Exact: [1024, 2272, 1, 1024] + - Exact: [1024, 2432, 1, 1024] + - Exact: [1024, 3936, 1, 1024] + - Exact: [36548, 13824, 1, 1024] + - Exact: [1024, 2401, 1, 1024] + - Exact: [1024, 2176, 1, 1024] + - Exact: [1024, 2240, 1, 1024] + - Exact: [1024, 1728, 1, 1024] + - Exact: [1024, 2528, 1, 1024] + - Exact: [1024, 2400, 1, 1024] + - Exact: [1024, 1440, 1, 1024] + - Exact: [1024, 2912, 1, 1024] + - Exact: [1024, 2880, 1, 1024] + - Exact: [1024, 4064, 1, 1024] + - Exact: [1024, 4655, 1, 1024] + - Exact: [36548, 6272, 1, 1024] + - Exact: [768, 2048, 1, 3072] + - Exact: [768, 4096, 1, 3072] + - Exact: [6272, 256, 1, 528] + - Exact: [3136, 2048, 1, 1024] + - Exact: [50176, 128, 1, 256] + - Exact: [12544, 1024, 1, 256] + - Exact: [12544, 256, 1, 1024] + - Exact: [3136, 512, 1, 1024] + - Exact: [3136, 2048, 1, 512] + - Exact: [289, 384, 32, 1024] + - Exact: [4096, 512, 1, 4096] + - Exact: [50176, 512, 1, 256] + - Exact: [12544, 1024, 1, 512] + - Exact: [12544, 256, 1, 512] + - Exact: [784, 128, 32, 256] + - Exact: [4096, 512, 1, 9216] + - Exact: [3136, 512, 1, 2048] + - Exact: [1225, 192, 32, 384] + - Exact: [8192, 320, 1, 1280] + - Exact: [8192, 320, 1, 2048] + - Exact: [8192, 384, 1, 1280] + - Exact: [8192, 384, 1, 2048] + - Exact: [8192, 448, 1, 2048] + - Exact: [8192, 448, 1, 1280] + - Exact: [256, 6400, 1, 4096] + - Exact: [512, 3433, 1, 2048] + - Exact: [512, 3439, 1, 2048] + - Exact: [512, 3461, 1, 2048] + - Exact: [512, 3479, 1, 2048] + - Exact: [512, 3494, 1, 2048] + - Exact: [512, 3520, 1, 2048] + - Exact: [512, 3530, 1, 2048] + - Exact: [512, 3541, 1, 2048] + - Exact: [512, 3564, 1, 2048] + - Exact: [512, 3776, 1, 2048] + - Exact: [512, 3859, 1, 512] + - Exact: [512, 3925, 1, 2048] + - Exact: [512, 3944, 1, 2048] + - Exact: [512, 3955, 1, 2048] + - Exact: [512, 3969, 1, 2048] + - Exact: [512, 3976, 1, 2048] + - Exact: [2048, 1232, 1, 512] + - Exact: [2048, 3165, 1, 512] + - Exact: [512, 2387, 1, 512] + - Exact: [512, 2418, 1, 512] + - Exact: [512, 2418, 1, 2048] + - Exact: [512, 2496, 1, 512] + - Exact: [512, 2496, 1, 2048] + - Exact: [512, 2790, 1, 2048] + - Exact: [512, 2864, 1, 2048] + - Exact: [512, 3092, 1, 2048] + - Exact: [512, 3113, 1, 2048] + - Exact: [512, 3137, 1, 2048] + - Exact: [512, 3165, 1, 2048] + - Exact: [512, 3166, 1, 2048] + - Exact: [512, 3194, 1, 2048] + - Exact: [512, 3219, 1, 2048] + - Exact: [512, 3222, 1, 2048] + - Exact: [512, 3234, 1, 2048] + - Exact: [512, 3237, 1, 2048] + - Exact: [512, 3242, 1, 2048] + - Exact: [512, 3246, 1, 2048] + - Exact: [512, 3249, 1, 2048] + - Exact: [512, 3251, 1, 2048] + - Exact: [512, 3257, 1, 2048] + - Exact: [512, 3262, 1, 2048] + - Exact: [512, 3268, 1, 2048] + - Exact: [512, 3282, 1, 2048] + - Exact: [512, 3286, 1, 2048] + - Exact: [512, 3287, 1, 2048] + - Exact: [512, 3293, 1, 2048] + - Exact: [512, 3297, 1, 2048] + - Exact: [512, 3307, 1, 2048] + - Exact: [512, 3314, 1, 2048] + - Exact: [512, 3315, 1, 2048] + - Exact: [512, 3319, 1, 2048] + - Exact: [512, 3322, 1, 2048] + - Exact: [512, 3323, 1, 2048] + - Exact: [512, 3324, 1, 2048] + - Exact: [512, 3325, 1, 2048] + - Exact: [512, 3327, 1, 2048] + - Exact: [512, 3329, 1, 2048] + - Exact: [512, 3332, 1, 2048] + - Exact: [512, 3336, 1, 2048] + - Exact: [512, 3339, 1, 2048] + - Exact: [512, 3342, 1, 2048] + - Exact: [512, 3344, 1, 2048] + - Exact: [512, 3358, 1, 2048] + - Exact: [512, 3360, 1, 2048] + - Exact: [512, 3364, 1, 2048] + - Exact: [512, 3365, 1, 2048] + - Exact: [512, 3369, 1, 2048] + - Exact: [512, 3370, 1, 2048] + - Exact: [512, 3371, 1, 2048] + - Exact: [512, 3374, 1, 2048] + - Exact: [512, 3376, 1, 2048] + - Exact: [512, 3377, 1, 2048] + - Exact: [512, 3378, 1, 2048] + - Exact: [512, 3381, 1, 2048] + - Exact: [512, 3382, 1, 2048] + - Exact: [512, 3383, 1, 2048] + - Exact: [512, 3384, 1, 2048] + - Exact: [512, 3385, 1, 2048] + - Exact: [512, 3386, 1, 2048] + - Exact: [512, 3388, 1, 2048] + - Exact: [512, 3390, 1, 2048] + - Exact: [512, 3391, 1, 2048] + - Exact: [512, 3396, 1, 2048] + - Exact: [512, 3399, 1, 2048] + - Exact: [512, 3402, 1, 2048] + - Exact: [512, 3410, 1, 2048] + - Exact: [512, 3412, 1, 2048] + - Exact: [512, 3414, 1, 2048] + - Exact: [512, 3415, 1, 2048] + - Exact: [512, 3418, 1, 2048] + - Exact: [512, 3420, 1, 2048] + - Exact: [512, 3422, 1, 2048] + - Exact: [512, 3425, 1, 2048] + - Exact: [512, 3426, 1, 2048] + - Exact: [512, 3427, 1, 2048] + - Exact: [512, 3428, 1, 2048] + - Exact: [512, 3430, 1, 2048] + - Exact: [512, 3431, 1, 2048] + - Exact: [512, 3432, 1, 2048] + - Exact: [512, 3438, 1, 2048] + - Exact: [512, 3440, 1, 2048] + - Exact: [512, 3443, 1, 2048] + - Exact: [512, 3445, 1, 2048] + - Exact: [512, 3447, 1, 2048] + - Exact: [512, 3448, 1, 2048] + - Exact: [512, 3450, 1, 2048] + - Exact: [512, 3451, 1, 2048] + - Exact: [512, 3452, 1, 2048] + - Exact: [512, 3453, 1, 2048] + - Exact: [512, 3455, 1, 2048] + - Exact: [512, 3456, 1, 2048] + - Exact: [512, 3457, 1, 2048] + - Exact: [512, 3458, 1, 2048] + - Exact: [512, 3459, 1, 2048] + - Exact: [512, 3460, 1, 2048] + - Exact: [512, 3462, 1, 2048] + - Exact: [512, 3466, 1, 2048] + - Exact: [512, 3467, 1, 2048] + - Exact: [512, 3468, 1, 2048] + - Exact: [512, 3470, 1, 2048] + - Exact: [512, 3471, 1, 2048] + - Exact: [512, 3472, 1, 2048] + - Exact: [512, 3475, 1, 2048] + - Exact: [512, 3476, 1, 2048] + - Exact: [512, 3477, 1, 2048] + - Exact: [512, 3478, 1, 2048] + - Exact: [512, 3480, 1, 2048] + - Exact: [512, 3481, 1, 2048] + - Exact: [512, 3483, 1, 2048] + - Exact: [512, 3484, 1, 2048] + - Exact: [512, 3487, 1, 2048] + - Exact: [512, 3489, 1, 2048] + - Exact: [512, 3490, 1, 2048] + - Exact: [512, 3491, 1, 2048] + - Exact: [512, 3493, 1, 2048] + - Exact: [512, 3495, 1, 2048] + - Exact: [512, 3497, 1, 2048] + - Exact: [512, 3498, 1, 2048] + - Exact: [512, 3499, 1, 2048] + - Exact: [512, 3501, 1, 2048] + - Exact: [512, 3503, 1, 2048] + - Exact: [512, 3505, 1, 2048] + - Exact: [512, 3507, 1, 2048] + - Exact: [512, 3508, 1, 2048] + - Exact: [512, 3509, 1, 2048] + - Exact: [512, 3510, 1, 2048] + - Exact: [512, 3511, 1, 2048] + - Exact: [512, 3513, 1, 2048] + - Exact: [512, 3514, 1, 2048] + - Exact: [512, 3515, 1, 2048] + - Exact: [512, 3517, 1, 2048] + - Exact: [512, 3518, 1, 2048] + - Exact: [512, 3519, 1, 2048] + - Exact: [512, 3523, 1, 2048] + - Exact: [512, 3528, 1, 2048] + - Exact: [512, 3529, 1, 2048] + - Exact: [512, 3531, 1, 2048] + - Exact: [512, 3532, 1, 2048] + - Exact: [512, 3533, 1, 2048] + - Exact: [512, 3534, 1, 2048] + - Exact: [512, 3538, 1, 2048] + - Exact: [512, 3539, 1, 2048] + - Exact: [512, 3540, 1, 2048] + - Exact: [512, 3547, 1, 2048] + - Exact: [512, 3548, 1, 2048] + - Exact: [512, 3552, 1, 2048] + - Exact: [512, 3575, 1, 2048] + - Exact: [512, 3598, 1, 2048] + - Exact: [512, 3599, 1, 2048] + - Exact: [512, 3608, 1, 2048] + - Exact: [512, 3776, 1, 512] + - Exact: [512, 3780, 1, 512] + - Exact: [512, 3780, 1, 2048] + - Exact: [512, 3780, 1, 33708] + - Exact: [512, 3796, 1, 512] + - Exact: [512, 3796, 1, 2048] + - Exact: [512, 3796, 1, 33708] + - Exact: [512, 3822, 1, 512] + - Exact: [512, 3822, 1, 2048] + - Exact: [512, 3822, 1, 33708] + - Exact: [512, 3835, 1, 512] + - Exact: [512, 3835, 1, 2048] + - Exact: [512, 3840, 1, 512] + - Exact: [512, 3840, 1, 2048] + - Exact: [512, 3840, 1, 33708] + - Exact: [512, 3859, 1, 2048] + - Exact: [512, 3859, 1, 33708] + - Exact: [512, 3864, 1, 512] + - Exact: [512, 3864, 1, 2048] + - Exact: [512, 3870, 1, 512] + - Exact: [512, 3870, 1, 2048] + - Exact: [512, 3870, 1, 33708] + - Exact: [512, 3876, 1, 512] + - Exact: [512, 3876, 1, 2048] + - Exact: [512, 3876, 1, 33708] + - Exact: [512, 3906, 1, 512] + - Exact: [512, 3906, 1, 2048] + - Exact: [512, 3906, 1, 33708] + - Exact: [512, 3910, 1, 512] + - Exact: [512, 3910, 1, 2048] + - Exact: [512, 3910, 1, 33708] + - Exact: [512, 3925, 1, 512] + - Exact: [512, 3925, 1, 33708] + - Exact: [512, 3927, 1, 512] + - Exact: [512, 3942, 1, 512] + - Exact: [512, 3942, 1, 2048] + - Exact: [512, 3942, 1, 33708] + - Exact: [512, 3944, 1, 512] + - Exact: [512, 3944, 1, 33708] + - Exact: [512, 3955, 1, 512] + - Exact: [512, 3955, 1, 33708] + - Exact: [512, 3968, 1, 512] + - Exact: [512, 3968, 1, 2048] + - Exact: [512, 3968, 1, 33708] + - Exact: [512, 3969, 1, 512] + - Exact: [512, 3969, 1, 33708] + - Exact: [512, 3976, 1, 512] + - Exact: [512, 3976, 1, 33708] + - Exact: [512, 3977, 1, 512] + - Exact: [512, 3977, 1, 2048] + - Exact: [512, 3977, 1, 33708] + - Exact: [512, 3978, 1, 512] + - Exact: [512, 3978, 1, 2048] + - Exact: [512, 3978, 1, 33708] + - Exact: [512, 3990, 1, 512] + - Exact: [512, 3990, 1, 2048] + - Exact: [512, 3990, 1, 33708] + - Exact: [512, 3995, 1, 512] + - Exact: [512, 3995, 1, 2048] + - Exact: [512, 3995, 1, 33708] + - Exact: [512, 3996, 1, 512] + - Exact: [512, 3996, 1, 2048] + - Exact: [512, 3996, 1, 33708] + - Exact: [512, 3999, 1, 512] + - Exact: [512, 3999, 1, 2048] + - Exact: [512, 3999, 1, 33708] + - Exact: [512, 4005, 1, 512] + - Exact: [512, 4005, 1, 2048] + - Exact: [512, 4005, 1, 33708] + - Exact: [512, 4012, 1, 512] + - Exact: [512, 4012, 1, 2048] + - Exact: [512, 4012, 1, 33708] + - Exact: [512, 4020, 1, 512] + - Exact: [512, 4020, 1, 2048] + - Exact: [512, 4020, 1, 33708] + - Exact: [512, 4026, 1, 512] + - Exact: [512, 4026, 1, 2048] + - Exact: [512, 4026, 1, 33708] + - Exact: [512, 4030, 1, 512] + - Exact: [512, 4030, 1, 2048] + - Exact: [512, 4030, 1, 33708] + - Exact: [512, 4032, 1, 512] + - Exact: [512, 4032, 1, 2048] + - Exact: [512, 4032, 1, 33708] + - Exact: [512, 4050, 1, 512] + - Exact: [512, 4059, 1, 512] + - Exact: [2048, 644, 1, 512] + - Exact: [2048, 668, 1, 512] + - Exact: [2048, 714, 1, 512] + - Exact: [2048, 720, 1, 512] + - Exact: [2048, 722, 1, 512] + - Exact: [2048, 781, 1, 512] + - Exact: [2048, 848, 1, 512] + - Exact: [2048, 872, 1, 512] + - Exact: [2048, 936, 1, 512] + - Exact: [2048, 980, 1, 512] + - Exact: [2048, 1139, 1, 512] + - Exact: [2048, 1184, 1, 512] + - Exact: [2048, 1186, 1, 512] + - Exact: [2048, 1279, 1, 512] + - Exact: [2048, 1290, 1, 512] + - Exact: [2048, 1327, 1, 512] + - Exact: [2048, 1331, 1, 512] + - Exact: [2048, 1341, 1, 512] + - Exact: [2048, 1350, 1, 512] + - Exact: [2048, 1359, 1, 512] + - Exact: [2048, 1391, 1, 512] + - Exact: [2048, 1424, 1, 512] + - Exact: [2048, 1458, 1, 512] + - Exact: [2048, 1462, 1, 512] + - Exact: [2048, 1467, 1, 512] + - Exact: [2048, 1472, 1, 512] + - Exact: [2048, 1520, 1, 512] + - Exact: [2048, 1596, 1, 512] + - Exact: [2048, 1599, 1, 512] + - Exact: [2048, 1615, 1, 512] + - Exact: [2048, 1680, 1, 512] + - Exact: [2048, 1709, 1, 512] + - Exact: [2048, 1902, 1, 512] + - Exact: [2048, 1917, 1, 512] + - Exact: [2048, 2076, 1, 512] + - Exact: [2048, 2195, 1, 512] + - Exact: [2048, 2205, 1, 512] + - Exact: [2048, 2418, 1, 512] + - Exact: [2048, 2496, 1, 512] + - Exact: [2048, 2790, 1, 512] + - Exact: [2048, 2864, 1, 512] + - Exact: [2048, 3092, 1, 512] + - Exact: [2048, 3113, 1, 512] + - Exact: [2048, 3137, 1, 512] + - Exact: [2048, 3166, 1, 512] + - Exact: [2048, 3194, 1, 512] + - Exact: [2048, 3219, 1, 512] + - Exact: [2048, 3222, 1, 512] + - Exact: [2048, 3234, 1, 512] + - Exact: [2048, 3237, 1, 512] + - Exact: [2048, 3242, 1, 512] + - Exact: [2048, 3246, 1, 512] + - Exact: [2048, 3249, 1, 512] + - Exact: [2048, 3251, 1, 512] + - Exact: [2048, 3257, 1, 512] + - Exact: [2048, 3262, 1, 512] + - Exact: [2048, 3268, 1, 512] + - Exact: [2048, 3282, 1, 512] + - Exact: [2048, 3286, 1, 512] + - Exact: [2048, 3287, 1, 512] + - Exact: [2048, 3293, 1, 512] + - Exact: [2048, 3297, 1, 512] + - Exact: [2048, 3307, 1, 512] + - Exact: [2048, 3314, 1, 512] + - Exact: [2048, 3315, 1, 512] + - Exact: [2048, 3319, 1, 512] + - Exact: [2048, 3322, 1, 512] + - Exact: [2048, 3323, 1, 512] + - Exact: [2048, 3324, 1, 512] + - Exact: [2048, 3325, 1, 512] + - Exact: [2048, 3327, 1, 512] + - Exact: [2048, 3329, 1, 512] + - Exact: [2048, 3332, 1, 512] + - Exact: [2048, 3336, 1, 512] + - Exact: [2048, 3339, 1, 512] + - Exact: [2048, 3342, 1, 512] + - Exact: [2048, 3344, 1, 512] + - Exact: [2048, 3358, 1, 512] + - Exact: [2048, 3360, 1, 512] + - Exact: [2048, 3364, 1, 512] + - Exact: [2048, 3365, 1, 512] + - Exact: [2048, 3369, 1, 512] + - Exact: [2048, 3370, 1, 512] + - Exact: [2048, 3371, 1, 512] + - Exact: [2048, 3374, 1, 512] + - Exact: [2048, 3376, 1, 512] + - Exact: [2048, 3377, 1, 512] + - Exact: [2048, 3378, 1, 512] + - Exact: [2048, 3381, 1, 512] + - Exact: [2048, 3382, 1, 512] + - Exact: [2048, 3383, 1, 512] + - Exact: [2048, 3384, 1, 512] + - Exact: [2048, 3385, 1, 512] + - Exact: [2048, 3386, 1, 512] + - Exact: [2048, 3388, 1, 512] + - Exact: [2048, 3390, 1, 512] + - Exact: [2048, 3391, 1, 512] + - Exact: [2048, 3396, 1, 512] + - Exact: [2048, 3399, 1, 512] + - Exact: [2048, 3402, 1, 512] + - Exact: [2048, 3410, 1, 512] + - Exact: [2048, 3412, 1, 512] + - Exact: [2048, 3414, 1, 512] + - Exact: [2048, 3415, 1, 512] + - Exact: [2048, 3418, 1, 512] + - Exact: [2048, 3420, 1, 512] + - Exact: [2048, 3422, 1, 512] + - Exact: [2048, 3425, 1, 512] + - Exact: [2048, 3426, 1, 512] + - Exact: [2048, 3427, 1, 512] + - Exact: [2048, 3428, 1, 512] + - Exact: [2048, 3430, 1, 512] + - Exact: [2048, 3431, 1, 512] + - Exact: [2048, 3432, 1, 512] + - Exact: [2048, 3433, 1, 512] + - Exact: [2048, 3438, 1, 512] + - Exact: [2048, 3439, 1, 512] + - Exact: [2048, 3440, 1, 512] + - Exact: [2048, 3443, 1, 512] + - Exact: [2048, 3445, 1, 512] + - Exact: [2048, 3447, 1, 512] + - Exact: [2048, 3448, 1, 512] + - Exact: [2048, 3450, 1, 512] + - Exact: [2048, 3451, 1, 512] + - Exact: [2048, 3452, 1, 512] + - Exact: [2048, 3453, 1, 512] + - Exact: [2048, 3455, 1, 512] + - Exact: [2048, 3456, 1, 512] + - Exact: [2048, 3457, 1, 512] + - Exact: [2048, 3458, 1, 512] + - Exact: [2048, 3459, 1, 512] + - Exact: [2048, 3460, 1, 512] + - Exact: [2048, 3461, 1, 512] + - Exact: [2048, 3462, 1, 512] + - Exact: [2048, 3466, 1, 512] + - Exact: [2048, 3467, 1, 512] + - Exact: [2048, 3468, 1, 512] + - Exact: [2048, 3470, 1, 512] + - Exact: [2048, 3471, 1, 512] + - Exact: [2048, 3472, 1, 512] + - Exact: [2048, 3475, 1, 512] + - Exact: [2048, 3476, 1, 512] + - Exact: [2048, 3477, 1, 512] + - Exact: [2048, 3478, 1, 512] + - Exact: [2048, 3479, 1, 512] + - Exact: [2048, 3480, 1, 512] + - Exact: [2048, 3481, 1, 512] + - Exact: [2048, 3483, 1, 512] + - Exact: [2048, 3484, 1, 512] + - Exact: [2048, 3487, 1, 512] + - Exact: [2048, 3489, 1, 512] + - Exact: [2048, 3490, 1, 512] + - Exact: [2048, 3491, 1, 512] + - Exact: [2048, 3493, 1, 512] + - Exact: [2048, 3494, 1, 512] + - Exact: [2048, 3495, 1, 512] + - Exact: [2048, 3497, 1, 512] + - Exact: [2048, 3498, 1, 512] + - Exact: [2048, 3501, 1, 512] + - Exact: [2048, 3503, 1, 512] + - Exact: [2048, 3505, 1, 512] + - Exact: [2048, 3507, 1, 512] + - Exact: [2048, 3508, 1, 512] + - Exact: [2048, 3509, 1, 512] + - Exact: [2048, 3510, 1, 512] + - Exact: [2048, 3511, 1, 512] + - Exact: [2048, 3513, 1, 512] + - Exact: [2048, 3514, 1, 512] + - Exact: [2048, 3515, 1, 512] + - Exact: [2048, 3517, 1, 512] + - Exact: [2048, 3518, 1, 512] + - Exact: [2048, 3519, 1, 512] + - Exact: [2048, 3520, 1, 512] + - Exact: [2048, 3523, 1, 512] + - Exact: [2048, 3528, 1, 512] + - Exact: [2048, 3529, 1, 512] + - Exact: [2048, 3530, 1, 512] + - Exact: [2048, 3531, 1, 512] + - Exact: [2048, 3532, 1, 512] + - Exact: [2048, 3533, 1, 512] + - Exact: [2048, 3534, 1, 512] + - Exact: [2048, 3538, 1, 512] + - Exact: [2048, 3539, 1, 512] + - Exact: [2048, 3540, 1, 512] + - Exact: [2048, 3541, 1, 512] + - Exact: [2048, 3547, 1, 512] + - Exact: [2048, 3548, 1, 512] + - Exact: [2048, 3552, 1, 512] + - Exact: [2048, 3564, 1, 512] + - Exact: [2048, 3575, 1, 512] + - Exact: [2048, 3598, 1, 512] + - Exact: [2048, 3599, 1, 512] + - Exact: [2048, 3608, 1, 512] + - Exact: [2048, 3776, 1, 512] + - Exact: [2048, 3780, 1, 512] + - Exact: [2048, 3796, 1, 512] + - Exact: [2048, 3822, 1, 512] + - Exact: [2048, 3835, 1, 512] + - Exact: [2048, 3840, 1, 512] + - Exact: [2048, 3859, 1, 512] + - Exact: [2048, 3864, 1, 512] + - Exact: [2048, 3870, 1, 512] + - Exact: [2048, 3876, 1, 512] + - Exact: [2048, 3906, 1, 512] + - Exact: [2048, 3910, 1, 512] + - Exact: [2048, 3925, 1, 512] + - Exact: [2048, 3942, 1, 512] + - Exact: [2048, 3944, 1, 512] + - Exact: [2048, 3955, 1, 512] + - Exact: [2048, 3968, 1, 512] + - Exact: [2048, 3969, 1, 512] + - Exact: [2048, 3976, 1, 512] + - Exact: [2048, 3977, 1, 512] + - Exact: [2048, 3978, 1, 512] + - Exact: [2048, 3990, 1, 512] + - Exact: [2048, 3995, 1, 512] + - Exact: [2048, 3996, 1, 512] + - Exact: [2048, 3999, 1, 512] + - Exact: [2048, 4005, 1, 512] + - Exact: [2048, 4012, 1, 512] + - Exact: [2048, 4020, 1, 512] + - Exact: [2048, 4026, 1, 512] + - Exact: [2048, 4030, 1, 512] + - Exact: [2048, 4032, 1, 512] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 3968, 1, 42720] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 7200, 1, 42720] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 9520, 1, 42720] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [1024, 2048, 1, 30528] + - Exact: [1024, 4096, 1, 30528] + - Exact: [1024, 10240, 1, 256] + - Exact: [1024, 10496, 1, 256] + - Exact: [1024, 11008, 1, 256] + - Exact: [1024, 11264, 1, 256] + - Exact: [1024, 11520, 1, 256] + - Exact: [1024, 12288, 1, 256] + - Exact: [1024, 13312, 1, 256] + - Exact: [1024, 13568, 1, 256] + - Exact: [1024, 14336, 1, 256] + - Exact: [1024, 14592, 1, 256] + - Exact: [1024, 14848, 1, 256] + - Exact: [1024, 15104, 1, 256] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1600, 1, 1] + - Exact: [1024, 16128, 1, 256] + - Exact: [1024, 17152, 1, 256] + - Exact: [1024, 1792, 1, 256] + - Exact: [1024, 18944, 1, 256] + - Exact: [1024, 19712, 1, 256] + - Exact: [1024, 19968, 1, 256] + - Exact: [1024, 20480, 1, 256] + - Exact: [1024, 2048, 1, 256] + - Exact: [1024, 20992, 1, 256] + - Exact: [1024, 21504, 1, 256] + - Exact: [1024, 22016, 1, 256] + - Exact: [1024, 23552, 1, 256] + - Exact: [1024, 2560, 1, 256] + - Exact: [1024, 28672, 1, 256] + - Exact: [1024, 3072, 1, 256] + - Exact: [1024, 3328, 1, 256] + - Exact: [1024, 33536, 1, 256] + - Exact: [1024, 3840, 1, 256] + - Exact: [1024, 40448, 1, 256] + - Exact: [1024, 4096, 1, 256] + - Exact: [1024, 4608, 1, 256] + - Exact: [1024, 4864, 1, 256] + - Exact: [1024, 5120, 1, 256] + - Exact: [1024, 5632, 1, 256] + - Exact: [1024, 6144, 1, 256] + - Exact: [1024, 6400, 1, 256] + - Exact: [1024, 7168, 1, 256] + - Exact: [1024, 7424, 1, 256] + - Exact: [1024, 7680, 1, 256] + - Exact: [1024, 7936, 1, 256] + - Exact: [1024, 8192, 1, 256] + - Exact: [1024, 8448, 1, 256] + - Exact: [1024, 8704, 1, 256] + - Exact: [1024, 8960, 1, 256] + - Exact: [1024, 9728, 1, 256] + - Exact: [1024, 9984, 1, 256] + - Exact: [2048, 1024, 1, 1] + - Exact: [2048, 1024, 1, 256] + - Exact: [256, 8976, 1, 10240] + - Exact: [256, 8976, 1, 10496] + - Exact: [256, 8976, 1, 11008] + - Exact: [256, 8976, 1, 11520] + - Exact: [256, 8976, 1, 12288] + - Exact: [256, 8976, 1, 14336] + - Exact: [256, 8976, 1, 14848] + - Exact: [256, 8976, 1, 15104] + - Exact: [256, 8976, 1, 1536] + - Exact: [256, 8976, 1, 15872] + - Exact: [256, 8976, 1, 17152] + - Exact: [256, 8976, 1, 19712] + - Exact: [256, 8976, 1, 19968] + - Exact: [256, 8976, 1, 20480] + - Exact: [256, 8976, 1, 2048] + - Exact: [256, 8976, 1, 20992] + - Exact: [256, 8976, 1, 22016] + - Exact: [256, 8976, 1, 2304] + - Exact: [256, 8976, 1, 2560] + - Exact: [256, 8976, 1, 26112] + - Exact: [256, 8976, 1, 2816] + - Exact: [256, 8976, 1, 3072] + - Exact: [256, 8976, 1, 33536] + - Exact: [256, 8976, 1, 4352] + - Exact: [256, 8976, 1, 44505] + - Exact: [256, 8976, 1, 4864] + - Exact: [256, 8976, 1, 5376] + - Exact: [256, 8976, 1, 5632] + - Exact: [256, 8976, 1, 5888] + - Exact: [256, 8976, 1, 6144] + - Exact: [256, 8976, 1, 6656] + - Exact: [256, 8976, 1, 7168] + - Exact: [256, 8976, 1, 7424] + - Exact: [256, 8976, 1, 8192] + - Exact: [256, 8976, 1, 8448] + - Exact: [256, 8976, 1, 8960] + - Exact: [256, 8976, 1, 9472] + - Exact: [256, 8976, 1, 9728] + - Exact: [256, 8976, 1, 9984] + - Exact: [3200, 1024, 1, 2048] + - Exact: [4096, 1024, 1, 1] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 3072, 1, 3072] + - Exact: [1024, 2048, 1, 3072] + - Exact: [30528, 4096, 1, 1024] + - Exact: [30528, 2048, 1, 1024] + - Exact: [512, 32768, 1, 256] + - Exact: [256, 32768, 1, 128] + - Exact: [1024, 32768, 1, 512] + - Exact: [1024, 32768, 1, 1024] + - Exact: [479, 32768, 1, 1024] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 64, 64] + - Exact: [784, 512, 64, 128] + - Exact: [784, 128, 64, 512] + - Exact: [196, 1024, 64, 256] + - Exact: [196, 256, 64, 1024] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [256, 6912, 1, 4] + - Exact: [512, 4096, 1, 256] + - Exact: [1024, 4096, 1, 512] + - Exact: [480, 4096, 1, 1024] + - Exact: [512, 6912, 1, 256] + - Exact: [1024, 6912, 1, 512] + - Exact: [1024, 6912, 1, 1024] + - Exact: [480, 6912, 1, 1024] + - Exact: [256, 55296, 1, 128] + - Exact: [512, 55296, 1, 256] + - Exact: [1920, 2048, 1, 2048] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3840, 4096, 1, 4096] + - Exact: [7680, 8192, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 3072, 1, 3072] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8192, 1, 8192] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [1152, 1152, 1, 384] + - Exact: [1536, 1536, 1, 384] + - Exact: [1920, 1920, 1, 384] + - Exact: [2304, 2304, 1, 384] + - Exact: [2688, 2688, 1, 384] + - Exact: [3072, 3072, 1, 384] + - Exact: [3456, 3456, 1, 384] + - Exact: [3840, 3840, 1, 384] + - Exact: [4224, 4224, 1, 384] + - Exact: [4608, 4608, 1, 384] + - Exact: [4992, 4992, 1, 384] + - Exact: [5376, 5376, 1, 384] + - Exact: [5760, 5760, 1, 384] + - Exact: [6144, 6144, 1, 384] + - Exact: [6528, 6528, 1, 384] + - Exact: [6912, 6912, 1, 384] + - Exact: [7296, 7296, 1, 384] + - Exact: [7680, 7680, 1, 384] + - Exact: [8064, 8064, 1, 384] + - Exact: [8448, 8448, 1, 384] + - Exact: [8832, 8832, 1, 384] + - Exact: [9216, 9216, 1, 384] + - Exact: [9600, 9600, 1, 384] + - Exact: [9984, 9984, 1, 384] + - Exact: [10368, 10368, 1, 384] + - Exact: [10752, 10752, 1, 384] + - Exact: [11136, 11136, 1, 384] + - Exact: [11520, 11520, 1, 384] + - Exact: [11904, 11904, 1, 384] + - Exact: [12288, 12288, 1, 384] + - Exact: [12672, 12672, 1, 384] + - Exact: [13056, 13056, 1, 384] + - Exact: [13440, 13440, 1, 384] + - Exact: [13824, 13824, 1, 384] + - Exact: [14208, 14208, 1, 384] + - Exact: [14592, 14592, 1, 384] + - Exact: [14976, 14976, 1, 384] + - Exact: [15360, 15360, 1, 384] + - Exact: [15744, 15744, 1, 384] + - Exact: [16128, 16128, 1, 384] + - Exact: [16512, 16512, 1, 384] + - Exact: [16896, 16896, 1, 384] + - Exact: [17280, 17280, 1, 384] + - Exact: [17664, 17664, 1, 384] + - Exact: [18048, 18048, 1, 384] + - Exact: [18432, 18432, 1, 384] + - Exact: [18816, 18816, 1, 384] + - Exact: [19200, 19200, 1, 384] + - Exact: [19584, 19584, 1, 384] + - Exact: [19968, 19968, 1, 384] + - Exact: [20352, 20352, 1, 384] + - Exact: [20736, 20736, 1, 384] + - Exact: [21120, 21120, 1, 384] + - Exact: [21504, 21504, 1, 384] + - Exact: [21888, 21888, 1, 384] + - Exact: [22272, 22272, 1, 384] + - Exact: [22656, 22656, 1, 384] + - Exact: [23040, 23040, 1, 384] + - Exact: [8192, 1024, 1, 1024] + - Exact: [8192, 4096, 1, 1024] + - Exact: [16384, 16384, 1, 16384] + - Exact: [1444, 256, 120, 128] + - Exact: [1444, 256, 139, 128] + - Exact: [1444, 256, 160, 128] + - Exact: [1444, 256, 18, 128] + - Exact: [1444, 256, 19, 128] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 139, 256] + - Exact: [1444, 256, 160, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 256, 120, 512] + - Exact: [361, 256, 139, 512] + - Exact: [361, 256, 160, 512] + - Exact: [361, 256, 18, 512] + - Exact: [361, 256, 19, 512] + - Exact: [173280, 128, 1, 64] + - Exact: [200716, 128, 1, 64] + - Exact: [231040, 128, 1, 64] + - Exact: [25992, 128, 1, 64] + - Exact: [27436, 128, 1, 64] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [1024, 1280, 1, 2] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [1024, 4992, 1, 2] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [1024, 5120, 1, 2] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [1024, 5248, 1, 2] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [1024, 2560, 1, 2] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 1152, 1, 2] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 8192, 1, 33712] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [1024, 9600, 1, 33712] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 10080, 1, 42720] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 6528, 1, 42720] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 7104, 1, 42720] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [480, 32768, 1, 1024] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 1024, 1, 30592] + - Exact: [2048, 1024, 1, 6144] + - Exact: [2048, 1024, 1, 8192] + - Exact: [8192, 1024, 1, 2048] + - Exact: [1024, 8192, 1, 30592] + - Exact: [1024, 8192, 1, 3072] + - Exact: [512, 512, 256, 64] + - Exact: [1024, 2048, 1, 30592] + - Exact: [1024, 4096, 1, 30592] + - Exact: [512, 512, 128, 64] + - Exact: [2560, 2048, 1, 1920] + - Exact: [2560, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 7680] + - Exact: [640, 2048, 1, 2560] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 4608] + - Exact: [1536, 4096, 1, 50304] + - Exact: [1536, 4096, 1, 6144] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1536, 8192, 1, 4608] + - Exact: [1536, 8192, 1, 50304] + - Exact: [1536, 8192, 1, 6144] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 16384, 1, 3072] + - Exact: [1024, 16384, 1, 4096] + - Exact: [1024, 16384, 1, 50304] + - Exact: [4096, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [1024, 2048, 1, 50304] + - Exact: [1024, 1024, 32, 64] + - Exact: [1024, 4096, 1, 50304] + - Exact: [1024, 1024, 64, 64] + - Exact: [1024, 8192, 1, 50304] + - Exact: [1024, 1024, 128, 64] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 8192, 1, 30528] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 512] + - Exact: [256, 6912, 1, 128] + - Exact: [480, 3456, 1, 1024] + - Exact: [512, 3456, 1, 256] + - Exact: [1024, 1280, 1, 30528] + - Exact: [1024, 1600, 1, 30528] + - Exact: [1024, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [4096, 10240, 1, 1024] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 1640, 1, 30528] + - Exact: [1024, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 4096] + - Exact: [4096, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [1024, 6144, 1, 4096] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [512, 512, 192, 64] + - Exact: [256, 6912, 1, 1] + - Exact: [3136, 128, 64, 64] + - Exact: [3136, 256, 64, 128] + - Exact: [784, 512, 64, 256] + - Exact: [3136, 128, 64, 256] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 1024, 64, 512] + - Exact: [784, 256, 64, 512] + - Exact: [784, 512, 64, 512] + - Exact: [196, 512, 64, 1024] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 64] + - Exact: [3136, 256, 32, 128] + - Exact: [784, 512, 32, 256] + - Exact: [3136, 128, 32, 256] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 1024, 32, 512] + - Exact: [784, 256, 32, 512] + - Exact: [784, 512, 32, 512] + - Exact: [196, 512, 32, 1024] + - Exact: [196, 1024, 32, 1024] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [1024, 10224, 1, 3072] + - Exact: [4096, 10224, 1, 1024] + - Exact: [1024, 10240, 1, 3072] + - Exact: [1024, 10192, 1, 3072] + - Exact: [4096, 10192, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [1024, 10200, 1, 3072] + - Exact: [1024, 10184, 1, 1024] + - Exact: [4096, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 3072] + - Exact: [1024, 10208, 1, 4096] + - Exact: [1024, 10224, 1, 2048] + - Exact: [1024, 10240, 1, 2048] + - Exact: [1024, 10120, 1, 1024] + - Exact: [1024, 10192, 1, 2048] + - Exact: [1024, 10152, 1, 1024] + - Exact: [1024, 10080, 1, 3072] + - Exact: [100352, 512, 1, 256] + - Exact: [12544, 2048, 1, 1024] + - Exact: [200704, 512, 1, 256] + - Exact: [25088, 1024, 1, 512] + - Exact: [50176, 1024, 1, 512] + - Exact: [6272, 2048, 1, 1024] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [196, 256, 128, 1024] + - Exact: [196, 256, 256, 1024] + - Exact: [196, 512, 128, 1024] + - Exact: [196, 512, 256, 1024] + - Exact: [3136, 128, 128, 256] + - Exact: [3136, 128, 256, 256] + - Exact: [784, 256, 128, 512] + - Exact: [784, 256, 256, 512] + - Exact: [128, 128, 2048, 64] + - Exact: [1024, 2560, 1, 30528] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 4096] + - Exact: [1024, 12288, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [1024, 1920, 1, 30528] + - Exact: [128, 128, 192, 64] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 2] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [768, 4608, 1, 768] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 2] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [1024, 4608, 1, 1024] + - Exact: [3072, 256, 2, 1024] + - Exact: [2852, 256, 2, 1024] + - Exact: [3220, 256, 2, 1024] + - Exact: [850, 2048, 2, 512] + - Exact: [768, 2048, 2, 512] + - Exact: [2904, 256, 2, 1024] + - Exact: [805, 2048, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [2992, 256, 2, 1024] + - Exact: [3400, 256, 2, 1024] + - Exact: [4032, 256, 2, 1024] + - Exact: [15200, 128, 2, 512] + - Exact: [12288, 128, 2, 512] + - Exact: [888, 2048, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3456, 256, 2, 1024] + - Exact: [2944, 256, 2, 1024] + - Exact: [2688, 256, 2, 1024] + - Exact: [13824, 128, 2, 512] + - Exact: [3036, 256, 2, 1024] + - Exact: [3168, 256, 2, 1024] + - Exact: [3360, 256, 2, 1024] + - Exact: [3552, 256, 2, 1024] + - Exact: [11616, 128, 2, 512] + - Exact: [4200, 256, 2, 1024] + - Exact: [840, 2048, 2, 512] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [3264, 256, 2, 1024] + - Exact: [713, 2048, 2, 512] + - Exact: [13600, 256, 2, 512] + - Exact: [12880, 256, 2, 512] + - Exact: [12288, 256, 2, 512] + - Exact: [2816, 256, 2, 1024] + - Exact: [850, 2048, 1, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [672, 2048, 2, 512] + - Exact: [13440, 128, 2, 512] + - Exact: [726, 2048, 2, 512] + - Exact: [3500, 256, 2, 1024] + - Exact: [13824, 256, 2, 512] + - Exact: [15200, 256, 2, 512] + - Exact: [3700, 256, 2, 1024] + - Exact: [748, 2048, 2, 512] + - Exact: [3600, 256, 2, 1024] + - Exact: [4032, 1024, 2, 256] + - Exact: [16128, 128, 2, 512] + - Exact: [15200, 128, 1, 512] + - Exact: [13600, 128, 1, 512] + - Exact: [2904, 1024, 2, 256] + - Exact: [2992, 1024, 2, 256] + - Exact: [1536, 2048, 1, 1024] + - Exact: [24576, 128, 1, 256] + - Exact: [24576, 512, 1, 256] + - Exact: [25760, 128, 1, 256] + - Exact: [25760, 512, 1, 256] + - Exact: [6144, 256, 1, 512] + - Exact: [6440, 256, 1, 512] + - Exact: [3036, 1024, 2, 256] + - Exact: [13600, 512, 1, 128] + - Exact: [9408, 512, 2, 128] + - Exact: [56000, 256, 2, 64] + - Exact: [2852, 1024, 2, 256] + - Exact: [2816, 1024, 2, 256] + - Exact: [60800, 256, 1, 64] + - Exact: [2944, 1024, 2, 256] + - Exact: [11776, 512, 2, 128] + - Exact: [11616, 512, 2, 128] + - Exact: [4200, 1024, 2, 256] + - Exact: [54400, 256, 1, 64] + - Exact: [15200, 512, 1, 128] + - Exact: [2688, 1024, 2, 256] + - Exact: [12672, 512, 2, 128] + - Exact: [11968, 512, 2, 128] + - Exact: [46464, 256, 2, 64] + - Exact: [2400, 256, 2, 1024] + - Exact: [2520, 256, 2, 1024] + - Exact: [2400, 1024, 2, 256] + - Exact: [10752, 128, 2, 512] + - Exact: [45632, 256, 2, 64] + - Exact: [2520, 1024, 2, 256] + - Exact: [53760, 256, 2, 64] + - Exact: [2352, 256, 2, 1024] + - Exact: [47872, 256, 2, 64] + - Exact: [47104, 256, 2, 64] + - Exact: [50688, 256, 2, 64] + - Exact: [45056, 256, 2, 64] + - Exact: [13440, 512, 2, 128] + - Exact: [2352, 1024, 2, 256] + - Exact: [11264, 512, 2, 128] + - Exact: [10560, 128, 2, 512] + - Exact: [16128, 512, 2, 128] + - Exact: [37632, 256, 2, 64] + - Exact: [51520, 256, 2, 64] + - Exact: [14000, 512, 2, 128] + - Exact: [10560, 512, 2, 128] + - Exact: [64512, 256, 2, 64] + - Exact: [54400, 256, 2, 64] + - Exact: [3264, 1024, 2, 256] + - Exact: [10752, 512, 2, 128] + - Exact: [3168, 1024, 2, 256] + - Exact: [950, 2048, 1, 512] + - Exact: [55296, 256, 2, 256] + - Exact: [51520, 256, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [60800, 256, 2, 256] + - Exact: [54400, 256, 2, 256] + - Exact: [3700, 1024, 2, 256] + - Exact: [60800, 256, 2, 64] + - Exact: [3800, 1024, 1, 256] + - Exact: [3400, 1024, 1, 256] + - Exact: [3072, 1024, 2, 256] + - Exact: [3600, 1024, 2, 256] + - Exact: [12288, 512, 2, 128] + - Exact: [49152, 256, 2, 256] + - Exact: [12880, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [42240, 256, 2, 64] + - Exact: [1008, 2048, 2, 512] + - Exact: [3360, 1024, 2, 256] + - Exact: [14208, 512, 2, 128] + - Exact: [56832, 256, 2, 64] + - Exact: [43008, 256, 2, 64] + - Exact: [13600, 512, 2, 128] + - Exact: [3500, 1024, 2, 256] + - Exact: [2640, 1024, 2, 256] + - Exact: [13824, 512, 2, 128] + - Exact: [3800, 256, 2, 1024] + - Exact: [55296, 256, 2, 64] + - Exact: [2640, 256, 2, 1024] + - Exact: [15200, 512, 2, 128] + - Exact: [3552, 1024, 2, 256] + - Exact: [3220, 1024, 2, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [49152, 256, 2, 64] + - Exact: [3400, 1024, 2, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [3800, 1024, 2, 256] + - Exact: [1610, 2048, 1, 1024] + - Exact: [6912, 256, 1, 512] + - Exact: [6800, 256, 1, 512] + - Exact: [27648, 128, 1, 256] + - Exact: [27200, 128, 1, 256] + - Exact: [30400, 128, 1, 256] + - Exact: [7600, 256, 1, 512] + - Exact: [6144, 1024, 1, 512] + - Exact: [6912, 1024, 1, 512] + - Exact: [6440, 1024, 1, 512] + - Exact: [27648, 512, 1, 256] + - Exact: [1728, 2048, 1, 1024] + - Exact: [27200, 512, 1, 256] + - Exact: [6800, 1024, 1, 512] + - Exact: [1700, 2048, 1, 1024] + - Exact: [7600, 1024, 1, 512] + - Exact: [30400, 512, 1, 256] + - Exact: [1900, 2048, 1, 1024] + - Exact: [12544, 1024, 1, 1024] + - Exact: [1024, 1024, 160, 96] + - Exact: [1920, 16384, 1, 25216] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [960, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 2880] + - Exact: [1024, 1024, 40, 96] + - Exact: [1920, 4096, 1, 25216] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [960, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 2880] + - Exact: [1024, 1024, 80, 96] + - Exact: [1920, 8192, 1, 25216] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [960, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 2880] + - Exact: [1024, 1024, 96, 96] + - Exact: [2304, 16384, 1, 12672] + - Exact: [2304, 16384, 1, 2304] + - Exact: [576, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 1728] + - Exact: [1024, 1024, 24, 96] + - Exact: [2304, 4096, 1, 12672] + - Exact: [2304, 4096, 1, 2304] + - Exact: [576, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 1728] + - Exact: [1024, 1024, 48, 96] + - Exact: [2304, 8192, 1, 12672] + - Exact: [2304, 8192, 1, 2304] + - Exact: [576, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 1728] + - Exact: [1024, 1024, 16, 96] + - Exact: [3072, 4096, 1, 6400] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [384, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1152] + - Exact: [1024, 1024, 32, 96] + - Exact: [3072, 8192, 1, 6400] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [384, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1152] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 4096, 1, 2048] + - Exact: [1024, 2283, 1, 29000] + - Exact: [1024, 2296, 1, 29000] + - Exact: [1024, 2306, 1, 29000] + - Exact: [1024, 2309, 1, 29000] + - Exact: [1024, 2318, 1, 29000] + - Exact: [1024, 2320, 1, 29000] + - Exact: [1024, 2324, 1, 29000] + - Exact: [1024, 2325, 1, 29000] + - Exact: [1024, 2329, 1, 29000] + - Exact: [1024, 2338, 1, 29000] + - Exact: [1024, 2345, 1, 29000] + - Exact: [1024, 2350, 1, 29000] + - Exact: [1024, 2362, 1, 29000] + - Exact: [1024, 2366, 1, 29000] + - Exact: [1024, 2368, 1, 29000] + - Exact: [1024, 2374, 1, 29000] + - Exact: [1024, 2390, 1, 29000] + - Exact: [512, 512, 320, 64] + - Exact: [512, 512, 80, 64] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 3072] + - Exact: [1024, 32768, 1, 4096] + - Exact: [1024, 32768, 1, 50304] + - Exact: [4096, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [768, 320, 1, 30522] + - Exact: [768, 640, 1, 30522] + - Exact: [768, 1280, 1, 30522] + - Exact: [1024, 780, 1, 30522] + - Exact: [1024, 308, 1, 30522] + - Exact: [1024, 800, 1, 30522] + - Exact: [1024, 820, 1, 30522] + - Exact: [1024, 385, 1, 30522] + - Exact: [1024, 462, 1, 30522] + - Exact: [1024, 640, 1, 30528] + - Exact: [2048, 199, 1, 29000] + - Exact: [2048, 221, 1, 29000] + - Exact: [2048, 224, 1, 29000] + - Exact: [2048, 229, 1, 29000] + - Exact: [2048, 234, 1, 29000] + - Exact: [2048, 242, 1, 29000] + - Exact: [2048, 246, 1, 29000] + - Exact: [2048, 247, 1, 29000] + - Exact: [2048, 256, 1, 29000] + - Exact: [2048, 262, 1, 29000] + - Exact: [2048, 264, 1, 29000] + - Exact: [2048, 265, 1, 29000] + - Exact: [2048, 274, 1, 29000] + - Exact: [2048, 277, 1, 29000] + - Exact: [2048, 279, 1, 29000] + - Exact: [2048, 288, 1, 29000] + - Exact: [2048, 296, 1, 29000] + - Exact: [2048, 315, 1, 29000] + - Exact: [2048, 335, 1, 29000] + - Exact: [1024, 561, 1, 29000] + - Exact: [1024, 574, 1, 29000] + - Exact: [1024, 600, 1, 29000] + - Exact: [1024, 608, 1, 29000] + - Exact: [1024, 615, 1, 29000] + - Exact: [1024, 622, 1, 29000] + - Exact: [1024, 625, 1, 29000] + - Exact: [1024, 626, 1, 29000] + - Exact: [1024, 628, 1, 29000] + - Exact: [1024, 636, 1, 29000] + - Exact: [1024, 651, 1, 29000] + - Exact: [1024, 658, 1, 29000] + - Exact: [1024, 669, 1, 29000] + - Exact: [1024, 670, 1, 29000] + - Exact: [1024, 672, 1, 29000] + - Exact: [1024, 684, 1, 29000] + - Exact: [1024, 716, 1, 29000] + - Exact: [1024, 730, 1, 29000] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1600, 512, 1, 1024] + - Exact: [1024, 512, 1, 1] + - Exact: [1024, 512, 1, 64] + - Exact: [2048, 512, 1, 1] + - Exact: [768, 640, 1, 768] + - Exact: [768, 1024, 1, 2] + - Exact: [768, 1024, 1, 768] + - Exact: [768, 1280, 1, 768] + - Exact: [768, 512, 1, 2] + - Exact: [768, 512, 1, 768] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 2] + - Exact: [64, 64, 768, 64] + - Exact: [64, 64, 96, 64] + - Exact: [704, 1024, 1, 128] + - Exact: [1024, 1024, 1, 3328] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [64, 193600, 1, 64] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [512, 1500, 1, 2816] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [448, 1024, 1, 256] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [2944, 448, 1, 128] + - Exact: [64, 193600, 1, 256] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [704, 1024, 1, 256] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [512, 1500, 1, 2048] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [1024, 700, 1, 512] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [128, 5888, 1, 1280] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 448, 1, 128] + - Exact: [704, 1408, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 2944, 1, 128] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [3025, 64, 64, 64] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [512, 1500, 1, 1536] + - Exact: [128, 3584, 1, 1280] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [1024, 1024, 1, 128] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [448, 1024, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [448, 1856, 1, 3328] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [128, 2368, 1, 256] + - Exact: [5888, 64, 1, 1280] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 2944, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 3328] + - Exact: [64, 5056, 1, 128] + - Exact: [64, 6784, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1408, 448, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [128, 4288, 1, 3328] + - Exact: [3584, 128, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [256, 4288, 1, 256] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [2560, 128, 1, 2560] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [128, 3584, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [5888, 64, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [512, 1500, 1, 2560] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [3136, 64, 256, 256] + - Exact: [1024, 512, 1, 2048] + - Exact: [4096, 256, 1, 2048] + - Exact: [2048, 256, 1, 4096] + - Exact: [512, 768, 1, 2048] + - Exact: [2048, 256, 1, 1024] + - Exact: [2048, 200, 1, 512] + - Exact: [4096, 200, 1, 1024] + - Exact: [2048, 200, 1, 4096] + - Exact: [2048, 512, 1, 1024] + - Exact: [1024, 1024, 1, 512] + - Exact: [2048, 512, 1, 4096] + - Exact: [1024, 1024, 1, 4096] + - Exact: [4096, 200, 1, 2048] + - Exact: [2048, 200, 1, 1024] + - Exact: [1024, 768, 1, 512] + - Exact: [2048, 200, 1, 2048] + - Exact: [2048, 256, 1, 2048] + - Exact: [512, 768, 1, 512] + - Exact: [4096, 256, 1, 4096] + - Exact: [1024, 512, 1, 512] + - Exact: [1024, 1024, 1, 2048] + - Exact: [4096, 256, 1, 1024] + - Exact: [512, 768, 1, 1024] + - Exact: [1024, 512, 1, 4096] + - Exact: [4096, 200, 1, 4096] + - Exact: [2048, 256, 1, 512] + - Exact: [1024, 1024, 1, 1024] + - Exact: [4096, 192, 1, 2048] + - Exact: [5329, 64, 64, 160] + - Exact: [1225, 64, 64, 384] + - Exact: [4096, 320, 1, 1280] + - Exact: [4096, 192, 1, 1280] + - Exact: [1225, 96, 64, 384] + - Exact: [4096, 320, 1, 2048] + - Exact: [4096, 256, 1, 1536] + - Exact: [64, 147, 432, 148] + - Exact: [64, 123, 528, 123] + - Exact: [64, 111, 576, 112] + - Exact: [64, 77, 816, 77] + - Exact: [64, 92, 688, 92] + - Exact: [64, 159, 400, 159] + - Exact: [64, 85, 752, 84] + - Exact: [64, 122, 528, 123] + - Exact: [64, 93, 688, 92] + - Exact: [64, 102, 624, 99] + - Exact: [64, 133, 480, 133] + - Exact: [64, 232, 272, 232] + - Exact: [64, 162, 400, 159] + - Exact: [64, 78, 816, 78] + - Exact: [64, 99, 624, 99] + - Exact: [64, 101, 624, 102] + - Exact: [64, 111, 576, 111] + - Exact: [64, 134, 480, 134] + - Exact: [64, 135, 480, 132] + - Exact: [64, 134, 480, 132] + - Exact: [64, 134, 480, 135] + - Exact: [64, 162, 400, 162] + - Exact: [64, 102, 624, 102] + - Exact: [64, 135, 480, 133] + - Exact: [64, 148, 432, 143] + - Exact: [64, 100, 624, 100] + - Exact: [64, 65, 992, 65] + - Exact: [64, 122, 528, 122] + - Exact: [64, 228, 272, 228] + - Exact: [64, 112, 576, 111] + - Exact: [64, 143, 432, 143] + - Exact: [64, 135, 480, 135] + - Exact: [64, 232, 272, 228] + - Exact: [64, 193, 320, 193] + - Exact: [64, 71, 896, 71] + - Exact: [64, 84, 752, 84] + - Exact: [64, 132, 480, 132] + - Exact: [64, 85, 752, 85] + - Exact: [64, 102, 624, 100] + - Exact: [64, 78, 816, 77] + - Exact: [64, 112, 576, 112] + - Exact: [64, 148, 432, 148] + - Exact: [64, 159, 400, 160] + - Exact: [64, 102, 624, 101] + - Exact: [64, 101, 624, 101] + - Exact: [64, 160, 400, 160] + - Exact: [64, 93, 688, 93] + - Exact: [64, 147, 432, 147] + - Exact: [64, 100, 624, 102] + - Exact: [64, 177, 352, 177] + - Exact: [500, 1024, 1, 512] + - Exact: [512, 1024, 1, 512] + - Exact: [200, 2048, 1, 512] + - Exact: [512, 2000, 1, 1024] + - Exact: [512, 2048, 1, 512] + - Exact: [200, 2000, 1, 100] + - Exact: [200, 2000, 1, 1024] + - Exact: [500, 1024, 1, 2048] + - Exact: [512, 2048, 1, 100] + - Exact: [512, 2048, 1, 2000] + - Exact: [200, 2000, 1, 10] + - Exact: [500, 2048, 1, 1024] + - Exact: [500, 2000, 1, 10] + - Exact: [500, 2048, 1, 100] + - Exact: [512, 1024, 1, 500] + - Exact: [200, 2000, 1, 2000] + - Exact: [500, 2048, 1, 2000] + - Exact: [512, 2048, 1, 1024] + - Exact: [512, 1024, 1, 100] + - Exact: [256, 2000, 1, 10] + - Exact: [512, 2000, 1, 100] + - Exact: [512, 2000, 1, 2048] + - Exact: [500, 1024, 1, 500] + - Exact: [256, 2000, 1, 100] + - Exact: [512, 1024, 1, 2048] + - Exact: [500, 2048, 1, 2048] + - Exact: [200, 2048, 1, 10] + - Exact: [500, 2000, 1, 512] + - Exact: [500, 1024, 1, 1024] + - Exact: [200, 2000, 1, 500] + - Exact: [256, 2048, 1, 100] + - Exact: [500, 2000, 1, 1024] + - Exact: [256, 2048, 1, 1024] + - Exact: [200, 2048, 1, 1024] + - Exact: [512, 2048, 1, 500] + - Exact: [512, 2000, 1, 10] + - Exact: [500, 1024, 1, 2000] + - Exact: [512, 2000, 1, 512] + - Exact: [500, 2000, 1, 2000] + - Exact: [500, 1024, 1, 10] + - Exact: [256, 2048, 1, 10] + - Exact: [256, 2048, 1, 500] + - Exact: [256, 2048, 1, 2048] + - Exact: [256, 2000, 1, 512] + - Exact: [512, 1024, 1, 2000] + - Exact: [256, 2000, 1, 2000] + - Exact: [256, 2048, 1, 2000] + - Exact: [200, 2048, 1, 100] + - Exact: [200, 2000, 1, 2048] + - Exact: [500, 2048, 1, 512] + - Exact: [500, 2000, 1, 500] + - Exact: [200, 2048, 1, 2048] + - Exact: [200, 2048, 1, 500] + - Exact: [512, 2000, 1, 500] + - Exact: [200, 2048, 1, 2000] + - Exact: [500, 1024, 1, 100] + - Exact: [512, 1024, 1, 10] + - Exact: [512, 1024, 1, 1024] + - Exact: [500, 2048, 1, 10] + - Exact: [200, 2000, 1, 512] + - Exact: [256, 2000, 1, 500] + - Exact: [256, 2048, 1, 512] + - Exact: [256, 2000, 1, 2048] + - Exact: [500, 2048, 1, 500] + - Exact: [256, 2000, 1, 1024] + - Exact: [500, 2000, 1, 2048] + - Exact: [512, 2000, 1, 2000] + - Exact: [512, 2048, 1, 2048] + - Exact: [512, 2048, 1, 10] + - Exact: [500, 2000, 1, 100] + - Exact: [1024, 1131, 1, 1024] + - Exact: [1024, 1102, 1, 1024] + - Exact: [1024, 774, 1, 1024] + - Exact: [4096, 128, 1, 2048] + - Exact: [4096, 128, 1, 3072] + - Exact: [1024, 1120, 1, 1024] + - Exact: [1024, 1015, 1, 1024] + - Exact: [1024, 992, 1, 1024] + - Exact: [1024, 950, 1, 1024] + - Exact: [1024, 1088, 1, 1024] + - Exact: [64, 128, 96, 128] + - Exact: [768, 1024, 1, 3072] + - Exact: [768, 512, 1, 3072] + - Exact: [64, 256, 192, 256] + - Exact: [64, 128, 384, 128] + - Exact: [64, 256, 96, 256] + - Exact: [6272, 112, 1, 512] + - Exact: [2048, 320, 1, 1280] + - Exact: [5329, 64, 1, 448] + - Exact: [784, 64, 32, 192] + - Exact: [6272, 64, 1, 480] + - Exact: [6272, 64, 1, 512] + - Exact: [6272, 160, 1, 528] + - Exact: [289, 160, 32, 768] + - Exact: [5329, 64, 32, 160] + - Exact: [5329, 96, 1, 576] + - Exact: [1225, 64, 32, 288] + - Exact: [289, 192, 32, 768] + - Exact: [2048, 448, 1, 1280] + - Exact: [3136, 64, 32, 64] + - Exact: [6272, 128, 1, 528] + - Exact: [6272, 96, 1, 480] + - Exact: [2048, 448, 1, 2048] + - Exact: [784, 96, 32, 192] + - Exact: [1001, 512, 1, 4096] + - Exact: [2048, 192, 1, 1280] + - Exact: [1225, 64, 32, 256] + - Exact: [2048, 256, 1, 1536] + - Exact: [6272, 128, 1, 512] + - Exact: [1568, 384, 1, 832] + - Exact: [1568, 256, 1, 832] + - Exact: [1568, 192, 1, 832] + - Exact: [289, 192, 32, 1024] + - Exact: [1225, 64, 32, 384] + - Exact: [2048, 320, 1, 2048] + - Exact: [2048, 384, 1, 1536] + - Exact: [5041, 96, 1, 576] + - Exact: [6272, 192, 1, 480] + - Exact: [5041, 192, 1, 720] + - Exact: [289, 128, 32, 768] + - Exact: [12544, 64, 1, 147] + - Exact: [6272, 160, 1, 512] + - Exact: [1225, 64, 32, 192] + - Exact: [784, 64, 32, 256] + - Exact: [6272, 144, 1, 512] + - Exact: [8192, 192, 1, 1280] + - Exact: [8192, 192, 1, 2048] + - Exact: [65, 6400, 1, 1024] + - Exact: [512, 1290, 1, 2048] + - Exact: [512, 2205, 1, 2048] + - Exact: [64, 512, 16, 512] + - Exact: [512, 600, 1, 2048] + - Exact: [512, 644, 1, 512] + - Exact: [512, 644, 1, 2048] + - Exact: [512, 668, 1, 2048] + - Exact: [512, 714, 1, 512] + - Exact: [512, 714, 1, 2048] + - Exact: [512, 720, 1, 512] + - Exact: [512, 720, 1, 2048] + - Exact: [512, 722, 1, 2048] + - Exact: [512, 781, 1, 512] + - Exact: [512, 781, 1, 2048] + - Exact: [512, 848, 1, 2048] + - Exact: [512, 872, 1, 2048] + - Exact: [512, 936, 1, 512] + - Exact: [512, 936, 1, 2048] + - Exact: [512, 980, 1, 512] + - Exact: [512, 980, 1, 2048] + - Exact: [512, 1139, 1, 2048] + - Exact: [512, 1184, 1, 2048] + - Exact: [512, 1186, 1, 2048] + - Exact: [512, 1232, 1, 512] + - Exact: [512, 1232, 1, 2048] + - Exact: [512, 1279, 1, 2048] + - Exact: [512, 1290, 1, 512] + - Exact: [512, 1327, 1, 2048] + - Exact: [512, 1331, 1, 2048] + - Exact: [512, 1341, 1, 2048] + - Exact: [512, 1350, 1, 512] + - Exact: [512, 1350, 1, 2048] + - Exact: [512, 1359, 1, 2048] + - Exact: [512, 1391, 1, 2048] + - Exact: [512, 1424, 1, 512] + - Exact: [512, 1424, 1, 2048] + - Exact: [512, 1458, 1, 512] + - Exact: [512, 1458, 1, 2048] + - Exact: [512, 1462, 1, 512] + - Exact: [512, 1462, 1, 2048] + - Exact: [512, 1467, 1, 2048] + - Exact: [512, 1472, 1, 2048] + - Exact: [512, 1520, 1, 512] + - Exact: [512, 1520, 1, 2048] + - Exact: [512, 1596, 1, 512] + - Exact: [512, 1596, 1, 2048] + - Exact: [512, 1599, 1, 512] + - Exact: [512, 1599, 1, 2048] + - Exact: [512, 1615, 1, 512] + - Exact: [512, 1615, 1, 2048] + - Exact: [512, 1680, 1, 512] + - Exact: [512, 1680, 1, 2048] + - Exact: [512, 1709, 1, 2048] + - Exact: [512, 1890, 1, 512] + - Exact: [512, 1902, 1, 2048] + - Exact: [512, 1917, 1, 512] + - Exact: [512, 1917, 1, 2048] + - Exact: [512, 2076, 1, 2048] + - Exact: [512, 2195, 1, 2048] + - Exact: [512, 2205, 1, 512] + - Exact: [2048, 198, 1, 512] + - Exact: [2048, 207, 1, 512] + - Exact: [2048, 208, 1, 512] + - Exact: [2048, 245, 1, 512] + - Exact: [2048, 246, 1, 512] + - Exact: [2048, 264, 1, 512] + - Exact: [2048, 401, 1, 512] + - Exact: [2048, 439, 1, 512] + - Exact: [2048, 443, 1, 512] + - Exact: [2048, 446, 1, 512] + - Exact: [2048, 465, 1, 512] + - Exact: [2048, 468, 1, 512] + - Exact: [2048, 493, 1, 512] + - Exact: [2048, 495, 1, 512] + - Exact: [2048, 511, 1, 512] + - Exact: [2048, 512, 1, 512] + - Exact: [2048, 540, 1, 512] + - Exact: [2048, 550, 1, 512] + - Exact: [2048, 560, 1, 512] + - Exact: [2048, 600, 1, 512] + - Exact: [64, 64, 496, 64] + - Exact: [64, 65, 496, 64] + - Exact: [64, 65, 496, 65] + - Exact: [64, 70, 216, 70] + - Exact: [64, 71, 216, 71] + - Exact: [64, 78, 248, 77] + - Exact: [64, 80, 152, 80] + - Exact: [64, 93, 344, 93] + - Exact: [64, 102, 312, 102] + - Exact: [64, 122, 264, 122] + - Exact: [64, 122, 264, 123] + - Exact: [64, 123, 264, 123] + - Exact: [64, 512, 96, 512] + - Exact: [64, 512, 128, 512] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [2048, 512, 1, 2048] + - Exact: [512, 1600, 1, 32] + - Exact: [512, 1600, 1, 512] + - Exact: [560, 1600, 1, 1024] + - Exact: [1024, 512, 1, 3072] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [5329, 80, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [256, 4096, 1, 4] + - Exact: [960, 1024, 1, 1024] + - Exact: [768, 768, 1, 768] + - Exact: [768, 768, 1, 384] + - Exact: [100, 128, 120, 512] + - Exact: [100, 128, 139, 512] + - Exact: [100, 128, 160, 512] + - Exact: [22500, 64, 1, 147] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 616, 1, 1024] + - Exact: [64, 128, 128, 128] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 2] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 780, 1, 1024] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 800, 1, 1024] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 820, 1, 1024] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 960, 1, 64] + - Exact: [64, 512, 256, 512] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 512] + - Exact: [256, 3456, 1, 128] + - Exact: [256, 4096, 1, 128] + - Exact: [480, 864, 1, 1024] + - Exact: [512, 864, 1, 256] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [256, 4096, 1, 1] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [950, 512, 2, 2048] + - Exact: [3400, 256, 1, 1024] + - Exact: [3800, 256, 1, 1024] + - Exact: [850, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [950, 256, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [51520, 64, 2, 256] + - Exact: [46464, 64, 2, 256] + - Exact: [49152, 64, 2, 256] + - Exact: [1900, 512, 1, 1024] + - Exact: [1700, 512, 1, 1024] + - Exact: [1610, 512, 1, 1024] + - Exact: [1536, 512, 1, 1024] + - Exact: [1728, 512, 1, 1024] + - Exact: [1024, 1024, 1, 320] + - Exact: [51520, 64, 2, 64] + - Exact: [55296, 64, 2, 64] + - Exact: [49152, 64, 2, 64] + - Exact: [54400, 64, 2, 64] + - Exact: [42240, 64, 2, 256] + - Exact: [672, 512, 2, 2048] + - Exact: [54400, 64, 2, 256] + - Exact: [56832, 64, 2, 256] + - Exact: [55296, 64, 2, 256] + - Exact: [60800, 64, 2, 64] + - Exact: [660, 512, 2, 2048] + - Exact: [768, 512, 2, 2048] + - Exact: [43008, 64, 2, 256] + - Exact: [864, 256, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [768, 256, 2, 2048] + - Exact: [45632, 64, 2, 256] + - Exact: [713, 512, 2, 2048] + - Exact: [805, 256, 2, 2048] + - Exact: [60800, 64, 2, 256] + - Exact: [850, 256, 2, 2048] + - Exact: [1024, 1024, 1, 81] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 80, 1, 30522] + - Exact: [1024, 120, 1, 30522] + - Exact: [1024, 77, 1, 30522] + - Exact: [1024, 200, 1, 30522] + - Exact: [1024, 160, 1, 30522] + - Exact: [1024, 180, 1, 30522] + - Exact: [1024, 160, 1, 30528] + - Exact: [1024, 240, 1, 30528] + - Exact: [2560, 109, 1, 29000] + - Exact: [2560, 121, 1, 29000] + - Exact: [2560, 65, 1, 29000] + - Exact: [2560, 66, 1, 29000] + - Exact: [2560, 67, 1, 29000] + - Exact: [2560, 69, 1, 29000] + - Exact: [2560, 70, 1, 29000] + - Exact: [2560, 71, 1, 29000] + - Exact: [2560, 73, 1, 29000] + - Exact: [2560, 74, 1, 29000] + - Exact: [2560, 75, 1, 29000] + - Exact: [2560, 77, 1, 29000] + - Exact: [2560, 78, 1, 29000] + - Exact: [2560, 80, 1, 29000] + - Exact: [2560, 81, 1, 29000] + - Exact: [2560, 82, 1, 29000] + - Exact: [2560, 83, 1, 29000] + - Exact: [2560, 84, 1, 29000] + - Exact: [2560, 88, 1, 29000] + - Exact: [2560, 89, 1, 29000] + - Exact: [2560, 90, 1, 29000] + - Exact: [2560, 92, 1, 29000] + - Exact: [2560, 95, 1, 29000] + - Exact: [2560, 98, 1, 29000] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 200, 1, 32] + - Exact: [1024, 200, 1, 1] + - Exact: [512, 200, 1, 1] + - Exact: [768, 320, 1, 768] + - Exact: [768, 160, 1, 768] + - Exact: [1024, 120, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [2368, 64, 1, 3328] + - Exact: [64, 3584, 1, 1280] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [448, 448, 1, 3328] + - Exact: [1024, 256, 1, 3328] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 3584, 1, 3328] + - Exact: [1760, 128, 1, 1760] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [35, 8457, 1, 1760] + - Exact: [64, 2944, 1, 128] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [2048, 128, 1, 2048] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [35, 8457, 1, 2560] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1024, 128, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [1408, 128, 1, 128] + - Exact: [128, 1024, 1, 1280] + - Exact: [2944, 64, 1, 128] + - Exact: [1024, 128, 1, 3328] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [64, 4288, 1, 3328] + - Exact: [2944, 64, 1, 3328] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [64, 4288, 1, 128] + - Exact: [3584, 64, 1, 1280] + - Exact: [1408, 128, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [128, 1500, 1, 1280] + - Exact: [35, 8457, 1, 4096] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [35, 8457, 1, 2048] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [4096, 64, 1, 4096] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [1024, 256, 1, 128] + - Exact: [256, 704, 1, 128] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [2560, 32, 1, 2560] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [176, 1500, 1, 1408] + - Exact: [1856, 128, 1, 256] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [1024, 256, 1, 1024] + - Exact: [512, 200, 1, 512] + - Exact: [1024, 200, 1, 1024] + - Exact: [512, 256, 1, 1024] + - Exact: [1024, 256, 1, 2048] + - Exact: [1024, 200, 1, 4096] + - Exact: [1024, 200, 1, 512] + - Exact: [512, 200, 1, 1024] + - Exact: [512, 256, 1, 512] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 200, 1, 2048] + - Exact: [1024, 256, 1, 512] + - Exact: [512, 200, 1, 2048] + - Exact: [64, 32, 1984, 32] + - Exact: [64, 38, 1680, 38] + - Exact: [64, 59, 1088, 59] + - Exact: [64, 54, 1184, 54] + - Exact: [64, 49, 1296, 49] + - Exact: [64, 45, 1424, 45] + - Exact: [64, 35, 1808, 35] + - Exact: [64, 41, 1552, 41] + - Exact: [512, 512, 1, 1024] + - Exact: [512, 512, 1, 2000] + - Exact: [100, 1024, 1, 2048] + - Exact: [100, 2000, 1, 1024] + - Exact: [128, 2000, 1, 100] + - Exact: [64, 2000, 1, 1024] + - Exact: [100, 1024, 1, 1024] + - Exact: [128, 1024, 1, 512] + - Exact: [512, 500, 1, 2000] + - Exact: [500, 512, 1, 100] + - Exact: [100, 1024, 1, 500] + - Exact: [128, 2000, 1, 512] + - Exact: [256, 1024, 1, 100] + - Exact: [200, 500, 1, 1024] + - Exact: [100, 2000, 1, 512] + - Exact: [200, 512, 1, 100] + - Exact: [64, 2048, 1, 10] + - Exact: [64, 2048, 1, 500] + - Exact: [512, 512, 1, 512] + - Exact: [500, 500, 1, 2000] + - Exact: [256, 500, 1, 10] + - Exact: [512, 500, 1, 512] + - Exact: [128, 1024, 1, 2000] + - Exact: [100, 2000, 1, 2048] + - Exact: [256, 512, 1, 10] + - Exact: [64, 2000, 1, 2048] + - Exact: [64, 2048, 1, 512] + - Exact: [64, 2000, 1, 10] + - Exact: [128, 1024, 1, 500] + - Exact: [200, 512, 1, 1024] + - Exact: [128, 2048, 1, 10] + - Exact: [64, 2048, 1, 100] + - Exact: [64, 2000, 1, 100] + - Exact: [200, 500, 1, 100] + - Exact: [500, 500, 1, 500] + - Exact: [128, 2048, 1, 512] + - Exact: [100, 2048, 1, 500] + - Exact: [500, 500, 1, 2048] + - Exact: [128, 2000, 1, 2000] + - Exact: [256, 500, 1, 1024] + - Exact: [64, 2048, 1, 2000] + - Exact: [100, 2048, 1, 1024] + - Exact: [128, 1024, 1, 100] + - Exact: [256, 1024, 1, 2048] + - Exact: [500, 512, 1, 512] + - Exact: [256, 500, 1, 2000] + - Exact: [256, 512, 1, 100] + - Exact: [128, 2000, 1, 500] + - Exact: [200, 512, 1, 2048] + - Exact: [64, 2048, 1, 2048] + - Exact: [200, 1024, 1, 2048] + - Exact: [512, 512, 1, 10] + - Exact: [512, 500, 1, 10] + - Exact: [200, 512, 1, 10] + - Exact: [500, 500, 1, 1024] + - Exact: [256, 1024, 1, 512] + - Exact: [256, 500, 1, 512] + - Exact: [200, 500, 1, 2048] + - Exact: [100, 2000, 1, 10] + - Exact: [100, 2048, 1, 2048] + - Exact: [128, 1024, 1, 2048] + - Exact: [100, 2000, 1, 500] + - Exact: [100, 2048, 1, 100] + - Exact: [100, 1024, 1, 10] + - Exact: [100, 1024, 1, 2000] + - Exact: [256, 512, 1, 500] + - Exact: [100, 2000, 1, 100] + - Exact: [128, 1024, 1, 10] + - Exact: [100, 2048, 1, 10] + - Exact: [512, 500, 1, 100] + - Exact: [128, 2000, 1, 1024] + - Exact: [200, 1024, 1, 500] + - Exact: [256, 512, 1, 2000] + - Exact: [256, 1024, 1, 2000] + - Exact: [200, 512, 1, 500] + - Exact: [64, 2000, 1, 512] + - Exact: [200, 1024, 1, 100] + - Exact: [200, 1024, 1, 1024] + - Exact: [500, 512, 1, 2000] + - Exact: [200, 500, 1, 512] + - Exact: [256, 512, 1, 512] + - Exact: [512, 512, 1, 500] + - Exact: [100, 1024, 1, 512] + - Exact: [128, 1024, 1, 1024] + - Exact: [200, 512, 1, 2000] + - Exact: [256, 1024, 1, 500] + - Exact: [200, 1024, 1, 512] + - Exact: [256, 500, 1, 500] + - Exact: [256, 500, 1, 2048] + - Exact: [512, 500, 1, 1024] + - Exact: [256, 512, 1, 1024] + - Exact: [128, 2048, 1, 1024] + - Exact: [500, 512, 1, 500] + - Exact: [200, 500, 1, 500] + - Exact: [64, 2000, 1, 2000] + - Exact: [128, 2000, 1, 2048] + - Exact: [256, 1024, 1, 10] + - Exact: [256, 1024, 1, 1024] + - Exact: [500, 500, 1, 10] + - Exact: [256, 500, 1, 100] + - Exact: [256, 512, 1, 2048] + - Exact: [200, 1024, 1, 2000] + - Exact: [100, 2048, 1, 512] + - Exact: [512, 500, 1, 2048] + - Exact: [128, 2048, 1, 2000] + - Exact: [500, 512, 1, 2048] + - Exact: [200, 500, 1, 2000] + - Exact: [500, 512, 1, 1024] + - Exact: [100, 1024, 1, 100] + - Exact: [64, 2000, 1, 500] + - Exact: [128, 2048, 1, 2048] + - Exact: [128, 2000, 1, 10] + - Exact: [500, 512, 1, 10] + - Exact: [200, 512, 1, 512] + - Exact: [512, 500, 1, 500] + - Exact: [512, 512, 1, 100] + - Exact: [500, 500, 1, 512] + - Exact: [128, 2048, 1, 500] + - Exact: [200, 500, 1, 10] + - Exact: [100, 2048, 1, 2000] + - Exact: [200, 1024, 1, 10] + - Exact: [64, 2048, 1, 1024] + - Exact: [100, 2000, 1, 2000] + - Exact: [500, 500, 1, 100] + - Exact: [128, 2048, 1, 100] + - Exact: [4096, 64, 1, 2048] + - Exact: [4096, 91, 1, 2048] + - Exact: [4096, 86, 1, 3072] + - Exact: [4096, 49, 1, 2048] + - Exact: [4096, 91, 1, 3072] + - Exact: [4096, 64, 1, 3072] + - Exact: [4096, 63, 1, 3072] + - Exact: [4096, 96, 1, 2048] + - Exact: [4096, 32, 1, 2048] + - Exact: [4096, 49, 1, 3072] + - Exact: [1024, 96, 1, 1024] + - Exact: [4096, 86, 1, 2048] + - Exact: [4096, 96, 1, 3072] + - Exact: [4096, 35, 1, 3072] + - Exact: [4096, 50, 1, 2048] + - Exact: [36548, 32, 1, 1024] + - Exact: [4096, 32, 1, 3072] + - Exact: [1024, 243, 1, 1024] + - Exact: [4096, 50, 1, 3072] + - Exact: [1024, 128, 1, 1024] + - Exact: [1024, 216, 1, 1024] + - Exact: [4096, 35, 1, 2048] + - Exact: [4096, 63, 1, 2048] + - Exact: [289, 256, 1, 1568] + - Exact: [3025, 64, 1, 363] + - Exact: [784, 32, 32, 192] + - Exact: [289, 256, 1, 2016] + - Exact: [21609, 32, 1, 288] + - Exact: [1225, 192, 1, 1728] + - Exact: [784, 96, 1, 800] + - Exact: [1225, 64, 1, 1200] + - Exact: [729, 192, 1, 1600] + - Exact: [6272, 32, 1, 528] + - Exact: [1568, 160, 1, 832] + - Exact: [289, 256, 1, 1792] + - Exact: [784, 32, 32, 256] + - Exact: [6272, 32, 1, 512] + - Exact: [289, 384, 1, 3456] + - Exact: [289, 384, 1, 2592] + - Exact: [1225, 32, 32, 192] + - Exact: [1568, 128, 1, 832] + - Exact: [1225, 48, 32, 288] + - Exact: [1001, 128, 1, 2048] + - Exact: [2048, 174, 1, 512] + - Exact: [2048, 189, 1, 512] + - Exact: [64, 35, 904, 35] + - Exact: [64, 103, 16, 103] + - Exact: [64, 104, 16, 103] + - Exact: [64, 123, 16, 112] + - Exact: [64, 123, 16, 123] + - Exact: [512, 540, 1, 512] + - Exact: [512, 540, 1, 2048] + - Exact: [512, 550, 1, 512] + - Exact: [512, 550, 1, 2048] + - Exact: [512, 560, 1, 512] + - Exact: [512, 560, 1, 2048] + - Exact: [2048, 160, 1, 512] + - Exact: [2048, 184, 1, 512] + - Exact: [512, 160, 1, 2048] + - Exact: [512, 174, 1, 2048] + - Exact: [512, 182, 1, 512] + - Exact: [512, 184, 1, 512] + - Exact: [512, 184, 1, 2048] + - Exact: [512, 189, 1, 512] + - Exact: [512, 189, 1, 2048] + - Exact: [512, 198, 1, 2048] + - Exact: [512, 206, 1, 512] + - Exact: [512, 207, 1, 2048] + - Exact: [512, 208, 1, 512] + - Exact: [512, 208, 1, 2048] + - Exact: [512, 224, 1, 512] + - Exact: [512, 245, 1, 2048] + - Exact: [512, 246, 1, 512] + - Exact: [512, 246, 1, 2048] + - Exact: [512, 264, 1, 512] + - Exact: [512, 264, 1, 2048] + - Exact: [512, 401, 1, 2048] + - Exact: [512, 439, 1, 2048] + - Exact: [512, 443, 1, 2048] + - Exact: [512, 446, 1, 2048] + - Exact: [512, 455, 1, 512] + - Exact: [512, 465, 1, 512] + - Exact: [512, 465, 1, 2048] + - Exact: [512, 468, 1, 512] + - Exact: [512, 468, 1, 2048] + - Exact: [512, 476, 1, 512] + - Exact: [512, 493, 1, 512] + - Exact: [512, 493, 1, 2048] + - Exact: [512, 495, 1, 2048] + - Exact: [512, 511, 1, 2048] + - Exact: [512, 512, 1, 2048] + - Exact: [64, 59, 512, 59] + - Exact: [64, 59, 544, 59] + - Exact: [256, 1024, 1, 1] + - Exact: [257, 1024, 1, 4096] + - Exact: [512, 215, 1, 2048] + - Exact: [512, 256, 1, 2048] + - Exact: [560, 200, 1, 1024] + - Exact: [768, 215, 1, 2048] + - Exact: [768, 256, 1, 2048] + - Exact: [32, 33, 1600, 33] + - Exact: [512, 512, 1, 64] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [49, 2048, 64, 512] + - Exact: [49, 512, 64, 2048] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [384, 384, 1, 384] + - Exact: [100, 128, 18, 512] + - Exact: [100, 128, 19, 512] + - Exact: [1444, 128, 1, 576] + - Exact: [361, 512, 1, 2304] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 32] + - Exact: [64, 35, 4608, 35] + - Exact: [256, 864, 1, 128] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [3136, 64, 1, 576] + - Exact: [784, 128, 1, 1152] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 512, 256, 2048] + - Exact: [1024, 128, 1, 2] + - Exact: [1024, 96, 1, 2] + - Exact: [1909283, 40, 1, 40] + - Exact: [3818566, 40, 1, 40] + - Exact: [2560, 35, 1, 29000] + - Exact: [2560, 36, 1, 29000] + - Exact: [2560, 39, 1, 29000] + - Exact: [2560, 40, 1, 29000] + - Exact: [2560, 42, 1, 29000] + - Exact: [2560, 43, 1, 29000] + - Exact: [2560, 44, 1, 29000] + - Exact: [2560, 46, 1, 29000] + - Exact: [2560, 48, 1, 29000] + - Exact: [2560, 49, 1, 29000] + - Exact: [2560, 50, 1, 29000] + - Exact: [2560, 51, 1, 29000] + - Exact: [2560, 53, 1, 29000] + - Exact: [2560, 54, 1, 29000] + - Exact: [2560, 55, 1, 29000] + - Exact: [2560, 56, 1, 29000] + - Exact: [2560, 57, 1, 29000] + - Exact: [2560, 58, 1, 29000] + - Exact: [2560, 59, 1, 29000] + - Exact: [2560, 61, 1, 29000] + - Exact: [2560, 63, 1, 29000] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1760, 32, 1, 1760] + - Exact: [3584, 4, 1, 1280] + - Exact: [2560, 16, 1, 2560] + - Exact: [2944, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [1760, 16, 1, 1760] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [8448, 4, 1, 2816] + - Exact: [1856, 4, 1, 1280] + - Exact: [4608, 1, 1, 1536] + - Exact: [7680, 4, 1, 2560] + - Exact: [8448, 16, 1, 2816] + - Exact: [3072, 2, 1, 1024] + - Exact: [2368, 4, 1, 256] + - Exact: [7680, 1, 1, 2560] + - Exact: [4608, 2, 1, 1536] + - Exact: [4608, 4, 1, 1536] + - Exact: [3072, 1, 1, 128] + - Exact: [2048, 32, 1, 2048] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 1280] + - Exact: [2048, 16, 1, 2048] + - Exact: [5888, 4, 1, 128] + - Exact: [8448, 1, 1, 2816] + - Exact: [1408, 4, 1, 256] + - Exact: [6144, 4, 1, 2560] + - Exact: [3072, 1, 1, 1024] + - Exact: [5056, 4, 1, 1280] + - Exact: [3072, 16, 1, 1024] + - Exact: [1408, 4, 1, 3328] + - Exact: [6144, 1, 1, 2560] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [1408, 4, 1, 128] + - Exact: [1856, 4, 1, 256] + - Exact: [6784, 4, 1, 128] + - Exact: [2944, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [3072, 4, 1, 1024] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [7680, 2, 1, 2560] + - Exact: [7680, 16, 1, 2560] + - Exact: [4224, 1, 1, 128] + - Exact: [8448, 2, 1, 2816] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [3584, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [4608, 16, 1, 1536] + - Exact: [6144, 2, 1, 2560] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [4096, 29, 1, 2048] + - Exact: [4096, 25, 1, 2048] + - Exact: [4096, 29, 1, 3072] + - Exact: [4096, 24, 1, 2048] + - Exact: [36548, 1, 1, 1024] + - Exact: [4096, 27, 1, 2048] + - Exact: [4096, 1, 1, 2048] + - Exact: [4096, 24, 1, 3072] + - Exact: [4096, 27, 1, 3072] + - Exact: [36548, 25, 1, 1024] + - Exact: [4096, 1, 1, 3072] + - Exact: [4096, 25, 1, 3072] + - Exact: [36548, 24, 1, 1024] + - Exact: [6272, 16, 1, 480] + - Exact: [1568, 32, 1, 832] + - Exact: [1568, 48, 1, 832] + - Exact: [6272, 24, 1, 512] + - Exact: [2048, 1, 1, 512] + - Exact: [2048, 2, 1, 2] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2] + - Exact: [2560, 4, 1, 2560] + - Exact: [12288, 12, 2, 256] + - Exact: [12288, 3, 2, 256] + - Exact: [51520, 12, 2, 256] + - Exact: [51520, 3, 2, 256] + - Exact: [15200, 12, 2, 256] + - Exact: [15200, 3, 2, 256] + - Exact: [3456, 3, 2, 256] + - Exact: [13600, 12, 2, 256] + - Exact: [12880, 3, 2, 256] + - Exact: [3400, 3, 2, 256] + - Exact: [12880, 12, 2, 256] + - Exact: [13824, 12, 2, 256] + - Exact: [13824, 3, 2, 256] + - Exact: [13600, 3, 2, 256] + - Exact: [3456, 12, 2, 256] + - Exact: [3800, 3, 2, 256] + - Exact: [3400, 12, 2, 256] + - Exact: [3800, 12, 2, 256] + - Exact: [55296, 3, 2, 256] + - Exact: [3220, 3, 2, 256] + - Exact: [3072, 3, 2, 256] + - Exact: [3220, 12, 2, 256] + - Exact: [3072, 12, 2, 256] + - Exact: [54400, 3, 2, 256] + - Exact: [60800, 12, 2, 256] + - Exact: [60800, 3, 2, 256] + - Exact: [1909283, 11, 1, 11] + - Exact: [3818566, 11, 1, 11] + - Exact: [2048, 8, 1, 2] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2] + - Exact: [2560, 2, 1, 2560] + - Exact: [2560, 27, 1, 29000] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [35, 1500, 1, 2560] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 1280] + - Exact: [35, 1500, 1, 2048] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 2368, 1, 128] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 1408, 1, 128] + - Exact: [16, 2000, 1, 2048] + - Exact: [2, 2048, 1, 2000] + - Exact: [32, 2000, 1, 2048] + - Exact: [10, 2000, 1, 1024] + - Exact: [2, 2000, 1, 100] + - Exact: [10, 2000, 1, 512] + - Exact: [32, 2000, 1, 500] + - Exact: [32, 2000, 1, 1024] + - Exact: [4, 2048, 1, 500] + - Exact: [16, 2000, 1, 500] + - Exact: [4, 2048, 1, 100] + - Exact: [16, 2000, 1, 100] + - Exact: [4, 2000, 1, 10] + - Exact: [10, 2000, 1, 10] + - Exact: [2, 2048, 1, 512] + - Exact: [10, 2048, 1, 100] + - Exact: [8, 2048, 1, 100] + - Exact: [2, 2048, 1, 1024] + - Exact: [16, 2000, 1, 1024] + - Exact: [10, 2000, 1, 2000] + - Exact: [8, 2000, 1, 500] + - Exact: [16, 2000, 1, 2000] + - Exact: [10, 2048, 1, 2048] + - Exact: [8, 2000, 1, 512] + - Exact: [2, 2000, 1, 2048] + - Exact: [16, 2048, 1, 500] + - Exact: [8, 2048, 1, 1024] + - Exact: [2, 2000, 1, 500] + - Exact: [32, 2048, 1, 100] + - Exact: [10, 2048, 1, 500] + - Exact: [4, 2000, 1, 2048] + - Exact: [8, 2000, 1, 1024] + - Exact: [32, 2048, 1, 512] + - Exact: [32, 2048, 1, 1024] + - Exact: [32, 2048, 1, 500] + - Exact: [10, 2048, 1, 1024] + - Exact: [8, 2048, 1, 2048] + - Exact: [16, 2048, 1, 2048] + - Exact: [8, 2000, 1, 10] + - Exact: [4, 2000, 1, 2000] + - Exact: [8, 2048, 1, 512] + - Exact: [8, 2000, 1, 2048] + - Exact: [32, 2048, 1, 2000] + - Exact: [16, 2000, 1, 10] + - Exact: [8, 2048, 1, 2000] + - Exact: [4, 2048, 1, 2048] + - Exact: [10, 2048, 1, 2000] + - Exact: [8, 2000, 1, 100] + - Exact: [2, 2000, 1, 2000] + - Exact: [16, 2048, 1, 1024] + - Exact: [32, 2000, 1, 2000] + - Exact: [32, 2048, 1, 2048] + - Exact: [2, 2048, 1, 10] + - Exact: [4, 2048, 1, 512] + - Exact: [4, 2048, 1, 10] + - Exact: [16, 2048, 1, 100] + - Exact: [4, 2000, 1, 500] + - Exact: [10, 2000, 1, 500] + - Exact: [32, 2000, 1, 512] + - Exact: [2, 2000, 1, 1024] + - Exact: [2, 2000, 1, 512] + - Exact: [4, 2048, 1, 1024] + - Exact: [8, 2048, 1, 500] + - Exact: [4, 2048, 1, 2000] + - Exact: [8, 2000, 1, 2000] + - Exact: [4, 2000, 1, 1024] + - Exact: [32, 2000, 1, 100] + - Exact: [2, 2048, 1, 100] + - Exact: [8, 2048, 1, 10] + - Exact: [2, 2048, 1, 2048] + - Exact: [10, 2000, 1, 2048] + - Exact: [16, 2048, 1, 2000] + - Exact: [10, 2048, 1, 512] + - Exact: [16, 2048, 1, 512] + - Exact: [2, 2000, 1, 10] + - Exact: [4, 2000, 1, 100] + - Exact: [16, 2000, 1, 512] + - Exact: [32, 2048, 1, 10] + - Exact: [10, 2048, 1, 10] + - Exact: [4, 2000, 1, 512] + - Exact: [16, 2048, 1, 10] + - Exact: [32, 2000, 1, 10] + - Exact: [10, 2000, 1, 100] + - Exact: [2, 2048, 1, 500] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1, 1, 500000] + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 2, 1, 500000] + - Exact: [512, 1, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [1024, 4, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 2, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [512, 4, 1, 500000] + - Exact: [1024, 20, 1, 30522] + - Exact: [49, 512, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 512, 1, 1] + - Exact: [1024, 32, 1, 2] + - Exact: [1024, 32, 1, 1024] + - Exact: [768, 32, 1, 768] + - Exact: [768, 32, 1, 2] + - Exact: [768, 64, 1, 768] + - Exact: [768, 64, 1, 2] + - Exact: [1024, 20, 1, 1024] + - Exact: [1024, 80, 1, 1024] + - Exact: [32, 200, 1, 1] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 4, 1, 2] + - Exact: [768, 16, 1, 768] + - Exact: [768, 16, 1, 2] + - Exact: [768, 8, 1, 768] + - Exact: [1024, 6, 1, 1024] + - Exact: [1024, 6, 1, 2] + - Exact: [1024, 8, 1, 1024] + - Exact: [4, 704, 1, 1280] + - Exact: [512, 4, 1, 512] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [64, 64, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [64, 1024, 1, 3328] + - Exact: [128, 1, 1, 1408] + - Exact: [1024, 64, 1, 1280] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 256, 1, 128] + - Exact: [256, 256, 1, 3328] + - Exact: [64, 1024, 1, 128] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [512, 32, 1, 512] + - Exact: [64, 64, 1, 3328] + - Exact: [512, 1, 1, 512] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 4, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 256] + - Exact: [4, 704, 1, 256] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [4, 448, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 1, 1, 1216] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [64, 704, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [448, 4, 1, 1280] + - Exact: [1024, 2, 1, 512] + - Exact: [256, 64, 1, 3328] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 256] + - Exact: [64, 704, 1, 3328] + - Exact: [256, 256, 1, 256] + - Exact: [4, 1024, 1, 3328] + - Exact: [4, 704, 1, 128] + - Exact: [64, 128, 1, 128] + - Exact: [704, 4, 1, 128] + - Exact: [64, 448, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 64, 1, 1280] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [64, 4, 1, 128] + - Exact: [256, 64, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [512, 2, 1, 512] + - Exact: [64, 128, 1, 256] + - Exact: [1024, 4, 1, 1280] + - Exact: [35, 700, 1, 2048] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [4, 256, 1, 128] + - Exact: [512, 16, 1, 512] + - Exact: [1024, 64, 1, 256] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 64, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [128, 4, 1, 128] + - Exact: [128, 1, 1, 1024] + - Exact: [4, 128, 1, 3328] + - Exact: [256, 256, 1, 128] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [256, 4, 1, 128] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [4, 448, 1, 1280] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [35, 700, 1, 2560] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [1024, 4, 1, 512] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [1024, 1, 1, 512] + - Exact: [128, 128, 1, 256] + - Exact: [64, 12, 5040, 12] + - Exact: [64, 17, 3632, 17] + - Exact: [64, 19, 3264, 19] + - Exact: [64, 9, 6544, 9] + - Exact: [64, 7, 8192, 7] + - Exact: [64, 16, 3840, 16] + - Exact: [64, 8, 7280, 8] + - Exact: [64, 27, 2336, 27] + - Exact: [64, 11, 5456, 11] + - Exact: [64, 21, 2976, 21] + - Exact: [64, 10, 5952, 10] + - Exact: [64, 14, 4368, 14] + - Exact: [64, 25, 2512, 25] + - Exact: [64, 13, 4672, 13] + - Exact: [64, 15, 4096, 15] + - Exact: [64, 29, 2176, 29] + - Exact: [64, 18, 3440, 18] + - Exact: [64, 23, 2720, 23] + - Exact: [8, 500, 1, 512] + - Exact: [32, 512, 1, 512] + - Exact: [8, 512, 1, 500] + - Exact: [8, 500, 1, 1024] + - Exact: [64, 1024, 1, 100] + - Exact: [64, 1024, 1, 500] + - Exact: [64, 1024, 1, 1024] + - Exact: [2, 500, 1, 2048] + - Exact: [16, 512, 1, 10] + - Exact: [8, 512, 1, 10] + - Exact: [16, 500, 1, 2048] + - Exact: [10, 100, 1, 500] + - Exact: [16, 100, 1, 10] + - Exact: [2, 100, 1, 2000] + - Exact: [256, 100, 1, 2048] + - Exact: [2, 512, 1, 512] + - Exact: [2, 100, 1, 10] + - Exact: [200, 100, 1, 100] + - Exact: [500, 100, 1, 100] + - Exact: [4, 100, 1, 10] + - Exact: [32, 100, 1, 512] + - Exact: [16, 1024, 1, 512] + - Exact: [4, 1024, 1, 1024] + - Exact: [4, 512, 1, 10] + - Exact: [128, 100, 1, 10] + - Exact: [4, 512, 1, 2048] + - Exact: [10, 1024, 1, 2000] + - Exact: [256, 100, 1, 100] + - Exact: [64, 1024, 1, 2048] + - Exact: [16, 1024, 1, 100] + - Exact: [32, 1024, 1, 1024] + - Exact: [8, 100, 1, 500] + - Exact: [10, 512, 1, 512] + - Exact: [8, 500, 1, 10] + - Exact: [16, 1024, 1, 10] + - Exact: [16, 512, 1, 2048] + - Exact: [128, 512, 1, 2048] + - Exact: [128, 512, 1, 100] + - Exact: [64, 500, 1, 2048] + - Exact: [500, 100, 1, 10] + - Exact: [64, 100, 1, 2048] + - Exact: [64, 100, 1, 10] + - Exact: [16, 512, 1, 500] + - Exact: [200, 100, 1, 2000] + - Exact: [2, 100, 1, 512] + - Exact: [32, 512, 1, 100] + - Exact: [16, 512, 1, 1024] + - Exact: [4, 1024, 1, 512] + - Exact: [2, 500, 1, 500] + - Exact: [32, 100, 1, 100] + - Exact: [100, 500, 1, 2000] + - Exact: [10, 512, 1, 10] + - Exact: [100, 500, 1, 2048] + - Exact: [2, 100, 1, 1024] + - Exact: [32, 512, 1, 1024] + - Exact: [256, 100, 1, 1024] + - Exact: [128, 100, 1, 100] + - Exact: [32, 512, 1, 10] + - Exact: [128, 100, 1, 1024] + - Exact: [16, 500, 1, 2000] + - Exact: [64, 500, 1, 500] + - Exact: [128, 512, 1, 1024] + - Exact: [128, 512, 1, 2000] + - Exact: [2, 512, 1, 10] + - Exact: [10, 512, 1, 500] + - Exact: [4, 1024, 1, 2000] + - Exact: [256, 100, 1, 2000] + - Exact: [100, 100, 1, 10] + - Exact: [128, 512, 1, 10] + - Exact: [256, 100, 1, 500] + - Exact: [64, 100, 1, 512] + - Exact: [64, 512, 1, 500] + - Exact: [8, 100, 1, 512] + - Exact: [32, 100, 1, 500] + - Exact: [32, 500, 1, 2048] + - Exact: [128, 500, 1, 2000] + - Exact: [8, 1024, 1, 10] + - Exact: [2, 500, 1, 100] + - Exact: [10, 500, 1, 512] + - Exact: [32, 500, 1, 500] + - Exact: [100, 500, 1, 100] + - Exact: [10, 1024, 1, 512] + - Exact: [512, 100, 1, 512] + - Exact: [4, 500, 1, 500] + - Exact: [64, 100, 1, 1024] + - Exact: [2, 500, 1, 2000] + - Exact: [32, 512, 1, 2048] + - Exact: [10, 100, 1, 2000] + - Exact: [4, 100, 1, 512] + - Exact: [2, 512, 1, 2048] + - Exact: [100, 100, 1, 2000] + - Exact: [10, 500, 1, 500] + - Exact: [2, 100, 1, 2048] + - Exact: [32, 100, 1, 2048] + - Exact: [16, 100, 1, 1024] + - Exact: [2, 500, 1, 10] + - Exact: [500, 100, 1, 2048] + - Exact: [16, 1024, 1, 2000] + - Exact: [10, 1024, 1, 1024] + - Exact: [500, 100, 1, 512] + - Exact: [32, 512, 1, 500] + - Exact: [100, 500, 1, 512] + - Exact: [8, 500, 1, 2000] + - Exact: [4, 100, 1, 1024] + - Exact: [2, 500, 1, 1024] + - Exact: [100, 500, 1, 1024] + - Exact: [32, 100, 1, 1024] + - Exact: [64, 100, 1, 2000] + - Exact: [64, 500, 1, 10] + - Exact: [64, 500, 1, 512] + - Exact: [10, 100, 1, 1024] + - Exact: [16, 512, 1, 100] + - Exact: [4, 100, 1, 2000] + - Exact: [2, 512, 1, 1024] + - Exact: [64, 512, 1, 1024] + - Exact: [512, 100, 1, 2048] + - Exact: [32, 100, 1, 2000] + - Exact: [4, 512, 1, 500] + - Exact: [4, 500, 1, 1024] + - Exact: [32, 100, 1, 10] + - Exact: [10, 1024, 1, 2048] + - Exact: [8, 500, 1, 100] + - Exact: [200, 100, 1, 1024] + - Exact: [16, 100, 1, 100] + - Exact: [8, 1024, 1, 2000] + - Exact: [4, 512, 1, 100] + - Exact: [16, 500, 1, 100] + - Exact: [8, 1024, 1, 2048] + - Exact: [16, 1024, 1, 2048] + - Exact: [64, 512, 1, 100] + - Exact: [2, 100, 1, 500] + - Exact: [2, 500, 1, 512] + - Exact: [128, 500, 1, 1024] + - Exact: [10, 100, 1, 10] + - Exact: [64, 1024, 1, 10] + - Exact: [500, 100, 1, 500] + - Exact: [2, 512, 1, 100] + - Exact: [16, 100, 1, 500] + - Exact: [128, 100, 1, 500] + - Exact: [512, 100, 1, 1024] + - Exact: [16, 100, 1, 2000] + - Exact: [10, 512, 1, 100] + - Exact: [8, 512, 1, 100] + - Exact: [128, 100, 1, 2000] + - Exact: [2, 1024, 1, 2000] + - Exact: [100, 512, 1, 512] + - Exact: [32, 1024, 1, 2000] + - Exact: [128, 500, 1, 100] + - Exact: [100, 100, 1, 100] + - Exact: [8, 512, 1, 1024] + - Exact: [200, 100, 1, 500] + - Exact: [2, 1024, 1, 2048] + - Exact: [512, 100, 1, 2000] + - Exact: [16, 512, 1, 2000] + - Exact: [64, 500, 1, 1024] + - Exact: [10, 512, 1, 1024] + - Exact: [512, 100, 1, 100] + - Exact: [8, 100, 1, 1024] + - Exact: [10, 100, 1, 100] + - Exact: [10, 500, 1, 2000] + - Exact: [500, 100, 1, 2000] + - Exact: [100, 512, 1, 2000] + - Exact: [64, 1024, 1, 512] + - Exact: [32, 500, 1, 100] + - Exact: [10, 100, 1, 2048] + - Exact: [64, 100, 1, 100] + - Exact: [2, 1024, 1, 100] + - Exact: [64, 500, 1, 2000] + - Exact: [8, 512, 1, 512] + - Exact: [8, 512, 1, 2048] + - Exact: [100, 100, 1, 1024] + - Exact: [8, 100, 1, 2000] + - Exact: [2, 1024, 1, 1024] + - Exact: [16, 512, 1, 512] + - Exact: [32, 500, 1, 512] + - Exact: [32, 500, 1, 1024] + - Exact: [32, 500, 1, 10] + - Exact: [4, 1024, 1, 500] + - Exact: [256, 100, 1, 512] + - Exact: [8, 1024, 1, 500] + - Exact: [4, 1024, 1, 100] + - Exact: [100, 500, 1, 500] + - Exact: [2, 1024, 1, 500] + - Exact: [64, 100, 1, 500] + - Exact: [2, 512, 1, 500] + - Exact: [10, 1024, 1, 500] + - Exact: [128, 500, 1, 512] + - Exact: [10, 500, 1, 2048] + - Exact: [128, 512, 1, 512] + - Exact: [64, 512, 1, 10] + - Exact: [32, 500, 1, 2000] + - Exact: [100, 100, 1, 2048] + - Exact: [200, 100, 1, 512] + - Exact: [200, 100, 1, 2048] + - Exact: [8, 100, 1, 10] + - Exact: [100, 100, 1, 500] + - Exact: [100, 500, 1, 10] + - Exact: [10, 500, 1, 1024] + - Exact: [256, 100, 1, 10] + - Exact: [10, 512, 1, 2048] + - Exact: [2, 1024, 1, 512] + - Exact: [4, 500, 1, 2048] + - Exact: [100, 512, 1, 100] + - Exact: [16, 500, 1, 512] + - Exact: [10, 1024, 1, 100] + - Exact: [8, 1024, 1, 100] + - Exact: [64, 1024, 1, 2000] + - Exact: [10, 100, 1, 512] + - Exact: [4, 500, 1, 2000] + - Exact: [4, 100, 1, 100] + - Exact: [32, 1024, 1, 512] + - Exact: [8, 512, 1, 2000] + - Exact: [100, 100, 1, 512] + - Exact: [2, 512, 1, 2000] + - Exact: [16, 500, 1, 10] + - Exact: [10, 500, 1, 100] + - Exact: [4, 100, 1, 500] + - Exact: [64, 500, 1, 100] + - Exact: [2, 100, 1, 100] + - Exact: [10, 512, 1, 2000] + - Exact: [8, 500, 1, 500] + - Exact: [4, 500, 1, 512] + - Exact: [10, 500, 1, 10] + - Exact: [64, 512, 1, 2000] + - Exact: [32, 512, 1, 2000] + - Exact: [128, 500, 1, 2048] + - Exact: [4, 512, 1, 512] + - Exact: [16, 500, 1, 1024] + - Exact: [10, 1024, 1, 10] + - Exact: [16, 500, 1, 500] + - Exact: [500, 100, 1, 1024] + - Exact: [16, 100, 1, 512] + - Exact: [64, 512, 1, 2048] + - Exact: [32, 1024, 1, 10] + - Exact: [8, 1024, 1, 512] + - Exact: [4, 1024, 1, 2048] + - Exact: [128, 500, 1, 500] + - Exact: [100, 512, 1, 1024] + - Exact: [16, 1024, 1, 500] + - Exact: [128, 100, 1, 2048] + - Exact: [100, 512, 1, 500] + - Exact: [8, 1024, 1, 1024] + - Exact: [4, 500, 1, 10] + - Exact: [128, 500, 1, 10] + - Exact: [32, 1024, 1, 100] + - Exact: [8, 500, 1, 2048] + - Exact: [16, 1024, 1, 1024] + - Exact: [200, 100, 1, 10] + - Exact: [512, 100, 1, 500] + - Exact: [4, 500, 1, 100] + - Exact: [8, 100, 1, 2048] + - Exact: [512, 100, 1, 10] + - Exact: [4, 512, 1, 1024] + - Exact: [32, 1024, 1, 2048] + - Exact: [128, 100, 1, 512] + - Exact: [32, 1024, 1, 500] + - Exact: [4, 1024, 1, 10] + - Exact: [100, 512, 1, 10] + - Exact: [8, 100, 1, 100] + - Exact: [128, 512, 1, 500] + - Exact: [16, 100, 1, 2048] + - Exact: [2, 1024, 1, 10] + - Exact: [4, 100, 1, 2048] + - Exact: [4, 512, 1, 2000] + - Exact: [1024, 29, 1, 1024] + - Exact: [1024, 1, 1, 21] + - Exact: [1024, 49, 1, 1024] + - Exact: [1024, 35, 1, 1024] + - Exact: [1024, 24, 1, 1024] + - Exact: [1024, 21, 1, 1024] + - Exact: [1024, 1, 1, 14] + - Exact: [1024, 91, 1, 1024] + - Exact: [1024, 14, 1, 1024] + - Exact: [1024, 25, 1, 1024] + - Exact: [1024, 27, 1, 1024] + - Exact: [1024, 50, 1, 1024] + - Exact: [1024, 64, 1, 1024] + - Exact: [1024, 13, 1, 1024] + - Exact: [1024, 63, 1, 1024] + - Exact: [1024, 86, 1, 1024] + - Exact: [1024, 1, 1, 13] + - Exact: [289, 192, 1, 1344] + - Exact: [196, 128, 1, 800] + - Exact: [64, 512, 1, 1344] + - Exact: [289, 224, 1, 1568] + - Exact: [64, 256, 1, 1536] + - Exact: [289, 160, 1, 1120] + - Exact: [64, 256, 1, 1152] + - Exact: [289, 224, 1, 1344] + - Exact: [289, 192, 1, 896] + - Exact: [784, 16, 32, 192] + - Exact: [49, 128, 1, 1200] + - Exact: [289, 128, 1, 896] + - Exact: [1001, 32, 1, 1024] + - Exact: [64, 448, 1, 1152] + - Exact: [1001, 32, 1, 2048] + - Exact: [289, 192, 1, 1120] + - Exact: [64, 320, 1, 1728] + - Exact: [289, 96, 1, 864] + - Exact: [196, 64, 1, 800] + - Exact: [784, 32, 1, 400] + - Exact: [64, 320, 1, 2880] + - Exact: [1001, 32, 1, 1536] + - Exact: [64, 384, 1, 1152] + - Exact: [64, 192, 1, 1728] + - Exact: [1001, 64, 1, 1536] + - Exact: [1001, 64, 1, 2048] + - Exact: [1024, 64, 1, 4096] + - Exact: [64, 10, 448, 10] + - Exact: [64, 18, 648, 18] + - Exact: [64, 18, 1720, 18] + - Exact: [64, 19, 1632, 19] + - Exact: [64, 21, 1472, 21] + - Exact: [64, 23, 64, 23] + - Exact: [64, 26, 56, 26] + - Exact: [1024, 1, 1, 2] + - Exact: [1024, 1, 1, 1024] + - Exact: [64, 27, 56, 26] + - Exact: [64, 17, 1, 17] + - Exact: [64, 30, 1, 30] + - Exact: [64, 31, 1, 30] + - Exact: [64, 31, 1, 31] + - Exact: [64, 14, 1, 14] + - Exact: [64, 14, 1, 15] + - Exact: [64, 15, 1, 15] + - Exact: [64, 15, 1, 17] + - Exact: [100, 512, 1, 2048] + - Exact: [1024, 1, 1, 1600] + - Exact: [1024, 1, 1, 200] + - Exact: [1, 200, 1, 1] + - Exact: [1, 512, 1, 1] + - Exact: [67, 512, 1, 2048] + - Exact: [74, 512, 1, 2048] + - Exact: [64, 3, 512, 3] + - Exact: [64, 5, 512, 5] + - Exact: [64, 9, 512, 9] + - Exact: [64, 512, 1, 512] + - Exact: [25, 128, 120, 256] + - Exact: [25, 128, 139, 256] + - Exact: [25, 128, 160, 256] + - Exact: [25, 128, 18, 256] + - Exact: [25, 128, 19, 256] + - Exact: [9, 128, 120, 256] + - Exact: [9, 128, 139, 256] + - Exact: [9, 128, 160, 256] + - Exact: [9, 128, 18, 256] + - Exact: [9, 128, 19, 256] + - Exact: [1, 256, 1, 1152] + - Exact: [100, 512, 1, 2304] + - Exact: [25, 256, 1, 1152] + - Exact: [9, 256, 1, 1152] + - Exact: [1024, 77, 1, 1024] + - Exact: [1024, 10, 1, 2] + - Exact: [1024, 10, 1, 1024] + - Exact: [1024, 39, 1, 2] + - Exact: [1024, 39, 1, 1024] + - Exact: [1024, 40, 1, 2] + - Exact: [1024, 40, 1, 1024] + - Exact: [1024, 41, 1, 2] + - Exact: [1024, 41, 1, 1024] + - Exact: [1024, 5, 1, 2] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 8, 1, 2] + - Exact: [1024, 9, 1, 2] + - Exact: [1024, 9, 1, 1024] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 14, 10880, 15] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 15, 7680, 17] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 17, 6144, 21] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 24, 4736, 34] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 31, 2048, 30] + - Exact: [64, 31, 2048, 31] + - Exact: [128, 128, 1, 64] + - Exact: [64, 5, 1, 5] + - Exact: [32, 33, 1, 33] + - Exact: [64, 5, 960, 5] + - Exact: [74, 960, 1, 2048] + - Exact: [128, 27, 32768, 27] + - Exact: [1024, 16, 1, 1024] + - Exact: [1024, 16, 1, 2] + - Exact: [1024, 64, 1, 2] + - Exact: [1024, 80, 1, 2] + - Exact: [1024, 82, 1, 1024] + - Exact: [1024, 82, 1, 2] + - Exact: [1024, 12, 1, 1024] + - Exact: [1024, 12, 1, 2] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [196, 256, 1, 2304] + - Exact: [850, 3, 2, 256] + - Exact: [850, 12, 2, 256] + - Exact: [805, 12, 2, 256] + - Exact: [805, 3, 2, 256] + - Exact: [768, 3, 2, 256] + - Exact: [768, 12, 2, 256] + - Exact: [864, 12, 2, 256] + - Exact: [864, 3, 2, 256] + - Exact: [247, 3, 2, 256] + - Exact: [216, 3, 2, 256] + - Exact: [950, 3, 2, 256] + - Exact: [187, 12, 2, 256] + - Exact: [176, 12, 2, 256] + - Exact: [247, 12, 2, 256] + - Exact: [187, 3, 2, 256] + - Exact: [228, 12, 2, 256] + - Exact: [221, 12, 2, 256] + - Exact: [176, 3, 2, 256] + - Exact: [950, 12, 2, 256] + - Exact: [192, 12, 2, 256] + - Exact: [228, 3, 2, 256] + - Exact: [221, 3, 2, 256] + - Exact: [192, 3, 2, 256] + - Exact: [216, 12, 2, 256] + - Exact: [2, 6, 1, 1024] + - Exact: [1024, 20, 1, 2] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml new file mode 100644 index 000000000..3d52c693c --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml @@ -0,0 +1,4923 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2048, 2048, 1, 512] + - Exact: [1600, 1024, 1, 512] + - Exact: [4096, 1024, 1, 4096] + - Exact: [4096, 1024, 1, 2048] + - Exact: [3072, 768, 1, 4096] + - Exact: [3072, 1024, 1, 2048] + - Exact: [3072, 1024, 1, 3072] + - Exact: [3072, 1024, 1, 512] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [448, 5056, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [5888, 2944, 1, 128] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [1024, 4288, 1, 128] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [6784, 5888, 1, 3328] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [6784, 2368, 1, 1280] + - Exact: [3584, 2944, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [2048, 7133, 1, 2048] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 1408, 1, 128] + - Exact: [4288, 5056, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 128] + - Exact: [1408, 5056, 1, 1280] + - Exact: [5888, 1856, 1, 128] + - Exact: [2368, 1024, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [1856, 4288, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [2944, 4288, 1, 3328] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [1856, 2368, 1, 128] + - Exact: [4096, 7133, 1, 4096] + - Exact: [3584, 704, 1, 1280] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [2368, 5056, 1, 128] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 128] + - Exact: [1856, 1856, 1, 128] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [3072, 7435, 1, 1024] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 2944, 1, 128] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [1024, 2368, 1, 128] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 4288, 1, 3328] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2560, 7133, 1, 2560] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [704, 4288, 1, 128] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [1408, 1024, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [1760, 7133, 1, 1760] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 6784, 1, 128] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [2368, 6784, 1, 128] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [1856, 5056, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [7680, 5481, 1, 2560] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5888, 1408, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [4288, 1024, 1, 128] + - Exact: [196, 256, 256, 1024] + - Exact: [784, 512, 256, 128] + - Exact: [784, 128, 128, 512] + - Exact: [3136, 256, 256, 64] + - Exact: [784, 128, 256, 512] + - Exact: [196, 256, 128, 1024] + - Exact: [3136, 256, 128, 64] + - Exact: [784, 512, 128, 128] + - Exact: [196, 1024, 128, 256] + - Exact: [196, 1024, 256, 256] + - Exact: [5329, 160, 64, 64] + - Exact: [1225, 384, 64, 192] + - Exact: [289, 1024, 64, 256] + - Exact: [1225, 384, 64, 64] + - Exact: [1225, 384, 64, 96] + - Exact: [289, 1024, 64, 384] + - Exact: [289, 1024, 64, 192] + - Exact: [289, 1024, 64, 128] + - Exact: [4096, 1024, 1, 2984] + - Exact: [1024, 4096, 1, 3437] + - Exact: [1024, 4096, 1, 3235] + - Exact: [4096, 1024, 1, 4032] + - Exact: [1024, 4096, 1, 3334] + - Exact: [4096, 1024, 1, 3288] + - Exact: [1024, 4096, 1, 3515] + - Exact: [4096, 1024, 1, 3437] + - Exact: [1024, 4096, 1, 3259] + - Exact: [1024, 4096, 1, 3384] + - Exact: [4096, 1024, 1, 3458] + - Exact: [1024, 4096, 1, 3412] + - Exact: [1024, 4096, 1, 3529] + - Exact: [1024, 4096, 1, 4032] + - Exact: [4096, 1024, 1, 3999] + - Exact: [1024, 4096, 1, 3079] + - Exact: [1024, 4096, 1, 3876] + - Exact: [1024, 4096, 1, 3450] + - Exact: [1024, 4096, 1, 3256] + - Exact: [4096, 1024, 1, 3403] + - Exact: [1024, 4096, 1, 3359] + - Exact: [4096, 1024, 1, 3549] + - Exact: [4096, 1024, 1, 3176] + - Exact: [1024, 4096, 1, 3504] + - Exact: [4096, 1024, 1, 3314] + - Exact: [4096, 1024, 1, 3183] + - Exact: [1024, 4096, 1, 3209] + - Exact: [1024, 4096, 1, 3720] + - Exact: [1024, 4096, 1, 3859] + - Exact: [1024, 33708, 1, 4059] + - Exact: [4096, 1024, 1, 3477] + - Exact: [4096, 1024, 1, 3233] + - Exact: [4096, 1024, 1, 3409] + - Exact: [4096, 1024, 1, 3564] + - Exact: [4096, 1024, 1, 3190] + - Exact: [1024, 4096, 1, 3288] + - Exact: [4096, 1024, 1, 3451] + - Exact: [1024, 4096, 1, 3348] + - Exact: [1024, 4096, 1, 3465] + - Exact: [1024, 33708, 1, 4032] + - Exact: [1024, 33708, 1, 3840] + - Exact: [4096, 1024, 1, 3391] + - Exact: [1024, 4096, 1, 3530] + - Exact: [4096, 1024, 1, 3209] + - Exact: [1024, 4096, 1, 3457] + - Exact: [1024, 4096, 1, 3386] + - Exact: [4096, 1024, 1, 3350] + - Exact: [1024, 4096, 1, 3184] + - Exact: [1024, 4096, 1, 3093] + - Exact: [1024, 4096, 1, 3400] + - Exact: [1024, 4096, 1, 3214] + - Exact: [4096, 1024, 1, 3406] + - Exact: [1024, 4096, 1, 3565] + - Exact: [4096, 1024, 1, 3536] + - Exact: [1024, 4096, 1, 3183] + - Exact: [1024, 4096, 1, 3462] + - Exact: [4096, 1024, 1, 3130] + - Exact: [4096, 1024, 1, 3381] + - Exact: [4096, 1024, 1, 3298] + - Exact: [1024, 4096, 1, 3292] + - Exact: [4096, 1024, 1, 3289] + - Exact: [1024, 4096, 1, 3379] + - Exact: [1024, 4096, 1, 3990] + - Exact: [1024, 4096, 1, 3540] + - Exact: [4096, 1024, 1, 3412] + - Exact: [1024, 4096, 1, 3555] + - Exact: [1024, 4096, 1, 3518] + - Exact: [4096, 1024, 1, 3189] + - Exact: [1024, 4096, 1, 3298] + - Exact: [4096, 1024, 1, 3072] + - Exact: [1024, 4096, 1, 3393] + - Exact: [1024, 4096, 1, 3207] + - Exact: [4096, 1024, 1, 3487] + - Exact: [4096, 1024, 1, 3431] + - Exact: [4096, 1024, 1, 3378] + - Exact: [4096, 1024, 1, 3529] + - Exact: [4096, 1024, 1, 3460] + - Exact: [1024, 4096, 1, 3336] + - Exact: [1024, 4096, 1, 3501] + - Exact: [1024, 4096, 1, 3584] + - Exact: [4096, 1024, 1, 2499] + - Exact: [4096, 1024, 1, 3352] + - Exact: [1024, 4096, 1, 3543] + - Exact: [1024, 4096, 1, 3476] + - Exact: [1024, 33708, 1, 3822] + - Exact: [1024, 4096, 1, 3436] + - Exact: [1024, 4096, 1, 3594] + - Exact: [4096, 1024, 1, 3514] + - Exact: [1024, 4096, 1, 3064] + - Exact: [4096, 1024, 1, 3371] + - Exact: [4096, 1024, 1, 3558] + - Exact: [4096, 1024, 1, 3517] + - Exact: [4096, 1024, 1, 3144] + - Exact: [1024, 4096, 1, 3312] + - Exact: [4096, 1024, 1, 3079] + - Exact: [1024, 4096, 1, 3415] + - Exact: [1024, 4096, 1, 3221] + - Exact: [1024, 4096, 1, 3978] + - Exact: [4096, 1024, 1, 3876] + - Exact: [1024, 4096, 1, 3528] + - Exact: [1024, 4096, 1, 3181] + - Exact: [4096, 1024, 1, 3445] + - Exact: [4096, 1024, 1, 3450] + - Exact: [4096, 1024, 1, 3377] + - Exact: [1024, 4096, 1, 3532] + - Exact: [1024, 33708, 1, 3944] + - Exact: [4096, 1024, 1, 3483] + - Exact: [1024, 4096, 1, 3358] + - Exact: [4096, 1024, 1, 3464] + - Exact: [4096, 1024, 1, 3282] + - Exact: [4096, 1024, 1, 3256] + - Exact: [1024, 4096, 1, 3057] + - Exact: [4096, 1024, 1, 3481] + - Exact: [4096, 1024, 1, 3340] + - Exact: [1024, 4096, 1, 3273] + - Exact: [4096, 1024, 1, 3392] + - Exact: [4096, 1024, 1, 3337] + - Exact: [4096, 1024, 1, 3359] + - Exact: [4096, 1024, 1, 3498] + - Exact: [4096, 1024, 1, 3169] + - Exact: [1024, 33708, 1, 3859] + - Exact: [1024, 4096, 1, 3103] + - Exact: [4096, 1024, 1, 3900] + - Exact: [1024, 4096, 1, 3442] + - Exact: [1024, 4096, 1, 3248] + - Exact: [1024, 4096, 1, 3351] + - Exact: [4096, 1024, 1, 3593] + - Exact: [1024, 4096, 1, 3780] + - Exact: [1024, 33708, 1, 3681] + - Exact: [4096, 1024, 1, 3374] + - Exact: [1024, 4096, 1, 3557] + - Exact: [4096, 1024, 1, 3906] + - Exact: [4096, 1024, 1, 3504] + - Exact: [1024, 4096, 1, 3270] + - Exact: [4096, 1024, 1, 3098] + - Exact: [4096, 1024, 1, 3216] + - Exact: [1024, 4096, 1, 3550] + - Exact: [4096, 1024, 1, 3449] + - Exact: [1024, 4096, 1, 3403] + - Exact: [1024, 4096, 1, 3523] + - Exact: [1024, 4096, 1, 3486] + - Exact: [1024, 4096, 1, 3564] + - Exact: [1024, 33708, 1, 4005] + - Exact: [4096, 1024, 1, 3296] + - Exact: [1024, 4096, 1, 3263] + - Exact: [1024, 4096, 1, 3130] + - Exact: [1024, 4096, 1, 3295] + - Exact: [1024, 33708, 1, 3925] + - Exact: [1024, 4096, 1, 3378] + - Exact: [4096, 1024, 1, 3720] + - Exact: [4096, 1024, 1, 3399] + - Exact: [4096, 1024, 1, 3543] + - Exact: [4096, 1024, 1, 3497] + - Exact: [4096, 1024, 1, 3594] + - Exact: [1024, 4096, 1, 3144] + - Exact: [1024, 4096, 1, 3975] + - Exact: [4096, 1024, 1, 3205] + - Exact: [1024, 33708, 1, 3995] + - Exact: [1024, 4096, 1, 3392] + - Exact: [1024, 4096, 1, 3055] + - Exact: [1024, 4096, 1, 4026] + - Exact: [4096, 1024, 1, 3557] + - Exact: [4096, 1024, 1, 3515] + - Exact: [4096, 1024, 1, 3486] + - Exact: [4096, 1024, 1, 3457] + - Exact: [1024, 4096, 1, 3511] + - Exact: [4096, 1024, 1, 3138] + - Exact: [1024, 4096, 1, 3339] + - Exact: [1024, 4096, 1, 3939] + - Exact: [4096, 1024, 1, 3500] + - Exact: [4096, 1024, 1, 3395] + - Exact: [4096, 1024, 1, 4020] + - Exact: [4096, 1024, 1, 3942] + - Exact: [4096, 1024, 1, 3349] + - Exact: [1024, 4096, 1, 3322] + - Exact: [4096, 1024, 1, 3452] + - Exact: [1024, 4096, 1, 3417] + - Exact: [1024, 4096, 1, 3526] + - Exact: [4096, 1024, 1, 3485] + - Exact: [4096, 1024, 1, 3303] + - Exact: [4096, 1024, 1, 3344] + - Exact: [1024, 4096, 1, 3479] + - Exact: [4096, 1024, 1, 3300] + - Exact: [1024, 4096, 1, 3439] + - Exact: [4096, 1024, 1, 3280] + - Exact: [1024, 4096, 1, 3245] + - Exact: [1024, 4096, 1, 3328] + - Exact: [4096, 1024, 1, 3418] + - Exact: [1024, 4096, 1, 3493] + - Exact: [1024, 4096, 1, 3500] + - Exact: [1024, 4096, 1, 3166] + - Exact: [4096, 1024, 1, 3126] + - Exact: [1024, 4096, 1, 3277] + - Exact: [1024, 4096, 1, 3315] + - Exact: [1024, 4096, 1, 3414] + - Exact: [4096, 1024, 1, 3531] + - Exact: [4096, 1024, 1, 3484] + - Exact: [1024, 4096, 1, 3180] + - Exact: [4096, 1024, 1, 3360] + - Exact: [1024, 33708, 1, 3990] + - Exact: [4096, 1024, 1, 3466] + - Exact: [1024, 4096, 1, 3428] + - Exact: [1024, 4096, 1, 3137] + - Exact: [4096, 1024, 1, 4059] + - Exact: [1024, 4096, 1, 3353] + - Exact: [1024, 4096, 1, 3942] + - Exact: [4096, 1024, 1, 3506] + - Exact: [4096, 1024, 1, 3508] + - Exact: [4096, 1024, 1, 3956] + - Exact: [1024, 4096, 1, 3272] + - Exact: [1024, 4096, 1, 3443] + - Exact: [1024, 4096, 1, 3375] + - Exact: [1024, 4096, 1, 3525] + - Exact: [4096, 1024, 1, 3472] + - Exact: [1024, 4096, 1, 3520] + - Exact: [4096, 1024, 1, 3322] + - Exact: [4096, 1024, 1, 3387] + - Exact: [1024, 33708, 1, 3939] + - Exact: [4096, 1024, 1, 3345] + - Exact: [4096, 1024, 1, 2967] + - Exact: [1024, 4096, 1, 3453] + - Exact: [1024, 4096, 1, 3640] + - Exact: [4096, 1024, 1, 3291] + - Exact: [1024, 4096, 1, 3350] + - Exact: [4096, 1024, 1, 3417] + - Exact: [1024, 4096, 1, 3467] + - Exact: [1024, 4096, 1, 3491] + - Exact: [1024, 4096, 1, 3822] + - Exact: [4096, 1024, 1, 3292] + - Exact: [1024, 4096, 1, 3231] + - Exact: [1024, 4096, 1, 3364] + - Exact: [1024, 4096, 1, 3995] + - Exact: [1024, 4096, 1, 3545] + - Exact: [1024, 4096, 1, 3186] + - Exact: [4096, 1024, 1, 3432] + - Exact: [4096, 1024, 1, 3367] + - Exact: [4096, 1024, 1, 3503] + - Exact: [1024, 4096, 1, 3095] + - Exact: [4096, 1024, 1, 3465] + - Exact: [1024, 4096, 1, 3402] + - Exact: [4096, 1024, 1, 3140] + - Exact: [4096, 1024, 1, 3424] + - Exact: [4096, 1024, 1, 3257] + - Exact: [4096, 1024, 1, 2917] + - Exact: [1024, 33708, 1, 3640] + - Exact: [1024, 4096, 1, 3456] + - Exact: [1024, 4096, 1, 3014] + - Exact: [4096, 1024, 1, 3372] + - Exact: [1024, 4096, 1, 3294] + - Exact: [4096, 1024, 1, 3446] + - Exact: [1024, 4096, 1, 3389] + - Exact: [4096, 1024, 1, 3259] + - Exact: [4096, 1024, 1, 3544] + - Exact: [4096, 1024, 1, 3479] + - Exact: [4096, 1024, 1, 3542] + - Exact: [4096, 1024, 1, 3321] + - Exact: [1024, 4096, 1, 3147] + - Exact: [1024, 4096, 1, 3944] + - Exact: [4096, 1024, 1, 3870] + - Exact: [1024, 4096, 1, 3308] + - Exact: [4096, 1024, 1, 3401] + - Exact: [1024, 4096, 1, 3395] + - Exact: [1024, 4096, 1, 3563] + - Exact: [1024, 33708, 1, 3870] + - Exact: [4096, 1024, 1, 3494] + - Exact: [1024, 4096, 1, 3271] + - Exact: [1024, 33708, 1, 3910] + - Exact: [1024, 4096, 1, 3287] + - Exact: [1024, 33708, 1, 3860] + - Exact: [4096, 1024, 1, 3341] + - Exact: [1024, 4096, 1, 3136] + - Exact: [4096, 1024, 1, 3439] + - Exact: [1024, 4096, 1, 3751] + - Exact: [1024, 4096, 1, 3301] + - Exact: [4096, 1024, 1, 3468] + - Exact: [1024, 4096, 1, 3416] + - Exact: [4096, 1024, 1, 3163] + - Exact: [1024, 4096, 1, 3230] + - Exact: [1024, 4096, 1, 3581] + - Exact: [4096, 1024, 1, 3463] + - Exact: [1024, 4096, 1, 3478] + - Exact: [4096, 1024, 1, 3262] + - Exact: [1024, 4096, 1, 3438] + - Exact: [1024, 4096, 1, 3244] + - Exact: [1024, 4096, 1, 3445] + - Exact: [4096, 1024, 1, 3328] + - Exact: [1024, 4096, 1, 3492] + - Exact: [4096, 1024, 1, 3211] + - Exact: [1024, 4096, 1, 3910] + - Exact: [1024, 4096, 1, 3314] + - Exact: [4096, 1024, 1, 3859] + - Exact: [4096, 1024, 1, 3383] + - Exact: [1024, 4096, 1, 3409] + - Exact: [1024, 4096, 1, 4020] + - Exact: [4096, 1024, 1, 3530] + - Exact: [4096, 1024, 1, 3411] + - Exact: [1024, 4096, 1, 3566] + - Exact: [4096, 1024, 1, 3493] + - Exact: [4096, 1024, 1, 3184] + - Exact: [1024, 4096, 1, 3072] + - Exact: [1024, 4096, 1, 3431] + - Exact: [4096, 1024, 1, 3306] + - Exact: [1024, 4096, 1, 3352] + - Exact: [4096, 1024, 1, 3295] + - Exact: [1024, 4096, 1, 3517] + - Exact: [4096, 1024, 1, 3426] + - Exact: [4096, 1024, 1, 3385] + - Exact: [4096, 1024, 1, 3572] + - Exact: [4096, 1024, 1, 3459] + - Exact: [1024, 4096, 1, 3374] + - Exact: [4096, 1024, 1, 3166] + - Exact: [4096, 1024, 1, 3093] + - Exact: [4096, 1024, 1, 3523] + - Exact: [4096, 1024, 1, 3413] + - Exact: [1024, 4096, 1, 3996] + - Exact: [1024, 4096, 1, 3452] + - Exact: [4096, 1024, 1, 3232] + - Exact: [4096, 1024, 1, 3400] + - Exact: [4096, 1024, 1, 3334] + - Exact: [1024, 4096, 1, 3345] + - Exact: [1024, 4096, 1, 3538] + - Exact: [1024, 4096, 1, 3466] + - Exact: [4096, 1024, 1, 3315] + - Exact: [4096, 1024, 1, 3214] + - Exact: [1024, 33708, 1, 3900] + - Exact: [1024, 4096, 1, 3367] + - Exact: [1024, 4096, 1, 2917] + - Exact: [1024, 4096, 1, 3544] + - Exact: [4096, 1024, 1, 3414] + - Exact: [4096, 1024, 1, 3565] + - Exact: [1024, 4096, 1, 3512] + - Exact: [1024, 4096, 1, 3191] + - Exact: [1024, 4096, 1, 3289] + - Exact: [4096, 1024, 1, 3290] + - Exact: [1024, 4096, 1, 3211] + - Exact: [1024, 33708, 1, 3969] + - Exact: [4096, 1024, 1, 3566] + - Exact: [1024, 4096, 1, 3459] + - Exact: [1024, 4096, 1, 3372] + - Exact: [4096, 1024, 1, 3339] + - Exact: [4096, 1024, 1, 3425] + - Exact: [4096, 1024, 1, 3388] + - Exact: [1024, 4096, 1, 3531] + - Exact: [4096, 1024, 1, 3286] + - Exact: [4096, 1024, 1, 3462] + - Exact: [1024, 4096, 1, 3388] + - Exact: [4096, 1024, 1, 3165] + - Exact: [4096, 1024, 1, 3304] + - Exact: [1024, 4096, 1, 2736] + - Exact: [4096, 1024, 1, 3397] + - Exact: [1024, 4096, 1, 3311] + - Exact: [1024, 4096, 1, 3394] + - Exact: [4096, 1024, 1, 2736] + - Exact: [1024, 4096, 1, 3559] + - Exact: [4096, 1024, 1, 3180] + - Exact: [1024, 4096, 1, 3480] + - Exact: [4096, 1024, 1, 3318] + - Exact: [4096, 1024, 1, 3213] + - Exact: [1024, 4096, 1, 3286] + - Exact: [4096, 1024, 1, 3471] + - Exact: [1024, 4096, 1, 3381] + - Exact: [4096, 1024, 1, 3502] + - Exact: [1024, 4096, 1, 3552] + - Exact: [4096, 1024, 1, 3519] + - Exact: [1024, 4096, 1, 3300] + - Exact: [1024, 4096, 1, 3419] + - Exact: [4096, 1024, 1, 4030] + - Exact: [4096, 1024, 1, 3976] + - Exact: [1024, 4096, 1, 3473] + - Exact: [4096, 1024, 1, 3428] + - Exact: [1024, 4096, 1, 3433] + - Exact: [4096, 1024, 1, 3534] + - Exact: [4096, 1024, 1, 3461] + - Exact: [4096, 1024, 1, 3681] + - Exact: [4096, 1024, 1, 3495] + - Exact: [4096, 1024, 1, 3351] + - Exact: [1024, 4096, 1, 4059] + - Exact: [4096, 1024, 1, 3990] + - Exact: [1024, 4096, 1, 3325] + - Exact: [1024, 4096, 1, 3408] + - Exact: [4096, 1024, 1, 3394] + - Exact: [1024, 4096, 1, 3573] + - Exact: [4096, 1024, 1, 3386] + - Exact: [4096, 1024, 1, 3540] + - Exact: [1024, 4096, 1, 3182] + - Exact: [1024, 4096, 1, 3430] + - Exact: [1024, 4096, 1, 3236] + - Exact: [4096, 1024, 1, 2977] + - Exact: [1024, 4096, 1, 3355] + - Exact: [4096, 1024, 1, 3139] + - Exact: [4096, 1024, 1, 3516] + - Exact: [4096, 1024, 1, 3368] + - Exact: [4096, 1024, 1, 3559] + - Exact: [1024, 4096, 1, 3506] + - Exact: [1024, 4096, 1, 3145] + - Exact: [1024, 4096, 1, 3369] + - Exact: [4096, 1024, 1, 3522] + - Exact: [1024, 33708, 1, 3894] + - Exact: [4096, 1024, 1, 3336] + - Exact: [1024, 4096, 1, 3382] + - Exact: [4096, 1024, 1, 3533] + - Exact: [4096, 1024, 1, 4050] + - Exact: [4096, 1024, 1, 3480] + - Exact: [1024, 4096, 1, 3344] + - Exact: [1024, 4096, 1, 3509] + - Exact: [1024, 4096, 1, 3956] + - Exact: [4096, 1024, 1, 3616] + - Exact: [1024, 4096, 1, 3366] + - Exact: [4096, 1024, 1, 2935] + - Exact: [4096, 1024, 1, 3393] + - Exact: [4096, 1024, 1, 3547] + - Exact: [1024, 4096, 1, 3499] + - Exact: [4096, 1024, 1, 3357] + - Exact: [4096, 1024, 1, 3272] + - Exact: [4096, 1024, 1, 3207] + - Exact: [4096, 1024, 1, 3894] + - Exact: [1024, 4096, 1, 3444] + - Exact: [4096, 1024, 1, 3561] + - Exact: [4096, 1024, 1, 3376] + - Exact: [1024, 4096, 1, 3458] + - Exact: [4096, 1024, 1, 3231] + - Exact: [1024, 4096, 1, 3505] + - Exact: [4096, 1024, 1, 3277] + - Exact: [1024, 4096, 1, 3391] + - Exact: [1024, 4096, 1, 3536] + - Exact: [1024, 4096, 1, 3063] + - Exact: [1024, 4096, 1, 3189] + - Exact: [1024, 4096, 1, 2505] + - Exact: [4096, 1024, 1, 3454] + - Exact: [1024, 4096, 1, 3405] + - Exact: [1024, 33708, 1, 4050] + - Exact: [4096, 1024, 1, 3520] + - Exact: [1024, 4096, 1, 3487] + - Exact: [1024, 4096, 1, 3558] + - Exact: [4096, 1024, 1, 3297] + - Exact: [1024, 4096, 1, 3483] + - Exact: [1024, 33708, 1, 3751] + - Exact: [4096, 1024, 1, 3380] + - Exact: [1024, 4096, 1, 3380] + - Exact: [1024, 4096, 1, 3396] + - Exact: [1024, 4096, 1, 3497] + - Exact: [1024, 4096, 1, 3502] + - Exact: [1024, 4096, 1, 3138] + - Exact: [4096, 1024, 1, 3939] + - Exact: [1024, 4096, 1, 3303] + - Exact: [1024, 4096, 1, 3418] + - Exact: [1024, 4096, 1, 3224] + - Exact: [4096, 1024, 1, 3978] + - Exact: [1024, 4096, 1, 3472] + - Exact: [4096, 1024, 1, 3353] + - Exact: [4096, 1024, 1, 3362] + - Exact: [1024, 33708, 1, 3978] + - Exact: [1024, 4096, 1, 3432] + - Exact: [1024, 4096, 1, 3139] + - Exact: [1024, 4096, 1, 3341] + - Exact: [1024, 4096, 1, 3494] + - Exact: [1024, 4096, 1, 3969] + - Exact: [1024, 4096, 1, 3163] + - Exact: [4096, 1024, 1, 3405] + - Exact: [4096, 1024, 1, 3453] + - Exact: [1024, 4096, 1, 3411] + - Exact: [1024, 4096, 1, 3527] + - Exact: [4096, 1024, 1, 3474] + - Exact: [1024, 4096, 1, 3572] + - Exact: [4096, 1024, 1, 3293] + - Exact: [4096, 1024, 1, 3247] + - Exact: [1024, 4096, 1, 3425] + - Exact: [1024, 4096, 1, 3354] + - Exact: [4096, 1024, 1, 3382] + - Exact: [4096, 1024, 1, 3236] + - Exact: [1024, 4096, 1, 3519] + - Exact: [4096, 1024, 1, 3354] + - Exact: [4096, 1024, 1, 3501] + - Exact: [4096, 1024, 1, 3266] + - Exact: [1024, 4096, 1, 3368] + - Exact: [1024, 4096, 1, 4030] + - Exact: [1024, 4096, 1, 3533] + - Exact: [4096, 1024, 1, 3332] + - Exact: [4096, 1024, 1, 3584] + - Exact: [1024, 4096, 1, 3616] + - Exact: [4096, 1024, 1, 3265] + - Exact: [4096, 1024, 1, 3361] + - Exact: [4096, 1024, 1, 3467] + - Exact: [1024, 4096, 1, 3454] + - Exact: [1024, 4096, 1, 3101] + - Exact: [1024, 4096, 1, 3508] + - Exact: [4096, 1024, 1, 3267] + - Exact: [4096, 1024, 1, 3419] + - Exact: [4096, 1024, 1, 3822] + - Exact: [1024, 4096, 1, 3266] + - Exact: [4096, 1024, 1, 3440] + - Exact: [1024, 4096, 1, 3361] + - Exact: [1024, 4096, 1, 3546] + - Exact: [4096, 1024, 1, 3473] + - Exact: [4096, 1024, 1, 3546] + - Exact: [1024, 4096, 1, 3088] + - Exact: [1024, 4096, 1, 3535] + - Exact: [1024, 4096, 1, 3447] + - Exact: [1024, 4096, 1, 3560] + - Exact: [1024, 4096, 1, 3422] + - Exact: [1024, 4096, 1, 3469] + - Exact: [4096, 1024, 1, 3488] + - Exact: [1024, 4096, 1, 3110] + - Exact: [1024, 4096, 1, 3265] + - Exact: [1024, 4096, 1, 3291] + - Exact: [1024, 4096, 1, 3390] + - Exact: [4096, 1024, 1, 3046] + - Exact: [1024, 4096, 1, 3539] + - Exact: [4096, 1024, 1, 3221] + - Exact: [4096, 1024, 1, 3433] + - Exact: [4096, 1024, 1, 3364] + - Exact: [4096, 1024, 1, 3470] + - Exact: [1024, 4096, 1, 3404] + - Exact: [1024, 33708, 1, 3968] + - Exact: [4096, 1024, 1, 3088] + - Exact: [1024, 4096, 1, 3247] + - Exact: [1024, 33708, 1, 3996] + - Exact: [4096, 1024, 1, 3482] + - Exact: [4096, 1024, 1, 3995] + - Exact: [1024, 4096, 1, 3280] + - Exact: [4096, 1024, 1, 3271] + - Exact: [4096, 1024, 1, 3545] + - Exact: [4096, 1024, 1, 3476] + - Exact: [4096, 1024, 1, 3496] + - Exact: [4096, 1024, 1, 3191] + - Exact: [4096, 1024, 1, 3311] + - Exact: [1024, 4096, 1, 3302] + - Exact: [1024, 4096, 1, 3681] + - Exact: [4096, 1024, 1, 3582] + - Exact: [4096, 1024, 1, 3421] + - Exact: [4096, 1024, 1, 3560] + - Exact: [1024, 4096, 1, 3495] + - Exact: [4096, 1024, 1, 3186] + - Exact: [4096, 1024, 1, 3925] + - Exact: [1024, 4096, 1, 3435] + - Exact: [4096, 1024, 1, 3434] + - Exact: [1024, 33708, 1, 4012] + - Exact: [1024, 4096, 1, 3340] + - Exact: [4096, 1024, 1, 3489] + - Exact: [1024, 4096, 1, 3162] + - Exact: [4096, 1024, 1, 3436] + - Exact: [4096, 1024, 1, 3574] + - Exact: [4096, 1024, 1, 3469] + - Exact: [1024, 4096, 1, 3410] + - Exact: [1024, 4096, 1, 3216] + - Exact: [4096, 1024, 1, 3095] + - Exact: [4096, 1024, 1, 3448] + - Exact: [1024, 4096, 1, 3176] + - Exact: [4096, 1024, 1, 2918] + - Exact: [1024, 4096, 1, 3424] + - Exact: [4096, 1024, 1, 3402] + - Exact: [4096, 1024, 1, 3145] + - Exact: [1024, 33708, 1, 3976] + - Exact: [4096, 1024, 1, 3518] + - Exact: [4096, 1024, 1, 3110] + - Exact: [4096, 1024, 1, 3325] + - Exact: [1024, 33708, 1, 3999] + - Exact: [4096, 1024, 1, 2985] + - Exact: [1024, 4096, 1, 3371] + - Exact: [4096, 1024, 1, 3342] + - Exact: [4096, 1024, 1, 3141] + - Exact: [4096, 1024, 1, 3532] + - Exact: [1024, 4096, 1, 3169] + - Exact: [1024, 4096, 1, 3514] + - Exact: [4096, 1024, 1, 3780] + - Exact: [1024, 4096, 1, 3098] + - Exact: [1024, 4096, 1, 3449] + - Exact: [1024, 4096, 1, 3222] + - Exact: [1024, 4096, 1, 3346] + - Exact: [4096, 1024, 1, 3064] + - Exact: [4096, 1024, 1, 3511] + - Exact: [4096, 1024, 1, 3384] + - Exact: [4096, 1024, 1, 3356] + - Exact: [1024, 4096, 1, 3796] + - Exact: [4096, 1024, 1, 3427] + - Exact: [4096, 1024, 1, 3390] + - Exact: [4096, 1024, 1, 3573] + - Exact: [4096, 1024, 1, 3456] + - Exact: [1024, 4096, 1, 3360] + - Exact: [1024, 33708, 1, 3977] + - Exact: [1024, 4096, 1, 2918] + - Exact: [4096, 1024, 1, 3975] + - Exact: [4096, 1024, 1, 3525] + - Exact: [4096, 1024, 1, 3398] + - Exact: [4096, 1024, 1, 3640] + - Exact: [4096, 1024, 1, 3014] + - Exact: [1024, 4096, 1, 3446] + - Exact: [1024, 33708, 1, 3796] + - Exact: [4096, 1024, 1, 3101] + - Exact: [4096, 1024, 1, 3563] + - Exact: [4096, 1024, 1, 3539] + - Exact: [4096, 1024, 1, 3182] + - Exact: [1024, 4096, 1, 3468] + - Exact: [4096, 1024, 1, 3312] + - Exact: [4096, 1024, 1, 3215] + - Exact: [4096, 1024, 1, 3910] + - Exact: [1024, 33708, 1, 3780] + - Exact: [1024, 4096, 1, 3290] + - Exact: [1024, 4096, 1, 4012] + - Exact: [1024, 4096, 1, 3385] + - Exact: [1024, 33708, 1, 3975] + - Exact: [4096, 1024, 1, 3996] + - Exact: [4096, 1024, 1, 2765] + - Exact: [4096, 1024, 1, 3538] + - Exact: [4096, 1024, 1, 3415] + - Exact: [1024, 4096, 1, 3554] + - Exact: [4096, 1024, 1, 3513] + - Exact: [1024, 4096, 1, 3304] + - Exact: [4096, 1024, 1, 3294] + - Exact: [4096, 1024, 1, 3396] + - Exact: [1024, 4096, 1, 3213] + - Exact: [4096, 1024, 1, 3137] + - Exact: [4096, 1024, 1, 3552] + - Exact: [1024, 4096, 1, 3461] + - Exact: [4096, 1024, 1, 3263] + - Exact: [4096, 1024, 1, 3430] + - Exact: [4096, 1024, 1, 3389] + - Exact: [4096, 1024, 1, 3528] + - Exact: [1024, 4096, 1, 3463] + - Exact: [4096, 1024, 1, 3526] + - Exact: [4096, 1024, 1, 3154] + - Exact: [4096, 1024, 1, 3499] + - Exact: [4096, 1024, 1, 3955] + - Exact: [1024, 4096, 1, 3297] + - Exact: [1024, 4096, 1, 3233] + - Exact: [1024, 4096, 1, 3226] + - Exact: [4096, 1024, 1, 3404] + - Exact: [4096, 1024, 1, 3355] + - Exact: [1024, 4096, 1, 3542] + - Exact: [4096, 1024, 1, 3181] + - Exact: [1024, 4096, 1, 3474] + - Exact: [4096, 1024, 1, 3319] + - Exact: [1024, 4096, 1, 3434] + - Exact: [1024, 4096, 1, 3860] + - Exact: [1024, 4096, 1, 3343] + - Exact: [1024, 4096, 1, 3488] + - Exact: [1024, 4096, 1, 3046] + - Exact: [1024, 4096, 1, 3141] + - Exact: [1024, 4096, 1, 3516] + - Exact: [4096, 1024, 1, 3147] + - Exact: [1024, 4096, 1, 3421] + - Exact: [4096, 1024, 1, 3944] + - Exact: [1024, 4096, 1, 3574] + - Exact: [1024, 4096, 1, 3977] + - Exact: [1024, 4096, 1, 2985] + - Exact: [1024, 4096, 1, 3427] + - Exact: [1024, 4096, 1, 3482] + - Exact: [1024, 4096, 1, 3332] + - Exact: [4096, 1024, 1, 3308] + - Exact: [1024, 4096, 1, 3513] + - Exact: [1024, 4096, 1, 3154] + - Exact: [1024, 4096, 1, 3955] + - Exact: [1024, 4096, 1, 2967] + - Exact: [1024, 33708, 1, 3942] + - Exact: [1024, 4096, 1, 3319] + - Exact: [4096, 1024, 1, 3860] + - Exact: [1024, 4096, 1, 3548] + - Exact: [4096, 1024, 1, 3977] + - Exact: [4096, 1024, 1, 3535] + - Exact: [1024, 4096, 1, 3541] + - Exact: [1024, 33708, 1, 3584] + - Exact: [1024, 4096, 1, 3168] + - Exact: [1024, 4096, 1, 3448] + - Exact: [4096, 1024, 1, 3343] + - Exact: [1024, 4096, 1, 3357] + - Exact: [4096, 1024, 1, 3510] + - Exact: [4096, 1024, 1, 3369] + - Exact: [4096, 1024, 1, 3379] + - Exact: [1024, 4096, 1, 3276] + - Exact: [1024, 4096, 1, 3363] + - Exact: [4096, 1024, 1, 3055] + - Exact: [1024, 4096, 1, 3524] + - Exact: [4096, 1024, 1, 3057] + - Exact: [1024, 33708, 1, 3720] + - Exact: [1024, 4096, 1, 3383] + - Exact: [1024, 4096, 1, 3522] + - Exact: [1024, 33708, 1, 3956] + - Exact: [1024, 4096, 1, 3481] + - Exact: [4096, 1024, 1, 3562] + - Exact: [4096, 1024, 1, 3299] + - Exact: [1024, 4096, 1, 3262] + - Exact: [1024, 33708, 1, 4026] + - Exact: [4096, 1024, 1, 3168] + - Exact: [1024, 4096, 1, 3999] + - Exact: [1024, 4096, 1, 3549] + - Exact: [4096, 1024, 1, 3375] + - Exact: [1024, 4096, 1, 3496] + - Exact: [1024, 4096, 1, 3190] + - Exact: [4096, 1024, 1, 3273] + - Exact: [1024, 4096, 1, 3406] + - Exact: [4096, 1024, 1, 4005] + - Exact: [4096, 1024, 1, 3555] + - Exact: [4096, 1024, 1, 2505] + - Exact: [1024, 4096, 1, 3460] + - Exact: [1024, 4096, 1, 3579] + - Exact: [1024, 33708, 1, 4030] + - Exact: [1024, 4096, 1, 3510] + - Exact: [1024, 4096, 1, 3282] + - Exact: [1024, 4096, 1, 3377] + - Exact: [1024, 4096, 1, 2935] + - Exact: [1024, 4096, 1, 3498] + - Exact: [1024, 4096, 1, 3593] + - Exact: [4096, 1024, 1, 3226] + - Exact: [1024, 4096, 1, 2499] + - Exact: [1024, 4096, 1, 3296] + - Exact: [1024, 4096, 1, 3455] + - Exact: [1024, 4096, 1, 3399] + - Exact: [1024, 4096, 1, 3205] + - Exact: [4096, 1024, 1, 4026] + - Exact: [1024, 4096, 1, 3484] + - Exact: [4096, 1024, 1, 3302] + - Exact: [1024, 4096, 1, 3485] + - Exact: [1024, 4096, 1, 3126] + - Exact: [1024, 4096, 1, 4050] + - Exact: [4096, 1024, 1, 3235] + - Exact: [1024, 33708, 1, 3955] + - Exact: [1024, 4096, 1, 3342] + - Exact: [1024, 4096, 1, 3397] + - Exact: [4096, 1024, 1, 3491] + - Exact: [1024, 4096, 1, 3503] + - Exact: [1024, 4096, 1, 3140] + - Exact: [4096, 1024, 1, 3121] + - Exact: [4096, 1024, 1, 3276] + - Exact: [1024, 4096, 1, 3321] + - Exact: [1024, 4096, 1, 3870] + - Exact: [4096, 1024, 1, 3475] + - Exact: [1024, 4096, 1, 2984] + - Exact: [4096, 1024, 1, 3363] + - Exact: [1024, 4096, 1, 3582] + - Exact: [4096, 1024, 1, 3509] + - Exact: [1024, 4096, 1, 3426] + - Exact: [4096, 1024, 1, 3136] + - Exact: [1024, 4096, 1, 3232] + - Exact: [4096, 1024, 1, 3103] + - Exact: [1024, 4096, 1, 3335] + - Exact: [1024, 4096, 1, 3900] + - Exact: [4096, 1024, 1, 3512] + - Exact: [4096, 1024, 1, 3222] + - Exact: [1024, 4096, 1, 3165] + - Exact: [4096, 1024, 1, 3408] + - Exact: [4096, 1024, 1, 3751] + - Exact: [1024, 4096, 1, 3318] + - Exact: [4096, 1024, 1, 3442] + - Exact: [1024, 4096, 1, 3413] + - Exact: [4096, 1024, 1, 3524] + - Exact: [1024, 4096, 1, 3976] + - Exact: [1024, 4096, 1, 3475] + - Exact: [1024, 4096, 1, 3534] + - Exact: [4096, 1024, 1, 3301] + - Exact: [4096, 1024, 1, 3248] + - Exact: [1024, 4096, 1, 2977] + - Exact: [4096, 1024, 1, 3346] + - Exact: [1024, 4096, 1, 3451] + - Exact: [1024, 4096, 1, 3257] + - Exact: [1024, 4096, 1, 3356] + - Exact: [4096, 1024, 1, 3348] + - Exact: [4096, 1024, 1, 3335] + - Exact: [4096, 1024, 1, 3505] + - Exact: [1024, 4096, 1, 3490] + - Exact: [4096, 1024, 1, 3447] + - Exact: [1024, 4096, 1, 3267] + - Exact: [4096, 1024, 1, 3230] + - Exact: [4096, 1024, 1, 3455] + - Exact: [1024, 4096, 1, 3925] + - Exact: [1024, 4096, 1, 3362] + - Exact: [4096, 1024, 1, 3969] + - Exact: [4096, 1024, 1, 3527] + - Exact: [1024, 4096, 1, 3585] + - Exact: [4096, 1024, 1, 3063] + - Exact: [4096, 1024, 1, 3435] + - Exact: [4096, 1024, 1, 3366] + - Exact: [4096, 1024, 1, 3581] + - Exact: [1024, 33708, 1, 3906] + - Exact: [1024, 4096, 1, 3464] + - Exact: [1024, 4096, 1, 3440] + - Exact: [4096, 1024, 1, 3143] + - Exact: [1024, 4096, 1, 3349] + - Exact: [4096, 1024, 1, 3416] + - Exact: [4096, 1024, 1, 3365] + - Exact: [1024, 4096, 1, 3470] + - Exact: [4096, 1024, 1, 3287] + - Exact: [1024, 4096, 1, 3441] + - Exact: [4096, 1024, 1, 3224] + - Exact: [1024, 4096, 1, 3387] + - Exact: [1024, 4096, 1, 3547] + - Exact: [4096, 1024, 1, 3478] + - Exact: [4096, 1024, 1, 3548] + - Exact: [1024, 33708, 1, 4020] + - Exact: [4096, 1024, 1, 3320] + - Exact: [1024, 4096, 1, 3906] + - Exact: [4096, 1024, 1, 3796] + - Exact: [1024, 4096, 1, 3306] + - Exact: [1024, 4096, 1, 3401] + - Exact: [1024, 4096, 1, 3215] + - Exact: [4096, 1024, 1, 4012] + - Exact: [1024, 4096, 1, 2765] + - Exact: [4096, 1024, 1, 3554] + - Exact: [4096, 1024, 1, 3423] + - Exact: [1024, 4096, 1, 3562] + - Exact: [1024, 4096, 1, 3489] + - Exact: [4096, 1024, 1, 3358] + - Exact: [4096, 1024, 1, 3270] + - Exact: [1024, 4096, 1, 3293] + - Exact: [1024, 4096, 1, 3376] + - Exact: [4096, 1024, 1, 3245] + - Exact: [4096, 1024, 1, 3541] + - Exact: [4096, 1024, 1, 3443] + - Exact: [4096, 1024, 1, 3438] + - Exact: [4096, 1024, 1, 3244] + - Exact: [1024, 4096, 1, 3365] + - Exact: [1024, 4096, 1, 3299] + - Exact: [1024, 4096, 1, 3471] + - Exact: [1024, 4096, 1, 3398] + - Exact: [4096, 1024, 1, 3162] + - Exact: [1024, 4096, 1, 4005] + - Exact: [4096, 1024, 1, 3579] + - Exact: [1024, 4096, 1, 3121] + - Exact: [4096, 1024, 1, 3441] + - Exact: [4096, 1024, 1, 3422] + - Exact: [4096, 1024, 1, 3444] + - Exact: [1024, 4096, 1, 3337] + - Exact: [4096, 1024, 1, 3550] + - Exact: [1024, 4096, 1, 3477] + - Exact: [4096, 1024, 1, 3490] + - Exact: [4096, 1024, 1, 3585] + - Exact: [1024, 4096, 1, 3143] + - Exact: [1024, 33708, 1, 3876] + - Exact: [1024, 4096, 1, 3320] + - Exact: [1024, 4096, 1, 3423] + - Exact: [1024, 4096, 1, 3894] + - Exact: [4096, 1024, 1, 3410] + - Exact: [1024, 4096, 1, 3561] + - Exact: [4096, 1024, 1, 3492] + - Exact: [36548, 1024, 1, 3712] + - Exact: [4096, 2048, 1, 128] + - Exact: [4096, 3072, 1, 128] + - Exact: [768, 3072, 1, 4096] + - Exact: [768, 30522, 1, 1280] + - Exact: [768, 30522, 1, 320] + - Exact: [768, 30522, 1, 640] + - Exact: [256, 512, 36, 98] + - Exact: [256, 256, 64, 56] + - Exact: [512, 486, 36, 800] + - Exact: [512, 512, 36, 1568] + - Exact: [256, 384, 36, 4096] + - Exact: [128, 256, 64, 32] + - Exact: [128, 256, 64, 9] + - Exact: [256, 512, 36, 784] + - Exact: [256, 324, 36, 32] + - Exact: [512, 512, 36, 33] + - Exact: [192, 384, 64, 128] + - Exact: [512, 512, 64, 72] + - Exact: [512, 512, 36, 128] + - Exact: [192, 384, 64, 2304] + - Exact: [384, 256, 64, 450] + - Exact: [384, 256, 64, 2304] + - Exact: [512, 512, 64, 144] + - Exact: [256, 256, 36, 6272] + - Exact: [256, 384, 64, 2304] + - Exact: [512, 512, 36, 66] + - Exact: [128, 256, 64, 800] + - Exact: [192, 256, 36, 512] + - Exact: [256, 512, 64, 200] + - Exact: [256, 512, 64, 25] + - Exact: [128, 256, 36, 1568] + - Exact: [128, 256, 64, 288] + - Exact: [256, 384, 64, 1152] + - Exact: [160, 320, 64, 288] + - Exact: [128, 256, 36, 128] + - Exact: [512, 512, 36, 16] + - Exact: [384, 256, 36, 800] + - Exact: [192, 384, 36, 4096] + - Exact: [256, 384, 64, 576] + - Exact: [512, 512, 64, 14] + - Exact: [512, 512, 36, 8] + - Exact: [512, 486, 64, 128] + - Exact: [256, 256, 36, 128] + - Exact: [256, 256, 36, 32] + - Exact: [192, 256, 64, 288] + - Exact: [256, 256, 36, 16] + - Exact: [128, 256, 36, 3200] + - Exact: [160, 320, 64, 512] + - Exact: [160, 320, 36, 512] + - Exact: [256, 512, 36, 4] + - Exact: [256, 324, 64, 1568] + - Exact: [256, 256, 36, 3200] + - Exact: [256, 256, 36, 210] + - Exact: [192, 384, 64, 576] + - Exact: [512, 512, 64, 800] + - Exact: [256, 256, 64, 1152] + - Exact: [512, 486, 64, 512] + - Exact: [256, 512, 64, 1600] + - Exact: [512, 512, 64, 9] + - Exact: [256, 512, 36, 1568] + - Exact: [128, 256, 64, 3200] + - Exact: [256, 512, 64, 4] + - Exact: [256, 256, 64, 450] + - Exact: [256, 256, 64, 72] + - Exact: [128, 256, 36, 3136] + - Exact: [160, 320, 64, 242] + - Exact: [512, 512, 36, 512] + - Exact: [512, 512, 36, 256] + - Exact: [512, 512, 36, 1024] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 64, 896] + - Exact: [128, 256, 64, 242] + - Exact: [192, 384, 36, 1024] + - Exact: [128, 256, 64, 100] + - Exact: [384, 256, 64, 1152] + - Exact: [192, 384, 36, 128] + - Exact: [128, 256, 64, 1568] + - Exact: [128, 256, 64, 72] + - Exact: [256, 256, 36, 12544] + - Exact: [256, 256, 36, 105] + - Exact: [128, 256, 36, 392] + - Exact: [384, 256, 36, 1024] + - Exact: [128, 256, 64, 1152] + - Exact: [256, 324, 64, 32] + - Exact: [256, 384, 36, 800] + - Exact: [512, 512, 64, 4] + - Exact: [192, 320, 36, 128] + - Exact: [192, 384, 64, 242] + - Exact: [256, 486, 64, 32] + - Exact: [512, 512, 64, 64] + - Exact: [128, 256, 36, 512] + - Exact: [512, 512, 64, 576] + - Exact: [256, 256, 64, 9] + - Exact: [128, 256, 36, 12544] + - Exact: [256, 512, 36, 3136] + - Exact: [144, 288, 36, 512] + - Exact: [384, 384, 36, 800] + - Exact: [512, 512, 64, 1600] + - Exact: [512, 512, 36, 4] + - Exact: [192, 384, 64, 450] + - Exact: [256, 256, 36, 1024] + - Exact: [256, 512, 64, 400] + - Exact: [128, 256, 36, 6272] + - Exact: [256, 256, 36, 512] + - Exact: [256, 256, 64, 112] + - Exact: [512, 512, 64, 18] + - Exact: [256, 256, 64, 18] + - Exact: [256, 256, 64, 1568] + - Exact: [384, 256, 36, 4096] + - Exact: [256, 512, 64, 800] + - Exact: [256, 384, 36, 2048] + - Exact: [384, 384, 64, 2304] + - Exact: [160, 320, 64, 128] + - Exact: [512, 512, 36, 528] + - Exact: [160, 320, 36, 128] + - Exact: [256, 512, 36, 49] + - Exact: [384, 384, 64, 450] + - Exact: [256, 256, 64, 3200] + - Exact: [512, 512, 64, 8] + - Exact: [512, 512, 64, 288] + - Exact: [384, 384, 36, 1024] + - Exact: [128, 256, 36, 16] + - Exact: [256, 256, 64, 288] + - Exact: [256, 384, 36, 1024] + - Exact: [256, 324, 36, 3200] + - Exact: [192, 384, 64, 512] + - Exact: [128, 256, 64, 1600] + - Exact: [512, 512, 36, 32] + - Exact: [512, 512, 36, 3136] + - Exact: [128, 256, 64, 6400] + - Exact: [256, 256, 36, 2048] + - Exact: [256, 256, 64, 6400] + - Exact: [256, 256, 36, 1680] + - Exact: [192, 384, 36, 2048] + - Exact: [256, 256, 64, 144] + - Exact: [384, 384, 36, 4096] + - Exact: [160, 320, 64, 1152] + - Exact: [384, 256, 36, 2048] + - Exact: [256, 512, 36, 392] + - Exact: [256, 512, 64, 50] + - Exact: [384, 384, 36, 2048] + - Exact: [256, 384, 64, 450] + - Exact: [192, 320, 64, 128] + - Exact: [128, 256, 36, 32] + - Exact: [512, 512, 64, 256] + - Exact: [256, 512, 64, 32] + - Exact: [384, 384, 64, 576] + - Exact: [512, 486, 36, 288] + - Exact: [144, 288, 64, 242] + - Exact: [384, 256, 64, 576] + - Exact: [512, 512, 36, 64] + - Exact: [448, 384, 64, 128] + - Exact: [144, 288, 64, 288] + - Exact: [512, 512, 64, 224] + - Exact: [384, 384, 64, 1152] + - Exact: [448, 384, 36, 128] + - Exact: [256, 486, 36, 128] + - Exact: [256, 256, 36, 800] + - Exact: [192, 384, 36, 800] + - Exact: [256, 256, 36, 256] + - Exact: [192, 384, 64, 1152] + - Exact: [128, 256, 64, 200] + - Exact: [512, 512, 64, 28] + - Exact: [144, 288, 64, 1152] + - Exact: [256, 256, 64, 576] + - Exact: [256, 256, 64, 2304] + - Exact: [192, 384, 36, 512] + - Exact: [256, 512, 36, 32] + - Exact: [512, 512, 64, 128] + - Exact: [512, 512, 64, 32] + - Exact: [128, 256, 36, 196] + - Exact: [196, 528, 32, 32] + - Exact: [196, 512, 32, 24] + - Exact: [1225, 192, 32, 32] + - Exact: [1001, 1536, 1, 32] + - Exact: [196, 480, 32, 64] + - Exact: [289, 1024, 32, 384] + - Exact: [784, 192, 32, 96] + - Exact: [50176, 256, 1, 128] + - Exact: [289, 1024, 32, 256] + - Exact: [289, 1024, 32, 192] + - Exact: [12544, 512, 1, 256] + - Exact: [1225, 1728, 1, 192] + - Exact: [196, 480, 32, 96] + - Exact: [196, 512, 32, 144] + - Exact: [289, 768, 32, 128] + - Exact: [5329, 576, 1, 96] + - Exact: [196, 528, 32, 128] + - Exact: [5329, 448, 1, 64] + - Exact: [784, 256, 32, 64] + - Exact: [784, 192, 32, 32] + - Exact: [21609, 288, 1, 32] + - Exact: [784, 256, 32, 32] + - Exact: [5041, 720, 1, 192] + - Exact: [196, 512, 32, 128] + - Exact: [289, 768, 32, 160] + - Exact: [1001, 4096, 1, 512] + - Exact: [1225, 192, 32, 64] + - Exact: [784, 192, 32, 16] + - Exact: [3136, 1024, 1, 2048] + - Exact: [784, 256, 32, 128] + - Exact: [196, 512, 32, 32] + - Exact: [1225, 384, 32, 96] + - Exact: [5041, 576, 1, 96] + - Exact: [5329, 160, 32, 64] + - Exact: [1225, 288, 32, 48] + - Exact: [4096, 9216, 1, 512] + - Exact: [196, 480, 32, 192] + - Exact: [3136, 1024, 1, 512] + - Exact: [784, 192, 32, 64] + - Exact: [289, 1024, 32, 128] + - Exact: [289, 768, 32, 192] + - Exact: [196, 512, 32, 112] + - Exact: [1001, 2048, 1, 32] + - Exact: [1225, 288, 32, 64] + - Exact: [1225, 384, 32, 192] + - Exact: [50176, 256, 1, 512] + - Exact: [196, 512, 32, 160] + - Exact: [4096, 4096, 1, 512] + - Exact: [1225, 256, 32, 64] + - Exact: [196, 480, 32, 16] + - Exact: [1225, 256, 32, 48] + - Exact: [1225, 1200, 1, 64] + - Exact: [1225, 384, 32, 64] + - Exact: [12544, 512, 1, 1024] + - Exact: [196, 512, 32, 64] + - Exact: [196, 528, 32, 256] + - Exact: [196, 528, 32, 160] + - Exact: [1225, 192, 32, 48] + - Exact: [1001, 2048, 1, 64] + - Exact: [289, 768, 128, 128] + - Exact: [1225, 192, 128, 64] + - Exact: [1225, 288, 128, 48] + - Exact: [289, 768, 128, 192] + - Exact: [289, 768, 128, 160] + - Exact: [1225, 256, 128, 48] + - Exact: [1225, 192, 128, 48] + - Exact: [1225, 288, 128, 64] + - Exact: [1225, 256, 128, 64] + - Exact: [1001, 2048, 1, 128] + - Exact: [1225, 192, 128, 32] + - Exact: [1001, 1536, 1, 64] + - Exact: [1024, 4096, 1, 64] + - Exact: [1024, 4096, 1, 6336] + - Exact: [512, 33708, 1, 3780] + - Exact: [512, 33708, 1, 3968] + - Exact: [512, 33708, 1, 4030] + - Exact: [196, 256, 64, 1024] + - Exact: [196, 1024, 64, 256] + - Exact: [289, 768, 64, 128] + - Exact: [289, 768, 64, 160] + - Exact: [289, 768, 64, 192] + - Exact: [784, 128, 64, 512] + - Exact: [784, 512, 64, 128] + - Exact: [1225, 192, 64, 32] + - Exact: [1225, 192, 64, 48] + - Exact: [1225, 192, 64, 64] + - Exact: [1225, 256, 64, 48] + - Exact: [1225, 256, 64, 64] + - Exact: [1225, 288, 64, 48] + - Exact: [1225, 288, 64, 64] + - Exact: [3136, 256, 64, 64] + - Exact: [256, 44505, 1, 8976] + - Exact: [512, 33708, 1, 3796] + - Exact: [512, 33708, 1, 3822] + - Exact: [512, 33708, 1, 3840] + - Exact: [512, 33708, 1, 3859] + - Exact: [512, 33708, 1, 3870] + - Exact: [512, 33708, 1, 3876] + - Exact: [512, 33708, 1, 3906] + - Exact: [512, 33708, 1, 3910] + - Exact: [512, 33708, 1, 3925] + - Exact: [512, 33708, 1, 3942] + - Exact: [512, 33708, 1, 3944] + - Exact: [512, 33708, 1, 3955] + - Exact: [512, 33708, 1, 3969] + - Exact: [512, 33708, 1, 3976] + - Exact: [512, 33708, 1, 3977] + - Exact: [512, 33708, 1, 3978] + - Exact: [512, 33708, 1, 3990] + - Exact: [512, 33708, 1, 3995] + - Exact: [512, 33708, 1, 3996] + - Exact: [512, 33708, 1, 3999] + - Exact: [512, 33708, 1, 4005] + - Exact: [512, 33708, 1, 4012] + - Exact: [512, 33708, 1, 4020] + - Exact: [512, 33708, 1, 4026] + - Exact: [512, 33708, 1, 4032] + - Exact: [1024, 3072, 1, 2048] + - Exact: [1024, 3072, 1, 3072] + - Exact: [1024, 30522, 1, 20] + - Exact: [1024, 30522, 1, 80] + - Exact: [1024, 30522, 1, 120] + - Exact: [1024, 4096, 1, 3840] + - Exact: [1024, 4096, 1, 3968] + - Exact: [1024, 4096, 1, 7200] + - Exact: [1024, 4096, 1, 8160] + - Exact: [1024, 4096, 1, 9520] + - Exact: [1024, 4096, 1, 10200] + - Exact: [1024, 42720, 1, 3968] + - Exact: [1024, 42720, 1, 7200] + - Exact: [1024, 42720, 1, 9520] + - Exact: [4096, 1024, 1, 3840] + - Exact: [4096, 1024, 1, 3968] + - Exact: [4096, 1024, 1, 7200] + - Exact: [4096, 1024, 1, 8160] + - Exact: [4096, 1024, 1, 9520] + - Exact: [4096, 1024, 1, 10200] + - Exact: [5760, 5760, 1, 5760] + - Exact: [7744, 7744, 1, 7744] + - Exact: [1152, 1152, 1, 384] + - Exact: [1536, 1536, 1, 384] + - Exact: [1920, 1920, 1, 384] + - Exact: [2304, 2304, 1, 384] + - Exact: [2688, 2688, 1, 384] + - Exact: [3072, 3072, 1, 384] + - Exact: [3456, 3456, 1, 384] + - Exact: [3840, 3840, 1, 384] + - Exact: [4224, 4224, 1, 384] + - Exact: [4608, 4608, 1, 384] + - Exact: [4992, 4992, 1, 384] + - Exact: [5376, 5376, 1, 384] + - Exact: [5760, 5760, 1, 384] + - Exact: [6144, 6144, 1, 384] + - Exact: [6528, 6528, 1, 384] + - Exact: [6912, 6912, 1, 384] + - Exact: [7296, 7296, 1, 384] + - Exact: [7680, 7680, 1, 384] + - Exact: [1536, 768, 1, 384] + - Exact: [1920, 960, 1, 384] + - Exact: [2304, 1152, 1, 384] + - Exact: [2688, 1344, 1, 384] + - Exact: [3072, 1536, 1, 384] + - Exact: [3456, 1728, 1, 384] + - Exact: [3840, 1920, 1, 384] + - Exact: [4224, 2112, 1, 384] + - Exact: [4608, 2304, 1, 384] + - Exact: [4992, 2496, 1, 384] + - Exact: [5376, 2688, 1, 384] + - Exact: [5760, 2880, 1, 384] + - Exact: [6144, 3072, 1, 384] + - Exact: [6528, 3264, 1, 384] + - Exact: [6912, 3456, 1, 384] + - Exact: [7296, 3648, 1, 384] + - Exact: [7680, 3840, 1, 384] + - Exact: [768, 1536, 1, 384] + - Exact: [1152, 2304, 1, 384] + - Exact: [1536, 3072, 1, 384] + - Exact: [1920, 3840, 1, 384] + - Exact: [2304, 4608, 1, 384] + - Exact: [2688, 5376, 1, 384] + - Exact: [3072, 6144, 1, 384] + - Exact: [3456, 6912, 1, 384] + - Exact: [3840, 7680, 1, 384] + - Exact: [4224, 8448, 1, 384] + - Exact: [4608, 9216, 1, 384] + - Exact: [4992, 9984, 1, 384] + - Exact: [5376, 10752, 1, 384] + - Exact: [5760, 11520, 1, 384] + - Exact: [6144, 12288, 1, 384] + - Exact: [6528, 13056, 1, 384] + - Exact: [6912, 13824, 1, 384] + - Exact: [7296, 14592, 1, 384] + - Exact: [7680, 15360, 1, 384] + - Exact: [2048, 2048, 1, 1024] + - Exact: [256, 10240, 1, 8976] + - Exact: [256, 10496, 1, 8976] + - Exact: [256, 11008, 1, 8976] + - Exact: [256, 11264, 1, 8976] + - Exact: [256, 11520, 1, 8976] + - Exact: [256, 11776, 1, 8976] + - Exact: [256, 12544, 1, 8976] + - Exact: [256, 12800, 1, 8976] + - Exact: [256, 13312, 1, 8976] + - Exact: [256, 13568, 1, 8976] + - Exact: [256, 14336, 1, 8976] + - Exact: [256, 14848, 1, 8976] + - Exact: [256, 15104, 1, 8976] + - Exact: [256, 15872, 1, 8976] + - Exact: [256, 16128, 1, 8976] + - Exact: [256, 17152, 1, 8976] + - Exact: [256, 17408, 1, 8976] + - Exact: [256, 18688, 1, 8976] + - Exact: [256, 19968, 1, 8976] + - Exact: [256, 20480, 1, 8976] + - Exact: [256, 20992, 1, 8976] + - Exact: [256, 21248, 1, 8976] + - Exact: [256, 22016, 1, 8976] + - Exact: [256, 26112, 1, 8976] + - Exact: [256, 32512, 1, 8976] + - Exact: [256, 33536, 1, 8976] + - Exact: [256, 4864, 1, 8976] + - Exact: [256, 5120, 1, 8976] + - Exact: [256, 5632, 1, 8976] + - Exact: [256, 5888, 1, 8976] + - Exact: [256, 6144, 1, 8976] + - Exact: [256, 7168, 1, 8976] + - Exact: [256, 8192, 1, 8976] + - Exact: [256, 8960, 1, 8976] + - Exact: [256, 9728, 1, 8976] + - Exact: [256, 9984, 1, 8976] + - Exact: [3200, 2048, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [512, 3280, 1, 1600] + - Exact: [512, 3280, 1, 200] + - Exact: [768, 2048, 1, 256] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 2048, 1, 960] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 3072, 1, 512] + - Exact: [1024, 4096, 1, 2048] + - Exact: [1024, 30528, 1, 2048] + - Exact: [1024, 4096, 1, 4096] + - Exact: [1024, 30528, 1, 4096] + - Exact: [9216, 128, 1, 128] + - Exact: [9600, 128, 1, 128] + - Exact: [9984, 128, 1, 128] + - Exact: [10368, 128, 1, 128] + - Exact: [10752, 128, 1, 128] + - Exact: [11136, 128, 1, 128] + - Exact: [11520, 128, 1, 128] + - Exact: [11904, 128, 1, 128] + - Exact: [12288, 128, 1, 128] + - Exact: [12672, 128, 1, 128] + - Exact: [13056, 128, 1, 128] + - Exact: [13440, 128, 1, 128] + - Exact: [13824, 128, 1, 128] + - Exact: [14208, 128, 1, 128] + - Exact: [14592, 128, 1, 128] + - Exact: [14976, 128, 1, 128] + - Exact: [15360, 128, 1, 128] + - Exact: [15744, 128, 1, 128] + - Exact: [16128, 128, 1, 128] + - Exact: [16512, 128, 1, 128] + - Exact: [16896, 128, 1, 128] + - Exact: [17280, 128, 1, 128] + - Exact: [17664, 128, 1, 128] + - Exact: [18048, 128, 1, 128] + - Exact: [18432, 128, 1, 128] + - Exact: [18816, 128, 1, 128] + - Exact: [19200, 128, 1, 128] + - Exact: [19584, 128, 1, 128] + - Exact: [19968, 128, 1, 128] + - Exact: [20352, 128, 1, 128] + - Exact: [20736, 128, 1, 128] + - Exact: [21120, 128, 1, 128] + - Exact: [21504, 128, 1, 128] + - Exact: [21888, 128, 1, 128] + - Exact: [22272, 128, 1, 128] + - Exact: [22656, 128, 1, 128] + - Exact: [23040, 128, 1, 128] + - Exact: [9216, 128, 1, 256] + - Exact: [9600, 128, 1, 256] + - Exact: [9984, 128, 1, 256] + - Exact: [10368, 128, 1, 256] + - Exact: [10752, 128, 1, 256] + - Exact: [11136, 128, 1, 256] + - Exact: [11520, 128, 1, 256] + - Exact: [11904, 128, 1, 256] + - Exact: [12288, 128, 1, 256] + - Exact: [12672, 128, 1, 256] + - Exact: [13056, 128, 1, 256] + - Exact: [13440, 128, 1, 256] + - Exact: [13824, 128, 1, 256] + - Exact: [14208, 128, 1, 256] + - Exact: [14592, 128, 1, 256] + - Exact: [14976, 128, 1, 256] + - Exact: [15360, 128, 1, 256] + - Exact: [15744, 128, 1, 256] + - Exact: [16128, 128, 1, 256] + - Exact: [16512, 128, 1, 256] + - Exact: [16896, 128, 1, 256] + - Exact: [17280, 128, 1, 256] + - Exact: [17664, 128, 1, 256] + - Exact: [18048, 128, 1, 256] + - Exact: [18432, 128, 1, 256] + - Exact: [18816, 128, 1, 256] + - Exact: [19200, 128, 1, 256] + - Exact: [19584, 128, 1, 256] + - Exact: [19968, 128, 1, 256] + - Exact: [20352, 128, 1, 256] + - Exact: [20736, 128, 1, 256] + - Exact: [21120, 128, 1, 256] + - Exact: [21504, 128, 1, 256] + - Exact: [21888, 128, 1, 256] + - Exact: [22272, 128, 1, 256] + - Exact: [22656, 128, 1, 256] + - Exact: [23040, 128, 1, 256] + - Exact: [8064, 8064, 1, 384] + - Exact: [8448, 8448, 1, 384] + - Exact: [8832, 8832, 1, 384] + - Exact: [9216, 9216, 1, 384] + - Exact: [9600, 9600, 1, 384] + - Exact: [9984, 9984, 1, 384] + - Exact: [10368, 10368, 1, 384] + - Exact: [10752, 10752, 1, 384] + - Exact: [11136, 11136, 1, 384] + - Exact: [11520, 11520, 1, 384] + - Exact: [11904, 11904, 1, 384] + - Exact: [12288, 12288, 1, 384] + - Exact: [12672, 12672, 1, 384] + - Exact: [13056, 13056, 1, 384] + - Exact: [13440, 13440, 1, 384] + - Exact: [13824, 13824, 1, 384] + - Exact: [14208, 14208, 1, 384] + - Exact: [14592, 14592, 1, 384] + - Exact: [14976, 14976, 1, 384] + - Exact: [15360, 15360, 1, 384] + - Exact: [15744, 15744, 1, 384] + - Exact: [16128, 16128, 1, 384] + - Exact: [16512, 16512, 1, 384] + - Exact: [16896, 16896, 1, 384] + - Exact: [17280, 17280, 1, 384] + - Exact: [17664, 17664, 1, 384] + - Exact: [18048, 18048, 1, 384] + - Exact: [18432, 18432, 1, 384] + - Exact: [18816, 18816, 1, 384] + - Exact: [19200, 19200, 1, 384] + - Exact: [19584, 19584, 1, 384] + - Exact: [19968, 19968, 1, 384] + - Exact: [20352, 20352, 1, 384] + - Exact: [20736, 20736, 1, 384] + - Exact: [21120, 21120, 1, 384] + - Exact: [21504, 21504, 1, 384] + - Exact: [21888, 21888, 1, 384] + - Exact: [22272, 22272, 1, 384] + - Exact: [22656, 22656, 1, 384] + - Exact: [23040, 23040, 1, 384] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3072, 3072, 1, 3072] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [8064, 4032, 1, 384] + - Exact: [8448, 4224, 1, 384] + - Exact: [8832, 4416, 1, 384] + - Exact: [9216, 4608, 1, 384] + - Exact: [9600, 4800, 1, 384] + - Exact: [9984, 4992, 1, 384] + - Exact: [10368, 5184, 1, 384] + - Exact: [10752, 5376, 1, 384] + - Exact: [11136, 5568, 1, 384] + - Exact: [11520, 5760, 1, 384] + - Exact: [11904, 5952, 1, 384] + - Exact: [12288, 6144, 1, 384] + - Exact: [12672, 6336, 1, 384] + - Exact: [13056, 6528, 1, 384] + - Exact: [13440, 6720, 1, 384] + - Exact: [13824, 6912, 1, 384] + - Exact: [14208, 7104, 1, 384] + - Exact: [14592, 7296, 1, 384] + - Exact: [14976, 7488, 1, 384] + - Exact: [15360, 7680, 1, 384] + - Exact: [15744, 7872, 1, 384] + - Exact: [16128, 8064, 1, 384] + - Exact: [16512, 8256, 1, 384] + - Exact: [16896, 8448, 1, 384] + - Exact: [17280, 8640, 1, 384] + - Exact: [17664, 8832, 1, 384] + - Exact: [18048, 9024, 1, 384] + - Exact: [18432, 9216, 1, 384] + - Exact: [18816, 9408, 1, 384] + - Exact: [19200, 9600, 1, 384] + - Exact: [19584, 9792, 1, 384] + - Exact: [19968, 9984, 1, 384] + - Exact: [20352, 10176, 1, 384] + - Exact: [20736, 10368, 1, 384] + - Exact: [21120, 10560, 1, 384] + - Exact: [21504, 10752, 1, 384] + - Exact: [21888, 10944, 1, 384] + - Exact: [22272, 11136, 1, 384] + - Exact: [22656, 11328, 1, 384] + - Exact: [23040, 11520, 1, 384] + - Exact: [8064, 16128, 1, 384] + - Exact: [8448, 16896, 1, 384] + - Exact: [8832, 17664, 1, 384] + - Exact: [9216, 18432, 1, 384] + - Exact: [9600, 19200, 1, 384] + - Exact: [9984, 19968, 1, 384] + - Exact: [10368, 20736, 1, 384] + - Exact: [10752, 21504, 1, 384] + - Exact: [11136, 22272, 1, 384] + - Exact: [11520, 23040, 1, 384] + - Exact: [11904, 23808, 1, 384] + - Exact: [12288, 24576, 1, 384] + - Exact: [12672, 25344, 1, 384] + - Exact: [13056, 26112, 1, 384] + - Exact: [13440, 26880, 1, 384] + - Exact: [13824, 27648, 1, 384] + - Exact: [14208, 28416, 1, 384] + - Exact: [14592, 29184, 1, 384] + - Exact: [14976, 29952, 1, 384] + - Exact: [15360, 30720, 1, 384] + - Exact: [15744, 31488, 1, 384] + - Exact: [16128, 32256, 1, 384] + - Exact: [16512, 33024, 1, 384] + - Exact: [16896, 33792, 1, 384] + - Exact: [17280, 34560, 1, 384] + - Exact: [17664, 35328, 1, 384] + - Exact: [18048, 36096, 1, 384] + - Exact: [18432, 36864, 1, 384] + - Exact: [18816, 37632, 1, 384] + - Exact: [19200, 38400, 1, 384] + - Exact: [19584, 39168, 1, 384] + - Exact: [19968, 39936, 1, 384] + - Exact: [20352, 40704, 1, 384] + - Exact: [20736, 41472, 1, 384] + - Exact: [21120, 42240, 1, 384] + - Exact: [21504, 43008, 1, 384] + - Exact: [21888, 43776, 1, 384] + - Exact: [22272, 44544, 1, 384] + - Exact: [22656, 45312, 1, 384] + - Exact: [23040, 46080, 1, 384] + - Exact: [1152, 1536, 1, 384] + - Exact: [1920, 1536, 1, 384] + - Exact: [2304, 1536, 1, 384] + - Exact: [2688, 1536, 1, 384] + - Exact: [3456, 1536, 1, 384] + - Exact: [3840, 1536, 1, 384] + - Exact: [4224, 1536, 1, 384] + - Exact: [4608, 1536, 1, 384] + - Exact: [4992, 1536, 1, 384] + - Exact: [5376, 1536, 1, 384] + - Exact: [5760, 1536, 1, 384] + - Exact: [6144, 1536, 1, 384] + - Exact: [6528, 1536, 1, 384] + - Exact: [6912, 1536, 1, 384] + - Exact: [7296, 1536, 1, 384] + - Exact: [7680, 1536, 1, 384] + - Exact: [8064, 1536, 1, 384] + - Exact: [8448, 1536, 1, 384] + - Exact: [8832, 1536, 1, 384] + - Exact: [9216, 1536, 1, 384] + - Exact: [9600, 1536, 1, 384] + - Exact: [9984, 1536, 1, 384] + - Exact: [10368, 1536, 1, 384] + - Exact: [10752, 1536, 1, 384] + - Exact: [11136, 1536, 1, 384] + - Exact: [11520, 1536, 1, 384] + - Exact: [11904, 1536, 1, 384] + - Exact: [12288, 1536, 1, 384] + - Exact: [12672, 1536, 1, 384] + - Exact: [13056, 1536, 1, 384] + - Exact: [13440, 1536, 1, 384] + - Exact: [13824, 1536, 1, 384] + - Exact: [14208, 1536, 1, 384] + - Exact: [14592, 1536, 1, 384] + - Exact: [14976, 1536, 1, 384] + - Exact: [15360, 1536, 1, 384] + - Exact: [15744, 1536, 1, 384] + - Exact: [16128, 1536, 1, 384] + - Exact: [16512, 1536, 1, 384] + - Exact: [16896, 1536, 1, 384] + - Exact: [17280, 1536, 1, 384] + - Exact: [17664, 1536, 1, 384] + - Exact: [18048, 1536, 1, 384] + - Exact: [18432, 1536, 1, 384] + - Exact: [18816, 1536, 1, 384] + - Exact: [19200, 1536, 1, 384] + - Exact: [19584, 1536, 1, 384] + - Exact: [19968, 1536, 1, 384] + - Exact: [20352, 1536, 1, 384] + - Exact: [20736, 1536, 1, 384] + - Exact: [21120, 1536, 1, 384] + - Exact: [21504, 1536, 1, 384] + - Exact: [21888, 1536, 1, 384] + - Exact: [22272, 1536, 1, 384] + - Exact: [22656, 1536, 1, 384] + - Exact: [23040, 1536, 1, 384] + - Exact: [768, 1920, 1, 384] + - Exact: [1152, 1920, 1, 384] + - Exact: [1536, 1920, 1, 384] + - Exact: [2304, 1920, 1, 384] + - Exact: [2688, 1920, 1, 384] + - Exact: [3072, 1920, 1, 384] + - Exact: [3456, 1920, 1, 384] + - Exact: [4224, 1920, 1, 384] + - Exact: [4608, 1920, 1, 384] + - Exact: [4992, 1920, 1, 384] + - Exact: [5376, 1920, 1, 384] + - Exact: [5760, 1920, 1, 384] + - Exact: [6144, 1920, 1, 384] + - Exact: [6528, 1920, 1, 384] + - Exact: [6912, 1920, 1, 384] + - Exact: [7296, 1920, 1, 384] + - Exact: [7680, 1920, 1, 384] + - Exact: [8064, 1920, 1, 384] + - Exact: [8448, 1920, 1, 384] + - Exact: [8832, 1920, 1, 384] + - Exact: [9216, 1920, 1, 384] + - Exact: [9600, 1920, 1, 384] + - Exact: [9984, 1920, 1, 384] + - Exact: [10368, 1920, 1, 384] + - Exact: [10752, 1920, 1, 384] + - Exact: [11136, 1920, 1, 384] + - Exact: [11520, 1920, 1, 384] + - Exact: [11904, 1920, 1, 384] + - Exact: [12288, 1920, 1, 384] + - Exact: [12672, 1920, 1, 384] + - Exact: [13056, 1920, 1, 384] + - Exact: [13440, 1920, 1, 384] + - Exact: [13824, 1920, 1, 384] + - Exact: [14208, 1920, 1, 384] + - Exact: [14592, 1920, 1, 384] + - Exact: [14976, 1920, 1, 384] + - Exact: [15360, 1920, 1, 384] + - Exact: [15744, 1920, 1, 384] + - Exact: [16128, 1920, 1, 384] + - Exact: [16512, 1920, 1, 384] + - Exact: [16896, 1920, 1, 384] + - Exact: [17280, 1920, 1, 384] + - Exact: [17664, 1920, 1, 384] + - Exact: [18048, 1920, 1, 384] + - Exact: [18432, 1920, 1, 384] + - Exact: [18816, 1920, 1, 384] + - Exact: [19200, 1920, 1, 384] + - Exact: [19584, 1920, 1, 384] + - Exact: [19968, 1920, 1, 384] + - Exact: [20352, 1920, 1, 384] + - Exact: [20736, 1920, 1, 384] + - Exact: [21120, 1920, 1, 384] + - Exact: [21504, 1920, 1, 384] + - Exact: [21888, 1920, 1, 384] + - Exact: [22272, 1920, 1, 384] + - Exact: [22656, 1920, 1, 384] + - Exact: [23040, 1920, 1, 384] + - Exact: [768, 2304, 1, 384] + - Exact: [1536, 2304, 1, 384] + - Exact: [1920, 2304, 1, 384] + - Exact: [2688, 2304, 1, 384] + - Exact: [3072, 2304, 1, 384] + - Exact: [3456, 2304, 1, 384] + - Exact: [3840, 2304, 1, 384] + - Exact: [4224, 2304, 1, 384] + - Exact: [4992, 2304, 1, 384] + - Exact: [5376, 2304, 1, 384] + - Exact: [5760, 2304, 1, 384] + - Exact: [6144, 2304, 1, 384] + - Exact: [6528, 2304, 1, 384] + - Exact: [6912, 2304, 1, 384] + - Exact: [7296, 2304, 1, 384] + - Exact: [7680, 2304, 1, 384] + - Exact: [8064, 2304, 1, 384] + - Exact: [8448, 2304, 1, 384] + - Exact: [8832, 2304, 1, 384] + - Exact: [9216, 2304, 1, 384] + - Exact: [9600, 2304, 1, 384] + - Exact: [9984, 2304, 1, 384] + - Exact: [10368, 2304, 1, 384] + - Exact: [10752, 2304, 1, 384] + - Exact: [11136, 2304, 1, 384] + - Exact: [11520, 2304, 1, 384] + - Exact: [11904, 2304, 1, 384] + - Exact: [12288, 2304, 1, 384] + - Exact: [12672, 2304, 1, 384] + - Exact: [13056, 2304, 1, 384] + - Exact: [13440, 2304, 1, 384] + - Exact: [13824, 2304, 1, 384] + - Exact: [14208, 2304, 1, 384] + - Exact: [14592, 2304, 1, 384] + - Exact: [14976, 2304, 1, 384] + - Exact: [15360, 2304, 1, 384] + - Exact: [15744, 2304, 1, 384] + - Exact: [16128, 2304, 1, 384] + - Exact: [16512, 2304, 1, 384] + - Exact: [16896, 2304, 1, 384] + - Exact: [17280, 2304, 1, 384] + - Exact: [17664, 2304, 1, 384] + - Exact: [18048, 2304, 1, 384] + - Exact: [18432, 2304, 1, 384] + - Exact: [18816, 2304, 1, 384] + - Exact: [19200, 2304, 1, 384] + - Exact: [19584, 2304, 1, 384] + - Exact: [19968, 2304, 1, 384] + - Exact: [20352, 2304, 1, 384] + - Exact: [20736, 2304, 1, 384] + - Exact: [21120, 2304, 1, 384] + - Exact: [21504, 2304, 1, 384] + - Exact: [21888, 2304, 1, 384] + - Exact: [22272, 2304, 1, 384] + - Exact: [22656, 2304, 1, 384] + - Exact: [23040, 2304, 1, 384] + - Exact: [256, 32768, 1, 1] + - Exact: [289, 128, 64, 768] + - Exact: [289, 160, 64, 768] + - Exact: [289, 192, 64, 768] + - Exact: [3136, 256, 32, 64] + - Exact: [784, 512, 32, 128] + - Exact: [784, 128, 32, 512] + - Exact: [196, 1024, 32, 256] + - Exact: [1444, 128, 120, 256] + - Exact: [1444, 128, 18, 256] + - Exact: [1444, 128, 19, 256] + - Exact: [1444, 256, 120, 256] + - Exact: [1444, 256, 18, 256] + - Exact: [1444, 256, 19, 256] + - Exact: [361, 512, 120, 256] + - Exact: [361, 512, 18, 256] + - Exact: [361, 512, 19, 256] + - Exact: [7680, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4096] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2048] + - Exact: [1024, 4096, 1, 512] + - Exact: [1024, 30522, 1, 77] + - Exact: [4096, 1024, 1, 512] + - Exact: [1024, 4096, 1, 1280] + - Exact: [1024, 30522, 1, 200] + - Exact: [4096, 1024, 1, 1280] + - Exact: [1024, 4096, 1, 4992] + - Exact: [1024, 30522, 1, 780] + - Exact: [4096, 1024, 1, 4992] + - Exact: [1024, 30522, 1, 308] + - Exact: [1024, 4096, 1, 5120] + - Exact: [1024, 30522, 1, 800] + - Exact: [4096, 1024, 1, 5120] + - Exact: [1024, 4096, 1, 5248] + - Exact: [1024, 30522, 1, 820] + - Exact: [4096, 1024, 1, 5248] + - Exact: [1024, 4096, 1, 2560] + - Exact: [1024, 30522, 1, 385] + - Exact: [4096, 1024, 1, 2560] + - Exact: [1024, 30522, 1, 462] + - Exact: [1024, 4096, 1, 1024] + - Exact: [1024, 30522, 1, 160] + - Exact: [4096, 1024, 1, 1024] + - Exact: [1024, 4096, 1, 1152] + - Exact: [1024, 30522, 1, 180] + - Exact: [4096, 1024, 1, 1152] + - Exact: [1024, 4096, 1, 8192] + - Exact: [1024, 4096, 1, 9600] + - Exact: [1024, 33712, 1, 8192] + - Exact: [1024, 33712, 1, 9600] + - Exact: [4096, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 9600] + - Exact: [1024, 4096, 1, 10064] + - Exact: [1024, 4096, 1, 10080] + - Exact: [1024, 4096, 1, 6528] + - Exact: [1024, 4096, 1, 7104] + - Exact: [1024, 4096, 1, 8064] + - Exact: [1024, 4096, 1, 9216] + - Exact: [1024, 42720, 1, 10080] + - Exact: [1024, 42720, 1, 6528] + - Exact: [1024, 42720, 1, 7104] + - Exact: [4096, 1024, 1, 10064] + - Exact: [4096, 1024, 1, 10080] + - Exact: [4096, 1024, 1, 6528] + - Exact: [4096, 1024, 1, 7104] + - Exact: [4096, 1024, 1, 8064] + - Exact: [4096, 1024, 1, 9216] + - Exact: [1024, 1600, 1, 1] + - Exact: [2048, 960, 1, 1] + - Exact: [2048, 2048, 1, 2] + - Exact: [2048, 30592, 1, 1024] + - Exact: [2048, 6144, 1, 1024] + - Exact: [2048, 8192, 1, 1024] + - Exact: [8192, 2048, 1, 1024] + - Exact: [1024, 30592, 1, 8192] + - Exact: [1024, 3072, 1, 8192] + - Exact: [1024, 30592, 1, 2048] + - Exact: [1024, 30592, 1, 4096] + - Exact: [1024, 3072, 1, 4096] + - Exact: [2560, 1920, 1, 2048] + - Exact: [2560, 2560, 1, 2048] + - Exact: [2560, 2560, 1, 4] + - Exact: [2560, 7680, 1, 2048] + - Exact: [640, 2560, 1, 2048] + - Exact: [1536, 1536, 1, 4096] + - Exact: [1536, 4608, 1, 4096] + - Exact: [1536, 50304, 1, 4096] + - Exact: [1536, 6144, 1, 4096] + - Exact: [6144, 1536, 1, 4096] + - Exact: [1536, 1536, 1, 8192] + - Exact: [1536, 4608, 1, 8192] + - Exact: [1536, 50304, 1, 8192] + - Exact: [1536, 6144, 1, 8192] + - Exact: [6144, 1536, 1, 8192] + - Exact: [1024, 3072, 1, 16384] + - Exact: [1024, 4096, 1, 16384] + - Exact: [1024, 50304, 1, 16384] + - Exact: [4096, 1024, 1, 16384] + - Exact: [1024, 50304, 1, 2048] + - Exact: [1024, 50304, 1, 4096] + - Exact: [1024, 50304, 1, 8192] + - Exact: [1024, 30528, 1, 8192] + - Exact: [256, 6912, 1, 1] + - Exact: [30528, 1024, 1, 640] + - Exact: [30528, 1024, 1, 1280] + - Exact: [4096, 1024, 1, 10240] + - Exact: [1024, 4096, 1, 10240] + - Exact: [30528, 1024, 1, 1600] + - Exact: [1024, 4096, 1, 10496] + - Exact: [30528, 1024, 1, 1640] + - Exact: [4096, 1024, 1, 10496] + - Exact: [30528, 1024, 1, 160] + - Exact: [1024, 4096, 1, 6144] + - Exact: [30528, 1024, 1, 240] + - Exact: [4096, 1024, 1, 6144] + - Exact: [3136, 128, 64, 256] + - Exact: [784, 256, 64, 512] + - Exact: [3136, 256, 64, 128] + - Exact: [3136, 256, 64, 256] + - Exact: [196, 512, 64, 1024] + - Exact: [784, 512, 64, 256] + - Exact: [784, 512, 64, 512] + - Exact: [196, 1024, 64, 512] + - Exact: [196, 1024, 64, 1024] + - Exact: [3136, 128, 32, 256] + - Exact: [784, 256, 32, 512] + - Exact: [3136, 256, 32, 128] + - Exact: [3136, 256, 32, 256] + - Exact: [196, 512, 32, 1024] + - Exact: [784, 512, 32, 256] + - Exact: [784, 512, 32, 512] + - Exact: [196, 1024, 32, 512] + - Exact: [196, 1024, 32, 1024] + - Exact: [1024, 4096, 1, 10224] + - Exact: [4096, 1024, 1, 10224] + - Exact: [1024, 3072, 1, 10224] + - Exact: [1024, 3072, 1, 10240] + - Exact: [4096, 1024, 1, 10192] + - Exact: [1024, 3072, 1, 10192] + - Exact: [1024, 4096, 1, 10192] + - Exact: [1024, 3072, 1, 10200] + - Exact: [4096, 1024, 1, 10208] + - Exact: [1024, 3072, 1, 10208] + - Exact: [1024, 4096, 1, 10208] + - Exact: [1024, 2048, 1, 10224] + - Exact: [1024, 2048, 1, 10240] + - Exact: [1024, 2048, 1, 10192] + - Exact: [1024, 3072, 1, 10080] + - Exact: [100352, 256, 1, 512] + - Exact: [12544, 1024, 1, 2048] + - Exact: [12544, 147, 1, 64] + - Exact: [200704, 256, 1, 512] + - Exact: [25088, 512, 1, 1024] + - Exact: [3136, 576, 1, 64] + - Exact: [50176, 512, 1, 1024] + - Exact: [6272, 1024, 1, 2048] + - Exact: [196, 1024, 128, 512] + - Exact: [196, 1024, 256, 512] + - Exact: [3136, 256, 128, 128] + - Exact: [3136, 256, 256, 128] + - Exact: [784, 512, 128, 256] + - Exact: [784, 512, 256, 256] + - Exact: [30528, 1024, 1, 2560] + - Exact: [1024, 4096, 1, 12288] + - Exact: [30528, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [25600, 128, 25, 128] + - Exact: [12544, 128, 36, 128] + - Exact: [9216, 128, 49, 128] + - Exact: [6400, 128, 64, 128] + - Exact: [6400, 256, 25, 256] + - Exact: [4096, 256, 36, 256] + - Exact: [2304, 256, 49, 256] + - Exact: [2304, 256, 64, 256] + - Exact: [2304, 512, 25, 512] + - Exact: [1024, 512, 36, 512] + - Exact: [1024, 512, 49, 512] + - Exact: [1024, 512, 64, 512] + - Exact: [3072, 768, 1, 2048] + - Exact: [768, 3072, 1, 2048] + - Exact: [3072, 768, 1, 4608] + - Exact: [768, 3072, 1, 4608] + - Exact: [4096, 1024, 1, 4608] + - Exact: [1024, 4096, 1, 4608] + - Exact: [4880, 256, 49, 256] + - Exact: [3128, 256, 64, 256] + - Exact: [4680, 256, 49, 256] + - Exact: [5280, 256, 36, 256] + - Exact: [2640, 256, 64, 256] + - Exact: [5304, 256, 49, 256] + - Exact: [4524, 256, 49, 256] + - Exact: [2760, 256, 64, 256] + - Exact: [6440, 256, 36, 256] + - Exact: [5704, 256, 36, 256] + - Exact: [2666, 256, 64, 256] + - Exact: [2128, 256, 64, 256] + - Exact: [1160, 256, 49, 256] + - Exact: [4056, 256, 49, 256] + - Exact: [6144, 256, 36, 256] + - Exact: [950, 2048, 2, 512] + - Exact: [6336, 256, 36, 256] + - Exact: [13600, 512, 2, 128] + - Exact: [15200, 512, 2, 128] + - Exact: [15200, 128, 2, 512] + - Exact: [13600, 128, 2, 512] + - Exact: [5632, 256, 36, 256] + - Exact: [12288, 128, 2, 512] + - Exact: [12880, 128, 2, 512] + - Exact: [3220, 1024, 2, 256] + - Exact: [11408, 128, 2, 512] + - Exact: [782, 128, 64, 128] + - Exact: [13824, 512, 2, 128] + - Exact: [13824, 128, 2, 512] + - Exact: [10560, 128, 2, 512] + - Exact: [10752, 128, 2, 512] + - Exact: [13600, 512, 2, 256] + - Exact: [15200, 512, 2, 256] + - Exact: [850, 2048, 2, 512] + - Exact: [768, 2048, 2, 512] + - Exact: [12880, 512, 2, 128] + - Exact: [11616, 128, 2, 512] + - Exact: [14208, 512, 2, 128] + - Exact: [11408, 512, 2, 128] + - Exact: [805, 2048, 2, 512] + - Exact: [6912, 256, 36, 256] + - Exact: [713, 2048, 2, 512] + - Exact: [13824, 512, 2, 256] + - Exact: [11616, 512, 2, 128] + - Exact: [12288, 512, 2, 128] + - Exact: [14208, 128, 2, 512] + - Exact: [11968, 128, 2, 512] + - Exact: [864, 2048, 2, 512] + - Exact: [10560, 512, 2, 128] + - Exact: [672, 2048, 2, 512] + - Exact: [660, 2048, 2, 512] + - Exact: [9408, 128, 2, 512] + - Exact: [10752, 512, 2, 128] + - Exact: [726, 2048, 2, 512] + - Exact: [11968, 512, 2, 128] + - Exact: [1240, 256, 49, 256] + - Exact: [4032, 256, 2, 1024] + - Exact: [888, 2048, 2, 512] + - Exact: [12880, 512, 2, 256] + - Exact: [12288, 512, 2, 256] + - Exact: [13440, 128, 2, 512] + - Exact: [864, 2048, 2, 256] + - Exact: [12672, 128, 2, 512] + - Exact: [11264, 128, 2, 512] + - Exact: [11776, 128, 2, 512] + - Exact: [16128, 128, 2, 512] + - Exact: [4032, 1024, 2, 256] + - Exact: [14000, 128, 2, 512] + - Exact: [13440, 512, 2, 128] + - Exact: [805, 2048, 2, 256] + - Exact: [768, 2048, 2, 256] + - Exact: [3264, 1024, 2, 256] + - Exact: [1251, 256, 49, 256] + - Exact: [4200, 256, 2, 1024] + - Exact: [2352, 1024, 2, 256] + - Exact: [2400, 1024, 2, 256] + - Exact: [15200, 256, 2, 12] + - Exact: [12880, 256, 2, 12] + - Exact: [2520, 1024, 2, 256] + - Exact: [13600, 256, 2, 12] + - Exact: [15200, 256, 2, 3] + - Exact: [12880, 256, 2, 3] + - Exact: [4200, 1024, 2, 256] + - Exact: [12288, 256, 2, 12] + - Exact: [13824, 256, 2, 12] + - Exact: [13600, 256, 2, 3] + - Exact: [1900, 1024, 1, 2048] + - Exact: [7600, 512, 1, 256] + - Exact: [1610, 1024, 1, 2048] + - Exact: [6144, 512, 1, 256] + - Exact: [1900, 1024, 1, 512] + - Exact: [12544, 1024, 1, 1024] + - Exact: [3220, 256, 2, 12] + - Exact: [3220, 256, 2, 3] + - Exact: [3800, 256, 2, 3] + - Exact: [13824, 256, 2, 3] + - Exact: [12288, 256, 2, 3] + - Exact: [2688, 256, 2, 1024] + - Exact: [3072, 256, 2, 12] + - Exact: [3800, 256, 2, 12] + - Exact: [3072, 256, 2, 3] + - Exact: [2520, 256, 2, 1024] + - Exact: [16128, 512, 2, 128] + - Exact: [2400, 256, 2, 1024] + - Exact: [2352, 256, 2, 1024] + - Exact: [3036, 1024, 2, 256] + - Exact: [2944, 256, 2, 1024] + - Exact: [2992, 1024, 2, 256] + - Exact: [2816, 256, 2, 1024] + - Exact: [3036, 256, 2, 1024] + - Exact: [2904, 1024, 2, 256] + - Exact: [3456, 256, 2, 3] + - Exact: [3400, 256, 2, 3] + - Exact: [2816, 1024, 2, 256] + - Exact: [3456, 256, 2, 12] + - Exact: [2944, 1024, 2, 256] + - Exact: [3168, 256, 2, 1024] + - Exact: [850, 2048, 2, 256] + - Exact: [2992, 256, 2, 1024] + - Exact: [2852, 1024, 2, 256] + - Exact: [51520, 256, 2, 12] + - Exact: [3072, 256, 2, 1024] + - Exact: [2640, 1024, 2, 256] + - Exact: [2688, 1024, 2, 256] + - Exact: [2904, 256, 2, 1024] + - Exact: [3264, 256, 2, 1024] + - Exact: [54400, 256, 2, 12] + - Exact: [950, 2048, 2, 256] + - Exact: [55296, 256, 2, 3] + - Exact: [60800, 256, 2, 12] + - Exact: [51520, 256, 2, 3] + - Exact: [3700, 1024, 2, 256] + - Exact: [55296, 256, 2, 12] + - Exact: [2852, 256, 2, 1024] + - Exact: [3600, 1024, 2, 256] + - Exact: [3700, 256, 2, 1024] + - Exact: [60800, 256, 2, 3] + - Exact: [1269, 256, 49, 256] + - Exact: [1467, 256, 49, 256] + - Exact: [3500, 256, 2, 1024] + - Exact: [952, 256, 64, 256] + - Exact: [49152, 256, 2, 12] + - Exact: [1449, 256, 49, 256] + - Exact: [1278, 256, 49, 256] + - Exact: [3360, 256, 2, 1024] + - Exact: [736, 256, 64, 256] + - Exact: [1413, 256, 49, 256] + - Exact: [600, 256, 64, 256] + - Exact: [1341, 256, 49, 256] + - Exact: [1287, 256, 49, 256] + - Exact: [1332, 256, 49, 256] + - Exact: [1359, 256, 49, 256] + - Exact: [1440, 256, 49, 256] + - Exact: [1395, 256, 49, 256] + - Exact: [1323, 256, 49, 256] + - Exact: [1404, 256, 49, 256] + - Exact: [1386, 256, 49, 256] + - Exact: [3168, 1024, 2, 256] + - Exact: [1350, 256, 49, 256] + - Exact: [1368, 256, 49, 256] + - Exact: [49152, 256, 2, 3] + - Exact: [3600, 256, 2, 1024] + - Exact: [3500, 1024, 2, 256] + - Exact: [3360, 1024, 2, 256] + - Exact: [3220, 256, 2, 1024] + - Exact: [690, 256, 64, 256] + - Exact: [54400, 256, 2, 3] + - Exact: [3072, 1024, 2, 256] + - Exact: [2640, 256, 2, 1024] + - Exact: [616, 256, 64, 256] + - Exact: [3008, 256, 64, 256] + - Exact: [896, 256, 64, 256] + - Exact: [768, 256, 64, 256] + - Exact: [660, 256, 64, 256] + - Exact: [3552, 256, 2, 1024] + - Exact: [3552, 1024, 2, 256] + - Exact: [800, 256, 64, 256] + - Exact: [1120, 256, 49, 256] + - Exact: [2408, 256, 64, 256] + - Exact: [3456, 256, 2, 1024] + - Exact: [672, 256, 64, 256] + - Exact: [782, 256, 64, 256] + - Exact: [884, 256, 64, 256] + - Exact: [3456, 1024, 2, 256] + - Exact: [1064, 256, 49, 256] + - Exact: [3400, 256, 2, 1024] + - Exact: [704, 256, 64, 256] + - Exact: [3400, 1024, 2, 256] + - Exact: [3264, 256, 64, 256] + - Exact: [3800, 1024, 2, 256] + - Exact: [3800, 256, 2, 1024] + - Exact: [6440, 512, 1, 256] + - Exact: [6912, 512, 1, 256] + - Exact: [6800, 512, 1, 256] + - Exact: [6800, 512, 1, 1024] + - Exact: [6440, 512, 1, 1024] + - Exact: [6912, 512, 1, 1024] + - Exact: [1728, 1024, 1, 512] + - Exact: [1536, 1024, 1, 512] + - Exact: [1610, 1024, 1, 512] + - Exact: [7600, 512, 1, 1024] + - Exact: [6144, 512, 1, 1024] + - Exact: [1700, 1024, 1, 512] + - Exact: [1728, 1024, 1, 2048] + - Exact: [1536, 1024, 1, 2048] + - Exact: [1700, 1024, 1, 2048] + - Exact: [1920, 25216, 1, 16384] + - Exact: [3840, 1920, 1, 16384] + - Exact: [1920, 3840, 1, 16384] + - Exact: [960, 1920, 1, 16384] + - Exact: [1920, 2880, 1, 16384] + - Exact: [1920, 25216, 1, 4096] + - Exact: [3840, 1920, 1, 4096] + - Exact: [1920, 3840, 1, 4096] + - Exact: [960, 1920, 1, 4096] + - Exact: [1920, 2880, 1, 4096] + - Exact: [1920, 25216, 1, 8192] + - Exact: [3840, 1920, 1, 8192] + - Exact: [1920, 3840, 1, 8192] + - Exact: [960, 1920, 1, 8192] + - Exact: [1920, 2880, 1, 8192] + - Exact: [2304, 12672, 1, 16384] + - Exact: [2304, 2304, 1, 16384] + - Exact: [576, 2304, 1, 16384] + - Exact: [2304, 1728, 1, 16384] + - Exact: [2304, 12672, 1, 4096] + - Exact: [2304, 2304, 1, 4096] + - Exact: [576, 2304, 1, 4096] + - Exact: [2304, 1728, 1, 4096] + - Exact: [2304, 12672, 1, 8192] + - Exact: [2304, 2304, 1, 8192] + - Exact: [576, 2304, 1, 8192] + - Exact: [2304, 1728, 1, 8192] + - Exact: [3072, 6400, 1, 4096] + - Exact: [1536, 3072, 1, 4096] + - Exact: [3072, 1536, 1, 4096] + - Exact: [384, 3072, 1, 4096] + - Exact: [3072, 1152, 1, 4096] + - Exact: [3072, 6400, 1, 8192] + - Exact: [1536, 3072, 1, 8192] + - Exact: [3072, 1536, 1, 8192] + - Exact: [384, 3072, 1, 8192] + - Exact: [3072, 1152, 1, 8192] + - Exact: [2048, 2048, 1, 4096] + - Exact: [2048, 2048, 1, 8] + - Exact: [2048, 29000, 1, 199] + - Exact: [2048, 29000, 1, 221] + - Exact: [2048, 29000, 1, 224] + - Exact: [2048, 29000, 1, 229] + - Exact: [2048, 29000, 1, 234] + - Exact: [2048, 29000, 1, 242] + - Exact: [2048, 29000, 1, 246] + - Exact: [2048, 29000, 1, 247] + - Exact: [2048, 29000, 1, 256] + - Exact: [2048, 29000, 1, 262] + - Exact: [2048, 29000, 1, 264] + - Exact: [2048, 29000, 1, 265] + - Exact: [2048, 29000, 1, 274] + - Exact: [2048, 29000, 1, 277] + - Exact: [2048, 29000, 1, 279] + - Exact: [2048, 29000, 1, 288] + - Exact: [2048, 29000, 1, 296] + - Exact: [2048, 29000, 1, 315] + - Exact: [2048, 29000, 1, 335] + - Exact: [2048, 4096, 1, 4096] + - Exact: [4096, 2048, 1, 4096] + - Exact: [1024, 29000, 1, 2283] + - Exact: [1024, 29000, 1, 2296] + - Exact: [1024, 29000, 1, 2306] + - Exact: [1024, 29000, 1, 2309] + - Exact: [1024, 29000, 1, 2318] + - Exact: [1024, 29000, 1, 2320] + - Exact: [1024, 29000, 1, 2324] + - Exact: [1024, 29000, 1, 2325] + - Exact: [1024, 29000, 1, 2329] + - Exact: [1024, 29000, 1, 2338] + - Exact: [1024, 29000, 1, 2345] + - Exact: [1024, 29000, 1, 2350] + - Exact: [1024, 29000, 1, 2362] + - Exact: [1024, 29000, 1, 2366] + - Exact: [1024, 29000, 1, 2368] + - Exact: [1024, 29000, 1, 2374] + - Exact: [1024, 29000, 1, 2390] + - Exact: [1024, 29000, 1, 561] + - Exact: [1024, 29000, 1, 574] + - Exact: [1024, 29000, 1, 600] + - Exact: [1024, 29000, 1, 608] + - Exact: [1024, 29000, 1, 615] + - Exact: [1024, 29000, 1, 622] + - Exact: [1024, 29000, 1, 625] + - Exact: [1024, 29000, 1, 626] + - Exact: [1024, 29000, 1, 628] + - Exact: [1024, 29000, 1, 636] + - Exact: [1024, 29000, 1, 651] + - Exact: [1024, 29000, 1, 658] + - Exact: [1024, 29000, 1, 669] + - Exact: [1024, 29000, 1, 670] + - Exact: [1024, 29000, 1, 672] + - Exact: [1024, 29000, 1, 684] + - Exact: [1024, 29000, 1, 716] + - Exact: [1024, 29000, 1, 730] + - Exact: [2560, 2560, 1, 1024] + - Exact: [2560, 2560, 1, 2] + - Exact: [2560, 29000, 1, 109] + - Exact: [2560, 29000, 1, 121] + - Exact: [2560, 29000, 1, 27] + - Exact: [2560, 29000, 1, 35] + - Exact: [2560, 29000, 1, 36] + - Exact: [2560, 29000, 1, 39] + - Exact: [2560, 29000, 1, 40] + - Exact: [2560, 29000, 1, 42] + - Exact: [2560, 29000, 1, 43] + - Exact: [2560, 29000, 1, 44] + - Exact: [2560, 29000, 1, 46] + - Exact: [2560, 29000, 1, 48] + - Exact: [2560, 29000, 1, 49] + - Exact: [2560, 29000, 1, 50] + - Exact: [2560, 29000, 1, 51] + - Exact: [2560, 29000, 1, 53] + - Exact: [2560, 29000, 1, 54] + - Exact: [2560, 29000, 1, 55] + - Exact: [2560, 29000, 1, 56] + - Exact: [2560, 29000, 1, 57] + - Exact: [2560, 29000, 1, 58] + - Exact: [2560, 29000, 1, 59] + - Exact: [2560, 29000, 1, 61] + - Exact: [2560, 29000, 1, 63] + - Exact: [2560, 29000, 1, 65] + - Exact: [2560, 29000, 1, 66] + - Exact: [2560, 29000, 1, 67] + - Exact: [2560, 29000, 1, 69] + - Exact: [2560, 29000, 1, 70] + - Exact: [2560, 29000, 1, 71] + - Exact: [2560, 29000, 1, 73] + - Exact: [2560, 29000, 1, 74] + - Exact: [2560, 29000, 1, 75] + - Exact: [2560, 29000, 1, 77] + - Exact: [2560, 29000, 1, 78] + - Exact: [2560, 29000, 1, 80] + - Exact: [2560, 29000, 1, 81] + - Exact: [2560, 29000, 1, 82] + - Exact: [2560, 29000, 1, 83] + - Exact: [2560, 29000, 1, 84] + - Exact: [2560, 29000, 1, 88] + - Exact: [2560, 29000, 1, 89] + - Exact: [2560, 29000, 1, 90] + - Exact: [2560, 29000, 1, 92] + - Exact: [2560, 29000, 1, 95] + - Exact: [2560, 29000, 1, 98] + - Exact: [2560, 4096, 1, 1024] + - Exact: [4096, 2560, 1, 1024] + - Exact: [1024, 3072, 1, 32768] + - Exact: [1024, 4096, 1, 32768] + - Exact: [1024, 50304, 1, 32768] + - Exact: [4096, 1024, 1, 32768] + - Exact: [1024, 128, 24, 1024] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 256, 1, 33536] + - Exact: [1024, 1024, 1, 9520] + - Exact: [1024, 1024, 1, 10200] + - Exact: [1024, 256, 1, 21248] + - Exact: [1024, 256, 1, 21504] + - Exact: [1024, 256, 1, 22016] + - Exact: [1024, 256, 1, 28672] + - Exact: [256, 2560, 1, 8976] + - Exact: [256, 2816, 1, 8976] + - Exact: [256, 3328, 1, 8976] + - Exact: [256, 3584, 1, 8976] + - Exact: [256, 3840, 1, 8976] + - Exact: [256, 4096, 1, 8976] + - Exact: [256, 4352, 1, 8976] + - Exact: [1024, 1024, 1, 32768] + - Exact: [1024, 512, 1, 32768] + - Exact: [479, 1024, 1, 32768] + - Exact: [512, 256, 1, 55296] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 1024, 1, 9600] + - Exact: [1024, 1024, 1, 10064] + - Exact: [1024, 1024, 1, 10080] + - Exact: [1024, 1024, 1, 9216] + - Exact: [480, 1024, 1, 32768] + - Exact: [1024, 1024, 1, 16384] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 10496] + - Exact: [1024, 1024, 1, 10224] + - Exact: [1024, 1024, 1, 10192] + - Exact: [1024, 1024, 1, 10208] + - Exact: [1024, 1024, 1, 10184] + - Exact: [1024, 1024, 1, 10120] + - Exact: [1024, 1024, 1, 10152] + - Exact: [1024, 1024, 1, 12288] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 512] + - Exact: [1024, 1024, 1, 200] + - Exact: [1024, 1024, 1, 4096] + - Exact: [1024, 1024, 1, 2048] + - Exact: [768, 768, 1, 16] + - Exact: [768, 768, 1, 320] + - Exact: [768, 768, 1, 4096] + - Exact: [768, 768, 1, 32] + - Exact: [768, 768, 1, 640] + - Exact: [768, 768, 1, 64] + - Exact: [768, 768, 1, 1280] + - Exact: [1024, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 120] + - Exact: [1024, 1024, 1, 1] + - Exact: [1024, 1024, 1, 20] + - Exact: [1024, 1024, 1, 4] + - Exact: [1024, 1024, 1, 6] + - Exact: [1024, 1024, 1, 80] + - Exact: [128, 64, 512, 128] + - Exact: [512, 64, 64, 512] + - Exact: [64, 64, 768, 64] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [448, 2944, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 128, 1, 1280] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [64, 6784, 1, 1280] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [1024, 1024, 1, 1024] + - Exact: [448, 1024, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [64, 6784, 1, 128] + - Exact: [5888, 64, 1, 1280] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [64, 5056, 1, 128] + - Exact: [5888, 64, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [3136, 64, 128, 64] + - Exact: [3136, 64, 128, 256] + - Exact: [3136, 64, 256, 256] + - Exact: [3136, 64, 256, 64] + - Exact: [64, 1536, 64, 384] + - Exact: [64, 1536, 64, 256] + - Exact: [64, 92, 688, 92] + - Exact: [1024, 1024, 1, 3975] + - Exact: [64, 123, 528, 123] + - Exact: [64, 102, 624, 100] + - Exact: [64, 112, 576, 111] + - Exact: [64, 102, 624, 102] + - Exact: [64, 133, 480, 135] + - Exact: [1024, 1024, 1, 4026] + - Exact: [64, 160, 400, 159] + - Exact: [1024, 1024, 1, 3780] + - Exact: [64, 228, 272, 232] + - Exact: [1024, 1024, 1, 3822] + - Exact: [64, 77, 816, 77] + - Exact: [64, 159, 400, 159] + - Exact: [64, 135, 480, 134] + - Exact: [64, 99, 624, 99] + - Exact: [1024, 1024, 1, 3942] + - Exact: [1024, 1024, 1, 3861] + - Exact: [1024, 1024, 1, 4000] + - Exact: [1024, 1024, 1, 3870] + - Exact: [64, 65, 992, 65] + - Exact: [64, 133, 480, 133] + - Exact: [64, 232, 272, 232] + - Exact: [64, 148, 432, 148] + - Exact: [1024, 1024, 1, 4032] + - Exact: [1024, 1024, 1, 4012] + - Exact: [1024, 1024, 1, 3681] + - Exact: [1024, 1024, 1, 3927] + - Exact: [1024, 1024, 1, 3894] + - Exact: [64, 132, 480, 135] + - Exact: [64, 135, 480, 135] + - Exact: [1024, 1024, 1, 3876] + - Exact: [64, 84, 752, 85] + - Exact: [1024, 1024, 1, 4050] + - Exact: [64, 132, 480, 132] + - Exact: [64, 99, 624, 102] + - Exact: [64, 143, 432, 148] + - Exact: [1024, 1024, 1, 3584] + - Exact: [64, 162, 400, 162] + - Exact: [64, 148, 432, 147] + - Exact: [1024, 1024, 1, 3960] + - Exact: [64, 123, 528, 122] + - Exact: [64, 102, 624, 101] + - Exact: [1024, 1024, 1, 3978] + - Exact: [64, 160, 400, 160] + - Exact: [1024, 1024, 1, 3995] + - Exact: [64, 132, 480, 134] + - Exact: [64, 111, 576, 111] + - Exact: [64, 100, 624, 100] + - Exact: [1024, 1024, 1, 3977] + - Exact: [64, 112, 576, 112] + - Exact: [64, 159, 400, 162] + - Exact: [64, 122, 528, 122] + - Exact: [64, 228, 272, 228] + - Exact: [1024, 1024, 1, 3925] + - Exact: [64, 93, 688, 93] + - Exact: [1024, 1024, 1, 3956] + - Exact: [1024, 1024, 1, 3976] + - Exact: [64, 111, 576, 112] + - Exact: [64, 100, 624, 102] + - Exact: [1024, 1024, 1, 3955] + - Exact: [1024, 1024, 1, 4030] + - Exact: [1024, 1024, 1, 3906] + - Exact: [64, 101, 624, 102] + - Exact: [1024, 1024, 1, 3796] + - Exact: [1024, 1024, 1, 3859] + - Exact: [64, 71, 896, 71] + - Exact: [1024, 1024, 1, 3860] + - Exact: [1024, 1024, 1, 4005] + - Exact: [64, 84, 752, 84] + - Exact: [1024, 1024, 1, 3990] + - Exact: [64, 134, 480, 134] + - Exact: [64, 78, 816, 78] + - Exact: [1024, 1024, 1, 3999] + - Exact: [1024, 1024, 1, 4020] + - Exact: [1024, 1024, 1, 3939] + - Exact: [64, 77, 816, 78] + - Exact: [1024, 1024, 1, 4059] + - Exact: [1024, 1024, 1, 3944] + - Exact: [64, 193, 320, 193] + - Exact: [1024, 1024, 1, 3720] + - Exact: [1024, 1024, 1, 3910] + - Exact: [64, 143, 432, 143] + - Exact: [64, 92, 688, 93] + - Exact: [64, 101, 624, 101] + - Exact: [1024, 1024, 1, 3969] + - Exact: [1024, 1024, 1, 3948] + - Exact: [1024, 1024, 1, 3996] + - Exact: [1024, 1024, 1, 3900] + - Exact: [1024, 1024, 1, 3640] + - Exact: [64, 147, 432, 147] + - Exact: [1024, 1024, 1, 3751] + - Exact: [64, 177, 352, 177] + - Exact: [64, 85, 752, 85] + - Exact: [1024, 1024, 1, 3712] + - Exact: [1024, 1024, 1, 128] + - Exact: [64, 256, 192, 256] + - Exact: [64, 128, 384, 128] + - Exact: [64, 192, 36, 25088] + - Exact: [128, 128, 64, 25] + - Exact: [64, 192, 64, 3200] + - Exact: [64, 128, 64, 23104] + - Exact: [128, 128, 64, 1600] + - Exact: [80, 192, 64, 4608] + - Exact: [64, 128, 36, 30] + - Exact: [64, 128, 64, 11552] + - Exact: [128, 192, 64, 946] + - Exact: [64, 192, 64, 12800] + - Exact: [224, 224, 64, 128] + - Exact: [128, 128, 64, 3360] + - Exact: [128, 128, 64, 420] + - Exact: [64, 128, 64, 361] + - Exact: [64, 128, 36, 53824] + - Exact: [128, 160, 36, 512] + - Exact: [147, 64, 36, 18816] + - Exact: [96, 128, 64, 946] + - Exact: [128, 128, 64, 50] + - Exact: [160, 224, 36, 128] + - Exact: [192, 224, 64, 1152] + - Exact: [128, 128, 36, 784] + - Exact: [96, 128, 64, 288] + - Exact: [128, 128, 64, 400] + - Exact: [128, 128, 64, 800] + - Exact: [96, 128, 36, 512] + - Exact: [96, 128, 64, 800] + - Exact: [192, 224, 64, 128] + - Exact: [128, 128, 64, 288] + - Exact: [96, 208, 36, 512] + - Exact: [64, 128, 36, 1568] + - Exact: [192, 192, 36, 512] + - Exact: [128, 128, 36, 512] + - Exact: [96, 208, 64, 1152] + - Exact: [128, 192, 64, 3200] + - Exact: [160, 160, 64, 288] + - Exact: [128, 128, 36, 440] + - Exact: [96, 128, 36, 1568] + - Exact: [112, 224, 36, 2048] + - Exact: [128, 128, 36, 7040] + - Exact: [128, 128, 36, 1568] + - Exact: [160, 224, 64, 128] + - Exact: [192, 224, 36, 2592] + - Exact: [64, 128, 64, 2888] + - Exact: [64, 128, 36, 480] + - Exact: [147, 64, 64, 9702] + - Exact: [64, 192, 64, 3698] + - Exact: [73, 192, 64, 10439] + - Exact: [128, 128, 36, 880] + - Exact: [192, 224, 36, 128] + - Exact: [64, 128, 36, 12544] + - Exact: [160, 160, 36, 512] + - Exact: [128, 128, 36, 3136] + - Exact: [112, 224, 36, 512] + - Exact: [128, 128, 36, 49] + - Exact: [112, 224, 64, 1152] + - Exact: [128, 192, 36, 1568] + - Exact: [128, 192, 36, 512] + - Exact: [192, 192, 64, 288] + - Exact: [96, 208, 64, 242] + - Exact: [64, 128, 64, 5776] + - Exact: [128, 192, 64, 288] + - Exact: [96, 128, 36, 6272] + - Exact: [96, 128, 64, 3200] + - Exact: [128, 192, 64, 800] + - Exact: [64, 128, 64, 10] + - Exact: [96, 208, 64, 288] + - Exact: [64, 128, 64, 160] + - Exact: [128, 128, 64, 1568] + - Exact: [112, 224, 64, 242] + - Exact: [160, 192, 64, 288] + - Exact: [128, 160, 64, 288] + - Exact: [128, 128, 64, 210] + - Exact: [73, 192, 36, 23360] + - Exact: [160, 192, 36, 512] + - Exact: [64, 128, 64, 722] + - Exact: [112, 224, 64, 288] + - Exact: [64, 192, 36, 6272] + - Exact: [64, 128, 36, 6272] + - Exact: [128, 128, 36, 3200] + - Exact: [128, 128, 36, 392] + - Exact: [80, 192, 36, 10368] + - Exact: [224, 224, 36, 128] + - Exact: [64, 128, 36, 784] + - Exact: [128, 128, 64, 200] + - Exact: [5329, 64, 32, 80] + - Exact: [64, 2048, 32, 384] + - Exact: [289, 1792, 1, 320] + - Exact: [1001, 1024, 1, 32] + - Exact: [784, 400, 1, 32] + - Exact: [64, 1536, 32, 256] + - Exact: [289, 2592, 1, 384] + - Exact: [64, 2048, 32, 448] + - Exact: [289, 2016, 1, 256] + - Exact: [64, 1536, 32, 384] + - Exact: [64, 1280, 32, 320] + - Exact: [289, 3456, 1, 384] + - Exact: [64, 1280, 32, 384] + - Exact: [729, 1600, 1, 192] + - Exact: [289, 1344, 1, 192] + - Exact: [64, 2048, 32, 320] + - Exact: [64, 1280, 32, 448] + - Exact: [64, 1280, 32, 192] + - Exact: [289, 1792, 1, 256] + - Exact: [64, 2048, 32, 192] + - Exact: [5329, 64, 128, 80] + - Exact: [64, 1280, 128, 448] + - Exact: [64, 2048, 128, 192] + - Exact: [64, 1280, 128, 384] + - Exact: [64, 1280, 128, 320] + - Exact: [64, 1280, 128, 192] + - Exact: [256, 4096, 1, 6400] + - Exact: [512, 2048, 1, 3427] + - Exact: [512, 2048, 1, 3552] + - Exact: [512, 2048, 1, 3840] + - Exact: [2048, 512, 1, 3427] + - Exact: [2048, 512, 1, 3452] + - Exact: [2048, 512, 1, 3472] + - Exact: [2048, 512, 1, 3475] + - Exact: [64, 64, 496, 64] + - Exact: [64, 64, 496, 65] + - Exact: [64, 65, 496, 65] + - Exact: [64, 71, 448, 71] + - Exact: [64, 77, 408, 77] + - Exact: [64, 77, 408, 78] + - Exact: [64, 78, 408, 78] + - Exact: [64, 85, 376, 85] + - Exact: [64, 93, 344, 93] + - Exact: [64, 112, 288, 112] + - Exact: [64, 122, 264, 122] + - Exact: [64, 123, 264, 122] + - Exact: [64, 123, 264, 123] + - Exact: [64, 134, 240, 134] + - Exact: [64, 135, 240, 134] + - Exact: [64, 135, 240, 135] + - Exact: [64, 1280, 64, 192] + - Exact: [64, 1280, 64, 320] + - Exact: [64, 1280, 64, 384] + - Exact: [64, 1280, 64, 448] + - Exact: [64, 2048, 64, 192] + - Exact: [64, 2048, 64, 320] + - Exact: [64, 2048, 64, 384] + - Exact: [64, 2048, 64, 448] + - Exact: [3136, 64, 64, 64] + - Exact: [3136, 64, 64, 256] + - Exact: [5329, 64, 64, 80] + - Exact: [257, 4096, 1, 1024] + - Exact: [512, 2048, 1, 2790] + - Exact: [512, 2048, 1, 2864] + - Exact: [512, 2048, 1, 3092] + - Exact: [512, 2048, 1, 3113] + - Exact: [512, 2048, 1, 3137] + - Exact: [512, 2048, 1, 3165] + - Exact: [512, 2048, 1, 3166] + - Exact: [512, 2048, 1, 3194] + - Exact: [512, 2048, 1, 3219] + - Exact: [512, 2048, 1, 3222] + - Exact: [512, 2048, 1, 3234] + - Exact: [512, 2048, 1, 3237] + - Exact: [512, 2048, 1, 3242] + - Exact: [512, 2048, 1, 3246] + - Exact: [512, 2048, 1, 3249] + - Exact: [512, 2048, 1, 3251] + - Exact: [512, 2048, 1, 3257] + - Exact: [512, 2048, 1, 3262] + - Exact: [512, 2048, 1, 3268] + - Exact: [512, 2048, 1, 3282] + - Exact: [512, 2048, 1, 3286] + - Exact: [512, 2048, 1, 3287] + - Exact: [512, 2048, 1, 3293] + - Exact: [512, 2048, 1, 3297] + - Exact: [512, 2048, 1, 3307] + - Exact: [512, 2048, 1, 3314] + - Exact: [512, 2048, 1, 3315] + - Exact: [512, 2048, 1, 3319] + - Exact: [512, 2048, 1, 3322] + - Exact: [512, 2048, 1, 3323] + - Exact: [512, 2048, 1, 3324] + - Exact: [512, 2048, 1, 3325] + - Exact: [512, 2048, 1, 3327] + - Exact: [512, 2048, 1, 3329] + - Exact: [512, 2048, 1, 3332] + - Exact: [512, 2048, 1, 3336] + - Exact: [512, 2048, 1, 3339] + - Exact: [512, 2048, 1, 3342] + - Exact: [512, 2048, 1, 3344] + - Exact: [512, 2048, 1, 3358] + - Exact: [512, 2048, 1, 3360] + - Exact: [512, 2048, 1, 3364] + - Exact: [512, 2048, 1, 3365] + - Exact: [512, 2048, 1, 3369] + - Exact: [512, 2048, 1, 3371] + - Exact: [512, 2048, 1, 3374] + - Exact: [512, 2048, 1, 3376] + - Exact: [512, 2048, 1, 3377] + - Exact: [512, 2048, 1, 3378] + - Exact: [512, 2048, 1, 3381] + - Exact: [512, 2048, 1, 3382] + - Exact: [512, 2048, 1, 3383] + - Exact: [512, 2048, 1, 3384] + - Exact: [512, 2048, 1, 3385] + - Exact: [512, 2048, 1, 3386] + - Exact: [512, 2048, 1, 3388] + - Exact: [512, 2048, 1, 3390] + - Exact: [512, 2048, 1, 3391] + - Exact: [512, 2048, 1, 3396] + - Exact: [512, 2048, 1, 3399] + - Exact: [512, 2048, 1, 3402] + - Exact: [512, 2048, 1, 3410] + - Exact: [512, 2048, 1, 3412] + - Exact: [512, 2048, 1, 3414] + - Exact: [512, 2048, 1, 3415] + - Exact: [512, 2048, 1, 3418] + - Exact: [512, 2048, 1, 3420] + - Exact: [512, 2048, 1, 3422] + - Exact: [512, 2048, 1, 3425] + - Exact: [512, 2048, 1, 3426] + - Exact: [512, 2048, 1, 3428] + - Exact: [512, 2048, 1, 3430] + - Exact: [512, 2048, 1, 3431] + - Exact: [512, 2048, 1, 3432] + - Exact: [512, 2048, 1, 3438] + - Exact: [512, 2048, 1, 3439] + - Exact: [512, 2048, 1, 3440] + - Exact: [512, 2048, 1, 3443] + - Exact: [512, 2048, 1, 3445] + - Exact: [512, 2048, 1, 3447] + - Exact: [512, 2048, 1, 3448] + - Exact: [512, 2048, 1, 3450] + - Exact: [512, 2048, 1, 3451] + - Exact: [512, 2048, 1, 3452] + - Exact: [512, 2048, 1, 3453] + - Exact: [512, 2048, 1, 3455] + - Exact: [512, 2048, 1, 3456] + - Exact: [512, 2048, 1, 3457] + - Exact: [512, 2048, 1, 3458] + - Exact: [512, 2048, 1, 3459] + - Exact: [512, 2048, 1, 3460] + - Exact: [512, 2048, 1, 3461] + - Exact: [512, 2048, 1, 3462] + - Exact: [512, 2048, 1, 3466] + - Exact: [512, 2048, 1, 3467] + - Exact: [512, 2048, 1, 3468] + - Exact: [512, 2048, 1, 3470] + - Exact: [512, 2048, 1, 3471] + - Exact: [512, 2048, 1, 3472] + - Exact: [512, 2048, 1, 3475] + - Exact: [512, 2048, 1, 3476] + - Exact: [512, 2048, 1, 3477] + - Exact: [512, 2048, 1, 3478] + - Exact: [512, 2048, 1, 3479] + - Exact: [512, 2048, 1, 3480] + - Exact: [512, 2048, 1, 3481] + - Exact: [512, 2048, 1, 3483] + - Exact: [512, 2048, 1, 3484] + - Exact: [512, 2048, 1, 3487] + - Exact: [512, 2048, 1, 3489] + - Exact: [512, 2048, 1, 3490] + - Exact: [512, 2048, 1, 3491] + - Exact: [512, 2048, 1, 3493] + - Exact: [512, 2048, 1, 3494] + - Exact: [512, 2048, 1, 3495] + - Exact: [512, 2048, 1, 3497] + - Exact: [512, 2048, 1, 3498] + - Exact: [512, 2048, 1, 3499] + - Exact: [512, 2048, 1, 3501] + - Exact: [512, 2048, 1, 3503] + - Exact: [512, 2048, 1, 3507] + - Exact: [512, 2048, 1, 3508] + - Exact: [512, 2048, 1, 3509] + - Exact: [512, 2048, 1, 3511] + - Exact: [512, 2048, 1, 3514] + - Exact: [512, 2048, 1, 3515] + - Exact: [512, 2048, 1, 3517] + - Exact: [512, 2048, 1, 3518] + - Exact: [512, 2048, 1, 3519] + - Exact: [512, 2048, 1, 3520] + - Exact: [512, 2048, 1, 3523] + - Exact: [512, 2048, 1, 3528] + - Exact: [512, 2048, 1, 3529] + - Exact: [512, 2048, 1, 3530] + - Exact: [512, 2048, 1, 3532] + - Exact: [512, 2048, 1, 3533] + - Exact: [512, 2048, 1, 3534] + - Exact: [512, 2048, 1, 3538] + - Exact: [512, 2048, 1, 3539] + - Exact: [512, 2048, 1, 3541] + - Exact: [512, 2048, 1, 3547] + - Exact: [512, 2048, 1, 3548] + - Exact: [512, 2048, 1, 3564] + - Exact: [512, 2048, 1, 3575] + - Exact: [512, 2048, 1, 3598] + - Exact: [512, 2048, 1, 3599] + - Exact: [512, 2048, 1, 3608] + - Exact: [512, 2048, 1, 3780] + - Exact: [512, 2048, 1, 3796] + - Exact: [512, 2048, 1, 3822] + - Exact: [512, 2048, 1, 3859] + - Exact: [512, 2048, 1, 3870] + - Exact: [512, 2048, 1, 3876] + - Exact: [512, 2048, 1, 3906] + - Exact: [512, 2048, 1, 3910] + - Exact: [512, 2048, 1, 3925] + - Exact: [512, 2048, 1, 3942] + - Exact: [512, 2048, 1, 3944] + - Exact: [512, 2048, 1, 3955] + - Exact: [512, 2048, 1, 3968] + - Exact: [512, 2048, 1, 3969] + - Exact: [512, 2048, 1, 3976] + - Exact: [512, 2048, 1, 3977] + - Exact: [512, 2048, 1, 3978] + - Exact: [512, 2048, 1, 3990] + - Exact: [512, 2048, 1, 3995] + - Exact: [512, 2048, 1, 3996] + - Exact: [512, 2048, 1, 3999] + - Exact: [512, 2048, 1, 4005] + - Exact: [512, 2048, 1, 4012] + - Exact: [512, 2048, 1, 4020] + - Exact: [512, 2048, 1, 4026] + - Exact: [512, 2048, 1, 4030] + - Exact: [512, 2048, 1, 4032] + - Exact: [2048, 512, 1, 2790] + - Exact: [2048, 512, 1, 2864] + - Exact: [2048, 512, 1, 3092] + - Exact: [2048, 512, 1, 3113] + - Exact: [2048, 512, 1, 3137] + - Exact: [2048, 512, 1, 3165] + - Exact: [2048, 512, 1, 3166] + - Exact: [2048, 512, 1, 3194] + - Exact: [2048, 512, 1, 3219] + - Exact: [2048, 512, 1, 3222] + - Exact: [2048, 512, 1, 3234] + - Exact: [2048, 512, 1, 3237] + - Exact: [2048, 512, 1, 3242] + - Exact: [2048, 512, 1, 3246] + - Exact: [2048, 512, 1, 3249] + - Exact: [2048, 512, 1, 3251] + - Exact: [2048, 512, 1, 3257] + - Exact: [2048, 512, 1, 3262] + - Exact: [2048, 512, 1, 3268] + - Exact: [2048, 512, 1, 3282] + - Exact: [2048, 512, 1, 3286] + - Exact: [2048, 512, 1, 3287] + - Exact: [2048, 512, 1, 3293] + - Exact: [2048, 512, 1, 3297] + - Exact: [2048, 512, 1, 3307] + - Exact: [2048, 512, 1, 3314] + - Exact: [2048, 512, 1, 3315] + - Exact: [2048, 512, 1, 3319] + - Exact: [2048, 512, 1, 3322] + - Exact: [2048, 512, 1, 3323] + - Exact: [2048, 512, 1, 3324] + - Exact: [2048, 512, 1, 3325] + - Exact: [2048, 512, 1, 3327] + - Exact: [2048, 512, 1, 3329] + - Exact: [2048, 512, 1, 3332] + - Exact: [2048, 512, 1, 3336] + - Exact: [2048, 512, 1, 3339] + - Exact: [2048, 512, 1, 3342] + - Exact: [2048, 512, 1, 3344] + - Exact: [2048, 512, 1, 3358] + - Exact: [2048, 512, 1, 3360] + - Exact: [2048, 512, 1, 3364] + - Exact: [2048, 512, 1, 3365] + - Exact: [2048, 512, 1, 3369] + - Exact: [2048, 512, 1, 3371] + - Exact: [2048, 512, 1, 3374] + - Exact: [2048, 512, 1, 3376] + - Exact: [2048, 512, 1, 3377] + - Exact: [2048, 512, 1, 3378] + - Exact: [2048, 512, 1, 3381] + - Exact: [2048, 512, 1, 3382] + - Exact: [2048, 512, 1, 3383] + - Exact: [2048, 512, 1, 3384] + - Exact: [2048, 512, 1, 3385] + - Exact: [2048, 512, 1, 3386] + - Exact: [2048, 512, 1, 3388] + - Exact: [2048, 512, 1, 3390] + - Exact: [2048, 512, 1, 3391] + - Exact: [2048, 512, 1, 3396] + - Exact: [2048, 512, 1, 3399] + - Exact: [2048, 512, 1, 3402] + - Exact: [2048, 512, 1, 3410] + - Exact: [2048, 512, 1, 3412] + - Exact: [2048, 512, 1, 3414] + - Exact: [2048, 512, 1, 3415] + - Exact: [2048, 512, 1, 3418] + - Exact: [2048, 512, 1, 3420] + - Exact: [2048, 512, 1, 3422] + - Exact: [2048, 512, 1, 3425] + - Exact: [2048, 512, 1, 3426] + - Exact: [2048, 512, 1, 3428] + - Exact: [2048, 512, 1, 3430] + - Exact: [2048, 512, 1, 3431] + - Exact: [2048, 512, 1, 3432] + - Exact: [2048, 512, 1, 3438] + - Exact: [2048, 512, 1, 3439] + - Exact: [2048, 512, 1, 3440] + - Exact: [2048, 512, 1, 3443] + - Exact: [2048, 512, 1, 3445] + - Exact: [2048, 512, 1, 3447] + - Exact: [2048, 512, 1, 3448] + - Exact: [2048, 512, 1, 3450] + - Exact: [2048, 512, 1, 3451] + - Exact: [2048, 512, 1, 3453] + - Exact: [2048, 512, 1, 3455] + - Exact: [2048, 512, 1, 3456] + - Exact: [2048, 512, 1, 3457] + - Exact: [2048, 512, 1, 3458] + - Exact: [2048, 512, 1, 3459] + - Exact: [2048, 512, 1, 3460] + - Exact: [2048, 512, 1, 3461] + - Exact: [2048, 512, 1, 3462] + - Exact: [2048, 512, 1, 3466] + - Exact: [2048, 512, 1, 3467] + - Exact: [2048, 512, 1, 3468] + - Exact: [2048, 512, 1, 3470] + - Exact: [2048, 512, 1, 3471] + - Exact: [2048, 512, 1, 3476] + - Exact: [2048, 512, 1, 3477] + - Exact: [2048, 512, 1, 3478] + - Exact: [2048, 512, 1, 3479] + - Exact: [2048, 512, 1, 3480] + - Exact: [2048, 512, 1, 3481] + - Exact: [2048, 512, 1, 3483] + - Exact: [2048, 512, 1, 3484] + - Exact: [2048, 512, 1, 3487] + - Exact: [2048, 512, 1, 3489] + - Exact: [2048, 512, 1, 3490] + - Exact: [2048, 512, 1, 3491] + - Exact: [2048, 512, 1, 3493] + - Exact: [2048, 512, 1, 3494] + - Exact: [2048, 512, 1, 3495] + - Exact: [2048, 512, 1, 3497] + - Exact: [2048, 512, 1, 3498] + - Exact: [2048, 512, 1, 3499] + - Exact: [2048, 512, 1, 3501] + - Exact: [2048, 512, 1, 3503] + - Exact: [2048, 512, 1, 3507] + - Exact: [2048, 512, 1, 3508] + - Exact: [2048, 512, 1, 3509] + - Exact: [2048, 512, 1, 3511] + - Exact: [2048, 512, 1, 3514] + - Exact: [2048, 512, 1, 3515] + - Exact: [2048, 512, 1, 3517] + - Exact: [2048, 512, 1, 3518] + - Exact: [2048, 512, 1, 3519] + - Exact: [2048, 512, 1, 3520] + - Exact: [2048, 512, 1, 3523] + - Exact: [2048, 512, 1, 3528] + - Exact: [2048, 512, 1, 3529] + - Exact: [2048, 512, 1, 3530] + - Exact: [2048, 512, 1, 3532] + - Exact: [2048, 512, 1, 3533] + - Exact: [2048, 512, 1, 3534] + - Exact: [2048, 512, 1, 3538] + - Exact: [2048, 512, 1, 3539] + - Exact: [2048, 512, 1, 3541] + - Exact: [2048, 512, 1, 3547] + - Exact: [2048, 512, 1, 3548] + - Exact: [2048, 512, 1, 3552] + - Exact: [2048, 512, 1, 3564] + - Exact: [2048, 512, 1, 3575] + - Exact: [2048, 512, 1, 3598] + - Exact: [2048, 512, 1, 3599] + - Exact: [2048, 512, 1, 3608] + - Exact: [2048, 512, 1, 3780] + - Exact: [2048, 512, 1, 3796] + - Exact: [2048, 512, 1, 3822] + - Exact: [2048, 512, 1, 3840] + - Exact: [2048, 512, 1, 3859] + - Exact: [2048, 512, 1, 3870] + - Exact: [2048, 512, 1, 3876] + - Exact: [2048, 512, 1, 3906] + - Exact: [2048, 512, 1, 3910] + - Exact: [2048, 512, 1, 3925] + - Exact: [2048, 512, 1, 3942] + - Exact: [2048, 512, 1, 3944] + - Exact: [2048, 512, 1, 3955] + - Exact: [2048, 512, 1, 3968] + - Exact: [2048, 512, 1, 3969] + - Exact: [2048, 512, 1, 3976] + - Exact: [2048, 512, 1, 3977] + - Exact: [2048, 512, 1, 3978] + - Exact: [2048, 512, 1, 3990] + - Exact: [2048, 512, 1, 3995] + - Exact: [2048, 512, 1, 3996] + - Exact: [2048, 512, 1, 3999] + - Exact: [2048, 512, 1, 4005] + - Exact: [2048, 512, 1, 4012] + - Exact: [2048, 512, 1, 4020] + - Exact: [2048, 512, 1, 4026] + - Exact: [2048, 512, 1, 4030] + - Exact: [2048, 512, 1, 4032] + - Exact: [64, 102, 312, 102] + - Exact: [64, 512, 16, 512] + - Exact: [64, 512, 96, 512] + - Exact: [1024, 1024, 1, 3840] + - Exact: [1024, 1024, 1, 3968] + - Exact: [1024, 1024, 1, 7200] + - Exact: [1024, 1024, 1, 8160] + - Exact: [768, 768, 1, 384] + - Exact: [768, 384, 1, 384] + - Exact: [1152, 576, 1, 384] + - Exact: [384, 768, 1, 384] + - Exact: [1024, 1024, 1, 32] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [1024, 1024, 1, 1600] + - Exact: [2048, 256, 1, 1024] + - Exact: [256, 1280, 1, 8976] + - Exact: [512, 2048, 1, 256] + - Exact: [560, 1024, 1, 1600] + - Exact: [560, 1024, 1, 200] + - Exact: [1024, 1024, 1, 960] + - Exact: [2304, 128, 1, 128] + - Exact: [2688, 128, 1, 128] + - Exact: [3072, 128, 1, 128] + - Exact: [3456, 128, 1, 128] + - Exact: [3840, 128, 1, 128] + - Exact: [4224, 128, 1, 128] + - Exact: [4608, 128, 1, 128] + - Exact: [4992, 128, 1, 128] + - Exact: [5376, 128, 1, 128] + - Exact: [5760, 128, 1, 128] + - Exact: [6144, 128, 1, 128] + - Exact: [6528, 128, 1, 128] + - Exact: [6912, 128, 1, 128] + - Exact: [7296, 128, 1, 128] + - Exact: [7680, 128, 1, 128] + - Exact: [8064, 128, 1, 128] + - Exact: [8448, 128, 1, 128] + - Exact: [8832, 128, 1, 128] + - Exact: [2304, 128, 1, 256] + - Exact: [2688, 128, 1, 256] + - Exact: [3072, 128, 1, 256] + - Exact: [3456, 128, 1, 256] + - Exact: [3840, 128, 1, 256] + - Exact: [4224, 128, 1, 256] + - Exact: [4608, 128, 1, 256] + - Exact: [4992, 128, 1, 256] + - Exact: [5376, 128, 1, 256] + - Exact: [5760, 128, 1, 256] + - Exact: [6144, 128, 1, 256] + - Exact: [6528, 128, 1, 256] + - Exact: [6912, 128, 1, 256] + - Exact: [7296, 128, 1, 256] + - Exact: [7680, 128, 1, 256] + - Exact: [8064, 128, 1, 256] + - Exact: [8448, 128, 1, 256] + - Exact: [8832, 128, 1, 256] + - Exact: [768, 768, 1, 768] + - Exact: [384, 1536, 1, 384] + - Exact: [384, 1920, 1, 384] + - Exact: [384, 2304, 1, 384] + - Exact: [64, 192, 64, 1280] + - Exact: [64, 320, 64, 1280] + - Exact: [64, 384, 64, 1280] + - Exact: [64, 448, 64, 1280] + - Exact: [64, 192, 64, 2048] + - Exact: [64, 320, 64, 2048] + - Exact: [64, 384, 64, 2048] + - Exact: [64, 448, 64, 2048] + - Exact: [1225, 64, 64, 192] + - Exact: [1225, 64, 64, 256] + - Exact: [1225, 64, 64, 288] + - Exact: [5329, 80, 64, 64] + - Exact: [64, 192, 32, 1280] + - Exact: [64, 320, 32, 1280] + - Exact: [64, 384, 32, 1280] + - Exact: [64, 448, 32, 1280] + - Exact: [64, 192, 32, 2048] + - Exact: [64, 320, 32, 2048] + - Exact: [64, 384, 32, 2048] + - Exact: [64, 448, 32, 2048] + - Exact: [1225, 64, 32, 192] + - Exact: [1225, 64, 32, 256] + - Exact: [1225, 64, 32, 288] + - Exact: [5329, 80, 32, 64] + - Exact: [289, 128, 32, 768] + - Exact: [289, 160, 32, 768] + - Exact: [289, 192, 32, 768] + - Exact: [3136, 64, 32, 64] + - Exact: [3136, 64, 32, 256] + - Exact: [196, 256, 32, 1024] + - Exact: [1024, 1024, 1, 6912] + - Exact: [1024, 512, 1, 4096] + - Exact: [480, 1024, 1, 4096] + - Exact: [1024, 512, 1, 6912] + - Exact: [480, 1024, 1, 6912] + - Exact: [100, 512, 120, 128] + - Exact: [100, 512, 18, 128] + - Exact: [100, 512, 19, 128] + - Exact: [1444, 576, 1, 128] + - Exact: [173280, 64, 1, 128] + - Exact: [25992, 64, 1, 128] + - Exact: [27436, 64, 1, 128] + - Exact: [361, 2304, 1, 512] + - Exact: [960, 1024, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 1024, 1, 77] + - Exact: [64, 128, 160, 128] + - Exact: [1024, 1024, 1, 10] + - Exact: [64, 128, 624, 128] + - Exact: [1024, 1024, 1, 39] + - Exact: [1024, 1024, 1, 780] + - Exact: [1024, 1024, 1, 4992] + - Exact: [1024, 1024, 1, 308] + - Exact: [64, 128, 640, 128] + - Exact: [1024, 1024, 1, 40] + - Exact: [1024, 1024, 1, 800] + - Exact: [1024, 1024, 1, 5120] + - Exact: [64, 128, 656, 128] + - Exact: [1024, 1024, 1, 41] + - Exact: [1024, 1024, 1, 820] + - Exact: [1024, 1024, 1, 5248] + - Exact: [64, 512, 80, 512] + - Exact: [1024, 1024, 1, 5] + - Exact: [1024, 1024, 1, 385] + - Exact: [1024, 1024, 1, 2560] + - Exact: [1024, 1024, 1, 462] + - Exact: [64, 128, 128, 128] + - Exact: [1024, 1024, 1, 8] + - Exact: [1024, 1024, 1, 160] + - Exact: [64, 128, 144, 128] + - Exact: [1024, 1024, 1, 9] + - Exact: [1024, 1024, 1, 180] + - Exact: [1024, 1024, 1, 1152] + - Exact: [1024, 1024, 1, 6528] + - Exact: [1024, 1024, 1, 7104] + - Exact: [1024, 1024, 1, 8064] + - Exact: [2048, 512, 1, 1] + - Exact: [1024, 1024, 1, 16] + - Exact: [512, 64, 256, 512] + - Exact: [64, 512, 256, 512] + - Exact: [512, 64, 128, 512] + - Exact: [64, 512, 128, 512] + - Exact: [512, 64, 40, 512] + - Exact: [64, 512, 40, 512] + - Exact: [1024, 96, 64, 1024] + - Exact: [96, 1024, 64, 1024] + - Exact: [1024, 96, 128, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [1024, 64, 256, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [1024, 64, 32, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [1024, 64, 64, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [1024, 64, 128, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [1024, 1024, 1, 64] + - Exact: [64, 128, 1024, 128] + - Exact: [128, 64, 1024, 128] + - Exact: [1024, 1024, 1, 3456] + - Exact: [1024, 1024, 1, 864] + - Exact: [1024, 512, 1, 3456] + - Exact: [1024, 512, 1, 864] + - Exact: [256, 3456, 1, 1] + - Exact: [256, 4096, 1, 1] + - Exact: [480, 1024, 1, 3456] + - Exact: [480, 1024, 1, 864] + - Exact: [64, 128, 1280, 128] + - Exact: [128, 64, 1280, 128] + - Exact: [1024, 1024, 1, 82] + - Exact: [128, 64, 1312, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [1024, 1024, 1, 12] + - Exact: [1024, 1024, 1, 6144] + - Exact: [64, 512, 192, 512] + - Exact: [512, 64, 192, 512] + - Exact: [3136, 64, 64, 128] + - Exact: [3136, 64, 32, 128] + - Exact: [196, 2304, 1, 256] + - Exact: [784, 1152, 1, 128] + - Exact: [64, 128, 2048, 128] + - Exact: [128, 64, 2048, 128] + - Exact: [128, 64, 1536, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [1024, 1024, 1, 96] + - Exact: [92416, 64, 25, 64] + - Exact: [50176, 64, 36, 64] + - Exact: [36864, 64, 49, 64] + - Exact: [25600, 64, 64, 64] + - Exact: [64, 128, 192, 128] + - Exact: [128, 64, 192, 128] + - Exact: [768, 768, 1, 2048] + - Exact: [64, 384, 144, 384] + - Exact: [384, 64, 144, 384] + - Exact: [768, 768, 1, 4608] + - Exact: [64, 512, 48, 512] + - Exact: [512, 64, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [128, 64, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [384, 64, 192, 384] + - Exact: [1024, 1024, 1, 4608] + - Exact: [768, 512, 2, 2048] + - Exact: [713, 512, 2, 2048] + - Exact: [672, 512, 2, 2048] + - Exact: [660, 512, 2, 2048] + - Exact: [726, 512, 2, 2048] + - Exact: [1008, 512, 2, 2048] + - Exact: [748, 512, 2, 2048] + - Exact: [864, 512, 2, 2048] + - Exact: [888, 512, 2, 2048] + - Exact: [805, 512, 2, 2048] + - Exact: [850, 512, 2, 2048] + - Exact: [840, 512, 2, 2048] + - Exact: [850, 256, 2, 3] + - Exact: [805, 256, 2, 12] + - Exact: [805, 256, 2, 3] + - Exact: [850, 256, 2, 12] + - Exact: [768, 256, 2, 12] + - Exact: [864, 256, 2, 3] + - Exact: [950, 256, 2, 12] + - Exact: [864, 256, 2, 12] + - Exact: [950, 256, 2, 3] + - Exact: [768, 256, 2, 3] + - Exact: [1024, 320, 1, 1024] + - Exact: [96, 1024, 160, 1024] + - Exact: [1024, 96, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [1024, 96, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [1024, 96, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [1024, 96, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [1024, 96, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [1024, 96, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [1024, 96, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [1024, 96, 32, 1024] + - Exact: [512, 64, 320, 512] + - Exact: [64, 512, 320, 512] + - Exact: [512, 64, 80, 512] + - Exact: [1024, 64, 512, 1024] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 64, 13216] + - Exact: [64, 96, 36, 10368] + - Exact: [64, 64, 36, 12544] + - Exact: [64, 64, 36, 11552] + - Exact: [1024, 256, 1, 10496] + - Exact: [1024, 256, 1, 11520] + - Exact: [1024, 256, 1, 12032] + - Exact: [1024, 256, 1, 13568] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 14848] + - Exact: [1024, 256, 1, 15104] + - Exact: [1024, 256, 1, 15872] + - Exact: [1024, 256, 1, 16128] + - Exact: [1024, 256, 1, 17152] + - Exact: [1024, 256, 1, 17408] + - Exact: [1024, 256, 1, 18944] + - Exact: [1024, 256, 1, 19712] + - Exact: [1024, 256, 1, 19968] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 8448] + - Exact: [1024, 256, 1, 9728] + - Exact: [1024, 256, 1, 9984] + - Exact: [512, 256, 1, 32768] + - Exact: [256, 128, 1, 55296] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 200] + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [4288, 64, 1, 1280] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [64, 2944, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1408, 128, 1, 128] + - Exact: [1024, 256, 1, 128] + - Exact: [2944, 64, 1, 128] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 704, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [448, 448, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [49, 512, 128, 2048] + - Exact: [49, 2048, 128, 512] + - Exact: [49, 2048, 256, 512] + - Exact: [49, 512, 256, 2048] + - Exact: [64, 38, 1680, 38] + - Exact: [64, 59, 1088, 59] + - Exact: [64, 32, 1984, 32] + - Exact: [64, 54, 1184, 54] + - Exact: [64, 49, 1296, 49] + - Exact: [64, 45, 1424, 45] + - Exact: [64, 35, 1808, 35] + - Exact: [64, 41, 1552, 41] + - Exact: [64, 64, 36, 3136] + - Exact: [64, 64, 64, 826] + - Exact: [64, 64, 64, 1600] + - Exact: [64, 96, 64, 288] + - Exact: [96, 96, 36, 1568] + - Exact: [96, 96, 36, 2592] + - Exact: [64, 96, 64, 800] + - Exact: [35, 96, 36, 8960] + - Exact: [32, 64, 36, 43808] + - Exact: [64, 64, 64, 81] + - Exact: [64, 96, 36, 512] + - Exact: [64, 64, 64, 3200] + - Exact: [64, 64, 36, 3520] + - Exact: [64, 64, 64, 5408] + - Exact: [35, 96, 36, 13440] + - Exact: [96, 96, 64, 1152] + - Exact: [32, 64, 36, 90] + - Exact: [64, 64, 64, 800] + - Exact: [64, 64, 36, 1568] + - Exact: [64, 64, 36, 196] + - Exact: [35, 96, 64, 4235] + - Exact: [149, 32, 36, 19072] + - Exact: [64, 96, 36, 1568] + - Exact: [96, 96, 64, 800] + - Exact: [32, 64, 64, 640] + - Exact: [64, 64, 36, 392] + - Exact: [64, 64, 64, 1652] + - Exact: [64, 96, 36, 2592] + - Exact: [64, 64, 36, 6272] + - Exact: [32, 64, 64, 20000] + - Exact: [64, 64, 64, 648] + - Exact: [32, 64, 36, 1440] + - Exact: [64, 64, 64, 100] + - Exact: [64, 96, 64, 4608] + - Exact: [64, 64, 64, 200] + - Exact: [32, 64, 64, 40] + - Exact: [64, 96, 64, 1152] + - Exact: [149, 32, 64, 8195] + - Exact: [35, 96, 64, 6160] + - Exact: [64, 64, 36, 1760] + - Exact: [64, 2880, 1, 320] + - Exact: [49, 832, 32, 256] + - Exact: [289, 1120, 1, 160] + - Exact: [64, 1728, 1, 320] + - Exact: [49, 832, 32, 160] + - Exact: [49, 832, 32, 384] + - Exact: [289, 896, 1, 192] + - Exact: [289, 896, 1, 128] + - Exact: [196, 800, 1, 64] + - Exact: [64, 1344, 1, 512] + - Exact: [64, 1152, 1, 384] + - Exact: [64, 1152, 1, 448] + - Exact: [49, 832, 32, 128] + - Exact: [49, 832, 32, 48] + - Exact: [64, 1152, 1, 256] + - Exact: [49, 832, 32, 32] + - Exact: [289, 1120, 1, 192] + - Exact: [196, 600, 1, 64] + - Exact: [49, 832, 32, 192] + - Exact: [64, 1728, 1, 192] + - Exact: [64, 38, 840, 38] + - Exact: [64, 49, 648, 49] + - Exact: [64, 32, 992, 32] + - Exact: [64, 35, 904, 35] + - Exact: [64, 41, 776, 41] + - Exact: [64, 45, 712, 45] + - Exact: [64, 54, 592, 54] + - Exact: [64, 59, 544, 59] + - Exact: [49, 512, 64, 2048] + - Exact: [49, 2048, 64, 512] + - Exact: [33, 32, 1600, 33] + - Exact: [33, 32, 200, 33] + - Exact: [67, 2048, 1, 512] + - Exact: [512, 512, 1, 3780] + - Exact: [512, 512, 1, 3796] + - Exact: [512, 512, 1, 3822] + - Exact: [512, 512, 1, 3840] + - Exact: [512, 512, 1, 3859] + - Exact: [512, 512, 1, 3870] + - Exact: [512, 512, 1, 3876] + - Exact: [512, 512, 1, 3906] + - Exact: [512, 512, 1, 3910] + - Exact: [512, 512, 1, 3925] + - Exact: [512, 512, 1, 3927] + - Exact: [512, 512, 1, 3942] + - Exact: [512, 512, 1, 3944] + - Exact: [512, 512, 1, 3955] + - Exact: [512, 512, 1, 3968] + - Exact: [512, 512, 1, 3969] + - Exact: [512, 512, 1, 3976] + - Exact: [512, 512, 1, 3977] + - Exact: [512, 512, 1, 3978] + - Exact: [512, 512, 1, 3990] + - Exact: [512, 512, 1, 3995] + - Exact: [512, 512, 1, 3996] + - Exact: [512, 512, 1, 3999] + - Exact: [512, 512, 1, 4005] + - Exact: [512, 512, 1, 4012] + - Exact: [512, 512, 1, 4020] + - Exact: [512, 512, 1, 4026] + - Exact: [512, 512, 1, 4030] + - Exact: [512, 512, 1, 4032] + - Exact: [512, 512, 1, 4050] + - Exact: [512, 512, 1, 4059] + - Exact: [384, 384, 1, 384] + - Exact: [384, 192, 1, 384] + - Exact: [1024, 256, 1, 1024] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 256, 1, 3072] + - Exact: [1024, 256, 1, 3328] + - Exact: [1024, 256, 1, 3584] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 256, 1, 4352] + - Exact: [1024, 256, 1, 4608] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 256, 1, 5376] + - Exact: [1024, 256, 1, 5632] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 256, 1, 6400] + - Exact: [1024, 256, 1, 7680] + - Exact: [1024, 256, 1, 7936] + - Exact: [512, 512, 1, 1600] + - Exact: [100, 2048, 1, 512] + - Exact: [74, 2048, 1, 512] + - Exact: [74, 2048, 1, 960] + - Exact: [768, 128, 1, 128] + - Exact: [1152, 128, 1, 128] + - Exact: [1536, 128, 1, 128] + - Exact: [1920, 128, 1, 128] + - Exact: [768, 128, 1, 256] + - Exact: [1152, 128, 1, 256] + - Exact: [1536, 128, 1, 256] + - Exact: [1920, 128, 1, 256] + - Exact: [448, 448, 1, 448] + - Exact: [1225, 32, 64, 192] + - Exact: [1225, 48, 64, 192] + - Exact: [1225, 48, 64, 256] + - Exact: [1225, 48, 64, 288] + - Exact: [1225, 32, 32, 192] + - Exact: [1225, 48, 32, 192] + - Exact: [1225, 48, 32, 256] + - Exact: [1225, 48, 32, 288] + - Exact: [49, 2048, 32, 512] + - Exact: [49, 512, 32, 2048] + - Exact: [512, 256, 1, 4096] + - Exact: [512, 256, 1, 6912] + - Exact: [100, 2304, 1, 512] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 512] + - Exact: [32, 64, 4608, 32] + - Exact: [32, 64, 4608, 35] + - Exact: [34, 64, 4736, 24] + - Exact: [34, 64, 4736, 34] + - Exact: [35, 64, 4608, 35] + - Exact: [64, 32, 4608, 32] + - Exact: [64, 32, 4608, 35] + - Exact: [64, 34, 4736, 24] + - Exact: [64, 34, 4736, 34] + - Exact: [64, 35, 4608, 35] + - Exact: [256, 864, 1, 1] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 864] + - Exact: [49, 1024, 64, 2048] + - Exact: [49, 2048, 64, 1024] + - Exact: [49, 1024, 32, 2048] + - Exact: [49, 2048, 32, 1024] + - Exact: [49, 4608, 1, 512] + - Exact: [56, 512, 64, 512] + - Exact: [228, 256, 2, 12] + - Exact: [228, 256, 2, 3] + - Exact: [187, 256, 2, 12] + - Exact: [247, 256, 2, 12] + - Exact: [176, 256, 2, 3] + - Exact: [187, 256, 2, 3] + - Exact: [221, 256, 2, 3] + - Exact: [221, 256, 2, 12] + - Exact: [176, 256, 2, 12] + - Exact: [247, 256, 2, 3] + - Exact: [216, 256, 2, 3] + - Exact: [192, 256, 2, 12] + - Exact: [192, 256, 2, 3] + - Exact: [216, 256, 2, 12] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [32, 32, 36, 43808] + - Exact: [32, 32, 64, 20000] + - Exact: [256, 128, 1, 32768] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 128] + - Exact: [2368, 4, 1, 256] + - Exact: [1408, 4, 1, 256] + - Exact: [5056, 4, 1, 1280] + - Exact: [1408, 4, 1, 3328] + - Exact: [6784, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [1856, 4, 1, 256] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [2048, 1, 1, 512] + - Exact: [2048, 1, 1, 960] + - Exact: [2048, 2, 1, 2] + - Exact: [2560, 2, 1, 4] + - Exact: [2048, 2, 1, 8] + - Exact: [2560, 2, 1, 2] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + - Exact: [49, 1200, 1, 128] + - Exact: [1, 1152, 1, 256] + - Exact: [25, 1152, 1, 256] + - Exact: [9, 1152, 1, 256] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [16, 32, 36, 5760] + - Exact: [3, 64, 36, 6272] + - Exact: [3, 64, 64, 46208] + - Exact: [3, 64, 64, 92416] + - Exact: [1, 16, 36, 23040] + - Exact: [1, 16, 64, 10240] + - Exact: [3, 64, 36, 25088] + - Exact: [3, 64, 64, 11552] + - Exact: [3, 64, 36, 200704] + - Exact: [3, 64, 64, 23104] + - Exact: [3, 64, 36, 100352] + - Exact: [3, 64, 36, 50176] + - Exact: [8, 384, 64, 6600] + - Exact: [65, 1024, 1, 6400] + - Exact: [13, 512, 1, 32768] + - Exact: [256, 1, 1, 32768] + - Exact: [256, 4, 1, 6912] + - Exact: [13, 512, 1, 55296] + - Exact: [1024, 2, 1, 4992] + - Exact: [1024, 2, 1, 5120] + - Exact: [1024, 2, 1, 5248] + - Exact: [13, 512, 1, 6912] + - Exact: [256, 1, 1, 6912] + - Exact: [256, 128, 1, 6912] + - Exact: [768, 2, 1, 4608] + - Exact: [1024, 2, 1, 4608] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 64, 1, 512] + - Exact: [512, 32, 1, 200] + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 1024, 1, 128] + - Exact: [4, 64, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [448, 4, 1, 1280] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [512, 32, 1, 512] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 256, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [512, 16, 1, 512] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [64, 1024, 1, 3328] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [1024, 32, 1, 512] + - Exact: [4, 128, 1, 3328] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [64, 64, 1, 3328] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [1024, 16, 1, 512] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 128, 1, 128] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [64, 23, 2720, 23] + - Exact: [64, 19, 3264, 19] + - Exact: [64, 25, 2512, 25] + - Exact: [64, 9, 6544, 9] + - Exact: [64, 7, 8192, 7] + - Exact: [64, 8, 7280, 8] + - Exact: [64, 27, 2336, 27] + - Exact: [64, 16, 3840, 16] + - Exact: [64, 11, 5456, 11] + - Exact: [64, 21, 2976, 21] + - Exact: [64, 15, 4096, 15] + - Exact: [64, 10, 5952, 10] + - Exact: [64, 14, 4368, 14] + - Exact: [64, 13, 4672, 13] + - Exact: [64, 12, 5040, 12] + - Exact: [64, 29, 2176, 29] + - Exact: [64, 17, 3632, 17] + - Exact: [64, 18, 3440, 18] + - Exact: [768, 2, 1, 16] + - Exact: [768, 2, 1, 32] + - Exact: [3, 64, 64, 2888] + - Exact: [1, 16, 64, 640] + - Exact: [512, 24, 36, 800] + - Exact: [16, 32, 36, 360] + - Exact: [1, 16, 36, 1440] + - Exact: [512, 24, 64, 512] + - Exact: [3, 64, 36, 3136] + - Exact: [256, 24, 64, 32] + - Exact: [256, 16, 36, 3200] + - Exact: [256, 16, 36, 32] + - Exact: [512, 24, 36, 288] + - Exact: [512, 24, 64, 128] + - Exact: [3, 64, 64, 1444] + - Exact: [16, 32, 64, 160] + - Exact: [256, 16, 64, 32] + - Exact: [256, 16, 64, 1568] + - Exact: [256, 24, 36, 128] + - Exact: [16, 32, 64, 2560] + - Exact: [49, 800, 1, 128] + - Exact: [64, 12, 2520, 12] + - Exact: [64, 13, 2336, 13] + - Exact: [64, 14, 2184, 14] + - Exact: [64, 15, 2048, 15] + - Exact: [64, 16, 1920, 16] + - Exact: [64, 17, 1816, 17] + - Exact: [64, 18, 1720, 18] + - Exact: [64, 19, 1632, 19] + - Exact: [64, 21, 1488, 21] + - Exact: [64, 23, 1360, 23] + - Exact: [64, 25, 1256, 25] + - Exact: [64, 27, 1168, 27] + - Exact: [64, 29, 1088, 29] + - Exact: [1024, 2, 1, 512] + - Exact: [1024, 2, 1, 3072] + - Exact: [1024, 2, 1, 6] + - Exact: [3, 64, 512, 3] + - Exact: [9, 64, 512, 9] + - Exact: [1024, 1, 1, 200] + - Exact: [5, 64, 512, 5] + - Exact: [1024, 2, 1, 1] + - Exact: [1024, 2, 1, 2048] + - Exact: [17, 64, 1, 15] + - Exact: [17, 64, 1, 17] + - Exact: [30, 64, 1, 30] + - Exact: [30, 64, 1, 31] + - Exact: [31, 64, 1, 31] + - Exact: [64, 17, 1, 15] + - Exact: [64, 17, 1, 17] + - Exact: [64, 30, 1, 30] + - Exact: [64, 30, 1, 31] + - Exact: [64, 31, 1, 31] + - Exact: [14, 64, 1, 14] + - Exact: [15, 64, 1, 14] + - Exact: [15, 64, 1, 15] + - Exact: [64, 14, 1, 14] + - Exact: [64, 15, 1, 14] + - Exact: [64, 15, 1, 15] + - Exact: [1024, 2, 1, 32] + - Exact: [1024, 2, 1, 4] + - Exact: [512, 32, 1, 1600] + - Exact: [1024, 64, 1, 960] + - Exact: [512, 64, 1, 512] + - Exact: [384, 128, 1, 128] + - Exact: [384, 128, 1, 256] + - Exact: [64, 64, 1, 64] + - Exact: [256, 4, 1, 4096] + - Exact: [25, 256, 120, 128] + - Exact: [25, 256, 18, 128] + - Exact: [25, 256, 19, 128] + - Exact: [9, 256, 120, 128] + - Exact: [9, 256, 18, 128] + - Exact: [9, 256, 19, 128] + - Exact: [1024, 2, 1, 10] + - Exact: [1024, 2, 1, 1280] + - Exact: [1024, 2, 1, 39] + - Exact: [1024, 2, 1, 40] + - Exact: [1024, 2, 1, 41] + - Exact: [1024, 2, 1, 5] + - Exact: [1024, 2, 1, 2560] + - Exact: [1024, 2, 1, 8] + - Exact: [1024, 2, 1, 1024] + - Exact: [1024, 2, 1, 9] + - Exact: [1024, 2, 1, 1152] + - Exact: [4, 64, 32768, 4] + - Exact: [4, 64, 38400, 4] + - Exact: [64, 4, 32768, 4] + - Exact: [64, 4, 38400, 4] + - Exact: [14, 64, 10880, 14] + - Exact: [15, 64, 10880, 14] + - Exact: [15, 64, 7680, 15] + - Exact: [15, 64, 10880, 15] + - Exact: [17, 64, 7680, 15] + - Exact: [17, 64, 6144, 17] + - Exact: [17, 64, 7680, 17] + - Exact: [21, 64, 6144, 17] + - Exact: [21, 64, 6144, 21] + - Exact: [24, 64, 4736, 24] + - Exact: [30, 64, 2048, 30] + - Exact: [30, 64, 2048, 31] + - Exact: [31, 64, 2048, 31] + - Exact: [64, 14, 10880, 14] + - Exact: [64, 15, 10880, 14] + - Exact: [64, 15, 7680, 15] + - Exact: [64, 15, 10880, 15] + - Exact: [64, 17, 7680, 15] + - Exact: [64, 17, 6144, 17] + - Exact: [64, 17, 7680, 17] + - Exact: [64, 21, 6144, 17] + - Exact: [64, 21, 6144, 21] + - Exact: [64, 24, 4736, 24] + - Exact: [64, 30, 2048, 30] + - Exact: [64, 30, 2048, 31] + - Exact: [64, 31, 2048, 31] + - Exact: [64, 512, 1, 512] + - Exact: [5, 64, 1, 5] + - Exact: [33, 32, 1, 33] + - Exact: [1024, 1, 1, 1600] + - Exact: [5, 64, 960, 5] + - Exact: [27, 128, 32768, 27] + - Exact: [1024, 2, 1, 16] + - Exact: [1024, 2, 1, 64] + - Exact: [13, 512, 1, 3456] + - Exact: [13, 512, 1, 4096] + - Exact: [13, 512, 1, 864] + - Exact: [256, 1, 1, 3456] + - Exact: [256, 1, 1, 4096] + - Exact: [256, 1, 1, 864] + - Exact: [256, 128, 1, 3456] + - Exact: [256, 128, 1, 4096] + - Exact: [256, 128, 1, 864] + - Exact: [1024, 2, 1, 80] + - Exact: [1024, 2, 1, 82] + - Exact: [1024, 2, 1, 12] + - Exact: [64, 24, 6816, 24] + - Exact: [64, 26, 6272, 26] + - Exact: [1024, 2, 1, 128] + - Exact: [1024, 2, 1, 96] + - Exact: [768, 2, 1, 2048] + - Exact: [1024, 81, 1, 1024] + - Exact: [2, 1024, 1, 6] + - Exact: [1024, 2, 1, 20] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml new file mode 100644 index 000000000..83aad34df --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml @@ -0,0 +1,5583 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 4096, 1, 1024] + - Exact: [4096, 4096, 1, 1024] + - Exact: [1024, 4096, 1, 4096] + - Exact: [30528, 4096, 1, 1024] + - Exact: [1024, 2048, 1, 1024] + - Exact: [4096, 2048, 1, 1024] + - Exact: [1024, 2048, 1, 4096] + - Exact: [30528, 2048, 1, 1024] + - Exact: [30522, 320, 1, 768] + - Exact: [3072, 4096, 1, 768] + - Exact: [768, 4096, 1, 3072] + - Exact: [768, 4096, 1, 768] + - Exact: [30522, 160, 1, 768] + - Exact: [30522, 640, 1, 768] + - Exact: [30522, 1280, 1, 768] + - Exact: [1024, 3072, 1, 1024] + - Exact: [1024, 2048, 1, 3072] + - Exact: [1024, 3072, 1, 3072] + - Exact: [3072, 2048, 1, 1024] + - Exact: [3072, 3072, 1, 1024] + - Exact: [3072, 512, 1, 1024] + - Exact: [30522, 160, 1, 1024] + - Exact: [128, 128, 512, 64] + - Exact: [512, 512, 64, 64] + - Exact: [256, 256, 192, 64] + - Exact: [256, 256, 96, 64] + - Exact: [128, 128, 384, 64] + - Exact: [128, 128, 96, 64] + - Exact: [512, 512, 16, 64] + - Exact: [512, 512, 96, 64] + - Exact: [512, 512, 128, 64] + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [512, 24000, 1, 1536] + - Exact: [5888, 1408, 1, 256] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [2368, 6784, 1, 128] + - Exact: [1024, 3584, 1, 3328] + - Exact: [512, 48000, 1, 2048] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [6144, 24000, 1, 2048] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1760, 6400, 1, 1760] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [16384, 3200, 1, 4096] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [3584, 2368, 1, 1280] + - Exact: [704, 5888, 1, 256] + - Exact: [6784, 2944, 1, 128] + - Exact: [2560, 1600, 1, 2560] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2944, 6784, 1, 3328] + - Exact: [6144, 5984, 1, 2048] + - Exact: [3584, 704, 1, 3328] + - Exact: [2048, 1600, 1, 512] + - Exact: [448, 4288, 1, 256] + - Exact: [1856, 4288, 1, 128] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [2048, 7000, 1, 2048] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [6144, 24000, 1, 2560] + - Exact: [5056, 4288, 1, 1280] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1408, 5056, 1, 1280] + - Exact: [2368, 2368, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [1856, 5056, 1, 128] + - Exact: [8192, 3200, 1, 2048] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [512, 24000, 1, 2048] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2560, 800, 1, 2560] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [5888, 1024, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [512, 48000, 1, 2560] + - Exact: [704, 6784, 1, 3328] + - Exact: [2560, 6400, 1, 2560] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 1024, 1, 128] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [8448, 12000, 1, 2816] + - Exact: [4288, 1408, 1, 128] + - Exact: [1856, 5888, 1, 3328] + - Exact: [4288, 5056, 1, 256] + - Exact: [4096, 800, 1, 1024] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [4096, 400, 1, 1024] + - Exact: [3072, 24000, 1, 1024] + - Exact: [5888, 1856, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [512, 48000, 1, 1536] + - Exact: [3584, 448, 1, 1280] + - Exact: [8448, 5984, 1, 2816] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [8448, 48000, 1, 2816] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [5056, 256, 1, 1280] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [4608, 5984, 1, 1536] + - Exact: [1760, 3200, 1, 1760] + - Exact: [1024, 1856, 1, 256] + - Exact: [4096, 1600, 1, 1024] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [7680, 24000, 1, 2560] + - Exact: [4608, 48000, 1, 1536] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [8192, 1600, 1, 2048] + - Exact: [512, 24000, 1, 2560] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [704, 6784, 1, 128] + - Exact: [3584, 704, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2048, 3200, 1, 512] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [4608, 12000, 1, 1536] + - Exact: [5056, 5056, 1, 128] + - Exact: [8192, 800, 1, 2048] + - Exact: [448, 5056, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [8448, 24000, 1, 2816] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [2048, 800, 1, 512] + - Exact: [5888, 4288, 1, 1280] + - Exact: [1024, 24000, 1, 2048] + - Exact: [5888, 3584, 1, 128] + - Exact: [1024, 2944, 1, 128] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [3584, 6784, 1, 256] + - Exact: [6144, 12000, 1, 2048] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [1856, 2368, 1, 128] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [2048, 1600, 1, 2048] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [5124, 9124, 1, 2048] + - Exact: [2944, 1408, 1, 128] + - Exact: [3584, 1408, 1, 1280] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [2944, 4288, 1, 128] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [4096, 3200, 1, 1024] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 1280] + - Exact: [2560, 7000, 1, 2560] + - Exact: [2944, 1024, 1, 256] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [1856, 1856, 1, 128] + - Exact: [3584, 5056, 1, 3328] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2944, 2944, 1, 128] + - Exact: [1856, 3584, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [4608, 24000, 1, 1536] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [704, 4288, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [2048, 6400, 1, 2048] + - Exact: [512, 48000, 1, 2816] + - Exact: [5124, 9124, 1, 2560] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [1024, 24000, 1, 1536] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [2048, 800, 1, 2048] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [2368, 5056, 1, 128] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [4096, 7000, 1, 4096] + - Exact: [704, 2944, 1, 256] + - Exact: [6784, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [16384, 400, 1, 4096] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2560] + - Exact: [5888, 2368, 1, 3328] + - Exact: [5124, 9124, 1, 1760] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [7680, 12000, 1, 2560] + - Exact: [1408, 1024, 1, 256] + - Exact: [8192, 400, 1, 2048] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [5124, 9124, 1, 4096] + - Exact: [7680, 48000, 1, 2560] + - Exact: [1856, 2944, 1, 1280] + - Exact: [1024, 48000, 1, 2816] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [5888, 2944, 1, 128] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [1760, 1600, 1, 1760] + - Exact: [5888, 1408, 1, 3328] + - Exact: [2368, 1856, 1, 256] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 5888, 1, 128] + - Exact: [2368, 1408, 1, 3328] + - Exact: [1024, 48000, 1, 1536] + - Exact: [5888, 448, 1, 256] + - Exact: [2560, 3200, 1, 2560] + - Exact: [5888, 6784, 1, 128] + - Exact: [6144, 48000, 1, 2048] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [1024, 48000, 1, 2560] + - Exact: [1024, 2368, 1, 128] + - Exact: [16384, 800, 1, 4096] + - Exact: [5888, 5056, 1, 1280] + - Exact: [3072, 48000, 1, 1024] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [6784, 448, 1, 3328] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [3584, 1024, 1, 3328] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2944, 4288, 1, 3328] + - Exact: [2368, 3584, 1, 3328] + - Exact: [2944, 704, 1, 1280] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [2048, 3200, 1, 2048] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [1024, 24000, 1, 2816] + - Exact: [7680, 5984, 1, 2560] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [1760, 800, 1, 1760] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1408, 2368, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [6144, 48000, 1, 2560] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [448, 4288, 1, 1280] + - Exact: [1024, 4288, 1, 128] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [4288, 704, 1, 1280] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [448, 5056, 1, 3328] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [6784, 1856, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [512, 24000, 1, 2816] + - Exact: [256, 5888, 1, 1280] + - Exact: [16384, 1600, 1, 4096] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [1760, 7000, 1, 1760] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [704, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5056, 1024, 1, 3328] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [1024, 48000, 1, 2048] + - Exact: [4288, 1024, 1, 128] + - Exact: [4096, 512, 1, 32] + - Exact: [2048, 1024, 1, 1664] + - Exact: [4096, 512, 1, 1408] + - Exact: [4096, 1024, 1, 1280] + - Exact: [2048, 1024, 1, 640] + - Exact: [4096, 1024, 1, 13312] + - Exact: [2048, 1024, 1, 13312] + - Exact: [2048, 1024, 1, 3584] + - Exact: [4096, 1024, 1, 1920] + - Exact: [4096, 1024, 1, 12288] + - Exact: [4096, 1024, 1, 8320] + - Exact: [4096, 1024, 1, 15360] + - Exact: [4096, 512, 1, 3072] + - Exact: [4096, 512, 1, 13312] + - Exact: [4096, 1024, 1, 3840] + - Exact: [2048, 1024, 1, 3200] + - Exact: [4096, 512, 1, 3840] + - Exact: [4096, 512, 1, 5632] + - Exact: [4096, 512, 1, 64] + - Exact: [2048, 1024, 1, 512] + - Exact: [4096, 512, 1, 8192] + - Exact: [4096, 512, 1, 2304] + - Exact: [4096, 512, 1, 2816] + - Exact: [2048, 1024, 1, 7680] + - Exact: [4096, 512, 1, 1920] + - Exact: [4096, 1024, 1, 32] + - Exact: [4096, 512, 1, 16640] + - Exact: [2048, 1024, 1, 1024] + - Exact: [4096, 512, 1, 1792] + - Exact: [4096, 1024, 1, 8192] + - Exact: [2048, 1024, 1, 4160] + - Exact: [4096, 512, 1, 10240] + - Exact: [4096, 512, 1, 512] + - Exact: [2048, 1024, 1, 6656] + - Exact: [2048, 1024, 1, 14336] + - Exact: [4096, 512, 1, 11264] + - Exact: [4096, 512, 1, 128] + - Exact: [4096, 512, 1, 768] + - Exact: [4096, 1024, 1, 11264] + - Exact: [4096, 1024, 1, 16640] + - Exact: [2048, 1024, 1, 5632] + - Exact: [4096, 512, 1, 12288] + - Exact: [4096, 1024, 1, 5632] + - Exact: [2048, 1024, 1, 10240] + - Exact: [4096, 1024, 1, 640] + - Exact: [2048, 1024, 1, 12288] + - Exact: [4096, 1024, 1, 10240] + - Exact: [2048, 1024, 1, 4608] + - Exact: [4096, 512, 1, 3584] + - Exact: [4096, 1024, 1, 4608] + - Exact: [4096, 1024, 1, 3328] + - Exact: [2048, 1024, 1, 9216] + - Exact: [2048, 1024, 1, 2304] + - Exact: [4096, 512, 1, 6144] + - Exact: [4096, 512, 1, 15360] + - Exact: [4096, 1024, 1, 7168] + - Exact: [4096, 1024, 1, 9216] + - Exact: [4096, 1024, 1, 7680] + - Exact: [2048, 1024, 1, 8192] + - Exact: [4096, 1024, 1, 64] + - Exact: [2048, 1024, 1, 1280] + - Exact: [2048, 1024, 1, 3328] + - Exact: [4096, 512, 1, 14336] + - Exact: [4096, 512, 1, 8320] + - Exact: [4096, 1024, 1, 6656] + - Exact: [2048, 1024, 1, 256] + - Exact: [4096, 512, 1, 1024] + - Exact: [4096, 1024, 1, 1536] + - Exact: [2048, 1024, 1, 32] + - Exact: [4096, 512, 1, 640] + - Exact: [4096, 512, 1, 16384] + - Exact: [4096, 1024, 1, 512] + - Exact: [2048, 1024, 1, 1152] + - Exact: [4096, 1024, 1, 2080] + - Exact: [4096, 1024, 1, 768] + - Exact: [4096, 1024, 1, 2560] + - Exact: [2048, 1024, 1, 64] + - Exact: [4096, 1024, 1, 16384] + - Exact: [4096, 512, 1, 6656] + - Exact: [2048, 1024, 1, 128] + - Exact: [2048, 1024, 1, 2080] + - Exact: [2048, 1024, 1, 16640] + - Exact: [2048, 1024, 1, 3072] + - Exact: [4096, 1024, 1, 1408] + - Exact: [4096, 1024, 1, 2048] + - Exact: [2048, 1024, 1, 2560] + - Exact: [4096, 1024, 1, 128] + - Exact: [4096, 1024, 1, 14336] + - Exact: [4096, 512, 1, 9216] + - Exact: [2048, 1024, 1, 2048] + - Exact: [4096, 512, 1, 1536] + - Exact: [2048, 1024, 1, 16384] + - Exact: [4096, 1024, 1, 1024] + - Exact: [4096, 1024, 1, 1664] + - Exact: [4096, 512, 1, 384] + - Exact: [4096, 512, 1, 3328] + - Exact: [4096, 1024, 1, 256] + - Exact: [2048, 1024, 1, 7168] + - Exact: [2048, 1024, 1, 1536] + - Exact: [4096, 512, 1, 7168] + - Exact: [4096, 1024, 1, 896] + - Exact: [4096, 1024, 1, 4096] + - Exact: [2048, 1024, 1, 6144] + - Exact: [4096, 512, 1, 4160] + - Exact: [4096, 512, 1, 2080] + - Exact: [4096, 1024, 1, 5120] + - Exact: [2048, 1024, 1, 1920] + - Exact: [2048, 1024, 1, 15360] + - Exact: [4096, 1024, 1, 2816] + - Exact: [4096, 512, 1, 256] + - Exact: [2048, 1024, 1, 5120] + - Exact: [2048, 1024, 1, 4096] + - Exact: [4096, 512, 1, 4608] + - Exact: [4096, 512, 1, 1664] + - Exact: [2048, 1024, 1, 896] + - Exact: [4096, 1024, 1, 4160] + - Exact: [2048, 1024, 1, 11264] + - Exact: [2048, 1024, 1, 384] + - Exact: [2048, 1024, 1, 3840] + - Exact: [4096, 512, 1, 1280] + - Exact: [4096, 1024, 1, 1152] + - Exact: [2048, 1024, 1, 1408] + - Exact: [4096, 512, 1, 896] + - Exact: [4096, 1024, 1, 3072] + - Exact: [2048, 1024, 1, 2816] + - Exact: [4096, 1024, 1, 1792] + - Exact: [4096, 512, 1, 1152] + - Exact: [4096, 512, 1, 7680] + - Exact: [4096, 1024, 1, 384] + - Exact: [2048, 1024, 1, 1792] + - Exact: [4096, 1024, 1, 3584] + - Exact: [2048, 1024, 1, 768] + - Exact: [2048, 1024, 1, 8320] + - Exact: [4096, 512, 1, 2048] + - Exact: [4096, 512, 1, 2560] + - Exact: [4096, 1024, 1, 2304] + - Exact: [4096, 512, 1, 5120] + - Exact: [4096, 1024, 1, 6144] + - Exact: [1024, 3392, 1, 4096] + - Exact: [1024, 3301, 1, 4096] + - Exact: [1024, 3443, 1, 4096] + - Exact: [132, 134, 480, 64] + - Exact: [162, 162, 400, 64] + - Exact: [4096, 3548, 1, 1024] + - Exact: [4096, 2977, 1, 1024] + - Exact: [132, 135, 480, 64] + - Exact: [1024, 2985, 1, 4096] + - Exact: [33708, 3681, 1, 1024] + - Exact: [4096, 3443, 1, 1024] + - Exact: [1024, 3400, 1, 4096] + - Exact: [4096, 3995, 1, 1024] + - Exact: [4096, 3190, 1, 1024] + - Exact: [4096, 3594, 1, 1024] + - Exact: [159, 162, 400, 64] + - Exact: [1024, 3565, 1, 4096] + - Exact: [4096, 3422, 1, 1024] + - Exact: [1024, 3214, 1, 4096] + - Exact: [33708, 3584, 1, 1024] + - Exact: [33708, 3640, 1, 1024] + - Exact: [4096, 3263, 1, 1024] + - Exact: [4096, 3296, 1, 1024] + - Exact: [1024, 3557, 1, 4096] + - Exact: [4096, 3463, 1, 1024] + - Exact: [4096, 3528, 1, 1024] + - Exact: [4096, 3226, 1, 1024] + - Exact: [4096, 3439, 1, 1024] + - Exact: [1024, 3523, 1, 4096] + - Exact: [1024, 3098, 1, 4096] + - Exact: [4096, 3121, 1, 1024] + - Exact: [33708, 3894, 1, 1024] + - Exact: [1024, 3548, 1, 4096] + - Exact: [1024, 3451, 1, 4096] + - Exact: [4096, 3353, 1, 1024] + - Exact: [4096, 3402, 1, 1024] + - Exact: [4096, 3939, 1, 1024] + - Exact: [133, 133, 480, 64] + - Exact: [1024, 3559, 1, 4096] + - Exact: [1024, 2977, 1, 4096] + - Exact: [1024, 3478, 1, 4096] + - Exact: [134, 134, 480, 64] + - Exact: [1024, 3368, 1, 4096] + - Exact: [4096, 4012, 1, 1024] + - Exact: [4096, 3486, 1, 1024] + - Exact: [1024, 3479, 1, 4096] + - Exact: [1024, 3505, 1, 4096] + - Exact: [4096, 3381, 1, 1024] + - Exact: [4096, 3430, 1, 1024] + - Exact: [1024, 3554, 1, 4096] + - Exact: [4096, 3271, 1, 1024] + - Exact: [1024, 3063, 1, 4096] + - Exact: [1024, 3209, 1, 4096] + - Exact: [4096, 3503, 1, 1024] + - Exact: [4096, 3344, 1, 1024] + - Exact: [1024, 3147, 1, 4096] + - Exact: [1024, 3322, 1, 4096] + - Exact: [1024, 3341, 1, 4096] + - Exact: [1024, 3516, 1, 4096] + - Exact: [1024, 3454, 1, 4096] + - Exact: [4096, 3969, 1, 1024] + - Exact: [4096, 3466, 1, 1024] + - Exact: [1024, 3999, 1, 1024] + - Exact: [1024, 4032, 1, 1024] + - Exact: [1024, 3403, 1, 4096] + - Exact: [4096, 3361, 1, 1024] + - Exact: [1024, 3527, 1, 4096] + - Exact: [1024, 3822, 1, 4096] + - Exact: [4096, 3315, 1, 1024] + - Exact: [232, 232, 272, 64] + - Exact: [1024, 3336, 1, 4096] + - Exact: [228, 232, 272, 64] + - Exact: [4096, 3547, 1, 1024] + - Exact: [4096, 3340, 1, 1024] + - Exact: [1024, 3906, 1, 1024] + - Exact: [1024, 3295, 1, 4096] + - Exact: [4096, 3294, 1, 1024] + - Exact: [33708, 3968, 1, 1024] + - Exact: [1024, 3473, 1, 4096] + - Exact: [1024, 3072, 1, 4096] + - Exact: [4096, 3189, 1, 1024] + - Exact: [4096, 3494, 1, 1024] + - Exact: [1024, 3522, 1, 4096] + - Exact: [33708, 3944, 1, 1024] + - Exact: [135, 135, 480, 64] + - Exact: [4096, 3421, 1, 1024] + - Exact: [4096, 3311, 1, 1024] + - Exact: [1024, 3990, 1, 1024] + - Exact: [1024, 3290, 1, 4096] + - Exact: [4096, 3565, 1, 1024] + - Exact: [1024, 3484, 1, 4096] + - Exact: [4096, 3384, 1, 1024] + - Exact: [1024, 3422, 1, 4096] + - Exact: [4096, 3681, 1, 1024] + - Exact: [1024, 3584, 1, 1024] + - Exact: [4096, 4050, 1, 1024] + - Exact: [1024, 3996, 1, 4096] + - Exact: [4096, 3169, 1, 1024] + - Exact: [4096, 3538, 1, 1024] + - Exact: [1024, 3495, 1, 4096] + - Exact: [4096, 3401, 1, 1024] + - Exact: [1024, 3560, 1, 4096] + - Exact: [133, 135, 480, 64] + - Exact: [1024, 3263, 1, 4096] + - Exact: [1024, 3870, 1, 4096] + - Exact: [4096, 3555, 1, 1024] + - Exact: [4096, 3412, 1, 1024] + - Exact: [1024, 3296, 1, 4096] + - Exact: [1024, 3379, 1, 4096] + - Exact: [4096, 3302, 1, 1024] + - Exact: [1024, 3490, 1, 4096] + - Exact: [1024, 3428, 1, 4096] + - Exact: [1024, 3976, 1, 4096] + - Exact: [4096, 3485, 1, 1024] + - Exact: [4096, 3534, 1, 1024] + - Exact: [1024, 3064, 1, 4096] + - Exact: [4096, 3216, 1, 1024] + - Exact: [1024, 3450, 1, 4096] + - Exact: [1024, 3533, 1, 4096] + - Exact: [1024, 4030, 1, 1024] + - Exact: [1024, 3311, 1, 4096] + - Exact: [1024, 3468, 1, 4096] + - Exact: [4096, 3359, 1, 1024] + - Exact: [4096, 3392, 1, 1024] + - Exact: [1024, 3925, 1, 1024] + - Exact: [4096, 3233, 1, 1024] + - Exact: [4096, 3956, 1, 1024] + - Exact: [1024, 3463, 1, 4096] + - Exact: [1024, 3126, 1, 4096] + - Exact: [1024, 3363, 1, 4096] + - Exact: [4096, 3465, 1, 1024] + - Exact: [33708, 3996, 1, 1024] + - Exact: [1024, 3231, 1, 4096] + - Exact: [33708, 3978, 1, 1024] + - Exact: [4096, 3476, 1, 1024] + - Exact: [4096, 3339, 1, 1024] + - Exact: [4096, 3452, 1, 1024] + - Exact: [1024, 3396, 1, 4096] + - Exact: [4096, 3293, 1, 1024] + - Exact: [1024, 3432, 1, 4096] + - Exact: [4096, 3493, 1, 1024] + - Exact: [4096, 3350, 1, 1024] + - Exact: [1024, 3079, 1, 4096] + - Exact: [1024, 3101, 1, 4096] + - Exact: [33708, 3939, 1, 1024] + - Exact: [4096, 3256, 1, 1024] + - Exact: [1024, 3439, 1, 4096] + - Exact: [1024, 3510, 1, 4096] + - Exact: [4096, 3900, 1, 1024] + - Exact: [1024, 3470, 1, 4096] + - Exact: [4096, 3456, 1, 1024] + - Exact: [4096, 3014, 1, 1024] + - Exact: [4096, 3367, 1, 1024] + - Exact: [4096, 3432, 1, 1024] + - Exact: [33708, 4026, 1, 1024] + - Exact: [4096, 3273, 1, 1024] + - Exact: [4096, 3130, 1, 1024] + - Exact: [1024, 3496, 1, 4096] + - Exact: [1024, 3995, 1, 4096] + - Exact: [1024, 3939, 1, 4096] + - Exact: [1024, 3121, 1, 4096] + - Exact: [1024, 3232, 1, 4096] + - Exact: [4096, 3147, 1, 1024] + - Exact: [4096, 3516, 1, 1024] + - Exact: [1024, 3969, 1, 1024] + - Exact: [1024, 3364, 1, 4096] + - Exact: [4096, 3411, 1, 1024] + - Exact: [147, 147, 432, 64] + - Exact: [4096, 3301, 1, 1024] + - Exact: [1024, 3513, 1, 4096] + - Exact: [1024, 3469, 1, 4096] + - Exact: [1024, 3095, 1, 4096] + - Exact: [4096, 3533, 1, 1024] + - Exact: [4096, 3390, 1, 1024] + - Exact: [4096, 3582, 1, 1024] + - Exact: [1024, 3956, 1, 1024] + - Exact: [4096, 3585, 1, 1024] + - Exact: [4096, 3231, 1, 1024] + - Exact: [1024, 3205, 1, 4096] + - Exact: [4096, 3496, 1, 1024] + - Exact: [1024, 3143, 1, 4096] + - Exact: [1024, 3318, 1, 4096] + - Exact: [1024, 3353, 1, 4096] + - Exact: [1024, 3464, 1, 4096] + - Exact: [4096, 2736, 1, 1024] + - Exact: [1024, 3402, 1, 4096] + - Exact: [4096, 3138, 1, 1024] + - Exact: [1024, 3860, 1, 4096] + - Exact: [148, 148, 432, 64] + - Exact: [1024, 3539, 1, 4096] + - Exact: [4096, 3211, 1, 1024] + - Exact: [1024, 3332, 1, 4096] + - Exact: [1024, 3466, 1, 4096] + - Exact: [4096, 3475, 1, 1024] + - Exact: [4096, 3524, 1, 1024] + - Exact: [4096, 2985, 1, 1024] + - Exact: [4096, 3222, 1, 1024] + - Exact: [4096, 3451, 1, 1024] + - Exact: [1024, 3181, 1, 4096] + - Exact: [1024, 3640, 1, 4096] + - Exact: [1024, 3375, 1, 4096] + - Exact: [1024, 3550, 1, 4096] + - Exact: [1024, 4020, 1, 1024] + - Exact: [4096, 3349, 1, 1024] + - Exact: [4096, 3398, 1, 1024] + - Exact: [33708, 3976, 1, 1024] + - Exact: [1024, 2917, 1, 4096] + - Exact: [33708, 3910, 1, 1024] + - Exact: [4096, 3860, 1, 1024] + - Exact: [4096, 3304, 1, 1024] + - Exact: [1024, 3286, 1, 4096] + - Exact: [1024, 3460, 1, 4096] + - Exact: [1024, 4026, 1, 4096] + - Exact: [4096, 3471, 1, 1024] + - Exact: [193, 193, 320, 64] + - Exact: [1024, 3894, 1, 1024] + - Exact: [1024, 3506, 1, 4096] + - Exact: [1024, 4000, 1, 1024] + - Exact: [1024, 3900, 1, 4096] + - Exact: [1024, 3445, 1, 4096] + - Exact: [4096, 3442, 1, 1024] + - Exact: [1024, 3358, 1, 4096] + - Exact: [1024, 3211, 1, 4096] + - Exact: [4096, 3515, 1, 1024] + - Exact: [1024, 3564, 1, 4096] + - Exact: [4096, 3057, 1, 1024] + - Exact: [1024, 3343, 1, 4096] + - Exact: [4096, 3262, 1, 1024] + - Exact: [1024, 3518, 1, 4096] + - Exact: [33708, 3876, 1, 1024] + - Exact: [4096, 3462, 1, 1024] + - Exact: [1024, 3265, 1, 4096] + - Exact: [4096, 3389, 1, 1024] + - Exact: [4096, 3438, 1, 1024] + - Exact: [1024, 3955, 1, 1024] + - Exact: [1024, 3545, 1, 4096] + - Exact: [1024, 3144, 1, 4096] + - Exact: [1024, 3417, 1, 4096] + - Exact: [4096, 3543, 1, 1024] + - Exact: [4096, 3352, 1, 1024] + - Exact: [33708, 3975, 1, 1024] + - Exact: [148, 147, 432, 64] + - Exact: [4096, 3137, 1, 1024] + - Exact: [4096, 3506, 1, 1024] + - Exact: [1024, 3975, 1, 1024] + - Exact: [1024, 3859, 1, 4096] + - Exact: [4096, 3369, 1, 1024] + - Exact: [1024, 3434, 1, 4096] + - Exact: [1024, 3292, 1, 4096] + - Exact: [4096, 3523, 1, 1024] + - Exact: [4096, 3380, 1, 1024] + - Exact: [1024, 3408, 1, 4096] + - Exact: [4096, 3221, 1, 1024] + - Exact: [4096, 3270, 1, 1024] + - Exact: [143, 143, 432, 64] + - Exact: [1024, 3303, 1, 4096] + - Exact: [4096, 3502, 1, 1024] + - Exact: [1024, 3222, 1, 4096] + - Exact: [4096, 2505, 1, 1024] + - Exact: [4096, 3397, 1, 1024] + - Exact: [4096, 3562, 1, 1024] + - Exact: [4096, 3095, 1, 1024] + - Exact: [1024, 3226, 1, 4096] + - Exact: [177, 177, 352, 64] + - Exact: [4096, 3360, 1, 1024] + - Exact: [1024, 3942, 1, 1024] + - Exact: [1024, 3298, 1, 4096] + - Exact: [1024, 3381, 1, 4096] + - Exact: [4096, 3314, 1, 1024] + - Exact: [1024, 3492, 1, 4096] + - Exact: [1024, 3430, 1, 4096] + - Exact: [4096, 3977, 1, 1024] + - Exact: [4096, 3546, 1, 1024] + - Exact: [4096, 3640, 1, 1024] + - Exact: [4096, 3441, 1, 1024] + - Exact: [33708, 4059, 1, 1024] + - Exact: [1024, 3978, 1, 1024] + - Exact: [1024, 3376, 1, 4096] + - Exact: [1024, 3482, 1, 4096] + - Exact: [1024, 3563, 1, 4096] + - Exact: [4096, 4020, 1, 1024] + - Exact: [1024, 3271, 1, 4096] + - Exact: [1024, 3291, 1, 4096] + - Exact: [1024, 3431, 1, 4096] + - Exact: [1024, 3481, 1, 4096] + - Exact: [4096, 3461, 1, 1024] + - Exact: [1024, 3574, 1, 4096] + - Exact: [1024, 4059, 1, 1024] + - Exact: [1024, 3421, 1, 4096] + - Exact: [4096, 3224, 1, 1024] + - Exact: [4096, 3437, 1, 1024] + - Exact: [4096, 3168, 1, 1024] + - Exact: [33708, 3990, 1, 1024] + - Exact: [1024, 3349, 1, 4096] + - Exact: [4096, 3335, 1, 1024] + - Exact: [4096, 3400, 1, 1024] + - Exact: [160, 159, 400, 64] + - Exact: [1024, 3398, 1, 4096] + - Exact: [1024, 3780, 1, 4096] + - Exact: [4096, 3098, 1, 1024] + - Exact: [1024, 4012, 1, 4096] + - Exact: [4096, 3505, 1, 1024] + - Exact: [4096, 3554, 1, 1024] + - Exact: [4096, 3063, 1, 1024] + - Exact: [1024, 3503, 1, 4096] + - Exact: [1024, 3166, 1, 4096] + - Exact: [1024, 3425, 1, 4096] + - Exact: [1024, 3344, 1, 4096] + - Exact: [4096, 3484, 1, 1024] + - Exact: [1024, 3681, 1, 1024] + - Exact: [1024, 4050, 1, 1024] + - Exact: [4096, 3379, 1, 1024] + - Exact: [4096, 3428, 1, 1024] + - Exact: [1024, 3304, 1, 4096] + - Exact: [1024, 3387, 1, 4096] + - Exact: [4096, 3126, 1, 1024] + - Exact: [1024, 3498, 1, 4096] + - Exact: [1024, 3436, 1, 4096] + - Exact: [4096, 3501, 1, 1024] + - Exact: [4096, 3358, 1, 1024] + - Exact: [4096, 3232, 1, 1024] + - Exact: [1024, 3585, 1, 4096] + - Exact: [4096, 3143, 1, 1024] + - Exact: [4096, 3464, 1, 1024] + - Exact: [1024, 3366, 1, 4096] + - Exact: [4096, 3375, 1, 1024] + - Exact: [4096, 2917, 1, 1024] + - Exact: [4096, 4026, 1, 1024] + - Exact: [1024, 3277, 1, 4096] + - Exact: [1024, 3103, 1, 4096] + - Exact: [33708, 3995, 1, 1024] + - Exact: [1024, 3297, 1, 4096] + - Exact: [4096, 3545, 1, 1024] + - Exact: [1024, 3399, 1, 4096] + - Exact: [33708, 3796, 1, 1024] + - Exact: [4096, 3292, 1, 1024] + - Exact: [33708, 3859, 1, 1024] + - Exact: [4096, 3566, 1, 1024] + - Exact: [4096, 3894, 1, 1024] + - Exact: [4096, 3492, 1, 1024] + - Exact: [1024, 3977, 1, 1024] + - Exact: [1024, 3272, 1, 4096] + - Exact: [135, 134, 480, 64] + - Exact: [1024, 3355, 1, 4096] + - Exact: [4096, 3419, 1, 1024] + - Exact: [1024, 3404, 1, 4096] + - Exact: [4096, 3999, 1, 1024] + - Exact: [4096, 3166, 1, 1024] + - Exact: [33708, 3840, 1, 1024] + - Exact: [4096, 4032, 1, 1024] + - Exact: [1024, 3573, 1, 4096] + - Exact: [4096, 3366, 1, 1024] + - Exact: [1024, 3541, 1, 4096] + - Exact: [4096, 3207, 1, 1024] + - Exact: [4096, 3272, 1, 1024] + - Exact: [1024, 3334, 1, 4096] + - Exact: [228, 228, 272, 64] + - Exact: [4096, 3183, 1, 1024] + - Exact: [4096, 3536, 1, 1024] + - Exact: [1024, 4005, 1, 1024] + - Exact: [1024, 3245, 1, 4096] + - Exact: [4096, 3447, 1, 1024] + - Exact: [1024, 3183, 1, 4096] + - Exact: [1024, 3361, 1, 4096] + - Exact: [33708, 3870, 1, 1024] + - Exact: [1024, 3321, 1, 4096] + - Exact: [1024, 3486, 1, 4096] + - Exact: [4096, 4005, 1, 1024] + - Exact: [4096, 3410, 1, 1024] + - Exact: [1024, 3944, 1, 1024] + - Exact: [4096, 3300, 1, 1024] + - Exact: [4096, 3579, 1, 1024] + - Exact: [4096, 3483, 1, 1024] + - Exact: [4096, 3532, 1, 1024] + - Exact: [1024, 3140, 1, 4096] + - Exact: [1024, 3372, 1, 4096] + - Exact: [1024, 3224, 1, 4096] + - Exact: [4096, 3230, 1, 1024] + - Exact: [4096, 3427, 1, 1024] + - Exact: [1024, 3796, 1, 1024] + - Exact: [143, 148, 432, 64] + - Exact: [1024, 3616, 1, 4096] + - Exact: [1024, 3315, 1, 4096] + - Exact: [1024, 3476, 1, 4096] + - Exact: [1024, 3509, 1, 4096] + - Exact: [4096, 3357, 1, 1024] + - Exact: [4096, 3406, 1, 1024] + - Exact: [1024, 3558, 1, 4096] + - Exact: [4096, 3593, 1, 1024] + - Exact: [4096, 3247, 1, 1024] + - Exact: [4096, 3088, 1, 1024] + - Exact: [1024, 3213, 1, 4096] + - Exact: [4096, 3511, 1, 1024] + - Exact: [1024, 3365, 1, 4096] + - Exact: [1024, 3504, 1, 4096] + - Exact: [1024, 3442, 1, 4096] + - Exact: [4096, 3474, 1, 1024] + - Exact: [4096, 2984, 1, 1024] + - Exact: [1024, 3876, 1, 4096] + - Exact: [4096, 3337, 1, 1024] + - Exact: [4096, 3450, 1, 1024] + - Exact: [1024, 3547, 1, 4096] + - Exact: [4096, 3291, 1, 1024] + - Exact: [1024, 3340, 1, 4096] + - Exact: [4096, 3491, 1, 1024] + - Exact: [4096, 3348, 1, 1024] + - Exact: [4096, 3906, 1, 1024] + - Exact: [1024, 3477, 1, 4096] + - Exact: [1024, 3397, 1, 4096] + - Exact: [4096, 3165, 1, 1024] + - Exact: [4096, 3470, 1, 1024] + - Exact: [1024, 3526, 1, 4096] + - Exact: [4096, 3365, 1, 1024] + - Exact: [4096, 3319, 1, 1024] + - Exact: [1024, 3401, 1, 4096] + - Exact: [1024, 3294, 1, 4096] + - Exact: [159, 159, 400, 64] + - Exact: [1024, 3472, 1, 4096] + - Exact: [4096, 3328, 1, 1024] + - Exact: [1024, 3861, 1, 1024] + - Exact: [1024, 3910, 1, 1024] + - Exact: [1024, 3410, 1, 4096] + - Exact: [1024, 3395, 1, 4096] + - Exact: [4096, 3282, 1, 1024] + - Exact: [1024, 3751, 1, 1024] + - Exact: [4096, 3145, 1, 1024] + - Exact: [4096, 3514, 1, 1024] + - Exact: [4096, 3944, 1, 1024] + - Exact: [1024, 3515, 1, 4096] + - Exact: [4096, 3409, 1, 1024] + - Exact: [4096, 3564, 1, 1024] + - Exact: [4096, 3299, 1, 1024] + - Exact: [1024, 3057, 1, 4096] + - Exact: [4096, 3531, 1, 1024] + - Exact: [4096, 3388, 1, 1024] + - Exact: [1024, 3189, 1, 4096] + - Exact: [1024, 3300, 1, 4096] + - Exact: [1024, 3720, 1, 4096] + - Exact: [1024, 3383, 1, 4096] + - Exact: [1024, 3494, 1, 4096] + - Exact: [1024, 3448, 1, 4096] + - Exact: [4096, 3542, 1, 1024] + - Exact: [1024, 3488, 1, 4096] + - Exact: [4096, 3405, 1, 1024] + - Exact: [1024, 3262, 1, 4096] + - Exact: [33708, 4005, 1, 1024] + - Exact: [1024, 3594, 1, 4096] + - Exact: [4096, 3103, 1, 1024] + - Exact: [4096, 3136, 1, 1024] + - Exact: [1024, 3378, 1, 4096] + - Exact: [4096, 3559, 1, 1024] + - Exact: [4096, 3368, 1, 1024] + - Exact: [4096, 3209, 1, 1024] + - Exact: [4096, 3322, 1, 1024] + - Exact: [1024, 3483, 1, 4096] + - Exact: [4096, 3473, 1, 1024] + - Exact: [4096, 3522, 1, 1024] + - Exact: [1024, 3532, 1, 4096] + - Exact: [4096, 3449, 1, 1024] + - Exact: [1024, 3351, 1, 4096] + - Exact: [1024, 3462, 1, 4096] + - Exact: [4096, 3396, 1, 1024] + - Exact: [132, 132, 480, 64] + - Exact: [1024, 3416, 1, 4096] + - Exact: [4096, 3469, 1, 1024] + - Exact: [1024, 3582, 1, 4096] + - Exact: [1024, 3230, 1, 4096] + - Exact: [1024, 3489, 1, 4096] + - Exact: [1024, 3427, 1, 4096] + - Exact: [1024, 3346, 1, 4096] + - Exact: [33708, 3977, 1, 1024] + - Exact: [4096, 3796, 1, 1024] + - Exact: [4096, 3176, 1, 1024] + - Exact: [4096, 3990, 1, 1024] + - Exact: [1024, 3257, 1, 4096] + - Exact: [4096, 3343, 1, 1024] + - Exact: [4096, 3440, 1, 1024] + - Exact: [33708, 4030, 1, 1024] + - Exact: [1024, 3190, 1, 4096] + - Exact: [1024, 3389, 1, 4096] + - Exact: [1024, 3500, 1, 4096] + - Exact: [1024, 3471, 1, 4096] + - Exact: [1024, 3438, 1, 4096] + - Exact: [4096, 3513, 1, 1024] + - Exact: [1024, 3562, 1, 4096] + - Exact: [4096, 3616, 1, 1024] + - Exact: [4096, 3955, 1, 1024] + - Exact: [1024, 3441, 1, 4096] + - Exact: [1024, 3236, 1, 4096] + - Exact: [1024, 3524, 1, 4096] + - Exact: [4096, 3460, 1, 1024] + - Exact: [1024, 3384, 1, 4096] + - Exact: [4096, 3387, 1, 1024] + - Exact: [4096, 3436, 1, 1024] + - Exact: [4096, 3277, 1, 1024] + - Exact: [1024, 3457, 1, 4096] + - Exact: [1024, 3999, 1, 4096] + - Exact: [1024, 4032, 1, 4096] + - Exact: [4096, 3541, 1, 1024] + - Exact: [4096, 3334, 1, 1024] + - Exact: [1024, 3393, 1, 4096] + - Exact: [1024, 3411, 1, 4096] + - Exact: [1024, 3822, 1, 1024] + - Exact: [1024, 3593, 1, 4096] + - Exact: [33708, 3822, 1, 1024] + - Exact: [4096, 3504, 1, 1024] + - Exact: [1024, 3163, 1, 4096] + - Exact: [1024, 3357, 1, 4096] + - Exact: [1024, 3906, 1, 4096] + - Exact: [4096, 3415, 1, 1024] + - Exact: [1024, 3406, 1, 4096] + - Exact: [4096, 3321, 1, 1024] + - Exact: [4096, 3584, 1, 1024] + - Exact: [1024, 2736, 1, 4096] + - Exact: [1024, 3110, 1, 4096] + - Exact: [33708, 3999, 1, 1024] + - Exact: [1024, 3093, 1, 4096] + - Exact: [4096, 3378, 1, 1024] + - Exact: [1024, 3543, 1, 4096] + - Exact: [33708, 3925, 1, 1024] + - Exact: [1024, 3352, 1, 4096] + - Exact: [4096, 3780, 1, 1024] + - Exact: [1024, 3990, 1, 4096] + - Exact: [4096, 3500, 1, 1024] + - Exact: [4096, 3996, 1, 1024] + - Exact: [1024, 3247, 1, 4096] + - Exact: [4096, 3395, 1, 1024] + - Exact: [1024, 3169, 1, 4096] + - Exact: [1024, 3088, 1, 4096] + - Exact: [1024, 3584, 1, 4096] + - Exact: [4096, 3093, 1, 1024] + - Exact: [1024, 3538, 1, 4096] + - Exact: [1024, 3996, 1, 1024] + - Exact: [1024, 3581, 1, 4096] + - Exact: [4096, 3374, 1, 1024] + - Exact: [33708, 3751, 1, 1024] + - Exact: [4096, 3215, 1, 1024] + - Exact: [4096, 3312, 1, 1024] + - Exact: [4096, 3581, 1, 1024] + - Exact: [4096, 3479, 1, 1024] + - Exact: [4096, 3544, 1, 1024] + - Exact: [1024, 3870, 1, 1024] + - Exact: [1024, 3374, 1, 4096] + - Exact: [1024, 2967, 1, 4096] + - Exact: [4096, 3455, 1, 1024] + - Exact: [4096, 3942, 1, 1024] + - Exact: [1024, 3528, 1, 4096] + - Exact: [4096, 3186, 1, 1024] + - Exact: [1024, 3976, 1, 1024] + - Exact: [1024, 3511, 1, 4096] + - Exact: [4096, 3573, 1, 1024] + - Exact: [4096, 3561, 1, 1024] + - Exact: [4096, 3418, 1, 1024] + - Exact: [33708, 3906, 1, 1024] + - Exact: [4096, 3259, 1, 1024] + - Exact: [4096, 3308, 1, 1024] + - Exact: [1024, 3419, 1, 4096] + - Exact: [1024, 3215, 1, 4096] + - Exact: [1024, 4030, 1, 4096] + - Exact: [4096, 3459, 1, 1024] + - Exact: [1024, 3572, 1, 4096] + - Exact: [1024, 3137, 1, 4096] + - Exact: [1024, 3312, 1, 4096] + - Exact: [1024, 3925, 1, 4096] + - Exact: [1024, 3453, 1, 4096] + - Exact: [4096, 3435, 1, 1024] + - Exact: [1024, 3176, 1, 4096] + - Exact: [1024, 3444, 1, 4096] + - Exact: [4096, 3975, 1, 1024] + - Exact: [4096, 3182, 1, 1024] + - Exact: [1024, 3475, 1, 4096] + - Exact: [33708, 3955, 1, 1024] + - Exact: [4096, 3446, 1, 1024] + - Exact: [1024, 3138, 1, 4096] + - Exact: [1024, 3549, 1, 4096] + - Exact: [4096, 3287, 1, 1024] + - Exact: [1024, 3342, 1, 4096] + - Exact: [4096, 3519, 1, 1024] + - Exact: [4096, 3552, 1, 1024] + - Exact: [4096, 3859, 1, 1024] + - Exact: [33708, 3969, 1, 1024] + - Exact: [1024, 3369, 1, 4096] + - Exact: [4096, 3482, 1, 1024] + - Exact: [1024, 3306, 1, 4096] + - Exact: [1024, 3474, 1, 4096] + - Exact: [4096, 3377, 1, 1024] + - Exact: [4096, 3426, 1, 1024] + - Exact: [4096, 2935, 1, 1024] + - Exact: [4096, 3267, 1, 1024] + - Exact: [1024, 3299, 1, 4096] + - Exact: [1024, 3456, 1, 4096] + - Exact: [1024, 3280, 1, 4096] + - Exact: [1024, 3555, 1, 4096] + - Exact: [4096, 3499, 1, 1024] + - Exact: [4096, 3356, 1, 1024] + - Exact: [1024, 3412, 1, 4096] + - Exact: [1024, 2984, 1, 4096] + - Exact: [4096, 3141, 1, 1024] + - Exact: [4096, 3510, 1, 1024] + - Exact: [1024, 3995, 1, 1024] + - Exact: [1024, 3517, 1, 4096] + - Exact: [1024, 3455, 1, 4096] + - Exact: [1024, 3939, 1, 1024] + - Exact: [1024, 3447, 1, 4096] + - Exact: [1024, 3969, 1, 4096] + - Exact: [4096, 3527, 1, 1024] + - Exact: [4096, 3336, 1, 1024] + - Exact: [1024, 3191, 1, 4096] + - Exact: [1024, 3302, 1, 4096] + - Exact: [1024, 3337, 1, 4096] + - Exact: [4096, 3290, 1, 1024] + - Exact: [1024, 3512, 1, 4096] + - Exact: [1024, 3433, 1, 4096] + - Exact: [4096, 3876, 1, 1024] + - Exact: [4096, 3490, 1, 1024] + - Exact: [4096, 3064, 1, 1024] + - Exact: [1024, 3508, 1, 4096] + - Exact: [1024, 3956, 1, 4096] + - Exact: [4096, 3417, 1, 1024] + - Exact: [1024, 3248, 1, 4096] + - Exact: [1024, 2499, 1, 4096] + - Exact: [1024, 3186, 1, 4096] + - Exact: [1024, 3180, 1, 4096] + - Exact: [4096, 3364, 1, 1024] + - Exact: [4096, 3976, 1, 1024] + - Exact: [4096, 3205, 1, 1024] + - Exact: [4096, 3318, 1, 1024] + - Exact: [1024, 3377, 1, 4096] + - Exact: [1024, 3485, 1, 4096] + - Exact: [4096, 3181, 1, 1024] + - Exact: [4096, 3550, 1, 1024] + - Exact: [1024, 3534, 1, 4096] + - Exact: [1024, 3860, 1, 1024] + - Exact: [160, 160, 400, 64] + - Exact: [4096, 3445, 1, 1024] + - Exact: [1024, 3391, 1, 4096] + - Exact: [1024, 3221, 1, 4096] + - Exact: [4096, 3079, 1, 1024] + - Exact: [4096, 3144, 1, 1024] + - Exact: [1024, 3270, 1, 4096] + - Exact: [1024, 3561, 1, 4096] + - Exact: [1024, 3480, 1, 4096] + - Exact: [4096, 3408, 1, 1024] + - Exact: [1024, 3418, 1, 4096] + - Exact: [4096, 3298, 1, 1024] + - Exact: [1024, 3640, 1, 1024] + - Exact: [1024, 3449, 1, 4096] + - Exact: [1024, 4020, 1, 4096] + - Exact: [4096, 3481, 1, 1024] + - Exact: [4096, 3530, 1, 1024] + - Exact: [1024, 3216, 1, 4096] + - Exact: [1024, 3491, 1, 4096] + - Exact: [1024, 3154, 1, 4096] + - Exact: [4096, 3425, 1, 1024] + - Exact: [1024, 3348, 1, 4096] + - Exact: [1024, 3415, 1, 4096] + - Exact: [1024, 4026, 1, 1024] + - Exact: [1024, 3367, 1, 4096] + - Exact: [1024, 3259, 1, 4096] + - Exact: [1024, 3894, 1, 4096] + - Exact: [4096, 3355, 1, 1024] + - Exact: [4096, 3404, 1, 1024] + - Exact: [1024, 3308, 1, 4096] + - Exact: [4096, 3245, 1, 1024] + - Exact: [1024, 3502, 1, 4096] + - Exact: [33708, 4032, 1, 1024] + - Exact: [1024, 3424, 1, 4096] + - Exact: [4096, 3509, 1, 1024] + - Exact: [4096, 3558, 1, 1024] + - Exact: [1024, 3900, 1, 1024] + - Exact: [1024, 2505, 1, 4096] + - Exact: [4096, 3472, 1, 1024] + - Exact: [1024, 3386, 1, 4096] + - Exact: [4096, 3383, 1, 1024] + - Exact: [4096, 3448, 1, 1024] + - Exact: [4096, 4030, 1, 1024] + - Exact: [4096, 3289, 1, 1024] + - Exact: [1024, 3459, 1, 4096] + - Exact: [1024, 2918, 1, 4096] + - Exact: [4096, 3489, 1, 1024] + - Exact: [4096, 3346, 1, 1024] + - Exact: [4096, 3572, 1, 1024] + - Exact: [1024, 3955, 1, 4096] + - Exact: [4096, 3236, 1, 1024] + - Exact: [4096, 3163, 1, 1024] + - Exact: [4096, 3468, 1, 1024] + - Exact: [1024, 3165, 1, 4096] + - Exact: [1024, 3276, 1, 4096] + - Exact: [1024, 3359, 1, 4096] + - Exact: [4096, 3363, 1, 1024] + - Exact: [1024, 3385, 1, 4096] + - Exact: [1024, 3207, 1, 4096] + - Exact: [1024, 3458, 1, 4096] + - Exact: [4096, 3110, 1, 1024] + - Exact: [4096, 3925, 1, 1024] + - Exact: [1024, 3975, 1, 4096] + - Exact: [4096, 3549, 1, 1024] + - Exact: [4096, 3342, 1, 1024] + - Exact: [1024, 3859, 1, 1024] + - Exact: [1024, 3497, 1, 4096] + - Exact: [4096, 3280, 1, 1024] + - Exact: [1024, 3435, 1, 4096] + - Exact: [1024, 3354, 1, 4096] + - Exact: [4096, 3191, 1, 1024] + - Exact: [4096, 3512, 1, 1024] + - Exact: [1024, 3055, 1, 4096] + - Exact: [4096, 2499, 1, 1024] + - Exact: [1024, 3233, 1, 4096] + - Exact: [4096, 3423, 1, 1024] + - Exact: [1024, 3319, 1, 4096] + - Exact: [4096, 3297, 1, 1024] + - Exact: [4096, 3154, 1, 1024] + - Exact: [1024, 3540, 1, 4096] + - Exact: [1024, 3289, 1, 4096] + - Exact: [4096, 3529, 1, 1024] + - Exact: [4096, 3386, 1, 1024] + - Exact: [4096, 3276, 1, 1024] + - Exact: [1024, 3244, 1, 4096] + - Exact: [1024, 3182, 1, 4096] + - Exact: [4096, 3540, 1, 1024] + - Exact: [1024, 3360, 1, 4096] + - Exact: [1024, 3942, 1, 4096] + - Exact: [4096, 3403, 1, 1024] + - Exact: [4096, 3101, 1, 1024] + - Exact: [4096, 2918, 1, 1024] + - Exact: [1024, 3465, 1, 4096] + - Exact: [33708, 3780, 1, 1024] + - Exact: [4096, 3557, 1, 1024] + - Exact: [4096, 3414, 1, 1024] + - Exact: [1024, 3948, 1, 1024] + - Exact: [4096, 3320, 1, 1024] + - Exact: [4096, 2765, 1, 1024] + - Exact: [1024, 3978, 1, 4096] + - Exact: [4096, 3487, 1, 1024] + - Exact: [4096, 3520, 1, 1024] + - Exact: [1024, 3139, 1, 4096] + - Exact: [1024, 3314, 1, 4096] + - Exact: [4096, 3431, 1, 1024] + - Exact: [1024, 3446, 1, 4096] + - Exact: [1024, 4059, 1, 4096] + - Exact: [4096, 3345, 1, 1024] + - Exact: [4096, 3394, 1, 1024] + - Exact: [1024, 3927, 1, 1024] + - Exact: [4096, 3235, 1, 1024] + - Exact: [1024, 3328, 1, 4096] + - Exact: [33708, 3956, 1, 1024] + - Exact: [4096, 3467, 1, 1024] + - Exact: [1024, 3287, 1, 4096] + - Exact: [4096, 3214, 1, 1024] + - Exact: [4096, 3910, 1, 1024] + - Exact: [1024, 3780, 1, 1024] + - Exact: [1024, 3371, 1, 4096] + - Exact: [4096, 3478, 1, 1024] + - Exact: [1024, 3546, 1, 4096] + - Exact: [1024, 4012, 1, 1024] + - Exact: [4096, 3341, 1, 1024] + - Exact: [4096, 3454, 1, 1024] + - Exact: [4096, 3295, 1, 1024] + - Exact: [4096, 3072, 1, 1024] + - Exact: [1024, 3282, 1, 4096] + - Exact: [33708, 3720, 1, 1024] + - Exact: [1024, 3681, 1, 4096] + - Exact: [1024, 4050, 1, 4096] + - Exact: [4096, 3495, 1, 1024] + - Exact: [4096, 3560, 1, 1024] + - Exact: [4096, 3751, 1, 1024] + - Exact: [1024, 3414, 1, 4096] + - Exact: [33708, 3860, 1, 1024] + - Exact: [1024, 3325, 1, 4096] + - Exact: [4096, 3458, 1, 1024] + - Exact: [4096, 2967, 1, 1024] + - Exact: [1024, 3519, 1, 4096] + - Exact: [4096, 3385, 1, 1024] + - Exact: [4096, 3434, 1, 1024] + - Exact: [1024, 3552, 1, 4096] + - Exact: [4096, 3822, 1, 1024] + - Exact: [1024, 3544, 1, 4096] + - Exact: [4096, 3539, 1, 1024] + - Exact: [4096, 3332, 1, 1024] + - Exact: [1024, 3145, 1, 4096] + - Exact: [1024, 3535, 1, 4096] + - Exact: [1024, 3320, 1, 4096] + - Exact: [33708, 4012, 1, 1024] + - Exact: [4096, 3286, 1, 1024] + - Exact: [1024, 3514, 1, 4096] + - Exact: [1024, 2765, 1, 4096] + - Exact: [1024, 3452, 1, 4096] + - Exact: [4096, 3518, 1, 1024] + - Exact: [1024, 3529, 1, 4096] + - Exact: [4096, 3413, 1, 1024] + - Exact: [33708, 4050, 1, 1024] + - Exact: [1024, 3525, 1, 4096] + - Exact: [4096, 3303, 1, 1024] + - Exact: [1024, 3382, 1, 4096] + - Exact: [1024, 3390, 1, 4096] + - Exact: [1024, 3977, 1, 4096] + - Exact: [1024, 3184, 1, 4096] + - Exact: [4096, 3535, 1, 1024] + - Exact: [4096, 3376, 1, 1024] + - Exact: [4096, 3978, 1, 1024] + - Exact: [1024, 3136, 1, 4096] + - Exact: [1024, 3293, 1, 4096] + - Exact: [4096, 3266, 1, 1024] + - Exact: [1024, 3487, 1, 4096] + - Exact: [1024, 3409, 1, 4096] + - Exact: [4096, 3498, 1, 1024] + - Exact: [1024, 3520, 1, 4096] + - Exact: [1024, 3530, 1, 4096] + - Exact: [4096, 3393, 1, 1024] + - Exact: [4096, 3140, 1, 1024] + - Exact: [1024, 3536, 1, 4096] + - Exact: [1024, 3288, 1, 4096] + - Exact: [1024, 4005, 1, 4096] + - Exact: [1024, 3579, 1, 4096] + - Exact: [4096, 3372, 1, 1024] + - Exact: [1024, 3440, 1, 4096] + - Exact: [4096, 3213, 1, 1024] + - Exact: [4096, 3477, 1, 1024] + - Exact: [4096, 3526, 1, 1024] + - Exact: [1024, 3493, 1, 4096] + - Exact: [1024, 3944, 1, 4096] + - Exact: [4096, 3453, 1, 1024] + - Exact: [1024, 3350, 1, 4096] + - Exact: [4096, 3184, 1, 1024] + - Exact: [1024, 3423, 1, 4096] + - Exact: [4096, 3351, 1, 1024] + - Exact: [4096, 3416, 1, 1024] + - Exact: [1024, 3796, 1, 4096] + - Exact: [4096, 3257, 1, 1024] + - Exact: [4096, 3306, 1, 1024] + - Exact: [33708, 4020, 1, 1024] + - Exact: [1024, 3426, 1, 4096] + - Exact: [4096, 3457, 1, 1024] + - Exact: [1024, 2935, 1, 4096] + - Exact: [1024, 3046, 1, 4096] + - Exact: [4096, 3433, 1, 1024] + - Exact: [1024, 3256, 1, 4096] + - Exact: [1024, 3531, 1, 4096] + - Exact: [4096, 3180, 1, 1024] + - Exact: [1024, 3388, 1, 4096] + - Exact: [4096, 3444, 1, 1024] + - Exact: [1024, 3501, 1, 4096] + - Exact: [1024, 3266, 1, 4096] + - Exact: [1024, 3267, 1, 4096] + - Exact: [1024, 3461, 1, 4096] + - Exact: [4096, 3870, 1, 1024] + - Exact: [4096, 3517, 1, 1024] + - Exact: [1024, 3566, 1, 4096] + - Exact: [4096, 3574, 1, 1024] + - Exact: [1024, 3876, 1, 1024] + - Exact: [4096, 3720, 1, 1024] + - Exact: [4096, 3248, 1, 1024] + - Exact: [4096, 4059, 1, 1024] + - Exact: [1024, 3380, 1, 4096] + - Exact: [4096, 3480, 1, 1024] + - Exact: [1024, 3335, 1, 4096] + - Exact: [1024, 3345, 1, 4096] + - Exact: [4096, 3391, 1, 1024] + - Exact: [4096, 3424, 1, 1024] + - Exact: [1024, 3394, 1, 4096] + - Exact: [4096, 3265, 1, 1024] + - Exact: [1024, 3014, 1, 4096] + - Exact: [4096, 3497, 1, 1024] + - Exact: [4096, 3354, 1, 1024] + - Exact: [4096, 3055, 1, 1024] + - Exact: [1024, 3499, 1, 4096] + - Exact: [1024, 3162, 1, 4096] + - Exact: [4096, 3244, 1, 1024] + - Exact: [1024, 3437, 1, 4096] + - Exact: [1024, 3356, 1, 4096] + - Exact: [4096, 3139, 1, 1024] + - Exact: [4096, 3508, 1, 1024] + - Exact: [1024, 3235, 1, 4096] + - Exact: [1024, 3910, 1, 4096] + - Exact: [4096, 3371, 1, 1024] + - Exact: [1024, 3751, 1, 4096] + - Exact: [4096, 3325, 1, 1024] + - Exact: [1024, 3413, 1, 4096] + - Exact: [1024, 3542, 1, 4096] + - Exact: [33708, 3900, 1, 1024] + - Exact: [4096, 3525, 1, 1024] + - Exact: [4096, 3382, 1, 1024] + - Exact: [1024, 3339, 1, 4096] + - Exact: [4096, 3288, 1, 1024] + - Exact: [1024, 3141, 1, 4096] + - Exact: [1024, 3168, 1, 4096] + - Exact: [4096, 3488, 1, 1024] + - Exact: [4096, 3046, 1, 1024] + - Exact: [1024, 3362, 1, 4096] + - Exact: [33708, 3942, 1, 1024] + - Exact: [4096, 3399, 1, 1024] + - Exact: [1024, 3720, 1, 1024] + - Exact: [4096, 3563, 1, 1024] + - Exact: [1024, 3273, 1, 4096] + - Exact: [4096, 3162, 1, 1024] + - Exact: [1024, 3467, 1, 4096] + - Exact: [1024, 3130, 1, 4096] + - Exact: [1024, 3405, 1, 4096] + - Exact: [4096, 3362, 1, 1024] + - Exact: [1024, 3960, 1, 1024] + - Exact: [1024, 3712, 1, 36548] + - Exact: [1024, 3712, 1, 1024] + - Exact: [4032, 384, 1, 64] + - Exact: [1024, 2048, 1, 49] + - Exact: [4608, 512, 1, 49] + - Exact: [9216, 512, 1, 4096] + - Exact: [3456, 384, 1, 289] + - Exact: [3456, 384, 1, 169] + - Exact: [4096, 512, 1, 1001] + - Exact: [384, 448, 49, 512] + - Exact: [384, 448, 64, 256] + - Exact: [384, 448, 36, 256] + - Exact: [384, 448, 49, 256] + - Exact: [384, 448, 64, 512] + - Exact: [384, 448, 36, 512] + - Exact: [1024, 6400, 1, 65] + - Exact: [4096, 6400, 1, 256] + - Exact: [512, 3194, 1, 2048] + - Exact: [512, 3222, 1, 2048] + - Exact: [512, 3234, 1, 2048] + - Exact: [512, 3242, 1, 2048] + - Exact: [512, 3257, 1, 2048] + - Exact: [512, 3332, 1, 2048] + - Exact: [512, 3336, 1, 2048] + - Exact: [512, 3378, 1, 2048] + - Exact: [512, 3396, 1, 2048] + - Exact: [512, 3399, 1, 2048] + - Exact: [512, 3451, 1, 2048] + - Exact: [512, 3456, 1, 2048] + - Exact: [512, 3458, 1, 2048] + - Exact: [512, 3467, 1, 2048] + - Exact: [512, 3468, 1, 2048] + - Exact: [512, 3470, 1, 2048] + - Exact: [512, 3477, 1, 2048] + - Exact: [512, 3478, 1, 2048] + - Exact: [512, 3495, 1, 2048] + - Exact: [512, 3507, 1, 2048] + - Exact: [512, 3515, 1, 2048] + - Exact: [512, 3517, 1, 2048] + - Exact: [2048, 2864, 1, 512] + - Exact: [2048, 3287, 1, 512] + - Exact: [2048, 3412, 1, 512] + - Exact: [2048, 3456, 1, 512] + - Exact: [2048, 3466, 1, 512] + - Exact: [2048, 3476, 1, 512] + - Exact: [2048, 3999, 1, 512] + - Exact: [33708, 189, 1, 512] + - Exact: [33708, 2496, 1, 512] + - Exact: [33708, 3864, 1, 512] + - Exact: [33708, 3969, 1, 512] + - Exact: [33708, 3995, 1, 512] + - Exact: [134, 134, 240, 64] + - Exact: [135, 134, 240, 64] + - Exact: [135, 135, 240, 64] + - Exact: [512, 2790, 1, 2048] + - Exact: [512, 2864, 1, 2048] + - Exact: [512, 3092, 1, 2048] + - Exact: [512, 3113, 1, 2048] + - Exact: [512, 3137, 1, 2048] + - Exact: [512, 3165, 1, 2048] + - Exact: [512, 3166, 1, 2048] + - Exact: [512, 3219, 1, 2048] + - Exact: [512, 3237, 1, 2048] + - Exact: [512, 3246, 1, 2048] + - Exact: [512, 3249, 1, 2048] + - Exact: [512, 3251, 1, 2048] + - Exact: [512, 3262, 1, 2048] + - Exact: [512, 3268, 1, 2048] + - Exact: [512, 3282, 1, 2048] + - Exact: [512, 3286, 1, 2048] + - Exact: [512, 3287, 1, 2048] + - Exact: [512, 3293, 1, 2048] + - Exact: [512, 3297, 1, 2048] + - Exact: [512, 3307, 1, 2048] + - Exact: [512, 3314, 1, 2048] + - Exact: [512, 3315, 1, 2048] + - Exact: [512, 3319, 1, 2048] + - Exact: [512, 3322, 1, 2048] + - Exact: [512, 3323, 1, 2048] + - Exact: [512, 3324, 1, 2048] + - Exact: [512, 3325, 1, 2048] + - Exact: [512, 3327, 1, 2048] + - Exact: [512, 3329, 1, 2048] + - Exact: [512, 3339, 1, 2048] + - Exact: [512, 3342, 1, 2048] + - Exact: [512, 3344, 1, 2048] + - Exact: [512, 3358, 1, 2048] + - Exact: [512, 3360, 1, 2048] + - Exact: [512, 3364, 1, 2048] + - Exact: [512, 3365, 1, 2048] + - Exact: [512, 3369, 1, 2048] + - Exact: [512, 3371, 1, 2048] + - Exact: [512, 3374, 1, 2048] + - Exact: [512, 3376, 1, 2048] + - Exact: [512, 3377, 1, 2048] + - Exact: [512, 3381, 1, 2048] + - Exact: [512, 3382, 1, 2048] + - Exact: [512, 3383, 1, 2048] + - Exact: [512, 3384, 1, 2048] + - Exact: [512, 3385, 1, 2048] + - Exact: [512, 3386, 1, 2048] + - Exact: [512, 3388, 1, 2048] + - Exact: [512, 3390, 1, 2048] + - Exact: [512, 3391, 1, 2048] + - Exact: [512, 3402, 1, 2048] + - Exact: [512, 3410, 1, 2048] + - Exact: [512, 3412, 1, 2048] + - Exact: [512, 3414, 1, 2048] + - Exact: [512, 3415, 1, 2048] + - Exact: [512, 3418, 1, 2048] + - Exact: [512, 3420, 1, 2048] + - Exact: [512, 3422, 1, 2048] + - Exact: [512, 3425, 1, 2048] + - Exact: [512, 3426, 1, 2048] + - Exact: [512, 3427, 1, 2048] + - Exact: [512, 3428, 1, 2048] + - Exact: [512, 3430, 1, 2048] + - Exact: [512, 3431, 1, 2048] + - Exact: [512, 3432, 1, 2048] + - Exact: [512, 3438, 1, 2048] + - Exact: [512, 3439, 1, 2048] + - Exact: [512, 3440, 1, 2048] + - Exact: [512, 3443, 1, 2048] + - Exact: [512, 3445, 1, 2048] + - Exact: [512, 3447, 1, 2048] + - Exact: [512, 3448, 1, 2048] + - Exact: [512, 3450, 1, 2048] + - Exact: [512, 3452, 1, 2048] + - Exact: [512, 3453, 1, 2048] + - Exact: [512, 3455, 1, 2048] + - Exact: [512, 3457, 1, 2048] + - Exact: [512, 3459, 1, 2048] + - Exact: [512, 3460, 1, 2048] + - Exact: [512, 3461, 1, 2048] + - Exact: [512, 3462, 1, 2048] + - Exact: [512, 3466, 1, 2048] + - Exact: [512, 3471, 1, 2048] + - Exact: [512, 3472, 1, 2048] + - Exact: [512, 3475, 1, 2048] + - Exact: [512, 3476, 1, 2048] + - Exact: [512, 3479, 1, 2048] + - Exact: [512, 3480, 1, 2048] + - Exact: [512, 3481, 1, 2048] + - Exact: [512, 3483, 1, 2048] + - Exact: [512, 3484, 1, 2048] + - Exact: [512, 3487, 1, 2048] + - Exact: [512, 3489, 1, 2048] + - Exact: [512, 3490, 1, 2048] + - Exact: [512, 3491, 1, 2048] + - Exact: [512, 3493, 1, 2048] + - Exact: [512, 3494, 1, 2048] + - Exact: [512, 3497, 1, 2048] + - Exact: [512, 3498, 1, 2048] + - Exact: [512, 3499, 1, 2048] + - Exact: [512, 3501, 1, 2048] + - Exact: [512, 3503, 1, 2048] + - Exact: [512, 3508, 1, 2048] + - Exact: [512, 3509, 1, 2048] + - Exact: [512, 3511, 1, 2048] + - Exact: [512, 3514, 1, 2048] + - Exact: [512, 3518, 1, 2048] + - Exact: [512, 3519, 1, 2048] + - Exact: [512, 3520, 1, 2048] + - Exact: [512, 3523, 1, 2048] + - Exact: [512, 3528, 1, 2048] + - Exact: [512, 3529, 1, 2048] + - Exact: [512, 3530, 1, 2048] + - Exact: [512, 3532, 1, 2048] + - Exact: [512, 3533, 1, 2048] + - Exact: [512, 3534, 1, 2048] + - Exact: [512, 3538, 1, 2048] + - Exact: [512, 3539, 1, 2048] + - Exact: [512, 3541, 1, 2048] + - Exact: [512, 3547, 1, 2048] + - Exact: [512, 3548, 1, 2048] + - Exact: [512, 3552, 1, 2048] + - Exact: [512, 3564, 1, 2048] + - Exact: [512, 3575, 1, 2048] + - Exact: [512, 3598, 1, 2048] + - Exact: [512, 3599, 1, 2048] + - Exact: [512, 3608, 1, 2048] + - Exact: [512, 3780, 1, 512] + - Exact: [512, 3780, 1, 2048] + - Exact: [512, 3796, 1, 512] + - Exact: [512, 3796, 1, 2048] + - Exact: [512, 3822, 1, 512] + - Exact: [512, 3822, 1, 2048] + - Exact: [512, 3840, 1, 512] + - Exact: [512, 3840, 1, 2048] + - Exact: [512, 3859, 1, 512] + - Exact: [512, 3859, 1, 2048] + - Exact: [512, 3870, 1, 512] + - Exact: [512, 3870, 1, 2048] + - Exact: [512, 3876, 1, 512] + - Exact: [512, 3876, 1, 2048] + - Exact: [512, 3906, 1, 512] + - Exact: [512, 3906, 1, 2048] + - Exact: [512, 3910, 1, 512] + - Exact: [512, 3910, 1, 2048] + - Exact: [512, 3925, 1, 512] + - Exact: [512, 3925, 1, 2048] + - Exact: [512, 3927, 1, 512] + - Exact: [512, 3942, 1, 512] + - Exact: [512, 3942, 1, 2048] + - Exact: [512, 3944, 1, 512] + - Exact: [512, 3944, 1, 2048] + - Exact: [512, 3955, 1, 512] + - Exact: [512, 3955, 1, 2048] + - Exact: [512, 3968, 1, 512] + - Exact: [512, 3968, 1, 2048] + - Exact: [512, 3969, 1, 512] + - Exact: [512, 3969, 1, 2048] + - Exact: [512, 3976, 1, 512] + - Exact: [512, 3976, 1, 2048] + - Exact: [512, 3977, 1, 512] + - Exact: [512, 3977, 1, 2048] + - Exact: [512, 3978, 1, 512] + - Exact: [512, 3978, 1, 2048] + - Exact: [512, 3990, 1, 512] + - Exact: [512, 3990, 1, 2048] + - Exact: [512, 3995, 1, 512] + - Exact: [512, 3995, 1, 2048] + - Exact: [512, 3996, 1, 512] + - Exact: [512, 3996, 1, 2048] + - Exact: [512, 3999, 1, 512] + - Exact: [512, 3999, 1, 2048] + - Exact: [512, 4005, 1, 512] + - Exact: [512, 4005, 1, 2048] + - Exact: [512, 4012, 1, 512] + - Exact: [512, 4012, 1, 2048] + - Exact: [512, 4020, 1, 512] + - Exact: [512, 4020, 1, 2048] + - Exact: [512, 4026, 1, 512] + - Exact: [512, 4026, 1, 2048] + - Exact: [512, 4030, 1, 512] + - Exact: [512, 4030, 1, 2048] + - Exact: [512, 4032, 1, 512] + - Exact: [512, 4032, 1, 2048] + - Exact: [512, 4050, 1, 512] + - Exact: [512, 4059, 1, 512] + - Exact: [2048, 2790, 1, 512] + - Exact: [2048, 3092, 1, 512] + - Exact: [2048, 3113, 1, 512] + - Exact: [2048, 3137, 1, 512] + - Exact: [2048, 3165, 1, 512] + - Exact: [2048, 3166, 1, 512] + - Exact: [2048, 3194, 1, 512] + - Exact: [2048, 3219, 1, 512] + - Exact: [2048, 3222, 1, 512] + - Exact: [2048, 3234, 1, 512] + - Exact: [2048, 3237, 1, 512] + - Exact: [2048, 3242, 1, 512] + - Exact: [2048, 3246, 1, 512] + - Exact: [2048, 3249, 1, 512] + - Exact: [2048, 3251, 1, 512] + - Exact: [2048, 3257, 1, 512] + - Exact: [2048, 3262, 1, 512] + - Exact: [2048, 3268, 1, 512] + - Exact: [2048, 3282, 1, 512] + - Exact: [2048, 3286, 1, 512] + - Exact: [2048, 3293, 1, 512] + - Exact: [2048, 3297, 1, 512] + - Exact: [2048, 3307, 1, 512] + - Exact: [2048, 3314, 1, 512] + - Exact: [2048, 3315, 1, 512] + - Exact: [2048, 3319, 1, 512] + - Exact: [2048, 3322, 1, 512] + - Exact: [2048, 3323, 1, 512] + - Exact: [2048, 3324, 1, 512] + - Exact: [2048, 3325, 1, 512] + - Exact: [2048, 3327, 1, 512] + - Exact: [2048, 3329, 1, 512] + - Exact: [2048, 3332, 1, 512] + - Exact: [2048, 3336, 1, 512] + - Exact: [2048, 3339, 1, 512] + - Exact: [2048, 3342, 1, 512] + - Exact: [2048, 3344, 1, 512] + - Exact: [2048, 3358, 1, 512] + - Exact: [2048, 3360, 1, 512] + - Exact: [2048, 3364, 1, 512] + - Exact: [2048, 3365, 1, 512] + - Exact: [2048, 3369, 1, 512] + - Exact: [2048, 3371, 1, 512] + - Exact: [2048, 3374, 1, 512] + - Exact: [2048, 3376, 1, 512] + - Exact: [2048, 3377, 1, 512] + - Exact: [2048, 3378, 1, 512] + - Exact: [2048, 3381, 1, 512] + - Exact: [2048, 3382, 1, 512] + - Exact: [2048, 3383, 1, 512] + - Exact: [2048, 3384, 1, 512] + - Exact: [2048, 3385, 1, 512] + - Exact: [2048, 3386, 1, 512] + - Exact: [2048, 3388, 1, 512] + - Exact: [2048, 3390, 1, 512] + - Exact: [2048, 3391, 1, 512] + - Exact: [2048, 3396, 1, 512] + - Exact: [2048, 3399, 1, 512] + - Exact: [2048, 3402, 1, 512] + - Exact: [2048, 3410, 1, 512] + - Exact: [2048, 3414, 1, 512] + - Exact: [2048, 3415, 1, 512] + - Exact: [2048, 3418, 1, 512] + - Exact: [2048, 3420, 1, 512] + - Exact: [2048, 3422, 1, 512] + - Exact: [2048, 3425, 1, 512] + - Exact: [2048, 3426, 1, 512] + - Exact: [2048, 3427, 1, 512] + - Exact: [2048, 3428, 1, 512] + - Exact: [2048, 3430, 1, 512] + - Exact: [2048, 3431, 1, 512] + - Exact: [2048, 3432, 1, 512] + - Exact: [2048, 3438, 1, 512] + - Exact: [2048, 3439, 1, 512] + - Exact: [2048, 3440, 1, 512] + - Exact: [2048, 3443, 1, 512] + - Exact: [2048, 3445, 1, 512] + - Exact: [2048, 3447, 1, 512] + - Exact: [2048, 3448, 1, 512] + - Exact: [2048, 3450, 1, 512] + - Exact: [2048, 3451, 1, 512] + - Exact: [2048, 3452, 1, 512] + - Exact: [2048, 3453, 1, 512] + - Exact: [2048, 3455, 1, 512] + - Exact: [2048, 3457, 1, 512] + - Exact: [2048, 3458, 1, 512] + - Exact: [2048, 3459, 1, 512] + - Exact: [2048, 3460, 1, 512] + - Exact: [2048, 3461, 1, 512] + - Exact: [2048, 3462, 1, 512] + - Exact: [2048, 3467, 1, 512] + - Exact: [2048, 3468, 1, 512] + - Exact: [2048, 3470, 1, 512] + - Exact: [2048, 3471, 1, 512] + - Exact: [2048, 3472, 1, 512] + - Exact: [2048, 3475, 1, 512] + - Exact: [2048, 3477, 1, 512] + - Exact: [2048, 3478, 1, 512] + - Exact: [2048, 3479, 1, 512] + - Exact: [2048, 3480, 1, 512] + - Exact: [2048, 3481, 1, 512] + - Exact: [2048, 3483, 1, 512] + - Exact: [2048, 3484, 1, 512] + - Exact: [2048, 3487, 1, 512] + - Exact: [2048, 3489, 1, 512] + - Exact: [2048, 3490, 1, 512] + - Exact: [2048, 3491, 1, 512] + - Exact: [2048, 3493, 1, 512] + - Exact: [2048, 3494, 1, 512] + - Exact: [2048, 3495, 1, 512] + - Exact: [2048, 3497, 1, 512] + - Exact: [2048, 3498, 1, 512] + - Exact: [2048, 3499, 1, 512] + - Exact: [2048, 3501, 1, 512] + - Exact: [2048, 3503, 1, 512] + - Exact: [2048, 3507, 1, 512] + - Exact: [2048, 3508, 1, 512] + - Exact: [2048, 3509, 1, 512] + - Exact: [2048, 3511, 1, 512] + - Exact: [2048, 3514, 1, 512] + - Exact: [2048, 3515, 1, 512] + - Exact: [2048, 3517, 1, 512] + - Exact: [2048, 3518, 1, 512] + - Exact: [2048, 3519, 1, 512] + - Exact: [2048, 3520, 1, 512] + - Exact: [2048, 3523, 1, 512] + - Exact: [2048, 3528, 1, 512] + - Exact: [2048, 3529, 1, 512] + - Exact: [2048, 3530, 1, 512] + - Exact: [2048, 3532, 1, 512] + - Exact: [2048, 3533, 1, 512] + - Exact: [2048, 3534, 1, 512] + - Exact: [2048, 3538, 1, 512] + - Exact: [2048, 3539, 1, 512] + - Exact: [2048, 3541, 1, 512] + - Exact: [2048, 3547, 1, 512] + - Exact: [2048, 3548, 1, 512] + - Exact: [2048, 3552, 1, 512] + - Exact: [2048, 3564, 1, 512] + - Exact: [2048, 3575, 1, 512] + - Exact: [2048, 3598, 1, 512] + - Exact: [2048, 3599, 1, 512] + - Exact: [2048, 3608, 1, 512] + - Exact: [2048, 3780, 1, 512] + - Exact: [2048, 3796, 1, 512] + - Exact: [2048, 3822, 1, 512] + - Exact: [2048, 3840, 1, 512] + - Exact: [2048, 3859, 1, 512] + - Exact: [2048, 3870, 1, 512] + - Exact: [2048, 3876, 1, 512] + - Exact: [2048, 3906, 1, 512] + - Exact: [2048, 3910, 1, 512] + - Exact: [2048, 3925, 1, 512] + - Exact: [2048, 3942, 1, 512] + - Exact: [2048, 3944, 1, 512] + - Exact: [2048, 3955, 1, 512] + - Exact: [2048, 3968, 1, 512] + - Exact: [2048, 3969, 1, 512] + - Exact: [2048, 3976, 1, 512] + - Exact: [2048, 3977, 1, 512] + - Exact: [2048, 3978, 1, 512] + - Exact: [2048, 3990, 1, 512] + - Exact: [2048, 3995, 1, 512] + - Exact: [2048, 3996, 1, 512] + - Exact: [2048, 4005, 1, 512] + - Exact: [2048, 4012, 1, 512] + - Exact: [2048, 4020, 1, 512] + - Exact: [2048, 4026, 1, 512] + - Exact: [2048, 4030, 1, 512] + - Exact: [2048, 4032, 1, 512] + - Exact: [33708, 184, 1, 512] + - Exact: [33708, 208, 1, 512] + - Exact: [33708, 246, 1, 512] + - Exact: [33708, 264, 1, 512] + - Exact: [33708, 465, 1, 512] + - Exact: [33708, 468, 1, 512] + - Exact: [33708, 493, 1, 512] + - Exact: [33708, 540, 1, 512] + - Exact: [33708, 550, 1, 512] + - Exact: [33708, 560, 1, 512] + - Exact: [33708, 644, 1, 512] + - Exact: [33708, 714, 1, 512] + - Exact: [33708, 720, 1, 512] + - Exact: [33708, 781, 1, 512] + - Exact: [33708, 936, 1, 512] + - Exact: [33708, 980, 1, 512] + - Exact: [33708, 1232, 1, 512] + - Exact: [33708, 1290, 1, 512] + - Exact: [33708, 1350, 1, 512] + - Exact: [33708, 1424, 1, 512] + - Exact: [33708, 1458, 1, 512] + - Exact: [33708, 1462, 1, 512] + - Exact: [33708, 1520, 1, 512] + - Exact: [33708, 1596, 1, 512] + - Exact: [33708, 1599, 1, 512] + - Exact: [33708, 1615, 1, 512] + - Exact: [33708, 1680, 1, 512] + - Exact: [33708, 1917, 1, 512] + - Exact: [33708, 2205, 1, 512] + - Exact: [33708, 2418, 1, 512] + - Exact: [33708, 3776, 1, 512] + - Exact: [33708, 3780, 1, 512] + - Exact: [33708, 3796, 1, 512] + - Exact: [33708, 3822, 1, 512] + - Exact: [33708, 3835, 1, 512] + - Exact: [33708, 3840, 1, 512] + - Exact: [33708, 3859, 1, 512] + - Exact: [33708, 3870, 1, 512] + - Exact: [33708, 3876, 1, 512] + - Exact: [33708, 3906, 1, 512] + - Exact: [33708, 3910, 1, 512] + - Exact: [33708, 3925, 1, 512] + - Exact: [33708, 3942, 1, 512] + - Exact: [33708, 3944, 1, 512] + - Exact: [33708, 3955, 1, 512] + - Exact: [33708, 3968, 1, 512] + - Exact: [33708, 3976, 1, 512] + - Exact: [33708, 3977, 1, 512] + - Exact: [33708, 3978, 1, 512] + - Exact: [33708, 3990, 1, 512] + - Exact: [33708, 3996, 1, 512] + - Exact: [33708, 3999, 1, 512] + - Exact: [33708, 4005, 1, 512] + - Exact: [33708, 4012, 1, 512] + - Exact: [33708, 4020, 1, 512] + - Exact: [33708, 4026, 1, 512] + - Exact: [33708, 4030, 1, 512] + - Exact: [33708, 4032, 1, 512] + - Exact: [3072, 512, 1, 3072] + - Exact: [511, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8193, 1, 8192] + - Exact: [3072, 3072, 1, 3071] + - Exact: [8192, 8192, 1, 8193] + - Exact: [7681, 8192, 1, 8192] + - Exact: [7680, 8192, 1, 8193] + - Exact: [513, 4096, 1, 4096] + - Exact: [3073, 512, 1, 3072] + - Exact: [7680, 8192, 1, 8192] + - Exact: [4096, 4096, 1, 4097] + - Exact: [8192, 8191, 1, 8192] + - Exact: [8192, 512, 1, 8193] + - Exact: [2880, 3071, 1, 3072] + - Exact: [2880, 3072, 1, 3072] + - Exact: [4096, 511, 1, 4096] + - Exact: [512, 3072, 1, 3072] + - Exact: [512, 8191, 1, 8192] + - Exact: [4096, 4095, 1, 4096] + - Exact: [8192, 511, 1, 8192] + - Exact: [8192, 512, 1, 8192] + - Exact: [511, 3072, 1, 3072] + - Exact: [7680, 8193, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 512, 1, 3073] + - Exact: [513, 8192, 1, 8192] + - Exact: [7679, 8192, 1, 8192] + - Exact: [3840, 4096, 1, 4097] + - Exact: [512, 3072, 1, 3071] + - Exact: [7680, 8192, 1, 8191] + - Exact: [3072, 511, 1, 3072] + - Exact: [8193, 8192, 1, 8192] + - Exact: [512, 4096, 1, 4095] + - Exact: [512, 3071, 1, 3072] + - Exact: [3073, 3072, 1, 3072] + - Exact: [512, 3073, 1, 3072] + - Exact: [4096, 4096, 1, 4095] + - Exact: [1920, 2048, 1, 2047] + - Exact: [1920, 2049, 1, 2048] + - Exact: [512, 8192, 1, 8191] + - Exact: [3840, 4096, 1, 4096] + - Exact: [8191, 512, 1, 8192] + - Exact: [2881, 3072, 1, 3072] + - Exact: [512, 4096, 1, 4096] + - Exact: [3841, 4096, 1, 4096] + - Exact: [2880, 3072, 1, 3073] + - Exact: [4095, 512, 1, 4096] + - Exact: [1919, 2048, 1, 2048] + - Exact: [1920, 2048, 1, 2048] + - Exact: [8192, 8192, 1, 8192] + - Exact: [511, 4096, 1, 4096] + - Exact: [8192, 513, 1, 8192] + - Exact: [513, 3072, 1, 3072] + - Exact: [7680, 8191, 1, 8192] + - Exact: [512, 4097, 1, 4096] + - Exact: [2047, 2048, 1, 2048] + - Exact: [2049, 2048, 1, 2048] + - Exact: [3840, 4095, 1, 4096] + - Exact: [2880, 3072, 1, 3071] + - Exact: [3072, 3072, 1, 3073] + - Exact: [2880, 3073, 1, 3072] + - Exact: [4096, 513, 1, 4096] + - Exact: [4097, 512, 1, 4096] + - Exact: [8192, 512, 1, 8191] + - Exact: [1921, 2048, 1, 2048] + - Exact: [512, 3072, 1, 3073] + - Exact: [2048, 2049, 1, 2048] + - Exact: [3072, 512, 1, 3071] + - Exact: [3071, 3072, 1, 3072] + - Exact: [3840, 4097, 1, 4096] + - Exact: [2048, 2047, 1, 2048] + - Exact: [2879, 3072, 1, 3072] + - Exact: [3072, 513, 1, 3072] + - Exact: [512, 4095, 1, 4096] + - Exact: [3071, 512, 1, 3072] + - Exact: [4096, 512, 1, 4096] + - Exact: [4097, 4096, 1, 4096] + - Exact: [2048, 2048, 1, 2047] + - Exact: [3839, 4096, 1, 4096] + - Exact: [512, 4096, 1, 4097] + - Exact: [3072, 3073, 1, 3072] + - Exact: [2048, 2048, 1, 2049] + - Exact: [8191, 8192, 1, 8192] + - Exact: [3072, 3071, 1, 3072] + - Exact: [4096, 512, 1, 4097] + - Exact: [3840, 4096, 1, 4095] + - Exact: [1920, 2047, 1, 2048] + - Exact: [8192, 8192, 1, 8191] + - Exact: [3072, 3072, 1, 3072] + - Exact: [512, 8193, 1, 8192] + - Exact: [4096, 512, 1, 4095] + - Exact: [8193, 512, 1, 8192] + - Exact: [4095, 4096, 1, 4096] + - Exact: [4096, 4097, 1, 4096] + - Exact: [512, 8192, 1, 8192] + - Exact: [512, 8192, 1, 8193] + - Exact: [1920, 2048, 1, 2049] + - Exact: [479, 3072, 1, 3072] + - Exact: [479, 4096, 1, 4096] + - Exact: [479, 8192, 1, 8192] + - Exact: [480, 3072, 1, 3071] + - Exact: [480, 3072, 1, 3073] + - Exact: [480, 3073, 1, 3072] + - Exact: [480, 4095, 1, 4096] + - Exact: [480, 4096, 1, 4095] + - Exact: [480, 4096, 1, 4097] + - Exact: [480, 4097, 1, 4096] + - Exact: [480, 8191, 1, 8192] + - Exact: [480, 8192, 1, 8191] + - Exact: [480, 8192, 1, 8193] + - Exact: [480, 8193, 1, 8192] + - Exact: [481, 3072, 1, 3072] + - Exact: [481, 4096, 1, 4096] + - Exact: [481, 8192, 1, 8192] + - Exact: [3072, 479, 1, 3072] + - Exact: [3072, 480, 1, 3071] + - Exact: [3072, 480, 1, 3073] + - Exact: [3072, 481, 1, 3072] + - Exact: [3073, 480, 1, 3072] + - Exact: [480, 3072, 1, 3072] + - Exact: [480, 4096, 1, 4096] + - Exact: [480, 8192, 1, 8192] + - Exact: [3072, 480, 1, 3072] + - Exact: [4096, 480, 1, 4096] + - Exact: [8192, 480, 1, 8192] + - Exact: [1024, 3840, 1, 1024] + - Exact: [1024, 3840, 1, 4096] + - Exact: [1024, 3968, 1, 1024] + - Exact: [1024, 3968, 1, 4096] + - Exact: [1024, 7200, 1, 1024] + - Exact: [1024, 7200, 1, 4096] + - Exact: [1024, 8160, 1, 1024] + - Exact: [1024, 8160, 1, 4096] + - Exact: [1024, 9520, 1, 1024] + - Exact: [1024, 9520, 1, 4096] + - Exact: [1024, 10200, 1, 1024] + - Exact: [1024, 10200, 1, 4096] + - Exact: [4096, 3840, 1, 1024] + - Exact: [4096, 3968, 1, 1024] + - Exact: [4096, 7200, 1, 1024] + - Exact: [4096, 8160, 1, 1024] + - Exact: [4096, 9520, 1, 1024] + - Exact: [4096, 10200, 1, 1024] + - Exact: [42720, 3968, 1, 1024] + - Exact: [42720, 7200, 1, 1024] + - Exact: [42720, 9520, 1, 1024] + - Exact: [2048, 960, 1, 2048] + - Exact: [2048, 960, 1, 74] + - Exact: [1600, 1024, 1, 960] + - Exact: [2048, 2048, 1, 960] + - Exact: [4096, 1024, 1, 257] + - Exact: [10240, 8976, 1, 256] + - Exact: [1024, 1600, 1, 1024] + - Exact: [1024, 1600, 1, 560] + - Exact: [10496, 8976, 1, 256] + - Exact: [11264, 8976, 1, 256] + - Exact: [11776, 8976, 1, 256] + - Exact: [12544, 8976, 1, 256] + - Exact: [1280, 8976, 1, 256] + - Exact: [13312, 8976, 1, 256] + - Exact: [13568, 8976, 1, 256] + - Exact: [13824, 8976, 1, 256] + - Exact: [15104, 8976, 1, 256] + - Exact: [15360, 8976, 1, 256] + - Exact: [15872, 8976, 1, 256] + - Exact: [16128, 8976, 1, 256] + - Exact: [17152, 8976, 1, 256] + - Exact: [1792, 8976, 1, 256] + - Exact: [18176, 8976, 1, 256] + - Exact: [18688, 8976, 1, 256] + - Exact: [18944, 8976, 1, 256] + - Exact: [19712, 8976, 1, 256] + - Exact: [19968, 8976, 1, 256] + - Exact: [20480, 8976, 1, 256] + - Exact: [2048, 1536, 1, 512] + - Exact: [2048, 1536, 1, 768] + - Exact: [2048, 684, 1, 512] + - Exact: [2048, 684, 1, 768] + - Exact: [2048, 8976, 1, 256] + - Exact: [20992, 8976, 1, 256] + - Exact: [21248, 8976, 1, 256] + - Exact: [2304, 8976, 1, 256] + - Exact: [23552, 8976, 1, 256] + - Exact: [2560, 8976, 1, 256] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [2816, 8976, 1, 256] + - Exact: [28672, 8976, 1, 256] + - Exact: [3072, 8976, 1, 256] + - Exact: [31488, 8976, 1, 256] + - Exact: [3328, 8976, 1, 256] + - Exact: [33536, 8976, 1, 256] + - Exact: [3840, 8976, 1, 256] + - Exact: [4096, 8976, 1, 256] + - Exact: [4352, 8976, 1, 256] + - Exact: [44505, 8976, 1, 256] + - Exact: [4608, 8976, 1, 256] + - Exact: [4864, 8976, 1, 256] + - Exact: [5120, 8976, 1, 256] + - Exact: [5376, 8976, 1, 256] + - Exact: [5632, 8976, 1, 256] + - Exact: [5888, 8976, 1, 256] + - Exact: [6144, 8976, 1, 256] + - Exact: [6400, 8976, 1, 256] + - Exact: [684, 8976, 1, 256] + - Exact: [7168, 8976, 1, 256] + - Exact: [7936, 8976, 1, 256] + - Exact: [8192, 8976, 1, 256] + - Exact: [8448, 8976, 1, 256] + - Exact: [8960, 8976, 1, 256] + - Exact: [9472, 8976, 1, 256] + - Exact: [9728, 8976, 1, 256] + - Exact: [9984, 8976, 1, 256] + - Exact: [256, 10496, 1, 1024] + - Exact: [256, 11264, 1, 1024] + - Exact: [256, 11520, 1, 1024] + - Exact: [256, 11776, 1, 1024] + - Exact: [256, 12544, 1, 1024] + - Exact: [256, 13312, 1, 1024] + - Exact: [256, 14336, 1, 1024] + - Exact: [256, 14592, 1, 1024] + - Exact: [256, 14848, 1, 1024] + - Exact: [256, 15104, 1, 1024] + - Exact: [256, 16128, 1, 1024] + - Exact: [256, 18176, 1, 1024] + - Exact: [256, 18944, 1, 1024] + - Exact: [256, 19200, 1, 1024] + - Exact: [256, 20480, 1, 1024] + - Exact: [256, 20992, 1, 1024] + - Exact: [256, 21248, 1, 1024] + - Exact: [256, 21504, 1, 1024] + - Exact: [256, 22016, 1, 1024] + - Exact: [256, 22344, 1, 1024] + - Exact: [256, 23296, 1, 1024] + - Exact: [256, 23552, 1, 1024] + - Exact: [256, 31488, 1, 1024] + - Exact: [256, 33536, 1, 1024] + - Exact: [256, 44505, 1, 1024] + - Exact: [256, 4608, 1, 1024] + - Exact: [256, 4864, 1, 1024] + - Exact: [256, 5376, 1, 1024] + - Exact: [256, 5888, 1, 1024] + - Exact: [256, 6144, 1, 1024] + - Exact: [256, 6400, 1, 1024] + - Exact: [256, 6656, 1, 1024] + - Exact: [256, 7168, 1, 1024] + - Exact: [256, 7424, 1, 1024] + - Exact: [256, 7936, 1, 1024] + - Exact: [256, 8192, 1, 1024] + - Exact: [256, 8448, 1, 1024] + - Exact: [256, 8960, 1, 1024] + - Exact: [256, 9984, 1, 1024] + - Exact: [512, 32768, 1, 13] + - Exact: [256, 32768, 1, 512] + - Exact: [128, 32768, 1, 512] + - Exact: [1024, 32768, 1, 479] + - Exact: [1024, 32768, 1, 1024] + - Exact: [512, 32768, 1, 1024] + - Exact: [1023, 2048, 1, 4096] + - Exact: [1025, 2048, 1, 4096] + - Exact: [1024, 2047, 1, 4096] + - Exact: [1024, 2049, 1, 4096] + - Exact: [1024, 2048, 1, 4095] + - Exact: [1024, 2048, 1, 4097] + - Exact: [1023, 3072, 1, 1024] + - Exact: [1025, 3072, 1, 1024] + - Exact: [1024, 3071, 1, 1024] + - Exact: [1024, 3073, 1, 1024] + - Exact: [1024, 3072, 1, 1023] + - Exact: [1024, 3072, 1, 1025] + - Exact: [3071, 512, 1, 1024] + - Exact: [3073, 512, 1, 1024] + - Exact: [3072, 511, 1, 1024] + - Exact: [3072, 513, 1, 1024] + - Exact: [3072, 512, 1, 1023] + - Exact: [3072, 512, 1, 1025] + - Exact: [128, 32768, 1, 256] + - Exact: [1024, 4096, 1, 480] + - Exact: [512, 4096, 1, 1024] + - Exact: [512, 55296, 1, 13] + - Exact: [256, 55296, 1, 512] + - Exact: [128, 55296, 1, 256] + - Exact: [1024, 6912, 1, 480] + - Exact: [1024, 6912, 1, 1024] + - Exact: [512, 6912, 1, 1024] + - Exact: [256, 6912, 1, 512] + - Exact: [1151, 1152, 1, 1152] + - Exact: [1153, 1152, 1, 1152] + - Exact: [1152, 1151, 1, 1152] + - Exact: [1152, 1153, 1, 1152] + - Exact: [1152, 1152, 1, 1151] + - Exact: [1152, 1152, 1, 1153] + - Exact: [1535, 1536, 1, 1536] + - Exact: [1537, 1536, 1, 1536] + - Exact: [1536, 1535, 1, 1536] + - Exact: [1536, 1537, 1, 1536] + - Exact: [1536, 1536, 1, 1535] + - Exact: [1536, 1536, 1, 1537] + - Exact: [1919, 1920, 1, 1920] + - Exact: [1921, 1920, 1, 1920] + - Exact: [1920, 1919, 1, 1920] + - Exact: [1920, 1921, 1, 1920] + - Exact: [1920, 1920, 1, 1919] + - Exact: [1920, 1920, 1, 1921] + - Exact: [2303, 2304, 1, 2304] + - Exact: [2305, 2304, 1, 2304] + - Exact: [2304, 2303, 1, 2304] + - Exact: [2304, 2305, 1, 2304] + - Exact: [2304, 2304, 1, 2303] + - Exact: [2304, 2304, 1, 2305] + - Exact: [2687, 2688, 1, 2688] + - Exact: [2689, 2688, 1, 2688] + - Exact: [2688, 2687, 1, 2688] + - Exact: [2688, 2689, 1, 2688] + - Exact: [2688, 2688, 1, 2687] + - Exact: [2688, 2688, 1, 2689] + - Exact: [3455, 3456, 1, 3456] + - Exact: [3457, 3456, 1, 3456] + - Exact: [3456, 3455, 1, 3456] + - Exact: [3456, 3457, 1, 3456] + - Exact: [3456, 3456, 1, 3455] + - Exact: [3456, 3456, 1, 3457] + - Exact: [3839, 3840, 1, 3840] + - Exact: [3841, 3840, 1, 3840] + - Exact: [3840, 3839, 1, 3840] + - Exact: [3840, 3841, 1, 3840] + - Exact: [3840, 3840, 1, 3839] + - Exact: [3840, 3840, 1, 3841] + - Exact: [4223, 4224, 1, 4224] + - Exact: [4225, 4224, 1, 4224] + - Exact: [4224, 4223, 1, 4224] + - Exact: [4224, 4225, 1, 4224] + - Exact: [4224, 4224, 1, 4223] + - Exact: [4224, 4224, 1, 4225] + - Exact: [4607, 4608, 1, 4608] + - Exact: [4609, 4608, 1, 4608] + - Exact: [4608, 4607, 1, 4608] + - Exact: [4608, 4609, 1, 4608] + - Exact: [4608, 4608, 1, 4607] + - Exact: [4608, 4608, 1, 4609] + - Exact: [4991, 4992, 1, 4992] + - Exact: [4993, 4992, 1, 4992] + - Exact: [4992, 4991, 1, 4992] + - Exact: [4992, 4993, 1, 4992] + - Exact: [4992, 4992, 1, 4991] + - Exact: [4992, 4992, 1, 4993] + - Exact: [5375, 5376, 1, 5376] + - Exact: [5377, 5376, 1, 5376] + - Exact: [5376, 5375, 1, 5376] + - Exact: [5376, 5377, 1, 5376] + - Exact: [5376, 5376, 1, 5375] + - Exact: [5376, 5376, 1, 5377] + - Exact: [5759, 5760, 1, 5760] + - Exact: [5761, 5760, 1, 5760] + - Exact: [5760, 5759, 1, 5760] + - Exact: [5760, 5761, 1, 5760] + - Exact: [5760, 5760, 1, 5759] + - Exact: [5760, 5760, 1, 5761] + - Exact: [6143, 6144, 1, 6144] + - Exact: [6145, 6144, 1, 6144] + - Exact: [6144, 6143, 1, 6144] + - Exact: [6144, 6145, 1, 6144] + - Exact: [6144, 6144, 1, 6143] + - Exact: [6144, 6144, 1, 6145] + - Exact: [6527, 6528, 1, 6528] + - Exact: [6529, 6528, 1, 6528] + - Exact: [6528, 6527, 1, 6528] + - Exact: [6528, 6529, 1, 6528] + - Exact: [6528, 6528, 1, 6527] + - Exact: [6528, 6528, 1, 6529] + - Exact: [6911, 6912, 1, 6912] + - Exact: [6913, 6912, 1, 6912] + - Exact: [6912, 6911, 1, 6912] + - Exact: [6912, 6913, 1, 6912] + - Exact: [6912, 6912, 1, 6911] + - Exact: [6912, 6912, 1, 6913] + - Exact: [7295, 7296, 1, 7296] + - Exact: [7297, 7296, 1, 7296] + - Exact: [7296, 7295, 1, 7296] + - Exact: [7296, 7297, 1, 7296] + - Exact: [7296, 7296, 1, 7295] + - Exact: [7296, 7296, 1, 7297] + - Exact: [7679, 7680, 1, 7680] + - Exact: [7681, 7680, 1, 7680] + - Exact: [7680, 7679, 1, 7680] + - Exact: [7680, 7681, 1, 7680] + - Exact: [7680, 7680, 1, 7679] + - Exact: [7680, 7680, 1, 7681] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + - Exact: [256, 128, 49, 1152] + - Exact: [256, 128, 121, 120] + - Exact: [256, 128, 169, 120] + - Exact: [256, 128, 36, 120] + - Exact: [256, 128, 49, 120] + - Exact: [256, 128, 64, 120] + - Exact: [256, 128, 36, 12000] + - Exact: [256, 128, 49, 1216] + - Exact: [256, 128, 121, 18] + - Exact: [256, 128, 169, 18] + - Exact: [256, 128, 36, 18] + - Exact: [256, 128, 49, 18] + - Exact: [256, 128, 64, 18] + - Exact: [256, 128, 36, 1800] + - Exact: [256, 128, 121, 19] + - Exact: [256, 128, 169, 19] + - Exact: [256, 128, 36, 19] + - Exact: [256, 128, 49, 19] + - Exact: [256, 128, 64, 19] + - Exact: [256, 128, 36, 1900] + - Exact: [256, 128, 49, 480] + - Exact: [256, 128, 81, 480] + - Exact: [256, 128, 64, 5880] + - Exact: [256, 128, 49, 72] + - Exact: [256, 128, 81, 72] + - Exact: [256, 128, 49, 76] + - Exact: [256, 128, 81, 76] + - Exact: [256, 128, 49, 7680] + - Exact: [256, 128, 64, 882] + - Exact: [256, 128, 64, 931] + - Exact: [256, 256, 49, 1152] + - Exact: [256, 256, 36, 12000] + - Exact: [256, 256, 49, 1216] + - Exact: [256, 256, 36, 1800] + - Exact: [256, 256, 36, 1900] + - Exact: [256, 256, 64, 5880] + - Exact: [256, 256, 49, 7680] + - Exact: [256, 256, 64, 882] + - Exact: [256, 256, 64, 931] + - Exact: [340, 256, 49, 1152] + - Exact: [340, 256, 36, 120] + - Exact: [340, 256, 49, 120] + - Exact: [340, 256, 64, 120] + - Exact: [340, 256, 36, 12000] + - Exact: [340, 256, 49, 1216] + - Exact: [340, 256, 36, 18] + - Exact: [340, 256, 49, 18] + - Exact: [340, 256, 64, 18] + - Exact: [340, 256, 36, 1800] + - Exact: [340, 256, 36, 19] + - Exact: [340, 256, 49, 19] + - Exact: [340, 256, 64, 19] + - Exact: [340, 256, 36, 1900] + - Exact: [340, 256, 64, 5880] + - Exact: [340, 256, 49, 7680] + - Exact: [340, 256, 64, 882] + - Exact: [340, 256, 64, 931] + - Exact: [510, 256, 49, 120] + - Exact: [510, 256, 64, 120] + - Exact: [510, 256, 49, 18] + - Exact: [510, 256, 64, 18] + - Exact: [510, 256, 49, 19] + - Exact: [510, 256, 64, 19] + - Exact: [510, 256, 36, 480] + - Exact: [510, 256, 36, 72] + - Exact: [510, 256, 36, 76] + - Exact: [510, 512, 36, 1080] + - Exact: [510, 512, 36, 162] + - Exact: [510, 512, 36, 171] + - Exact: [510, 512, 49, 1920] + - Exact: [510, 512, 64, 1920] + - Exact: [510, 512, 49, 288] + - Exact: [510, 512, 64, 288] + - Exact: [510, 512, 36, 3000] + - Exact: [510, 512, 49, 304] + - Exact: [510, 512, 64, 304] + - Exact: [510, 512, 36, 450] + - Exact: [510, 512, 36, 475] + - Exact: [510, 512, 49, 480] + - Exact: [510, 512, 64, 480] + - Exact: [510, 512, 49, 72] + - Exact: [510, 512, 64, 72] + - Exact: [510, 512, 49, 76] + - Exact: [510, 512, 64, 76] + - Exact: [512, 256, 81, 1080] + - Exact: [512, 256, 25, 12000] + - Exact: [512, 256, 81, 162] + - Exact: [512, 256, 81, 171] + - Exact: [512, 256, 25, 1800] + - Exact: [512, 256, 25, 1900] + - Exact: [512, 256, 121, 1920] + - Exact: [512, 256, 169, 1920] + - Exact: [512, 256, 49, 1920] + - Exact: [512, 256, 121, 288] + - Exact: [512, 256, 169, 288] + - Exact: [512, 256, 49, 288] + - Exact: [512, 256, 25, 3000] + - Exact: [512, 256, 81, 3000] + - Exact: [512, 256, 121, 304] + - Exact: [512, 256, 169, 304] + - Exact: [512, 256, 49, 304] + - Exact: [512, 256, 25, 450] + - Exact: [512, 256, 81, 450] + - Exact: [512, 256, 25, 475] + - Exact: [512, 256, 81, 475] + - Exact: [512, 256, 121, 480] + - Exact: [512, 256, 169, 480] + - Exact: [512, 256, 49, 5880] + - Exact: [512, 256, 121, 72] + - Exact: [512, 256, 169, 72] + - Exact: [512, 256, 121, 76] + - Exact: [512, 256, 169, 76] + - Exact: [512, 256, 49, 882] + - Exact: [512, 256, 49, 931] + - Exact: [2304, 512, 1, 100] + - Exact: [2304, 512, 1, 361] + - Exact: [4608, 510, 1, 100] + - Exact: [4608, 510, 1, 361] + - Exact: [8192, 7680, 1, 8192] + - Exact: [4096, 3840, 1, 4096] + - Exact: [2048, 1920, 1, 2048] + - Exact: [30522, 616, 1, 1024] + - Exact: [128, 128, 128, 64] + - Exact: [128, 128, 160, 64] + - Exact: [1024, 1280, 1, 1024] + - Exact: [1024, 1280, 1, 4096] + - Exact: [4096, 1280, 1, 1024] + - Exact: [30522, 200, 1, 1024] + - Exact: [128, 128, 624, 64] + - Exact: [1024, 4992, 1, 1024] + - Exact: [1024, 4992, 1, 4096] + - Exact: [4096, 4992, 1, 1024] + - Exact: [30522, 780, 1, 1024] + - Exact: [30522, 308, 1, 1024] + - Exact: [128, 128, 640, 64] + - Exact: [1024, 5120, 1, 1024] + - Exact: [1024, 5120, 1, 4096] + - Exact: [4096, 5120, 1, 1024] + - Exact: [30522, 800, 1, 1024] + - Exact: [128, 128, 656, 64] + - Exact: [1024, 5248, 1, 1024] + - Exact: [1024, 5248, 1, 4096] + - Exact: [4096, 5248, 1, 1024] + - Exact: [30522, 820, 1, 1024] + - Exact: [512, 512, 80, 64] + - Exact: [1024, 2560, 1, 1024] + - Exact: [1024, 2560, 1, 4096] + - Exact: [4096, 2560, 1, 1024] + - Exact: [30522, 385, 1, 1024] + - Exact: [30522, 462, 1, 1024] + - Exact: [128, 128, 144, 64] + - Exact: [1024, 1152, 1, 1024] + - Exact: [1024, 1152, 1, 4096] + - Exact: [4096, 1152, 1, 1024] + - Exact: [30522, 180, 1, 1024] + - Exact: [1024, 8192, 1, 1024] + - Exact: [1024, 8192, 1, 4096] + - Exact: [1024, 9600, 1, 1024] + - Exact: [1024, 9600, 1, 4096] + - Exact: [4096, 8192, 1, 1024] + - Exact: [4096, 9600, 1, 1024] + - Exact: [33712, 8192, 1, 1024] + - Exact: [33712, 9600, 1, 1024] + - Exact: [1024, 10064, 1, 1024] + - Exact: [1024, 10064, 1, 4096] + - Exact: [1024, 10080, 1, 1024] + - Exact: [1024, 10080, 1, 4096] + - Exact: [1024, 6528, 1, 1024] + - Exact: [1024, 6528, 1, 4096] + - Exact: [1024, 7104, 1, 1024] + - Exact: [1024, 7104, 1, 4096] + - Exact: [1024, 8064, 1, 1024] + - Exact: [1024, 8064, 1, 4096] + - Exact: [1024, 9216, 1, 1024] + - Exact: [1024, 9216, 1, 4096] + - Exact: [4096, 10064, 1, 1024] + - Exact: [4096, 10080, 1, 1024] + - Exact: [4096, 6528, 1, 1024] + - Exact: [4096, 7104, 1, 1024] + - Exact: [4096, 8064, 1, 1024] + - Exact: [4096, 9216, 1, 1024] + - Exact: [42720, 10080, 1, 1024] + - Exact: [42720, 6528, 1, 1024] + - Exact: [42720, 7104, 1, 1024] + - Exact: [1024, 32768, 1, 480] + - Exact: [30592, 1024, 1, 2048] + - Exact: [6144, 1024, 1, 2048] + - Exact: [8192, 1024, 1, 2048] + - Exact: [30592, 8192, 1, 1024] + - Exact: [3072, 8192, 1, 1024] + - Exact: [512, 512, 256, 64] + - Exact: [30592, 2048, 1, 1024] + - Exact: [30592, 4096, 1, 1024] + - Exact: [3072, 4096, 1, 1024] + - Exact: [1920, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 2560] + - Exact: [2560, 2048, 1, 640] + - Exact: [7680, 2048, 1, 2560] + - Exact: [512, 512, 40, 64] + - Exact: [1536, 4096, 1, 1536] + - Exact: [1536, 4096, 1, 6144] + - Exact: [4608, 4096, 1, 1536] + - Exact: [50304, 4096, 1, 1536] + - Exact: [6144, 4096, 1, 1536] + - Exact: [1024, 1024, 64, 96] + - Exact: [1536, 8192, 1, 1536] + - Exact: [1536, 8192, 1, 6144] + - Exact: [4608, 8192, 1, 1536] + - Exact: [50304, 8192, 1, 1536] + - Exact: [6144, 8192, 1, 1536] + - Exact: [1024, 1024, 128, 96] + - Exact: [1024, 16384, 1, 1024] + - Exact: [1024, 16384, 1, 4096] + - Exact: [3072, 16384, 1, 1024] + - Exact: [4096, 16384, 1, 1024] + - Exact: [50304, 16384, 1, 1024] + - Exact: [1024, 1024, 256, 64] + - Exact: [50304, 2048, 1, 1024] + - Exact: [1024, 1024, 32, 64] + - Exact: [50304, 4096, 1, 1024] + - Exact: [1024, 1024, 64, 64] + - Exact: [50304, 8192, 1, 1024] + - Exact: [1024, 1024, 128, 64] + - Exact: [30528, 8192, 1, 1024] + - Exact: [128, 128, 1024, 64] + - Exact: [1024, 3456, 1, 1024] + - Exact: [1024, 3456, 1, 480] + - Exact: [512, 3456, 1, 1024] + - Exact: [512, 3456, 1, 13] + - Exact: [512, 4096, 1, 13] + - Exact: [512, 6912, 1, 13] + - Exact: [30528, 640, 1, 1024] + - Exact: [30528, 1280, 1, 1024] + - Exact: [30528, 1600, 1, 1024] + - Exact: [1024, 10240, 1, 1024] + - Exact: [4096, 10240, 1, 1024] + - Exact: [1024, 10240, 1, 4096] + - Exact: [128, 128, 1280, 64] + - Exact: [1024, 10496, 1, 4096] + - Exact: [30528, 1640, 1, 1024] + - Exact: [4096, 10496, 1, 1024] + - Exact: [1024, 10496, 1, 1024] + - Exact: [128, 128, 1312, 64] + - Exact: [30528, 160, 1, 1024] + - Exact: [30528, 240, 1, 1024] + - Exact: [1024, 6144, 1, 1024] + - Exact: [4096, 6144, 1, 1024] + - Exact: [1024, 6144, 1, 4096] + - Exact: [512, 512, 192, 64] + - Exact: [1024, 10224, 1, 1024] + - Exact: [1024, 10192, 1, 1024] + - Exact: [1024, 10208, 1, 1024] + - Exact: [1024, 10224, 1, 4096] + - Exact: [4096, 10224, 1, 1024] + - Exact: [3072, 10224, 1, 1024] + - Exact: [3072, 10240, 1, 1024] + - Exact: [1024, 10192, 1, 4096] + - Exact: [4096, 10192, 1, 1024] + - Exact: [3072, 10192, 1, 1024] + - Exact: [3072, 10200, 1, 1024] + - Exact: [1024, 10184, 1, 1024] + - Exact: [3072, 10208, 1, 1024] + - Exact: [1024, 10208, 1, 4096] + - Exact: [4096, 10208, 1, 1024] + - Exact: [2048, 10224, 1, 1024] + - Exact: [2048, 10240, 1, 1024] + - Exact: [1024, 10120, 1, 1024] + - Exact: [2048, 10192, 1, 1024] + - Exact: [1024, 10152, 1, 1024] + - Exact: [3072, 10080, 1, 1024] + - Exact: [256, 256, 25, 12544] + - Exact: [256, 256, 49, 3200] + - Exact: [256, 256, 25, 6272] + - Exact: [256, 256, 49, 6400] + - Exact: [512, 512, 49, 1152] + - Exact: [512, 512, 25, 2048] + - Exact: [512, 512, 49, 2304] + - Exact: [512, 512, 25, 4096] + - Exact: [128, 128, 2048, 64] + - Exact: [30528, 2560, 1, 1024] + - Exact: [128, 128, 1536, 64] + - Exact: [1024, 12288, 1, 1024] + - Exact: [1024, 12288, 1, 4096] + - Exact: [30528, 1920, 1, 1024] + - Exact: [4096, 12288, 1, 1024] + - Exact: [128, 128, 81, 12544] + - Exact: [128, 128, 121, 9216] + - Exact: [128, 128, 169, 6400] + - Exact: [256, 256, 36, 4096] + - Exact: [256, 256, 49, 2304] + - Exact: [256, 256, 64, 2304] + - Exact: [256, 256, 81, 4096] + - Exact: [256, 256, 121, 2304] + - Exact: [256, 256, 169, 2304] + - Exact: [512, 512, 81, 1024] + - Exact: [512, 512, 121, 1024] + - Exact: [512, 512, 169, 1024] + - Exact: [512, 512, 36, 1024] + - Exact: [512, 512, 49, 1024] + - Exact: [512, 512, 64, 1024] + - Exact: [128, 128, 192, 64] + - Exact: [768, 2048, 1, 768] + - Exact: [3072, 2048, 1, 768] + - Exact: [768, 2048, 1, 3072] + - Exact: [384, 384, 144, 64] + - Exact: [768, 4608, 1, 768] + - Exact: [3072, 4608, 1, 768] + - Exact: [768, 4608, 1, 3072] + - Exact: [512, 512, 48, 64] + - Exact: [128, 128, 256, 64] + - Exact: [384, 384, 192, 64] + - Exact: [1024, 4608, 1, 1024] + - Exact: [4096, 4608, 1, 1024] + - Exact: [1024, 4608, 1, 4096] + - Exact: [256, 256, 36, 432] + - Exact: [256, 256, 36, 456] + - Exact: [256, 256, 36, 504] + - Exact: [256, 256, 49, 1120] + - Exact: [256, 256, 36, 442] + - Exact: [256, 256, 49, 950] + - Exact: [256, 256, 64, 616] + - Exact: [256, 256, 64, 660] + - Exact: [256, 256, 36, 408] + - Exact: [256, 256, 49, 1008] + - Exact: [256, 256, 36, 462] + - Exact: [256, 256, 36, 468] + - Exact: [256, 256, 36, 494] + - Exact: [512, 512, 64, 48] + - Exact: [256, 256, 64, 140] + - Exact: [512, 512, 64, 56] + - Exact: [512, 512, 49, 90] + - Exact: [512, 512, 49, 60] + - Exact: [256, 256, 49, 864] + - Exact: [256, 256, 64, 224] + - Exact: [256, 256, 64, 176] + - Exact: [256, 256, 64, 154] + - Exact: [512, 512, 49, 80] + - Exact: [256, 256, 49, 1200] + - Exact: [256, 256, 64, 704] + - Exact: [256, 256, 64, 768] + - Exact: [256, 256, 49, 1160] + - Exact: [256, 256, 49, 320] + - Exact: [512, 512, 49, 70] + - Exact: [256, 256, 49, 1240] + - Exact: [256, 256, 36, 384] + - Exact: [1024, 2048, 1, 888] + - Exact: [1024, 2048, 1, 713] + - Exact: [1024, 2048, 1, 660] + - Exact: [1024, 2048, 1, 726] + - Exact: [1024, 2048, 1, 672] + - Exact: [1024, 2048, 1, 850] + - Exact: [1024, 2048, 1, 805] + - Exact: [1024, 2048, 1, 864] + - Exact: [1024, 2048, 1, 768] + - Exact: [1024, 2048, 1, 950] + - Exact: [1024, 1024, 160, 96] + - Exact: [2880, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 960] + - Exact: [3840, 16384, 1, 1920] + - Exact: [1920, 16384, 1, 3840] + - Exact: [25216, 16384, 1, 1920] + - Exact: [1024, 1024, 40, 96] + - Exact: [2880, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 960] + - Exact: [3840, 4096, 1, 1920] + - Exact: [1920, 4096, 1, 3840] + - Exact: [25216, 4096, 1, 1920] + - Exact: [1024, 1024, 80, 96] + - Exact: [2880, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 960] + - Exact: [3840, 8192, 1, 1920] + - Exact: [1920, 8192, 1, 3840] + - Exact: [25216, 8192, 1, 1920] + - Exact: [1024, 1024, 96, 96] + - Exact: [1728, 16384, 1, 2304] + - Exact: [2304, 16384, 1, 576] + - Exact: [2304, 16384, 1, 2304] + - Exact: [12672, 16384, 1, 2304] + - Exact: [1024, 1024, 24, 96] + - Exact: [1728, 4096, 1, 2304] + - Exact: [2304, 4096, 1, 576] + - Exact: [2304, 4096, 1, 2304] + - Exact: [12672, 4096, 1, 2304] + - Exact: [1024, 1024, 48, 96] + - Exact: [1728, 8192, 1, 2304] + - Exact: [2304, 8192, 1, 576] + - Exact: [2304, 8192, 1, 2304] + - Exact: [12672, 8192, 1, 2304] + - Exact: [1024, 1024, 16, 96] + - Exact: [1152, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 384] + - Exact: [1536, 4096, 1, 3072] + - Exact: [3072, 4096, 1, 1536] + - Exact: [6400, 4096, 1, 3072] + - Exact: [1024, 1024, 32, 96] + - Exact: [1152, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 384] + - Exact: [1536, 8192, 1, 3072] + - Exact: [3072, 8192, 1, 1536] + - Exact: [6400, 8192, 1, 3072] + - Exact: [2048, 4096, 1, 2048] + - Exact: [2048, 4096, 1, 4096] + - Exact: [29000, 199, 1, 2048] + - Exact: [29000, 221, 1, 2048] + - Exact: [29000, 224, 1, 2048] + - Exact: [29000, 229, 1, 2048] + - Exact: [29000, 234, 1, 2048] + - Exact: [29000, 242, 1, 2048] + - Exact: [29000, 246, 1, 2048] + - Exact: [29000, 247, 1, 2048] + - Exact: [29000, 256, 1, 2048] + - Exact: [29000, 262, 1, 2048] + - Exact: [29000, 264, 1, 2048] + - Exact: [29000, 265, 1, 2048] + - Exact: [29000, 274, 1, 2048] + - Exact: [29000, 277, 1, 2048] + - Exact: [29000, 279, 1, 2048] + - Exact: [29000, 288, 1, 2048] + - Exact: [29000, 296, 1, 2048] + - Exact: [29000, 315, 1, 2048] + - Exact: [29000, 335, 1, 2048] + - Exact: [4096, 4096, 1, 2048] + - Exact: [29000, 2283, 1, 1024] + - Exact: [29000, 2296, 1, 1024] + - Exact: [29000, 2306, 1, 1024] + - Exact: [29000, 2309, 1, 1024] + - Exact: [29000, 2318, 1, 1024] + - Exact: [29000, 2320, 1, 1024] + - Exact: [29000, 2324, 1, 1024] + - Exact: [29000, 2325, 1, 1024] + - Exact: [29000, 2329, 1, 1024] + - Exact: [29000, 2338, 1, 1024] + - Exact: [29000, 2345, 1, 1024] + - Exact: [29000, 2350, 1, 1024] + - Exact: [29000, 2362, 1, 1024] + - Exact: [29000, 2366, 1, 1024] + - Exact: [29000, 2368, 1, 1024] + - Exact: [29000, 2374, 1, 1024] + - Exact: [29000, 2390, 1, 1024] + - Exact: [512, 512, 320, 64] + - Exact: [29000, 561, 1, 1024] + - Exact: [29000, 574, 1, 1024] + - Exact: [29000, 600, 1, 1024] + - Exact: [29000, 608, 1, 1024] + - Exact: [29000, 615, 1, 1024] + - Exact: [29000, 622, 1, 1024] + - Exact: [29000, 625, 1, 1024] + - Exact: [29000, 626, 1, 1024] + - Exact: [29000, 628, 1, 1024] + - Exact: [29000, 636, 1, 1024] + - Exact: [29000, 651, 1, 1024] + - Exact: [29000, 658, 1, 1024] + - Exact: [29000, 669, 1, 1024] + - Exact: [29000, 670, 1, 1024] + - Exact: [29000, 672, 1, 1024] + - Exact: [29000, 684, 1, 1024] + - Exact: [29000, 716, 1, 1024] + - Exact: [29000, 730, 1, 1024] + - Exact: [2560, 1024, 1, 2560] + - Exact: [2560, 1024, 1, 4096] + - Exact: [1024, 1024, 512, 64] + - Exact: [1024, 32768, 1, 4096] + - Exact: [3072, 32768, 1, 1024] + - Exact: [4096, 32768, 1, 1024] + - Exact: [50304, 32768, 1, 1024] + - Exact: [1024, 1024, 24, 128] + - Exact: [128, 1024, 24, 1024] + +# bodys bigSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4096, 256, 1, 12288] + - Exact: [2048, 256, 1, 13312] + - Exact: [4096, 256, 1, 15360] + - Exact: [2048, 512, 1, 16640] + - Exact: [4096, 256, 1, 14336] + - Exact: [1024, 1024, 1, 8192] + - Exact: [1024, 512, 1, 16384] + - Exact: [4096, 256, 1, 9216] + - Exact: [1024, 512, 1, 12288] + - Exact: [4096, 200, 1, 12288] + - Exact: [1024, 1024, 1, 13312] + - Exact: [2048, 256, 1, 16384] + - Exact: [2048, 512, 1, 16384] + - Exact: [1024, 1024, 1, 8320] + - Exact: [2048, 256, 1, 14336] + - Exact: [4096, 200, 1, 16640] + - Exact: [1024, 1024, 1, 16640] + - Exact: [1024, 1024, 1, 14336] + - Exact: [2048, 512, 1, 9216] + - Exact: [1024, 1024, 1, 15360] + - Exact: [2048, 512, 1, 8192] + - Exact: [2048, 512, 1, 13312] + - Exact: [1024, 1024, 1, 11264] + - Exact: [1024, 512, 1, 16640] + - Exact: [2048, 512, 1, 10240] + - Exact: [2048, 256, 1, 16640] + - Exact: [4096, 256, 1, 13312] + - Exact: [4096, 200, 1, 15360] + - Exact: [2048, 512, 1, 12288] + - Exact: [4096, 256, 1, 8192] + - Exact: [2048, 512, 1, 15360] + - Exact: [2048, 512, 1, 11264] + - Exact: [2048, 256, 1, 12288] + - Exact: [1024, 1024, 1, 12288] + - Exact: [4096, 256, 1, 16384] + - Exact: [2048, 256, 1, 15360] + - Exact: [2048, 512, 1, 8320] + - Exact: [1024, 1024, 1, 10240] + - Exact: [1024, 1024, 1, 9216] + - Exact: [4096, 200, 1, 16384] + - Exact: [2048, 512, 1, 14336] + - Exact: [1024, 512, 1, 13312] + - Exact: [4096, 256, 1, 8320] + - Exact: [4096, 200, 1, 13312] + - Exact: [1024, 512, 1, 14336] + - Exact: [4096, 256, 1, 11264] + - Exact: [4096, 256, 1, 10240] + - Exact: [4096, 200, 1, 14336] + - Exact: [4096, 256, 1, 16640] + - Exact: [1024, 512, 1, 15360] + - Exact: [1024, 1024, 1, 16384] + - Exact: [224, 192, 36, 10368] + - Exact: [320, 256, 9, 19584] + - Exact: [256, 256, 11, 13056] + - Exact: [320, 256, 9, 9792] + - Exact: [320, 256, 11, 13056] + - Exact: [256, 256, 9, 9792] + - Exact: [256, 224, 9, 19584] + - Exact: [256, 256, 9, 19584] + - Exact: [128, 128, 36, 12000] + - Exact: [128, 128, 49, 12800] + - Exact: [128, 128, 25, 25088] + - Exact: [128, 128, 49, 25600] + - Exact: [128, 128, 25, 50176] + - Exact: [128, 128, 36, 12544] + - Exact: [128, 128, 49, 9216] + - Exact: [1024, 1024, 1, 12544] + - Exact: [1024, 1000, 1, 12544] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 512, 1, 1600] + - Exact: [2048, 512, 1, 100] + - Exact: [768, 640, 1, 768] + - Exact: [768, 1280, 1, 768] + - Exact: [1024, 512, 1, 1024] + - Exact: [1024, 512, 1, 3072] + - Exact: [30522, 120, 1, 1024] + - Exact: [30522, 80, 1, 1024] + - Exact: [64, 128, 512, 128] + - Exact: [64, 512, 64, 512] + - Exact: [64, 64, 768, 64] + - Exact: [64, 64, 96, 64] + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2048, 400, 1, 512] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [3072, 128, 1, 1024] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [256, 1408, 1, 256] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [6784, 64, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 1280] + - Exact: [448, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [128, 5056, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [1024, 700, 1, 512] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [7680, 64, 1, 2560] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [704, 1856, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [64, 6784, 1, 256] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 3584, 1, 3328] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [7680, 128, 1, 2560] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [6784, 64, 1, 256] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [64, 6784, 1, 1280] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [704, 704, 1, 128] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [448, 1024, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [2368, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [2560, 128, 1, 2560] + - Exact: [448, 1024, 1, 256] + - Exact: [1024, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [448, 2944, 1, 128] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [2368, 448, 1, 3328] + - Exact: [128, 5888, 1, 1280] + - Exact: [64, 5056, 1, 128] + - Exact: [64, 6784, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [5888, 64, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [5056, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [4096, 128, 1, 4096] + - Exact: [5888, 64, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [2048, 200, 1, 3200] + - Exact: [2048, 256, 1, 3328] + - Exact: [4096, 200, 1, 11264] + - Exact: [2048, 512, 1, 1024] + - Exact: [1024, 1024, 1, 64] + - Exact: [512, 1024, 1, 1536] + - Exact: [1024, 512, 1, 512] + - Exact: [2048, 512, 1, 640] + - Exact: [1024, 1024, 1, 512] + - Exact: [2048, 256, 1, 2048] + - Exact: [1024, 512, 1, 128] + - Exact: [2048, 512, 1, 256] + - Exact: [4096, 200, 1, 2560] + - Exact: [1024, 1024, 1, 1152] + - Exact: [2048, 200, 1, 32] + - Exact: [512, 1024, 1, 2816] + - Exact: [2048, 200, 1, 2080] + - Exact: [2048, 200, 1, 1024] + - Exact: [4096, 200, 1, 4096] + - Exact: [1024, 512, 1, 11264] + - Exact: [1024, 1024, 1, 1792] + - Exact: [4096, 200, 1, 768] + - Exact: [4096, 256, 1, 1024] + - Exact: [1024, 512, 1, 256] + - Exact: [1024, 512, 1, 1408] + - Exact: [1024, 512, 1, 5632] + - Exact: [4096, 200, 1, 256] + - Exact: [512, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 4160] + - Exact: [2048, 256, 1, 384] + - Exact: [4096, 200, 1, 640] + - Exact: [1024, 1024, 1, 7168] + - Exact: [4096, 256, 1, 768] + - Exact: [2048, 256, 1, 6656] + - Exact: [2048, 200, 1, 3072] + - Exact: [1024, 512, 1, 2816] + - Exact: [4096, 256, 1, 7680] + - Exact: [4096, 200, 1, 1024] + - Exact: [2048, 200, 1, 1792] + - Exact: [1024, 1024, 1, 2816] + - Exact: [2048, 512, 1, 1536] + - Exact: [4096, 256, 1, 3072] + - Exact: [2048, 256, 1, 5632] + - Exact: [1024, 512, 1, 6656] + - Exact: [4096, 200, 1, 2080] + - Exact: [2048, 200, 1, 13312] + - Exact: [4096, 256, 1, 3584] + - Exact: [2048, 256, 1, 8192] + - Exact: [2048, 512, 1, 512] + - Exact: [2048, 512, 1, 1152] + - Exact: [2048, 200, 1, 9216] + - Exact: [2048, 200, 1, 2560] + - Exact: [2048, 256, 1, 4608] + - Exact: [2048, 256, 1, 3584] + - Exact: [1024, 512, 1, 640] + - Exact: [2048, 512, 1, 768] + - Exact: [2048, 200, 1, 1408] + - Exact: [4096, 200, 1, 2048] + - Exact: [1024, 1024, 1, 5632] + - Exact: [2048, 512, 1, 3584] + - Exact: [1024, 512, 1, 64] + - Exact: [4096, 200, 1, 7680] + - Exact: [1024, 1024, 1, 1280] + - Exact: [2048, 200, 1, 896] + - Exact: [2048, 256, 1, 32] + - Exact: [2048, 256, 1, 1280] + - Exact: [4096, 256, 1, 4096] + - Exact: [2048, 256, 1, 11264] + - Exact: [4096, 200, 1, 9216] + - Exact: [1024, 512, 1, 4096] + - Exact: [4096, 200, 1, 3840] + - Exact: [1024, 1024, 1, 1920] + - Exact: [2048, 200, 1, 7168] + - Exact: [4096, 256, 1, 1152] + - Exact: [2048, 256, 1, 1920] + - Exact: [2048, 512, 1, 4160] + - Exact: [2048, 512, 1, 5632] + - Exact: [4096, 256, 1, 7168] + - Exact: [4096, 200, 1, 128] + - Exact: [2048, 200, 1, 5120] + - Exact: [1024, 1024, 1, 6656] + - Exact: [512, 1024, 1, 3200] + - Exact: [2048, 256, 1, 1536] + - Exact: [4096, 256, 1, 256] + - Exact: [2048, 512, 1, 1408] + - Exact: [1024, 512, 1, 2080] + - Exact: [2048, 512, 1, 2304] + - Exact: [4096, 200, 1, 512] + - Exact: [2048, 200, 1, 1280] + - Exact: [1024, 1024, 1, 2304] + - Exact: [2048, 512, 1, 4608] + - Exact: [4096, 256, 1, 6144] + - Exact: [4096, 256, 1, 896] + - Exact: [2048, 256, 1, 640] + - Exact: [2048, 512, 1, 384] + - Exact: [2048, 200, 1, 16384] + - Exact: [4096, 200, 1, 10240] + - Exact: [1024, 512, 1, 9216] + - Exact: [4096, 200, 1, 1920] + - Exact: [2048, 512, 1, 7680] + - Exact: [1024, 512, 1, 3584] + - Exact: [1024, 1024, 1, 32] + - Exact: [2048, 512, 1, 1664] + - Exact: [2048, 200, 1, 2048] + - Exact: [1024, 1024, 1, 3584] + - Exact: [4096, 256, 1, 6656] + - Exact: [4096, 256, 1, 4160] + - Exact: [2048, 256, 1, 3072] + - Exact: [2048, 256, 1, 8320] + - Exact: [1024, 512, 1, 3200] + - Exact: [1024, 512, 1, 896] + - Exact: [2048, 512, 1, 1280] + - Exact: [4096, 200, 1, 64] + - Exact: [1024, 1024, 1, 5120] + - Exact: [2048, 512, 1, 6656] + - Exact: [1024, 1024, 1, 128] + - Exact: [512, 1024, 1, 1792] + - Exact: [4096, 256, 1, 2816] + - Exact: [1024, 1024, 1, 4096] + - Exact: [2048, 200, 1, 4160] + - Exact: [1024, 512, 1, 768] + - Exact: [4096, 200, 1, 8320] + - Exact: [2048, 512, 1, 896] + - Exact: [4096, 200, 1, 7168] + - Exact: [2048, 200, 1, 3840] + - Exact: [1024, 1024, 1, 768] + - Exact: [4096, 256, 1, 2304] + - Exact: [2048, 200, 1, 16640] + - Exact: [2048, 256, 1, 2816] + - Exact: [1024, 512, 1, 384] + - Exact: [2048, 200, 1, 7680] + - Exact: [1024, 512, 1, 4608] + - Exact: [4096, 200, 1, 32] + - Exact: [4096, 200, 1, 3328] + - Exact: [1024, 1024, 1, 1408] + - Exact: [2048, 200, 1, 15360] + - Exact: [512, 1024, 1, 2048] + - Exact: [4096, 256, 1, 5632] + - Exact: [2048, 256, 1, 1408] + - Exact: [2048, 256, 1, 6144] + - Exact: [4096, 256, 1, 3328] + - Exact: [2048, 512, 1, 6144] + - Exact: [2048, 512, 1, 3200] + - Exact: [2048, 200, 1, 4608] + - Exact: [1024, 1024, 1, 6144] + - Exact: [4096, 256, 1, 1664] + - Exact: [2048, 200, 1, 384] + - Exact: [4096, 256, 1, 1792] + - Exact: [2048, 512, 1, 2816] + - Exact: [4096, 256, 1, 384] + - Exact: [2048, 256, 1, 128] + - Exact: [1024, 1024, 1, 640] + - Exact: [4096, 200, 1, 5632] + - Exact: [2048, 200, 1, 1152] + - Exact: [4096, 256, 1, 512] + - Exact: [1024, 1024, 1, 384] + - Exact: [2048, 200, 1, 512] + - Exact: [2048, 256, 1, 9216] + - Exact: [2048, 256, 1, 1792] + - Exact: [4096, 200, 1, 1792] + - Exact: [2048, 200, 1, 1536] + - Exact: [1024, 1024, 1, 3072] + - Exact: [1024, 1024, 1, 2080] + - Exact: [2048, 200, 1, 2304] + - Exact: [2048, 256, 1, 7168] + - Exact: [2048, 512, 1, 1792] + - Exact: [1024, 1024, 1, 4608] + - Exact: [512, 1024, 1, 1280] + - Exact: [2048, 256, 1, 3200] + - Exact: [1024, 512, 1, 3328] + - Exact: [1024, 512, 1, 4160] + - Exact: [4096, 200, 1, 6656] + - Exact: [2048, 200, 1, 3328] + - Exact: [1024, 1024, 1, 256] + - Exact: [2048, 256, 1, 64] + - Exact: [2048, 256, 1, 2304] + - Exact: [4096, 200, 1, 8192] + - Exact: [1024, 512, 1, 7168] + - Exact: [1024, 512, 1, 1792] + - Exact: [4096, 200, 1, 2816] + - Exact: [1024, 1024, 1, 896] + - Exact: [4096, 256, 1, 5120] + - Exact: [4096, 256, 1, 2048] + - Exact: [2048, 256, 1, 5120] + - Exact: [2048, 256, 1, 7680] + - Exact: [2048, 200, 1, 3584] + - Exact: [1024, 512, 1, 1536] + - Exact: [2048, 200, 1, 64] + - Exact: [2048, 200, 1, 4096] + - Exact: [1024, 1024, 1, 1536] + - Exact: [4096, 256, 1, 32] + - Exact: [4096, 256, 1, 1280] + - Exact: [2048, 256, 1, 1024] + - Exact: [1024, 512, 1, 1152] + - Exact: [2048, 512, 1, 3328] + - Exact: [4096, 200, 1, 3584] + - Exact: [2048, 200, 1, 256] + - Exact: [4096, 256, 1, 1920] + - Exact: [2048, 256, 1, 1664] + - Exact: [4096, 200, 1, 5120] + - Exact: [1024, 512, 1, 8192] + - Exact: [4096, 200, 1, 896] + - Exact: [2048, 200, 1, 640] + - Exact: [4096, 200, 1, 1408] + - Exact: [2048, 200, 1, 5632] + - Exact: [1024, 512, 1, 2560] + - Exact: [4096, 200, 1, 1280] + - Exact: [1024, 1024, 1, 2560] + - Exact: [2048, 512, 1, 64] + - Exact: [2048, 200, 1, 8192] + - Exact: [2048, 512, 1, 3072] + - Exact: [4096, 256, 1, 640] + - Exact: [2048, 256, 1, 4096] + - Exact: [4096, 200, 1, 1664] + - Exact: [2048, 200, 1, 6656] + - Exact: [512, 1024, 1, 768] + - Exact: [2048, 200, 1, 8320] + - Exact: [4096, 256, 1, 3840] + - Exact: [1024, 1024, 1, 3200] + - Exact: [4096, 256, 1, 4608] + - Exact: [1024, 512, 1, 32] + - Exact: [1024, 512, 1, 3840] + - Exact: [2048, 512, 1, 1920] + - Exact: [4096, 200, 1, 6144] + - Exact: [2048, 200, 1, 2816] + - Exact: [1024, 1024, 1, 3840] + - Exact: [2048, 256, 1, 3840] + - Exact: [1024, 512, 1, 7680] + - Exact: [2048, 200, 1, 10240] + - Exact: [2048, 512, 1, 5120] + - Exact: [512, 1024, 1, 512] + - Exact: [2048, 512, 1, 32] + - Exact: [4096, 256, 1, 2560] + - Exact: [4096, 256, 1, 64] + - Exact: [2048, 200, 1, 768] + - Exact: [2048, 512, 1, 2560] + - Exact: [2048, 512, 1, 7168] + - Exact: [2048, 512, 1, 128] + - Exact: [4096, 200, 1, 2304] + - Exact: [2048, 512, 1, 4096] + - Exact: [2048, 256, 1, 2560] + - Exact: [2048, 256, 1, 4160] + - Exact: [1024, 512, 1, 1664] + - Exact: [2048, 512, 1, 2080] + - Exact: [2048, 512, 1, 3840] + - Exact: [4096, 200, 1, 3072] + - Exact: [1024, 1024, 1, 1664] + - Exact: [512, 1024, 1, 2304] + - Exact: [4096, 256, 1, 1408] + - Exact: [2048, 256, 1, 1152] + - Exact: [1024, 512, 1, 1280] + - Exact: [2048, 200, 1, 12288] + - Exact: [2048, 200, 1, 1664] + - Exact: [4096, 200, 1, 4608] + - Exact: [512, 1024, 1, 2560] + - Exact: [4096, 200, 1, 384] + - Exact: [2048, 200, 1, 128] + - Exact: [2048, 200, 1, 11264] + - Exact: [1024, 512, 1, 1920] + - Exact: [4096, 256, 1, 1536] + - Exact: [2048, 256, 1, 256] + - Exact: [2048, 256, 1, 10240] + - Exact: [1024, 512, 1, 5120] + - Exact: [1024, 512, 1, 8320] + - Exact: [1024, 512, 1, 10240] + - Exact: [1024, 1024, 1, 2048] + - Exact: [2048, 256, 1, 2080] + - Exact: [4096, 256, 1, 128] + - Exact: [2048, 256, 1, 896] + - Exact: [4096, 200, 1, 1152] + - Exact: [2048, 200, 1, 6144] + - Exact: [1024, 1024, 1, 7680] + - Exact: [2048, 200, 1, 1920] + - Exact: [4096, 256, 1, 2080] + - Exact: [2048, 200, 1, 14336] + - Exact: [1024, 512, 1, 6144] + - Exact: [1024, 512, 1, 2304] + - Exact: [4096, 200, 1, 4160] + - Exact: [4096, 200, 1, 1536] + - Exact: [2048, 320, 1, 64] + - Exact: [2048, 384, 1, 64] + - Exact: [1024, 384, 1, 289] + - Exact: [2048, 448, 1, 64] + - Exact: [102, 101, 624, 64] + - Exact: [101, 101, 624, 64] + - Exact: [85, 85, 752, 64] + - Exact: [112, 111, 576, 64] + - Exact: [65, 65, 992, 64] + - Exact: [77, 77, 816, 64] + - Exact: [111, 111, 576, 64] + - Exact: [84, 85, 752, 64] + - Exact: [84, 84, 752, 64] + - Exact: [71, 71, 896, 64] + - Exact: [122, 122, 528, 64] + - Exact: [78, 78, 816, 64] + - Exact: [112, 112, 576, 64] + - Exact: [77, 78, 816, 64] + - Exact: [111, 112, 576, 64] + - Exact: [92, 93, 688, 64] + - Exact: [102, 102, 624, 64] + - Exact: [99, 99, 624, 64] + - Exact: [100, 102, 624, 64] + - Exact: [123, 122, 528, 64] + - Exact: [99, 102, 624, 64] + - Exact: [93, 93, 688, 64] + - Exact: [123, 123, 528, 64] + - Exact: [100, 100, 624, 64] + - Exact: [101, 102, 624, 64] + - Exact: [102, 100, 624, 64] + - Exact: [92, 92, 688, 64] + - Exact: [3072, 128, 1, 4096] + - Exact: [1728, 320, 1, 64] + - Exact: [1440, 320, 1, 196] + - Exact: [2592, 384, 1, 289] + - Exact: [192, 80, 36, 10368] + - Exact: [1280, 384, 1, 64] + - Exact: [1280, 448, 1, 64] + - Exact: [3456, 256, 1, 169] + - Exact: [2304, 256, 1, 196] + - Exact: [224, 192, 36, 2592] + - Exact: [192, 128, 36, 1568] + - Exact: [1296, 288, 1, 196] + - Exact: [192, 64, 36, 6272] + - Exact: [1728, 224, 1, 1225] + - Exact: [1152, 384, 1, 64] + - Exact: [1792, 256, 1, 289] + - Exact: [1728, 384, 1, 169] + - Exact: [1568, 256, 1, 289] + - Exact: [1152, 448, 1, 64] + - Exact: [1536, 256, 1, 64] + - Exact: [1440, 320, 1, 49] + - Exact: [1344, 512, 1, 64] + - Exact: [1152, 256, 1, 196] + - Exact: [1728, 192, 1, 1225] + - Exact: [2048, 512, 1, 49] + - Exact: [512, 2048, 1, 49] + - Exact: [1728, 192, 1, 64] + - Exact: [1536, 384, 1, 64] + - Exact: [2048, 192, 1, 64] + - Exact: [128, 96, 36, 1568] + - Exact: [128, 128, 36, 3136] + - Exact: [1280, 320, 1, 64] + - Exact: [1792, 320, 1, 289] + - Exact: [2880, 320, 1, 64] + - Exact: [1728, 384, 1, 49] + - Exact: [512, 1024, 1, 196] + - Exact: [224, 192, 36, 5184] + - Exact: [192, 80, 36, 20736] + - Exact: [224, 192, 64, 4608] + - Exact: [224, 192, 64, 2304] + - Exact: [192, 80, 49, 14400] + - Exact: [224, 192, 49, 6272] + - Exact: [224, 192, 49, 3136] + - Exact: [192, 80, 36, 41472] + - Exact: [192, 80, 49, 28800] + - Exact: [192, 80, 64, 9216] + - Exact: [256, 224, 9, 9792] + - Exact: [256, 256, 9, 4896] + - Exact: [320, 256, 9, 4896] + - Exact: [224, 192, 9, 19584] + - Exact: [192, 192, 11, 3264] + - Exact: [192, 192, 11, 6528] + - Exact: [192, 192, 9, 4896] + - Exact: [224, 192, 11, 6528] + - Exact: [192, 192, 9, 19584] + - Exact: [256, 224, 11, 13056] + - Exact: [224, 192, 11, 13056] + - Exact: [256, 256, 11, 3264] + - Exact: [320, 256, 11, 6528] + - Exact: [192, 192, 9, 9792] + - Exact: [224, 224, 9, 9792] + - Exact: [224, 192, 11, 3264] + - Exact: [224, 224, 11, 6528] + - Exact: [224, 224, 9, 19584] + - Exact: [192, 192, 11, 13056] + - Exact: [224, 224, 9, 4896] + - Exact: [320, 256, 11, 3264] + - Exact: [256, 256, 11, 6528] + - Exact: [224, 192, 9, 4896] + - Exact: [224, 224, 11, 13056] + - Exact: [224, 224, 11, 3264] + - Exact: [256, 224, 11, 6528] + - Exact: [256, 224, 11, 3264] + - Exact: [224, 192, 9, 9792] + - Exact: [256, 224, 9, 4896] + - Exact: [64, 64, 496, 64] + - Exact: [135, 135, 32, 64] + - Exact: [64, 65, 496, 64] + - Exact: [65, 65, 472, 64] + - Exact: [65, 65, 496, 64] + - Exact: [70, 70, 216, 64] + - Exact: [70, 71, 216, 64] + - Exact: [71, 71, 216, 64] + - Exact: [71, 71, 448, 64] + - Exact: [77, 77, 248, 64] + - Exact: [77, 77, 408, 64] + - Exact: [77, 78, 248, 64] + - Exact: [77, 78, 408, 64] + - Exact: [78, 78, 248, 64] + - Exact: [78, 78, 408, 64] + - Exact: [80, 80, 152, 64] + - Exact: [80, 84, 152, 64] + - Exact: [84, 84, 152, 64] + - Exact: [85, 85, 376, 64] + - Exact: [93, 93, 344, 64] + - Exact: [102, 102, 312, 64] + - Exact: [112, 112, 288, 64] + - Exact: [122, 122, 264, 64] + - Exact: [123, 122, 264, 64] + - Exact: [123, 123, 264, 64] + - Exact: [511, 2048, 1, 2048] + - Exact: [1024, 512, 1, 1025] + - Exact: [512, 1023, 1, 1024] + - Exact: [1025, 1024, 1, 1024] + - Exact: [2048, 513, 1, 2048] + - Exact: [1024, 1024, 1, 1025] + - Exact: [960, 1024, 1, 1023] + - Exact: [1024, 1024, 1, 1024] + - Exact: [960, 1025, 1, 1024] + - Exact: [2049, 512, 1, 2048] + - Exact: [513, 1024, 1, 1024] + - Exact: [512, 2048, 1, 2048] + - Exact: [1024, 511, 1, 1024] + - Exact: [1024, 512, 1, 1023] + - Exact: [960, 1024, 1, 1025] + - Exact: [959, 1024, 1, 1024] + - Exact: [2048, 512, 1, 2049] + - Exact: [511, 1024, 1, 1024] + - Exact: [512, 2049, 1, 2048] + - Exact: [1024, 513, 1, 1024] + - Exact: [2048, 512, 1, 2047] + - Exact: [1025, 512, 1, 1024] + - Exact: [1024, 1024, 1, 1023] + - Exact: [513, 2048, 1, 2048] + - Exact: [1024, 1025, 1, 1024] + - Exact: [512, 2048, 1, 2049] + - Exact: [1024, 1023, 1, 1024] + - Exact: [960, 1023, 1, 1024] + - Exact: [2048, 511, 1, 2048] + - Exact: [1023, 512, 1, 1024] + - Exact: [2047, 512, 1, 2048] + - Exact: [512, 1024, 1, 1024] + - Exact: [512, 1024, 1, 1025] + - Exact: [512, 2047, 1, 2048] + - Exact: [512, 1025, 1, 1024] + - Exact: [512, 2048, 1, 2047] + - Exact: [960, 1024, 1, 1024] + - Exact: [961, 1024, 1, 1024] + - Exact: [512, 1024, 1, 1023] + - Exact: [1023, 1024, 1, 1024] + - Exact: [479, 1024, 1, 1024] + - Exact: [479, 2048, 1, 2048] + - Exact: [480, 1023, 1, 1024] + - Exact: [480, 1024, 1, 1023] + - Exact: [480, 1024, 1, 1025] + - Exact: [480, 1025, 1, 1024] + - Exact: [480, 2047, 1, 2048] + - Exact: [480, 2048, 1, 2047] + - Exact: [480, 2048, 1, 2049] + - Exact: [480, 2049, 1, 2048] + - Exact: [480, 3071, 1, 3072] + - Exact: [481, 1024, 1, 1024] + - Exact: [481, 2048, 1, 2048] + - Exact: [1023, 480, 1, 1024] + - Exact: [1024, 479, 1, 1024] + - Exact: [1024, 480, 1, 1023] + - Exact: [1024, 480, 1, 1025] + - Exact: [1024, 481, 1, 1024] + - Exact: [1025, 480, 1, 1024] + - Exact: [2047, 480, 1, 2048] + - Exact: [2048, 479, 1, 2048] + - Exact: [2048, 480, 1, 2047] + - Exact: [2048, 480, 1, 2049] + - Exact: [2048, 481, 1, 2048] + - Exact: [2049, 480, 1, 2048] + - Exact: [3071, 480, 1, 3072] + - Exact: [480, 1024, 1, 1024] + - Exact: [480, 2048, 1, 2048] + - Exact: [1024, 480, 1, 1024] + - Exact: [2048, 480, 1, 2048] + - Exact: [1024, 512, 1, 2048] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1024] + - Exact: [1024, 960, 1, 1600] + - Exact: [1024, 1024, 1, 960] + - Exact: [2048, 215, 1, 512] + - Exact: [2048, 215, 1, 768] + - Exact: [2048, 256, 1, 512] + - Exact: [2048, 256, 1, 768] + - Exact: [2048, 512, 1, 2048] + - Exact: [2048, 512, 1, 67] + - Exact: [2048, 512, 1, 74] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3328, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [512, 1600, 1, 512] + - Exact: [256, 1280, 1, 1024] + - Exact: [256, 1536, 1, 1024] + - Exact: [256, 2304, 1, 1024] + - Exact: [256, 2560, 1, 1024] + - Exact: [256, 2816, 1, 1024] + - Exact: [256, 3584, 1, 1024] + - Exact: [767, 1280, 1, 768] + - Exact: [769, 1280, 1, 768] + - Exact: [768, 1279, 1, 768] + - Exact: [768, 1281, 1, 768] + - Exact: [768, 1280, 1, 767] + - Exact: [768, 1280, 1, 769] + - Exact: [256, 4096, 1, 512] + - Exact: [767, 768, 1, 768] + - Exact: [769, 768, 1, 768] + - Exact: [768, 767, 1, 768] + - Exact: [768, 769, 1, 768] + - Exact: [768, 768, 1, 767] + - Exact: [768, 768, 1, 769] + - Exact: [768, 768, 1, 768] + - Exact: [128, 128, 49, 1152] + - Exact: [128, 128, 49, 1216] + - Exact: [128, 128, 36, 1800] + - Exact: [128, 128, 36, 1900] + - Exact: [128, 128, 64, 5880] + - Exact: [128, 128, 49, 7680] + - Exact: [128, 128, 64, 882] + - Exact: [128, 128, 64, 931] + - Exact: [128, 64, 121, 1152] + - Exact: [128, 64, 81, 12000] + - Exact: [128, 64, 121, 1216] + - Exact: [128, 64, 81, 1800] + - Exact: [128, 64, 81, 1900] + - Exact: [128, 64, 49, 20280] + - Exact: [128, 64, 49, 3042] + - Exact: [128, 64, 49, 3211] + - Exact: [128, 64, 169, 5880] + - Exact: [128, 64, 121, 7680] + - Exact: [128, 64, 169, 882] + - Exact: [128, 64, 169, 931] + - Exact: [256, 128, 25, 1080] + - Exact: [256, 128, 25, 162] + - Exact: [256, 128, 25, 171] + - Exact: [1152, 256, 1, 1] + - Exact: [1152, 256, 1, 1444] + - Exact: [1152, 256, 1, 25] + - Exact: [1152, 256, 1, 9] + - Exact: [2304, 256, 1, 1444] + - Exact: [2304, 340, 1, 1] + - Exact: [2304, 340, 1, 1444] + - Exact: [2304, 340, 1, 9] + - Exact: [2304, 510, 1, 25] + - Exact: [30522, 77, 1, 1024] + - Exact: [1024, 780, 1, 1024] + - Exact: [1024, 800, 1, 1024] + - Exact: [1024, 820, 1, 1024] + - Exact: [1024, 385, 1, 1024] + - Exact: [1024, 462, 1, 1024] + - Exact: [64, 512, 256, 512] + - Exact: [64, 512, 128, 512] + - Exact: [64, 512, 40, 512] + - Exact: [96, 1024, 64, 1024] + - Exact: [96, 1024, 128, 1024] + - Exact: [64, 1024, 256, 1024] + - Exact: [64, 1024, 32, 1024] + - Exact: [64, 1024, 64, 1024] + - Exact: [64, 1024, 128, 1024] + - Exact: [64, 128, 1024, 128] + - Exact: [1024, 864, 1, 1024] + - Exact: [1024, 864, 1, 480] + - Exact: [128, 3456, 1, 256] + - Exact: [128, 4096, 1, 256] + - Exact: [128, 6912, 1, 256] + - Exact: [256, 3456, 1, 512] + - Exact: [512, 864, 1, 1024] + - Exact: [512, 864, 1, 13] + - Exact: [64, 128, 1280, 128] + - Exact: [64, 128, 1312, 128] + - Exact: [64, 512, 192, 512] + - Exact: [1024, 512, 1, 196] + - Exact: [64, 128, 2048, 128] + - Exact: [64, 128, 1536, 128] + - Exact: [128, 128, 64, 6400] + - Exact: [64, 128, 192, 128] + - Exact: [64, 384, 144, 384] + - Exact: [64, 512, 48, 512] + - Exact: [64, 128, 256, 128] + - Exact: [64, 384, 192, 384] + - Exact: [128, 128, 49, 1120] + - Exact: [128, 128, 49, 1064] + - Exact: [128, 128, 49, 1040] + - Exact: [128, 128, 64, 600] + - Exact: [128, 128, 64, 616] + - Exact: [128, 128, 49, 950] + - Exact: [128, 128, 49, 972] + - Exact: [128, 128, 64, 560] + - Exact: [128, 128, 49, 1008] + - Exact: [128, 128, 64, 532] + - Exact: [128, 128, 49, 1080] + - Exact: [128, 128, 64, 588] + - Exact: [128, 128, 49, 1160] + - Exact: [128, 128, 49, 988] + - Exact: [128, 128, 49, 936] + - Exact: [512, 1024, 1, 3800] + - Exact: [512, 1024, 1, 3400] + - Exact: [512, 1024, 1, 3456] + - Exact: [2048, 512, 1, 950] + - Exact: [512, 1024, 1, 3552] + - Exact: [512, 1024, 1, 3220] + - Exact: [2048, 512, 1, 850] + - Exact: [512, 2048, 1, 864] + - Exact: [512, 2048, 1, 768] + - Exact: [2048, 512, 1, 805] + - Exact: [512, 1024, 1, 2852] + - Exact: [512, 2048, 1, 888] + - Exact: [2048, 512, 1, 864] + - Exact: [2048, 512, 1, 888] + - Exact: [2048, 256, 1, 950] + - Exact: [2048, 512, 1, 713] + - Exact: [512, 1024, 1, 2688] + - Exact: [512, 1024, 1, 2640] + - Exact: [512, 1024, 1, 2904] + - Exact: [1024, 512, 1, 950] + - Exact: [512, 2048, 1, 672] + - Exact: [512, 2048, 1, 660] + - Exact: [512, 2048, 1, 1008] + - Exact: [2048, 256, 1, 850] + - Exact: [2048, 512, 1, 726] + - Exact: [1024, 512, 1, 850] + - Exact: [2048, 512, 1, 660] + - Exact: [2048, 512, 1, 672] + - Exact: [512, 2048, 1, 840] + - Exact: [2048, 512, 1, 1008] + - Exact: [512, 2048, 1, 792] + - Exact: [1024, 512, 1, 805] + - Exact: [512, 2048, 1, 1050] + - Exact: [2048, 512, 1, 748] + - Exact: [2048, 256, 1, 864] + - Exact: [1024, 512, 1, 864] + - Exact: [2048, 512, 1, 875] + - Exact: [2048, 512, 1, 840] + - Exact: [2048, 512, 1, 792] + - Exact: [512, 2048, 1, 736] + - Exact: [2048, 256, 1, 888] + - Exact: [512, 2048, 1, 704] + - Exact: [512, 2048, 1, 588] + - Exact: [1024, 512, 1, 888] + - Exact: [512, 2048, 1, 816] + - Exact: [1024, 512, 1, 713] + - Exact: [2048, 512, 1, 736] + - Exact: [2048, 512, 1, 588] + - Exact: [2048, 512, 1, 704] + - Exact: [1024, 512, 1, 660] + - Exact: [2048, 256, 1, 660] + - Exact: [2048, 256, 1, 672] + - Exact: [1024, 512, 1, 672] + - Exact: [1024, 512, 1, 726] + - Exact: [512, 2048, 1, 630] + - Exact: [512, 2048, 1, 600] + - Exact: [2048, 256, 1, 805] + - Exact: [2048, 256, 1, 713] + - Exact: [2048, 256, 1, 726] + - Exact: [320, 1024, 1, 1024] + - Exact: [1024, 1000, 1, 1024] + - Exact: [320, 1000, 1, 1024] + - Exact: [128, 128, 49, 1280] + - Exact: [128, 128, 49, 1360] + - Exact: [128, 128, 49, 1200] + - Exact: [128, 128, 49, 1240] + - Exact: [2304, 256, 1, 704] + - Exact: [2304, 256, 1, 736] + - Exact: [2304, 256, 1, 792] + - Exact: [2304, 256, 1, 748] + - Exact: [2304, 256, 1, 726] + - Exact: [2304, 256, 1, 713] + - Exact: [2304, 256, 1, 768] + - Exact: [512, 2048, 1, 759] + - Exact: [512, 2048, 1, 925] + - Exact: [2304, 256, 1, 805] + - Exact: [512, 2048, 1, 900] + - Exact: [512, 2048, 1, 875] + - Exact: [512, 2048, 1, 748] + - Exact: [512, 2048, 1, 726] + - Exact: [512, 2048, 1, 713] + - Exact: [512, 2048, 1, 805] + - Exact: [512, 2048, 1, 850] + - Exact: [512, 2048, 1, 950] + - Exact: [96, 1024, 160, 1024] + - Exact: [96, 1024, 40, 1024] + - Exact: [96, 1024, 80, 1024] + - Exact: [96, 1024, 96, 1024] + - Exact: [96, 1024, 24, 1024] + - Exact: [96, 1024, 48, 1024] + - Exact: [96, 1024, 16, 1024] + - Exact: [96, 1024, 32, 1024] + - Exact: [64, 512, 320, 512] + - Exact: [64, 512, 80, 512] + - Exact: [29000, 109, 1, 2560] + - Exact: [29000, 121, 1, 2560] + - Exact: [29000, 65, 1, 2560] + - Exact: [29000, 66, 1, 2560] + - Exact: [29000, 67, 1, 2560] + - Exact: [29000, 69, 1, 2560] + - Exact: [29000, 70, 1, 2560] + - Exact: [29000, 71, 1, 2560] + - Exact: [29000, 73, 1, 2560] + - Exact: [29000, 74, 1, 2560] + - Exact: [29000, 75, 1, 2560] + - Exact: [29000, 77, 1, 2560] + - Exact: [29000, 78, 1, 2560] + - Exact: [29000, 80, 1, 2560] + - Exact: [29000, 81, 1, 2560] + - Exact: [29000, 82, 1, 2560] + - Exact: [29000, 83, 1, 2560] + - Exact: [29000, 84, 1, 2560] + - Exact: [29000, 88, 1, 2560] + - Exact: [29000, 89, 1, 2560] + - Exact: [29000, 90, 1, 2560] + - Exact: [29000, 92, 1, 2560] + - Exact: [29000, 95, 1, 2560] + - Exact: [29000, 98, 1, 2560] + - Exact: [64, 1024, 512, 1024] + +# bodys midSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 200, 1, 13312] + - Exact: [1024, 256, 1, 15360] + - Exact: [1024, 256, 1, 16384] + - Exact: [1024, 200, 1, 16384] + - Exact: [1024, 256, 1, 12288] + - Exact: [1024, 200, 1, 12288] + - Exact: [1024, 200, 1, 15360] + - Exact: [1024, 256, 1, 9216] + - Exact: [1024, 200, 1, 14336] + - Exact: [1024, 256, 1, 16640] + - Exact: [1024, 200, 1, 8192] + - Exact: [1024, 200, 1, 10240] + - Exact: [1024, 200, 1, 9216] + - Exact: [1024, 256, 1, 11264] + - Exact: [1024, 200, 1, 8320] + - Exact: [1024, 256, 1, 8320] + - Exact: [1024, 200, 1, 16640] + - Exact: [1024, 256, 1, 14336] + - Exact: [1024, 256, 1, 13312] + - Exact: [1024, 200, 1, 11264] + - Exact: [1024, 256, 1, 8192] + - Exact: [1024, 256, 1, 10240] + - Exact: [96, 64, 64, 18432] + - Exact: [96, 64, 36, 10368] + - Exact: [96, 64, 36, 20736] + - Exact: [96, 96, 36, 10368] + - Exact: [96, 64, 49, 28800] + - Exact: [96, 64, 36, 41472] + - Exact: [64, 64, 11, 233600] + - Exact: [64, 64, 11, 116800] + - Exact: [64, 64, 9, 172864] + - Exact: [64, 64, 11, 58400] + - Exact: [192, 160, 9, 19584] + - Exact: [128, 128, 9, 9792] + - Exact: [192, 160, 11, 13056] + - Exact: [64, 64, 9, 86432] + - Exact: [128, 128, 9, 19584] + - Exact: [160, 160, 11, 13056] + - Exact: [160, 160, 9, 19584] + - Exact: [192, 128, 9, 19584] + - Exact: [192, 160, 9, 9792] + - Exact: [64, 64, 9, 345728] + - Exact: [128, 128, 11, 13056] + - Exact: [160, 160, 9, 9792] + - Exact: [192, 128, 11, 13056] + - Exact: [192, 128, 9, 9792] + - Exact: [128, 64, 25, 43320] + - Exact: [64, 64, 64, 20280] + - Exact: [64, 64, 49, 27000] + - Exact: [64, 64, 36, 43320] + - Exact: [64, 64, 36, 50176] + - Exact: [64, 64, 49, 36864] + - Exact: [64, 64, 64, 25600] + - Exact: [256, 256, 1, 60800] + - Exact: [256, 256, 1, 54400] + - Exact: [256, 256, 1, 51520] + - Exact: [256, 256, 1, 55296] + - Exact: [256, 256, 1, 56832] + - Exact: [256, 256, 1, 45632] + - Exact: [256, 256, 1, 49152] + - Exact: [256, 512, 1, 13600] + - Exact: [256, 256, 1, 43008] + - Exact: [256, 512, 1, 15200] + - Exact: [256, 512, 1, 12880] + - Exact: [256, 512, 1, 13824] + - Exact: [512, 256, 1, 13824] + - Exact: [256, 512, 1, 14208] + - Exact: [512, 256, 1, 14208] + - Exact: [512, 256, 1, 15200] + - Exact: [256, 512, 1, 12288] + - Exact: [512, 256, 1, 12288] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 200, 1, 560] + - Exact: [768, 320, 1, 768] + - Exact: [1024, 120, 1, 1024] + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 1280] + - Exact: [4096, 32, 1, 4096] + - Exact: [3072, 64, 1, 1024] + - Exact: [2944, 64, 1, 256] + - Exact: [6144, 32, 1, 2560] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [448, 448, 1, 256] + - Exact: [7680, 32, 1, 2560] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [3072, 32, 1, 1024] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [35, 8457, 1, 1760] + - Exact: [64, 2944, 1, 128] + - Exact: [8448, 32, 1, 2816] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [2560, 64, 1, 2560] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [2560, 32, 1, 2560] + - Exact: [128, 1408, 1, 256] + - Exact: [35, 8457, 1, 2560] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [704, 128, 1, 3328] + - Exact: [4288, 64, 1, 1280] + - Exact: [1408, 128, 1, 128] + - Exact: [128, 1024, 1, 1280] + - Exact: [2944, 64, 1, 128] + - Exact: [1024, 128, 1, 3328] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [35, 8457, 1, 2048] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [35, 8457, 1, 4096] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [1408, 128, 1, 256] + - Exact: [64, 2944, 1, 1280] + - Exact: [128, 704, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [4096, 64, 1, 4096] + - Exact: [1760, 128, 1, 1760] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [64, 2944, 1, 3328] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 1408, 1, 128] + - Exact: [256, 704, 1, 128] + - Exact: [1408, 64, 1, 128] + - Exact: [448, 448, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [4608, 32, 1, 1536] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [2048, 128, 1, 2048] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [1856, 64, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [2048, 64, 1, 2048] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [1760, 64, 1, 1760] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [256, 1024, 1, 196] + - Exact: [1024, 256, 1, 1536] + - Exact: [1024, 200, 1, 1408] + - Exact: [1024, 200, 1, 6144] + - Exact: [1024, 256, 1, 3328] + - Exact: [512, 256, 1, 3200] + - Exact: [1024, 200, 1, 4608] + - Exact: [512, 256, 1, 1792] + - Exact: [1024, 200, 1, 1792] + - Exact: [512, 200, 1, 2816] + - Exact: [512, 200, 1, 3072] + - Exact: [1024, 200, 1, 128] + - Exact: [1024, 200, 1, 5120] + - Exact: [1024, 256, 1, 256] + - Exact: [512, 256, 1, 2560] + - Exact: [1024, 256, 1, 4160] + - Exact: [1024, 200, 1, 512] + - Exact: [512, 512, 1, 1536] + - Exact: [1024, 256, 1, 896] + - Exact: [1024, 200, 1, 3200] + - Exact: [1024, 200, 1, 1536] + - Exact: [1024, 256, 1, 1024] + - Exact: [128, 1024, 1, 512] + - Exact: [1024, 256, 1, 5120] + - Exact: [1024, 200, 1, 2304] + - Exact: [1024, 256, 1, 1664] + - Exact: [512, 512, 1, 1024] + - Exact: [1024, 256, 1, 2080] + - Exact: [512, 200, 1, 768] + - Exact: [1024, 256, 1, 2816] + - Exact: [1024, 200, 1, 64] + - Exact: [512, 512, 1, 2304] + - Exact: [128, 1024, 1, 2048] + - Exact: [512, 200, 1, 2560] + - Exact: [512, 256, 1, 1024] + - Exact: [1024, 256, 1, 1920] + - Exact: [512, 200, 1, 2304] + - Exact: [1024, 256, 1, 384] + - Exact: [1024, 256, 1, 32] + - Exact: [1024, 200, 1, 2816] + - Exact: [1024, 200, 1, 3072] + - Exact: [512, 256, 1, 1536] + - Exact: [1024, 256, 1, 512] + - Exact: [256, 512, 1, 512] + - Exact: [1024, 200, 1, 3840] + - Exact: [256, 1024, 1, 512] + - Exact: [1024, 256, 1, 1152] + - Exact: [512, 512, 1, 2816] + - Exact: [512, 200, 1, 1280] + - Exact: [512, 200, 1, 3200] + - Exact: [1024, 256, 1, 2304] + - Exact: [1024, 256, 1, 6144] + - Exact: [1024, 200, 1, 2560] + - Exact: [1024, 256, 1, 5632] + - Exact: [512, 256, 1, 768] + - Exact: [1024, 256, 1, 3072] + - Exact: [256, 512, 1, 2048] + - Exact: [1024, 200, 1, 1152] + - Exact: [512, 512, 1, 3072] + - Exact: [1024, 200, 1, 1664] + - Exact: [1024, 200, 1, 32] + - Exact: [1024, 200, 1, 384] + - Exact: [512, 256, 1, 2304] + - Exact: [256, 512, 1, 1024] + - Exact: [1024, 200, 1, 3328] + - Exact: [1024, 200, 1, 2080] + - Exact: [512, 200, 1, 1792] + - Exact: [1024, 256, 1, 1792] + - Exact: [1024, 200, 1, 7168] + - Exact: [512, 256, 1, 3072] + - Exact: [1024, 200, 1, 2048] + - Exact: [512, 512, 1, 1280] + - Exact: [1024, 200, 1, 1280] + - Exact: [512, 200, 1, 512] + - Exact: [1024, 256, 1, 2560] + - Exact: [1024, 200, 1, 1024] + - Exact: [1024, 256, 1, 3200] + - Exact: [512, 512, 1, 2560] + - Exact: [1024, 256, 1, 640] + - Exact: [1024, 256, 1, 3584] + - Exact: [512, 512, 1, 3200] + - Exact: [1024, 256, 1, 7680] + - Exact: [512, 200, 1, 1536] + - Exact: [512, 256, 1, 2816] + - Exact: [1024, 200, 1, 768] + - Exact: [512, 200, 1, 2048] + - Exact: [1024, 256, 1, 128] + - Exact: [1024, 200, 1, 4096] + - Exact: [1024, 256, 1, 1280] + - Exact: [1024, 200, 1, 896] + - Exact: [1024, 256, 1, 4608] + - Exact: [128, 1024, 1, 1024] + - Exact: [1024, 256, 1, 2048] + - Exact: [512, 256, 1, 1280] + - Exact: [256, 1024, 1, 2048] + - Exact: [512, 512, 1, 2048] + - Exact: [512, 256, 1, 512] + - Exact: [1024, 200, 1, 7680] + - Exact: [1024, 200, 1, 6656] + - Exact: [512, 200, 1, 1024] + - Exact: [1024, 256, 1, 3840] + - Exact: [512, 512, 1, 768] + - Exact: [1024, 256, 1, 64] + - Exact: [1024, 200, 1, 1920] + - Exact: [1024, 256, 1, 7168] + - Exact: [512, 512, 1, 1792] + - Exact: [1024, 200, 1, 256] + - Exact: [256, 1024, 1, 1024] + - Exact: [1024, 200, 1, 640] + - Exact: [1024, 200, 1, 4160] + - Exact: [1024, 200, 1, 5632] + - Exact: [1024, 256, 1, 6656] + - Exact: [1024, 256, 1, 768] + - Exact: [512, 256, 1, 2048] + - Exact: [1024, 200, 1, 3584] + - Exact: [1024, 256, 1, 1408] + - Exact: [1024, 256, 1, 4096] + - Exact: [1024, 128, 1, 289] + - Exact: [768, 192, 1, 289] + - Exact: [32, 32, 1984, 64] + - Exact: [54, 54, 1184, 64] + - Exact: [35, 35, 1808, 64] + - Exact: [45, 45, 1424, 64] + - Exact: [49, 49, 1296, 64] + - Exact: [59, 59, 1088, 64] + - Exact: [41, 41, 1552, 64] + - Exact: [38, 38, 1680, 64] + - Exact: [2048, 128, 1, 4096] + - Exact: [1024, 128, 1, 1024] + - Exact: [1152, 128, 1, 784] + - Exact: [864, 96, 1, 1225] + - Exact: [896, 192, 1, 289] + - Exact: [768, 128, 1, 289] + - Exact: [1344, 192, 1, 289] + - Exact: [384, 192, 1, 1225] + - Exact: [832, 192, 1, 49] + - Exact: [1280, 192, 1, 64] + - Exact: [512, 256, 1, 196] + - Exact: [864, 96, 1, 289] + - Exact: [896, 128, 1, 289] + - Exact: [1200, 64, 1, 1225] + - Exact: [1024, 256, 1, 289] + - Exact: [1024, 256, 1, 196] + - Exact: [1120, 192, 1, 289] + - Exact: [800, 96, 1, 784] + - Exact: [864, 128, 1, 784] + - Exact: [1344, 224, 1, 289] + - Exact: [1152, 192, 1, 784] + - Exact: [800, 128, 1, 196] + - Exact: [864, 208, 1, 196] + - Exact: [720, 192, 1, 5041] + - Exact: [576, 192, 1, 3136] + - Exact: [832, 256, 1, 49] + - Exact: [1200, 128, 1, 49] + - Exact: [528, 256, 1, 196] + - Exact: [256, 512, 1, 784] + - Exact: [480, 192, 1, 196] + - Exact: [96, 64, 36, 2592] + - Exact: [96, 96, 36, 2592] + - Exact: [1024, 192, 1, 289] + - Exact: [528, 160, 1, 196] + - Exact: [512, 160, 1, 196] + - Exact: [768, 160, 1, 289] + - Exact: [64, 32, 36, 43808] + - Exact: [832, 160, 1, 49] + - Exact: [2048, 64, 1, 1001] + - Exact: [2048, 128, 1, 1001] + - Exact: [1536, 64, 1, 1001] + - Exact: [96, 96, 49, 3136] + - Exact: [64, 32, 49, 57600] + - Exact: [96, 64, 49, 6272] + - Exact: [64, 32, 49, 115200] + - Exact: [96, 96, 64, 2304] + - Exact: [96, 96, 49, 6272] + - Exact: [96, 64, 36, 5184] + - Exact: [64, 32, 64, 40000] + - Exact: [96, 64, 64, 4608] + - Exact: [96, 96, 36, 5184] + - Exact: [96, 64, 64, 2304] + - Exact: [96, 64, 49, 3136] + - Exact: [64, 32, 36, 87616] + - Exact: [64, 32, 64, 80000] + - Exact: [96, 96, 64, 4608] + - Exact: [64, 32, 36, 175232] + - Exact: [128, 128, 11, 3264] + - Exact: [192, 128, 11, 6528] + - Exact: [128, 128, 11, 6528] + - Exact: [160, 160, 9, 4896] + - Exact: [192, 160, 11, 6528] + - Exact: [192, 128, 9, 4896] + - Exact: [128, 128, 9, 4896] + - Exact: [192, 128, 11, 3264] + - Exact: [160, 160, 11, 3264] + - Exact: [192, 160, 9, 4896] + - Exact: [192, 160, 11, 3264] + - Exact: [160, 160, 11, 6528] + - Exact: [4096, 64, 1, 1024] + - Exact: [49, 49, 160, 64] + - Exact: [54, 54, 592, 64] + - Exact: [59, 59, 512, 64] + - Exact: [104, 104, 16, 64] + - Exact: [32, 32, 624, 64] + - Exact: [32, 32, 992, 64] + - Exact: [35, 35, 384, 64] + - Exact: [35, 35, 904, 64] + - Exact: [38, 38, 320, 64] + - Exact: [38, 38, 840, 64] + - Exact: [41, 41, 312, 64] + - Exact: [41, 41, 776, 64] + - Exact: [45, 45, 392, 64] + - Exact: [45, 45, 712, 64] + - Exact: [49, 49, 648, 64] + - Exact: [54, 54, 200, 64] + - Exact: [59, 59, 544, 64] + - Exact: [91, 91, 40, 64] + - Exact: [91, 93, 40, 64] + - Exact: [93, 93, 40, 64] + - Exact: [102, 102, 56, 64] + - Exact: [103, 103, 16, 64] + - Exact: [103, 104, 16, 64] + - Exact: [112, 112, 16, 64] + - Exact: [112, 123, 16, 64] + - Exact: [119, 119, 32, 64] + - Exact: [119, 135, 32, 64] + - Exact: [123, 123, 16, 64] + - Exact: [512, 512, 1, 512] + - Exact: [513, 512, 1, 512] + - Exact: [512, 512, 1, 513] + - Exact: [512, 512, 1, 511] + - Exact: [512, 513, 1, 512] + - Exact: [512, 511, 1, 512] + - Exact: [511, 512, 1, 512] + - Exact: [479, 512, 1, 512] + - Exact: [480, 511, 1, 512] + - Exact: [480, 512, 1, 511] + - Exact: [480, 512, 1, 513] + - Exact: [480, 513, 1, 512] + - Exact: [481, 512, 1, 512] + - Exact: [511, 480, 1, 512] + - Exact: [512, 479, 1, 512] + - Exact: [512, 480, 1, 511] + - Exact: [512, 480, 1, 513] + - Exact: [512, 481, 1, 512] + - Exact: [513, 480, 1, 512] + - Exact: [480, 512, 1, 512] + - Exact: [512, 480, 1, 512] + - Exact: [512, 512, 1, 64] + - Exact: [2048, 114, 1, 512] + - Exact: [2048, 114, 1, 768] + - Exact: [256, 684, 1, 1024] + - Exact: [33, 33, 1600, 32] + - Exact: [256, 684, 1, 1024] + - Exact: [383, 384, 1, 384] + - Exact: [385, 384, 1, 384] + - Exact: [384, 383, 1, 384] + - Exact: [384, 385, 1, 384] + - Exact: [384, 384, 1, 383] + - Exact: [384, 384, 1, 385] + - Exact: [384, 384, 1, 384] + - Exact: [128, 64, 25, 6498] + - Exact: [128, 64, 25, 6859] + - Exact: [64, 64, 64, 3042] + - Exact: [64, 64, 64, 3211] + - Exact: [64, 64, 49, 4050] + - Exact: [64, 64, 49, 4275] + - Exact: [64, 64, 36, 6498] + - Exact: [64, 64, 36, 6859] + - Exact: [1152, 128, 1, 1444] + - Exact: [512, 256, 1, 361] + - Exact: [576, 128, 1, 1444] + - Exact: [1024, 308, 1, 1024] + - Exact: [1024, 160, 1, 1024] + - Exact: [1024, 180, 1, 1024] + - Exact: [32, 32, 4608, 64] + - Exact: [32, 35, 4608, 64] + - Exact: [34, 34, 4736, 64] + - Exact: [35, 35, 4608, 64] + - Exact: [128, 864, 1, 256] + - Exact: [256, 864, 1, 512] + - Exact: [512, 256, 1, 784] + - Exact: [1024, 96, 1, 1024] + - Exact: [1024, 256, 1, 3800] + - Exact: [1024, 256, 1, 3400] + - Exact: [256, 1024, 1, 3400] + - Exact: [1024, 256, 1, 3220] + - Exact: [256, 1024, 1, 3220] + - Exact: [1024, 256, 1, 3456] + - Exact: [256, 1024, 1, 3456] + - Exact: [256, 1024, 1, 3072] + - Exact: [1024, 256, 1, 3552] + - Exact: [256, 1024, 1, 3552] + - Exact: [256, 1024, 1, 2852] + - Exact: [1024, 256, 1, 2852] + - Exact: [256, 512, 1, 10752] + - Exact: [256, 1024, 1, 3800] + - Exact: [256, 512, 1, 10560] + - Exact: [256, 1024, 1, 2992] + - Exact: [256, 1024, 1, 2688] + - Exact: [1024, 256, 1, 2688] + - Exact: [256, 1024, 1, 2904] + - Exact: [1024, 256, 1, 2904] + - Exact: [256, 1024, 1, 2640] + - Exact: [1024, 256, 1, 2640] + - Exact: [1024, 256, 1, 4032] + - Exact: [1024, 256, 1, 2992] + - Exact: [256, 1024, 1, 3360] + - Exact: [1024, 256, 1, 3360] + - Exact: [1024, 256, 1, 3500] + - Exact: [256, 1024, 1, 3500] + - Exact: [1024, 256, 1, 3168] + - Exact: [256, 1024, 1, 3168] + - Exact: [256, 1024, 1, 3036] + - Exact: [1024, 256, 1, 4200] + - Exact: [1024, 256, 1, 3600] + - Exact: [256, 1024, 1, 3600] + - Exact: [256, 1024, 1, 2944] + - Exact: [1024, 256, 1, 2944] + - Exact: [1024, 256, 1, 3700] + - Exact: [256, 1024, 1, 2352] + - Exact: [1024, 256, 1, 2352] + - Exact: [256, 1024, 1, 3700] + - Exact: [256, 1024, 1, 2816] + - Exact: [256, 512, 1, 11408] + - Exact: [1024, 256, 1, 3036] + - Exact: [1024, 256, 1, 3264] + - Exact: [256, 1024, 1, 3264] + - Exact: [1024, 256, 1, 3864] + - Exact: [256, 1024, 1, 4032] + - Exact: [1024, 256, 1, 3128] + - Exact: [256, 1024, 1, 3128] + - Exact: [256, 1024, 1, 3200] + - Exact: [256, 512, 1, 11616] + - Exact: [1024, 256, 1, 4000] + - Exact: [256, 1024, 1, 2520] + - Exact: [1024, 256, 1, 2520] + - Exact: [256, 1024, 1, 2976] + - Exact: [256, 1024, 1, 2400] + - Exact: [1024, 256, 1, 2400] + - Exact: [1024, 256, 1, 3696] + - Exact: [1024, 256, 1, 3900] + - Exact: [1024, 256, 1, 3772] + - Exact: [256, 1024, 1, 3696] + - Exact: [256, 1024, 1, 2728] + - Exact: [1024, 256, 1, 2728] + - Exact: [1024, 256, 1, 2480] + - Exact: [256, 1024, 1, 2480] + - Exact: [1024, 256, 1, 2880] + - Exact: [512, 256, 1, 3220] + - Exact: [256, 1024, 1, 2880] + - Exact: [256, 1024, 1, 4200] + - Exact: [1024, 256, 1, 3648] + - Exact: [1024, 256, 1, 3312] + - Exact: [256, 1024, 1, 3648] + - Exact: [1024, 256, 1, 3300] + - Exact: [1024, 256, 1, 3528] + - Exact: [256, 1024, 1, 2604] + - Exact: [1024, 256, 1, 2604] + - Exact: [512, 256, 1, 11408] + - Exact: [256, 1024, 1, 3312] + - Exact: [256, 1024, 1, 3300] + - Exact: [256, 1024, 1, 3528] + - Exact: [1024, 256, 1, 2976] + - Exact: [1024, 256, 1, 2760] + - Exact: [512, 256, 1, 3800] + - Exact: [256, 1024, 1, 2760] + - Exact: [1024, 256, 1, 2160] + - Exact: [256, 1024, 1, 2160] + - Exact: [512, 256, 1, 11616] + - Exact: [512, 256, 1, 2852] + - Exact: [256, 1024, 1, 3864] + - Exact: [512, 256, 1, 2640] + - Exact: [256, 1024, 1, 4000] + - Exact: [512, 256, 1, 2904] + - Exact: [256, 1024, 1, 3900] + - Exact: [512, 256, 1, 2688] + - Exact: [256, 1024, 1, 3772] + - Exact: [512, 256, 1, 3400] + - Exact: [512, 256, 1, 3456] + - Exact: [512, 256, 1, 3552] + - Exact: [29000, 35, 1, 2560] + - Exact: [29000, 36, 1, 2560] + - Exact: [29000, 39, 1, 2560] + - Exact: [29000, 40, 1, 2560] + - Exact: [29000, 42, 1, 2560] + - Exact: [29000, 43, 1, 2560] + - Exact: [29000, 44, 1, 2560] + - Exact: [29000, 46, 1, 2560] + - Exact: [29000, 48, 1, 2560] + - Exact: [29000, 49, 1, 2560] + - Exact: [29000, 50, 1, 2560] + - Exact: [29000, 51, 1, 2560] + - Exact: [29000, 53, 1, 2560] + - Exact: [29000, 54, 1, 2560] + - Exact: [29000, 55, 1, 2560] + - Exact: [29000, 56, 1, 2560] + - Exact: [29000, 57, 1, 2560] + - Exact: [29000, 58, 1, 2560] + - Exact: [29000, 59, 1, 2560] + - Exact: [29000, 61, 1, 2560] + - Exact: [29000, 63, 1, 2560] + +# bodys smaSizeGSU + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [288, 64, 1, 21609] + - Exact: [32, 32, 36, 43808] + - Exact: [32, 32, 64, 40000] + - Exact: [32, 32, 49, 115200] + - Exact: [32, 32, 36, 175232] + - Exact: [32, 32, 49, 57600] + - Exact: [32, 32, 36, 87616] + - Exact: [32, 32, 64, 80000] + - Exact: [256, 128, 1, 13600] + - Exact: [256, 128, 1, 12880] + - Exact: [128, 512, 1, 15200] + - Exact: [512, 128, 1, 15200] + - Exact: [128, 512, 1, 11408] + - Exact: [256, 128, 1, 13824] + - Exact: [128, 512, 1, 11616] + - Exact: [256, 128, 1, 14208] + - Exact: [128, 512, 1, 14208] + - Exact: [256, 128, 1, 15200] + - Exact: [512, 128, 1, 11408] + - Exact: [512, 128, 1, 16800] + - Exact: [128, 512, 1, 11264] + - Exact: [512, 128, 1, 11616] + - Exact: [512, 128, 1, 16128] + - Exact: [512, 128, 1, 11968] + - Exact: [128, 512, 1, 11968] + - Exact: [512, 128, 1, 12288] + - Exact: [128, 512, 1, 12288] + - Exact: [128, 512, 1, 12672] + - Exact: [512, 128, 1, 11776] + - Exact: [512, 128, 1, 12144] + - Exact: [512, 128, 1, 11264] + - Exact: [128, 512, 1, 12144] + - Exact: [512, 128, 1, 12672] + - Exact: [128, 512, 1, 12512] + - Exact: [128, 512, 1, 11776] + - Exact: [256, 128, 1, 12288] + - Exact: [40, 40, 1, 1909283] + - Exact: [40, 40, 1, 3818566] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [30522, 20, 1, 1024] + - Exact: [1760, 32, 1, 1760] + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + - Exact: [1760, 16, 1, 1760] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [8448, 16, 1, 2816] + - Exact: [2368, 4, 1, 256] + - Exact: [5888, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [2048, 16, 1, 2048] + - Exact: [1408, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2368, 4, 1, 3328] + - Exact: [5056, 4, 1, 1280] + - Exact: [3072, 16, 1, 1024] + - Exact: [1408, 4, 1, 3328] + - Exact: [6144, 16, 1, 2560] + - Exact: [4096, 16, 1, 4096] + - Exact: [1856, 4, 1, 256] + - Exact: [6784, 4, 1, 128] + - Exact: [4288, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [2560, 16, 1, 2560] + - Exact: [7680, 16, 1, 2560] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [6784, 4, 1, 3328] + - Exact: [2048, 32, 1, 2048] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 1280] + - Exact: [4608, 16, 1, 1536] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [2048, 32, 1, 1001] + - Exact: [1536, 32, 1, 1001] + - Exact: [1600, 1, 1, 1024] + - Exact: [32768, 1, 1, 256] + - Exact: [2048, 2, 1, 2048] + - Exact: [2560, 4, 1, 2560] + - Exact: [3456, 1, 1, 256] + - Exact: [4096, 1, 1, 256] + - Exact: [6912, 1, 1, 256] + - Exact: [2048, 8, 1, 2048] + - Exact: [2560, 2, 1, 2560] + - Exact: [29000, 27, 1, 2560] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + - Exact: [32, 1600, 1, 512] + - Exact: [2, 2048, 1, 1024] + - Exact: [1, 4096, 1, 256] + - Exact: [1, 6912, 1, 256] + - Exact: [2, 2048, 1, 768] + - Exact: [2, 4608, 1, 768] + - Exact: [2, 4608, 1, 1024] + +# bodys bigK + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1,4] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 16, 1, 500000] + - Exact: [1024, 8, 1, 500000] + - Exact: [512, 16, 1, 500000] + - Exact: [512, 8, 1, 500000] + - Exact: [64, 80, 1, 5329] + - Exact: [576, 96, 1, 5329] + - Exact: [288, 32, 1, 21609] + - Exact: [576, 96, 1, 5041] + - Exact: [27, 32, 1, 22201] + - Exact: [160, 64, 1, 5329] + - Exact: [448, 64, 1, 5329] + - Exact: [147, 64, 1, 12544] + - Exact: [147, 64, 1, 22500] + - Exact: [576, 64, 1, 5625] + - Exact: [256, 128, 1, 10752] + - Exact: [256, 128, 1, 10560] + - Exact: [256, 128, 1, 11408] + - Exact: [256, 12, 1, 11408] + - Exact: [256, 128, 1, 11616] + - Exact: [256, 12, 1, 11616] + - Exact: [256, 12, 1, 12288] + - Exact: [11, 11, 1, 1909283] + - Exact: [11, 11, 1, 3818566] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [768, 32, 1, 768] + - Exact: [768, 64, 1, 768] + - Exact: [1024, 80, 1, 1024] + - Exact: [1024, 20, 1, 1024] + - Exact: [768, 16, 1, 768] + - Exact: [1024, 4, 1, 1024] + - Exact: [1024, 6, 1, 1024] + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [448, 128, 1, 128] + - Exact: [256, 256, 1, 3328] + - Exact: [4, 64, 1, 1280] + - Exact: [64, 64, 1, 3328] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [256, 64, 1, 1280] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 1024, 1, 1280] + - Exact: [4, 448, 1, 1280] + - Exact: [448, 4, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [64, 128, 1, 128] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [64, 448, 1, 256] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [64, 128, 1, 1280] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [4, 64, 1, 128] + - Exact: [64, 1024, 1, 128] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [64, 128, 1, 256] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [4, 4, 1, 3328] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [64, 1024, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [4, 128, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + - Exact: [256, 64, 1, 3136] + - Exact: [64, 200, 1, 1024] + - Exact: [32, 512, 1, 1024] + - Exact: [1, 512, 1, 1024] + - Exact: [128, 512, 1, 2048] + - Exact: [64, 256, 1, 1024] + - Exact: [1, 200, 1, 1024] + - Exact: [128, 512, 1, 1024] + - Exact: [32, 256, 1, 2048] + - Exact: [32, 256, 1, 512] + - Exact: [256, 200, 1, 1024] + - Exact: [1, 256, 1, 2048] + - Exact: [32, 200, 1, 2048] + - Exact: [128, 200, 1, 1024] + - Exact: [128, 256, 1, 2048] + - Exact: [64, 1024, 1, 1024] + - Exact: [1, 512, 1, 2048] + - Exact: [128, 256, 1, 512] + - Exact: [128, 200, 1, 2048] + - Exact: [64, 200, 1, 512] + - Exact: [1, 256, 1, 1024] + - Exact: [1, 1024, 1, 1024] + - Exact: [256, 256, 1, 2048] + - Exact: [128, 256, 1, 1024] + - Exact: [1, 256, 1, 4096] + - Exact: [32, 512, 1, 512] + - Exact: [64, 200, 1, 2048] + - Exact: [1, 200, 1, 2048] + - Exact: [1, 512, 1, 4096] + - Exact: [256, 256, 1, 1024] + - Exact: [64, 256, 1, 2048] + - Exact: [1, 200, 1, 4096] + - Exact: [32, 256, 1, 1024] + - Exact: [32, 200, 1, 1024] + - Exact: [32, 512, 1, 2048] + - Exact: [128, 200, 1, 512] + - Exact: [64, 1024, 1, 2048] + - Exact: [1, 1024, 1, 2048] + - Exact: [32, 1024, 1, 512] + - Exact: [64, 1024, 1, 512] + - Exact: [1, 1024, 1, 4096] + - Exact: [64, 256, 1, 512] + - Exact: [256, 200, 1, 512] + - Exact: [32, 1024, 1, 1024] + - Exact: [32, 200, 1, 512] + - Exact: [256, 256, 1, 512] + - Exact: [128, 512, 1, 512] + - Exact: [256, 200, 1, 2048] + - Exact: [64, 512, 1, 2048] + - Exact: [32, 1024, 1, 2048] + - Exact: [256, 64, 1, 1225] + - Exact: [384, 64, 1, 1225] + - Exact: [288, 64, 1, 1225] + - Exact: [384, 96, 1, 1225] + - Exact: [11, 11, 5456, 64] + - Exact: [14, 14, 4368, 64] + - Exact: [23, 23, 2720, 64] + - Exact: [13, 13, 4672, 64] + - Exact: [29, 29, 2176, 64] + - Exact: [12, 12, 5040, 64] + - Exact: [27, 27, 2336, 64] + - Exact: [10, 10, 5952, 64] + - Exact: [7, 7, 8192, 64] + - Exact: [16, 16, 3840, 64] + - Exact: [17, 17, 3632, 64] + - Exact: [9, 9, 6544, 64] + - Exact: [8, 8, 7280, 64] + - Exact: [21, 21, 2976, 64] + - Exact: [19, 19, 3264, 64] + - Exact: [25, 25, 2512, 64] + - Exact: [18, 18, 3440, 64] + - Exact: [15, 15, 4096, 64] + - Exact: [2, 16, 1, 768] + - Exact: [2, 8, 1, 768] + - Exact: [2, 64, 1, 768] + - Exact: [256, 128, 1, 784] + - Exact: [192, 48, 1, 1225] + - Exact: [64, 256, 1, 3136] + - Exact: [512, 144, 1, 196] + - Exact: [400, 32, 1, 784] + - Exact: [832, 48, 1, 49] + - Exact: [192, 32, 1, 784] + - Exact: [288, 48, 1, 1225] + - Exact: [512, 112, 1, 196] + - Exact: [528, 32, 1, 196] + - Exact: [576, 64, 1, 3136] + - Exact: [480, 64, 1, 196] + - Exact: [192, 64, 1, 784] + - Exact: [192, 32, 1, 1225] + - Exact: [400, 48, 1, 196] + - Exact: [480, 16, 1, 196] + - Exact: [512, 64, 1, 196] + - Exact: [800, 64, 1, 196] + - Exact: [512, 128, 1, 784] + - Exact: [256, 64, 1, 784] + - Exact: [256, 48, 1, 1225] + - Exact: [192, 16, 1, 784] + - Exact: [576, 96, 1, 1225] + - Exact: [512, 128, 1, 196] + - Exact: [192, 96, 1, 784] + - Exact: [192, 64, 1, 1225] + - Exact: [512, 32, 1, 196] + - Exact: [528, 128, 1, 196] + - Exact: [128, 512, 1, 784] + - Exact: [64, 64, 1, 3136] + - Exact: [256, 32, 1, 784] + - Exact: [480, 96, 1, 196] + - Exact: [1024, 32, 1, 1001] + - Exact: [18, 18, 648, 64] + - Exact: [7, 7, 736, 64] + - Exact: [8, 8, 264, 64] + - Exact: [9, 9, 416, 64] + - Exact: [10, 10, 448, 64] + - Exact: [11, 11, 568, 64] + - Exact: [12, 12, 480, 64] + - Exact: [12, 12, 2520, 64] + - Exact: [13, 13, 576, 64] + - Exact: [13, 13, 2336, 64] + - Exact: [14, 14, 704, 64] + - Exact: [14, 14, 2184, 64] + - Exact: [15, 15, 688, 64] + - Exact: [15, 15, 2048, 64] + - Exact: [16, 16, 712, 64] + - Exact: [16, 16, 1920, 64] + - Exact: [17, 17, 688, 64] + - Exact: [17, 17, 1816, 64] + - Exact: [18, 18, 1720, 64] + - Exact: [19, 19, 680, 64] + - Exact: [19, 19, 1632, 64] + - Exact: [21, 21, 1472, 64] + - Exact: [21, 21, 1488, 64] + - Exact: [23, 23, 64, 64] + - Exact: [23, 23, 1360, 64] + - Exact: [25, 25, 176, 64] + - Exact: [25, 25, 1256, 64] + - Exact: [26, 26, 56, 64] + - Exact: [26, 27, 56, 64] + - Exact: [27, 27, 56, 64] + - Exact: [27, 27, 1168, 64] + - Exact: [29, 29, 136, 64] + - Exact: [29, 29, 1088, 64] + - Exact: [256, 1, 1, 4] + - Exact: [2, 1, 1, 1024] + - Exact: [1024, 1, 1, 1024] + - Exact: [2, 6, 1, 1024] + - Exact: [2, 8, 1, 1024] + - Exact: [14, 14, 1, 64] + - Exact: [15, 14, 1, 64] + - Exact: [15, 15, 1, 64] + - Exact: [17, 15, 1, 64] + - Exact: [17, 17, 1, 64] + - Exact: [30, 30, 1, 64] + - Exact: [30, 31, 1, 64] + - Exact: [31, 31, 1, 64] + - Exact: [1024, 32, 1, 1024] + - Exact: [2, 32, 1, 1024] + - Exact: [2, 4, 1, 1024] + - Exact: [64, 512, 1, 512] + - Exact: [64, 960, 1, 1024] + - Exact: [200, 1, 1, 1024] + - Exact: [512, 1, 1, 2048] + - Exact: [64, 512, 1, 1024] + - Exact: [3, 3, 512, 64] + - Exact: [5, 5, 512, 64] + - Exact: [9, 9, 512, 64] + - Exact: [128, 256, 1, 1444] + - Exact: [256, 128, 1, 25] + - Exact: [256, 128, 1, 9] + - Exact: [256, 256, 1, 1444] + - Exact: [512, 128, 1, 100] + - Exact: [64, 128, 1, 1444] + - Exact: [1024, 77, 1, 1024] + - Exact: [2, 10, 1, 1024] + - Exact: [1024, 10, 1, 1024] + - Exact: [2, 39, 1, 1024] + - Exact: [1024, 39, 1, 1024] + - Exact: [2, 40, 1, 1024] + - Exact: [1024, 40, 1, 1024] + - Exact: [2, 41, 1, 1024] + - Exact: [1024, 41, 1, 1024] + - Exact: [2, 5, 1, 1024] + - Exact: [1024, 5, 1, 1024] + - Exact: [1024, 8, 1, 1024] + - Exact: [2, 9, 1, 1024] + - Exact: [1024, 9, 1, 1024] + - Exact: [4, 4, 32768, 64] + - Exact: [4, 4, 38400, 64] + - Exact: [14, 14, 10880, 64] + - Exact: [15, 14, 10880, 64] + - Exact: [15, 15, 7680, 64] + - Exact: [15, 15, 10880, 64] + - Exact: [17, 15, 7680, 64] + - Exact: [17, 17, 6144, 64] + - Exact: [17, 17, 7680, 64] + - Exact: [21, 17, 6144, 64] + - Exact: [21, 21, 6144, 64] + - Exact: [24, 24, 4736, 64] + - Exact: [30, 30, 2048, 64] + - Exact: [30, 31, 2048, 64] + - Exact: [31, 31, 2048, 64] + - Exact: [34, 24, 4736, 64] + - Exact: [128, 128, 1, 64] + - Exact: [2, 1024, 1, 1024] + - Exact: [5, 5, 1, 64] + - Exact: [33, 33, 1, 32] + - Exact: [5, 5, 960, 64] + - Exact: [27, 27, 32768, 128] + - Exact: [960, 1, 1, 2048] + - Exact: [2, 2, 1, 2048] + - Exact: [1024, 16, 1, 1024] + - Exact: [2, 16, 1, 1024] + - Exact: [2, 4, 1, 2560] + - Exact: [1024, 64, 1, 1024] + - Exact: [2, 64, 1, 1024] + - Exact: [864, 1, 1, 256] + - Exact: [2, 80, 1, 1024] + - Exact: [1024, 82, 1, 1024] + - Exact: [2, 82, 1, 1024] + - Exact: [1024, 12, 1, 1024] + - Exact: [2, 12, 1, 1024] + - Exact: [24, 24, 6816, 64] + - Exact: [26, 26, 6272, 64] + - Exact: [256, 128, 1, 3136] + - Exact: [2, 128, 1, 1024] + - Exact: [2, 96, 1, 1024] + - Exact: [768, 12, 1, 768] + - Exact: [768, 4, 1, 768] + - Exact: [256, 80, 1, 784] + - Exact: [256, 12, 1, 3800] + - Exact: [256, 3, 1, 3800] + - Exact: [256, 12, 1, 950] + - Exact: [256, 3, 1, 950] + - Exact: [256, 12, 1, 3220] + - Exact: [256, 3, 1, 3220] + - Exact: [256, 12, 1, 3072] + - Exact: [256, 3, 1, 3072] + - Exact: [256, 12, 1, 850] + - Exact: [256, 3, 1, 850] + - Exact: [256, 12, 1, 2852] + - Exact: [256, 3, 1, 2852] + - Exact: [256, 12, 1, 805] + - Exact: [256, 3, 1, 805] + - Exact: [256, 3, 1, 864] + - Exact: [256, 3, 1, 768] + - Exact: [256, 12, 1, 864] + - Exact: [256, 12, 1, 768] + - Exact: [256, 12, 1, 2904] + - Exact: [256, 3, 1, 2904] + - Exact: [256, 3, 1, 713] + - Exact: [256, 12, 1, 888] + - Exact: [256, 3, 1, 888] + - Exact: [256, 12, 1, 713] + - Exact: [256, 3, 1, 660] + - Exact: [256, 3, 1, 672] + - Exact: [256, 12, 1, 660] + - Exact: [256, 3, 1, 726] + - Exact: [256, 12, 1, 672] + - Exact: [256, 3, 1, 247] + - Exact: [256, 12, 1, 726] + - Exact: [256, 3, 1, 216] + - Exact: [256, 3, 1, 3400] + - Exact: [256, 3, 1, 221] + - Exact: [256, 12, 1, 3552] + - Exact: [256, 3, 1, 3456] + - Exact: [256, 3, 1, 204] + - Exact: [256, 12, 1, 3400] + - Exact: [256, 12, 1, 3456] + - Exact: [256, 12, 1, 221] + - Exact: [256, 3, 1, 3552] + - Exact: [256, 3, 1, 228] + - Exact: [256, 3, 1, 234] + - Exact: [256, 12, 1, 234] + - Exact: [81, 1024, 1, 1024] + - Exact: [81, 1000, 1, 1024] + - Exact: [256, 12, 1, 228] + - Exact: [256, 3, 1, 252] + - Exact: [256, 12, 1, 252] + - Exact: [256, 12, 1, 247] + - Exact: [1024, 6, 1, 2] + - Exact: [2, 8, 1, 2048] + - Exact: [2, 20, 1, 1024] + - Exact: [2, 2, 1, 2560] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml b/Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml new file mode 100644 index 000000000..80da4363c --- /dev/null +++ b/Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml @@ -0,0 +1,1292 @@ +# headers +GlobalParameters: + MinimumRequiredVersion: 4.9.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + NumBenchmarks: 1 + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + KernelTime: True + SleepPercent: 500 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 +# PrintCodeCommands: True + PrintSolutionRejectionReason: True + PrintWinnersOnly: True +# PinClocks: True + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + +# bodys bigSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 8, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [2944, 4288, 1, 1280] + - Exact: [2368, 5888, 1, 256] + - Exact: [5888, 1024, 1, 1280] + - Exact: [5888, 1856, 1, 3328] + - Exact: [5056, 704, 1, 256] + - Exact: [5888, 2944, 1, 3328] + - Exact: [1856, 4288, 1, 256] + - Exact: [1024, 5056, 1, 128] + - Exact: [5056, 5056, 1, 3328] + - Exact: [1408, 5888, 1, 1280] + - Exact: [1024, 3584, 1, 3328] + - Exact: [5888, 1408, 1, 1280] + - Exact: [1024, 2368, 1, 256] + - Exact: [1408, 1856, 1, 1280] + - Exact: [5056, 5056, 1, 1280] + - Exact: [448, 5056, 1, 256] + - Exact: [1856, 1408, 1, 128] + - Exact: [6784, 256, 1, 3328] + - Exact: [6784, 4288, 1, 3328] + - Exact: [4288, 448, 1, 256] + - Exact: [1856, 2368, 1, 3328] + - Exact: [4288, 2944, 1, 1280] + - Exact: [704, 5056, 1, 1280] + - Exact: [2368, 704, 1, 3328] + - Exact: [256, 5888, 1, 256] + - Exact: [1856, 4288, 1, 3328] + - Exact: [5888, 1024, 1, 256] + - Exact: [448, 5056, 1, 3328] + - Exact: [1408, 2944, 1, 256] + - Exact: [6784, 5056, 1, 3328] + - Exact: [5056, 5056, 1, 256] + - Exact: [1408, 6784, 1, 128] + - Exact: [704, 5056, 1, 128] + - Exact: [2368, 2944, 1, 1280] + - Exact: [6784, 6784, 1, 1280] + - Exact: [1408, 4288, 1, 1280] + - Exact: [3584, 4288, 1, 1280] + - Exact: [2368, 704, 1, 1280] + - Exact: [5056, 4288, 1, 3328] + - Exact: [3584, 2368, 1, 3328] + - Exact: [6784, 448, 1, 1280] + - Exact: [1408, 2944, 1, 128] + - Exact: [4288, 2944, 1, 256] + - Exact: [5888, 704, 1, 1280] + - Exact: [448, 5888, 1, 128] + - Exact: [5056, 2368, 1, 1280] + - Exact: [448, 3584, 1, 1280] + - Exact: [6784, 5888, 1, 256] + - Exact: [5888, 2944, 1, 128] + - Exact: [1024, 1408, 1, 256] + - Exact: [2368, 2368, 1, 3328] + - Exact: [1856, 6784, 1, 128] + - Exact: [5056, 704, 1, 3328] + - Exact: [1408, 1856, 1, 256] + - Exact: [2368, 5056, 1, 256] + - Exact: [5888, 1856, 1, 256] + - Exact: [704, 5888, 1, 256] + - Exact: [2944, 6784, 1, 3328] + - Exact: [3584, 704, 1, 3328] + - Exact: [448, 4288, 1, 256] + - Exact: [704, 2368, 1, 1280] + - Exact: [1856, 2368, 1, 1280] + - Exact: [1856, 4288, 1, 1280] + - Exact: [704, 2944, 1, 128] + - Exact: [1408, 1024, 1, 1280] + - Exact: [704, 6784, 1, 256] + - Exact: [6784, 704, 1, 256] + - Exact: [5056, 1408, 1, 128] + - Exact: [3584, 4288, 1, 3328] + - Exact: [5888, 1856, 1, 1280] + - Exact: [5056, 1024, 1, 3328] + - Exact: [1024, 4288, 1, 128] + - Exact: [2368, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 1280] + - Exact: [2944, 3584, 1, 3328] + - Exact: [6784, 2944, 1, 256] + - Exact: [4288, 2368, 1, 3328] + - Exact: [1856, 2368, 1, 256] + - Exact: [3584, 6784, 1, 3328] + - Exact: [1024, 5888, 1, 3328] + - Exact: [5056, 4288, 1, 1280] + - Exact: [1408, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 128] + - Exact: [704, 5888, 1, 1280] + - Exact: [2368, 3584, 1, 128] + - Exact: [6784, 5888, 1, 3328] + - Exact: [1024, 5056, 1, 1280] + - Exact: [4288, 1024, 1, 256] + - Exact: [2944, 2368, 1, 128] + - Exact: [5888, 448, 1, 1280] + - Exact: [704, 5888, 1, 3328] + - Exact: [3584, 2944, 1, 256] + - Exact: [2368, 1024, 1, 3328] + - Exact: [1408, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 3328] + - Exact: [2368, 2368, 1, 256] + - Exact: [4288, 4288, 1, 1280] + - Exact: [1408, 4288, 1, 256] + - Exact: [5888, 448, 1, 128] + - Exact: [704, 6784, 1, 3328] + - Exact: [5888, 5888, 1, 1280] + - Exact: [5056, 1024, 1, 1280] + - Exact: [448, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 1280] + - Exact: [5056, 5888, 1, 1280] + - Exact: [4288, 5888, 1, 128] + - Exact: [1408, 3584, 1, 128] + - Exact: [448, 3584, 1, 128] + - Exact: [5888, 2944, 1, 1280] + - Exact: [2368, 5888, 1, 128] + - Exact: [3584, 5888, 1, 256] + - Exact: [2368, 704, 1, 128] + - Exact: [3584, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 128] + - Exact: [5056, 704, 1, 128] + - Exact: [5056, 1408, 1, 3328] + - Exact: [6784, 1024, 1, 3328] + - Exact: [6784, 2944, 1, 3328] + - Exact: [2944, 5056, 1, 3328] + - Exact: [1856, 1856, 1, 256] + - Exact: [1024, 5888, 1, 128] + - Exact: [6784, 2368, 1, 1280] + - Exact: [4288, 5888, 1, 1280] + - Exact: [4288, 4288, 1, 256] + - Exact: [4288, 1856, 1, 1280] + - Exact: [1856, 2944, 1, 3328] + - Exact: [256, 6784, 1, 3328] + - Exact: [256, 5056, 1, 128] + - Exact: [5056, 1024, 1, 256] + - Exact: [5056, 1856, 1, 3328] + - Exact: [1856, 1408, 1, 256] + - Exact: [4288, 1408, 1, 128] + - Exact: [4288, 5056, 1, 256] + - Exact: [5056, 256, 1, 3328] + - Exact: [1024, 5888, 1, 1280] + - Exact: [6784, 2368, 1, 128] + - Exact: [5056, 3584, 1, 256] + - Exact: [1856, 1024, 1, 1280] + - Exact: [6784, 4288, 1, 1280] + - Exact: [1856, 1856, 1, 1280] + - Exact: [6784, 2944, 1, 128] + - Exact: [5888, 1856, 1, 128] + - Exact: [2368, 1024, 1, 128] + - Exact: [5056, 3584, 1, 128] + - Exact: [5888, 5888, 1, 3328] + - Exact: [6784, 1024, 1, 256] + - Exact: [2944, 2368, 1, 256] + - Exact: [5056, 5888, 1, 3328] + - Exact: [1856, 1024, 1, 256] + - Exact: [3584, 448, 1, 1280] + - Exact: [448, 5888, 1, 256] + - Exact: [1408, 6784, 1, 3328] + - Exact: [4288, 704, 1, 128] + - Exact: [5056, 2944, 1, 256] + - Exact: [6784, 5888, 1, 128] + - Exact: [2368, 1856, 1, 256] + - Exact: [1408, 3584, 1, 3328] + - Exact: [2368, 6784, 1, 256] + - Exact: [5056, 1408, 1, 1280] + - Exact: [5056, 4288, 1, 128] + - Exact: [1408, 1856, 1, 128] + - Exact: [1408, 5888, 1, 3328] + - Exact: [6784, 6784, 1, 256] + - Exact: [4288, 2368, 1, 128] + - Exact: [1856, 4288, 1, 128] + - Exact: [2368, 2944, 1, 256] + - Exact: [3584, 1856, 1, 1280] + - Exact: [6784, 6784, 1, 128] + - Exact: [5888, 5056, 1, 256] + - Exact: [3584, 448, 1, 256] + - Exact: [448, 4288, 1, 128] + - Exact: [2944, 4288, 1, 3328] + - Exact: [256, 6784, 1, 256] + - Exact: [1408, 4288, 1, 128] + - Exact: [2944, 704, 1, 3328] + - Exact: [3584, 3584, 1, 256] + - Exact: [3584, 5056, 1, 256] + - Exact: [2944, 2368, 1, 1280] + - Exact: [1408, 3584, 1, 256] + - Exact: [6784, 3584, 1, 256] + - Exact: [5056, 2368, 1, 128] + - Exact: [2944, 2944, 1, 3328] + - Exact: [5056, 6784, 1, 256] + - Exact: [1856, 3584, 1, 128] + - Exact: [6784, 448, 1, 256] + - Exact: [3584, 6784, 1, 128] + - Exact: [5056, 1856, 1, 256] + - Exact: [1024, 1856, 1, 256] + - Exact: [1408, 6784, 1, 1280] + - Exact: [3584, 3584, 1, 1280] + - Exact: [5888, 5888, 1, 128] + - Exact: [5056, 5888, 1, 128] + - Exact: [5056, 2368, 1, 3328] + - Exact: [2944, 4288, 1, 256] + - Exact: [1408, 3584, 1, 1280] + - Exact: [2368, 6784, 1, 3328] + - Exact: [1856, 1408, 1, 1280] + - Exact: [6784, 704, 1, 128] + - Exact: [1408, 5888, 1, 256] + - Exact: [704, 2944, 1, 1280] + - Exact: [1856, 2368, 1, 128] + - Exact: [3584, 704, 1, 1280] + - Exact: [2944, 6784, 1, 128] + - Exact: [3584, 448, 1, 3328] + - Exact: [704, 2368, 1, 3328] + - Exact: [256, 5888, 1, 128] + - Exact: [2944, 2944, 1, 1280] + - Exact: [5888, 2368, 1, 256] + - Exact: [6784, 704, 1, 3328] + - Exact: [5888, 4288, 1, 128] + - Exact: [1408, 2944, 1, 3328] + - Exact: [3584, 704, 1, 128] + - Exact: [5056, 5056, 1, 128] + - Exact: [448, 5056, 1, 128] + - Exact: [1408, 5056, 1, 128] + - Exact: [2944, 3584, 1, 128] + - Exact: [3584, 2368, 1, 256] + - Exact: [5888, 5056, 1, 1280] + - Exact: [2368, 5056, 1, 128] + - Exact: [3584, 3584, 1, 3328] + - Exact: [5888, 6784, 1, 256] + - Exact: [4288, 2944, 1, 3328] + - Exact: [4288, 704, 1, 1280] + - Exact: [256, 5056, 1, 1280] + - Exact: [2944, 5888, 1, 3328] + - Exact: [6784, 5888, 1, 1280] + - Exact: [5888, 4288, 1, 1280] + - Exact: [5888, 3584, 1, 128] + - Exact: [1856, 1856, 1, 128] + - Exact: [3584, 1024, 1, 3328] + - Exact: [704, 3584, 1, 128] + - Exact: [5888, 448, 1, 3328] + - Exact: [2368, 4288, 1, 1280] + - Exact: [4288, 2944, 1, 128] + - Exact: [1024, 6784, 1, 3328] + - Exact: [5056, 2944, 1, 3328] + - Exact: [2944, 3584, 1, 256] + - Exact: [1408, 1408, 1, 3328] + - Exact: [3584, 3584, 1, 128] + - Exact: [3584, 704, 1, 256] + - Exact: [3584, 1408, 1, 3328] + - Exact: [704, 3584, 1, 1280] + - Exact: [2944, 6784, 1, 1280] + - Exact: [1856, 6784, 1, 256] + - Exact: [4288, 448, 1, 3328] + - Exact: [6784, 4288, 1, 128] + - Exact: [6784, 704, 1, 1280] + - Exact: [5888, 1024, 1, 3328] + - Exact: [704, 6784, 1, 1280] + - Exact: [1856, 5056, 1, 3328] + - Exact: [1024, 3584, 1, 128] + - Exact: [1024, 1408, 1, 128] + - Exact: [2368, 2944, 1, 128] + - Exact: [5056, 2944, 1, 128] + - Exact: [5888, 5056, 1, 3328] + - Exact: [1408, 2368, 1, 128] + - Exact: [5888, 2368, 1, 128] + - Exact: [3584, 6784, 1, 1280] + - Exact: [1856, 5888, 1, 256] + - Exact: [4288, 4288, 1, 3328] + - Exact: [4288, 1408, 1, 1280] + - Exact: [3584, 5056, 1, 128] + - Exact: [4288, 2368, 1, 256] + - Exact: [2944, 5056, 1, 1280] + - Exact: [448, 6784, 1, 256] + - Exact: [6784, 2368, 1, 3328] + - Exact: [4288, 1856, 1, 3328] + - Exact: [3584, 448, 1, 128] + - Exact: [3584, 1024, 1, 1280] + - Exact: [1856, 5056, 1, 256] + - Exact: [1024, 4288, 1, 256] + - Exact: [5888, 3584, 1, 3328] + - Exact: [5056, 3584, 1, 3328] + - Exact: [2368, 1408, 1, 1280] + - Exact: [5056, 2944, 1, 1280] + - Exact: [1024, 6784, 1, 256] + - Exact: [2944, 1408, 1, 128] + - Exact: [5056, 6784, 1, 3328] + - Exact: [3584, 4288, 1, 256] + - Exact: [1856, 6784, 1, 3328] + - Exact: [5888, 4288, 1, 256] + - Exact: [5056, 1408, 1, 256] + - Exact: [3584, 1024, 1, 256] + - Exact: [5888, 5888, 1, 256] + - Exact: [4288, 1024, 1, 1280] + - Exact: [448, 6784, 1, 3328] + - Exact: [2944, 1408, 1, 1280] + - Exact: [2944, 1856, 1, 3328] + - Exact: [2944, 2944, 1, 128] + - Exact: [3584, 5888, 1, 1280] + - Exact: [6784, 1856, 1, 1280] + - Exact: [2944, 5056, 1, 256] + - Exact: [5888, 256, 1, 3328] + - Exact: [1856, 5888, 1, 3328] + - Exact: [3584, 1408, 1, 256] + - Exact: [704, 3584, 1, 3328] + - Exact: [5056, 448, 1, 1280] + - Exact: [3584, 1856, 1, 3328] + - Exact: [2944, 1024, 1, 256] + - Exact: [1024, 2368, 1, 128] + - Exact: [2368, 4288, 1, 3328] + - Exact: [1024, 1408, 1, 1280] + - Exact: [6784, 5056, 1, 256] + - Exact: [448, 6784, 1, 128] + - Exact: [2944, 6784, 1, 256] + - Exact: [2368, 2368, 1, 1280] + - Exact: [1856, 3584, 1, 1280] + - Exact: [3584, 1408, 1, 1280] + - Exact: [4288, 448, 1, 128] + - Exact: [5056, 256, 1, 1280] + - Exact: [1856, 1408, 1, 3328] + - Exact: [1024, 4288, 1, 3328] + - Exact: [5056, 448, 1, 256] + - Exact: [2944, 2368, 1, 3328] + - Exact: [1024, 1856, 1, 1280] + - Exact: [6784, 1856, 1, 256] + - Exact: [1024, 5888, 1, 256] + - Exact: [1408, 2368, 1, 256] + - Exact: [1408, 1408, 1, 256] + - Exact: [2368, 2368, 1, 128] + - Exact: [6784, 1408, 1, 128] + - Exact: [4288, 5888, 1, 256] + - Exact: [1408, 5056, 1, 256] + - Exact: [4288, 3584, 1, 128] + - Exact: [3584, 5056, 1, 1280] + - Exact: [1856, 1024, 1, 128] + - Exact: [704, 4288, 1, 256] + - Exact: [5888, 2368, 1, 1280] + - Exact: [2368, 5888, 1, 1280] + - Exact: [5888, 256, 1, 1280] + - Exact: [2368, 1856, 1, 3328] + - Exact: [2944, 704, 1, 256] + - Exact: [704, 3584, 1, 256] + - Exact: [704, 2944, 1, 3328] + - Exact: [6784, 1024, 1, 128] + - Exact: [2944, 1024, 1, 3328] + - Exact: [2944, 5056, 1, 128] + - Exact: [1408, 6784, 1, 256] + - Exact: [6784, 1408, 1, 3328] + - Exact: [4288, 6784, 1, 128] + - Exact: [6784, 2944, 1, 1280] + - Exact: [4288, 1856, 1, 128] + - Exact: [1856, 2944, 1, 128] + - Exact: [6784, 448, 1, 128] + - Exact: [448, 5056, 1, 1280] + - Exact: [2368, 1856, 1, 128] + - Exact: [4288, 704, 1, 256] + - Exact: [5888, 704, 1, 256] + - Exact: [3584, 1024, 1, 128] + - Exact: [256, 5888, 1, 3328] + - Exact: [1408, 4288, 1, 3328] + - Exact: [6784, 4288, 1, 256] + - Exact: [5888, 256, 1, 256] + - Exact: [6784, 1024, 1, 1280] + - Exact: [5888, 1024, 1, 128] + - Exact: [2944, 704, 1, 1280] + - Exact: [6784, 3584, 1, 1280] + - Exact: [1024, 6784, 1, 1280] + - Exact: [1408, 2944, 1, 1280] + - Exact: [1408, 2368, 1, 3328] + - Exact: [2944, 1856, 1, 128] + - Exact: [256, 6784, 1, 128] + - Exact: [5056, 6784, 1, 128] + - Exact: [4288, 5056, 1, 128] + - Exact: [1856, 5888, 1, 128] + - Exact: [2944, 5888, 1, 256] + - Exact: [3584, 1856, 1, 256] + - Exact: [4288, 3584, 1, 1280] + - Exact: [704, 4288, 1, 3328] + - Exact: [704, 5888, 1, 128] + - Exact: [6784, 3584, 1, 128] + - Exact: [4288, 5056, 1, 3328] + - Exact: [1408, 1408, 1, 128] + - Exact: [5056, 2368, 1, 256] + - Exact: [4288, 704, 1, 3328] + - Exact: [448, 3584, 1, 256] + - Exact: [2368, 1024, 1, 1280] + - Exact: [2944, 1408, 1, 3328] + - Exact: [1024, 1408, 1, 3328] + - Exact: [2944, 5888, 1, 1280] + - Exact: [5888, 3584, 1, 256] + - Exact: [1408, 1856, 1, 3328] + - Exact: [6784, 1408, 1, 1280] + - Exact: [704, 2944, 1, 256] + - Exact: [704, 4288, 1, 128] + - Exact: [2368, 4288, 1, 128] + - Exact: [1024, 6784, 1, 128] + - Exact: [1408, 1408, 1, 1280] + - Exact: [448, 4288, 1, 3328] + - Exact: [2368, 1408, 1, 256] + - Exact: [5888, 5056, 1, 128] + - Exact: [704, 2368, 1, 256] + - Exact: [5888, 2368, 1, 3328] + - Exact: [4288, 448, 1, 1280] + - Exact: [5888, 704, 1, 3328] + - Exact: [5056, 256, 1, 128] + - Exact: [1408, 5888, 1, 128] + - Exact: [1408, 1024, 1, 256] + - Exact: [1024, 1856, 1, 128] + - Exact: [5056, 6784, 1, 1280] + - Exact: [704, 5056, 1, 3328] + - Exact: [3584, 5056, 1, 3328] + - Exact: [2368, 2944, 1, 3328] + - Exact: [2368, 3584, 1, 256] + - Exact: [5056, 3584, 1, 1280] + - Exact: [1856, 2944, 1, 1280] + - Exact: [3584, 2368, 1, 1280] + - Exact: [2944, 1408, 1, 256] + - Exact: [4288, 1408, 1, 3328] + - Exact: [2944, 1024, 1, 128] + - Exact: [4288, 5056, 1, 1280] + - Exact: [5888, 6784, 1, 1280] + - Exact: [6784, 5056, 1, 128] + - Exact: [5888, 1408, 1, 3328] + - Exact: [256, 5056, 1, 256] + - Exact: [448, 3584, 1, 3328] + - Exact: [704, 2368, 1, 128] + - Exact: [5888, 256, 1, 128] + - Exact: [3584, 1856, 1, 128] + - Exact: [4288, 4288, 1, 128] + - Exact: [1856, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 256] + - Exact: [2368, 1408, 1, 3328] + - Exact: [5888, 448, 1, 256] + - Exact: [5888, 6784, 1, 128] + - Exact: [6784, 5056, 1, 1280] + - Exact: [5056, 704, 1, 1280] + - Exact: [4288, 6784, 1, 1280] + - Exact: [6784, 1408, 1, 256] + - Exact: [3584, 5888, 1, 128] + - Exact: [5056, 5888, 1, 256] + - Exact: [2368, 1024, 1, 256] + - Exact: [2944, 1856, 1, 256] + - Exact: [1856, 6784, 1, 1280] + - Exact: [4288, 3584, 1, 256] + - Exact: [5056, 1856, 1, 1280] + - Exact: [1408, 1024, 1, 3328] + - Exact: [5888, 3584, 1, 1280] + - Exact: [1856, 3584, 1, 3328] + - Exact: [1024, 2944, 1, 256] + - Exact: [448, 6784, 1, 1280] + - Exact: [704, 5056, 1, 256] + - Exact: [2944, 1856, 1, 1280] + - Exact: [5056, 256, 1, 256] + - Exact: [2368, 3584, 1, 3328] + - Exact: [3584, 5888, 1, 3328] + - Exact: [2944, 3584, 1, 1280] + - Exact: [1856, 5888, 1, 1280] + - Exact: [5056, 448, 1, 3328] + - Exact: [4288, 1408, 1, 256] + - Exact: [5888, 1408, 1, 128] + - Exact: [4288, 2368, 1, 1280] + - Exact: [6784, 2368, 1, 256] + - Exact: [4288, 1856, 1, 256] + - Exact: [1856, 2944, 1, 256] + - Exact: [5056, 1024, 1, 128] + - Exact: [6784, 256, 1, 128] + - Exact: [5888, 704, 1, 128] + - Exact: [1024, 4288, 1, 1280] + - Exact: [2368, 5056, 1, 3328] + - Exact: [4288, 1024, 1, 3328] + - Exact: [1024, 5056, 1, 3328] + - Exact: [1024, 1856, 1, 3328] + - Exact: [704, 6784, 1, 128] + - Exact: [4288, 6784, 1, 256] + - Exact: [3584, 2944, 1, 3328] + - Exact: [5888, 2944, 1, 256] + - Exact: [2368, 6784, 1, 128] + - Exact: [448, 4288, 1, 1280] + - Exact: [5056, 4288, 1, 256] + - Exact: [1024, 3584, 1, 256] + - Exact: [1856, 5056, 1, 128] + - Exact: [6784, 6784, 1, 3328] + - Exact: [448, 5888, 1, 1280] + - Exact: [5056, 448, 1, 128] + - Exact: [3584, 2944, 1, 128] + - Exact: [6784, 256, 1, 1280] + - Exact: [2368, 5888, 1, 3328] + - Exact: [2368, 1856, 1, 1280] + - Exact: [3584, 4288, 1, 128] + - Exact: [5888, 4288, 1, 3328] + - Exact: [2368, 704, 1, 256] + - Exact: [3584, 1408, 1, 128] + - Exact: [1856, 5056, 1, 1280] + - Exact: [2944, 1024, 1, 1280] + - Exact: [2368, 4288, 1, 256] + - Exact: [1024, 2368, 1, 3328] + - Exact: [6784, 1856, 1, 3328] + - Exact: [1024, 2944, 1, 128] + - Exact: [1024, 3584, 1, 1280] + - Exact: [4288, 5888, 1, 3328] + - Exact: [1024, 2944, 1, 3328] + - Exact: [3584, 6784, 1, 256] + - Exact: [256, 6784, 1, 1280] + - Exact: [1856, 3584, 1, 256] + - Exact: [6784, 1856, 1, 128] + - Exact: [2944, 704, 1, 128] + - Exact: [256, 5888, 1, 1280] + - Exact: [4288, 6784, 1, 3328] + - Exact: [2368, 1408, 1, 128] + - Exact: [1408, 1024, 1, 128] + - Exact: [6784, 3584, 1, 3328] + - Exact: [2368, 5056, 1, 1280] + - Exact: [1408, 2368, 1, 1280] + - Exact: [2944, 4288, 1, 128] + - Exact: [2944, 2944, 1, 256] + - Exact: [6784, 256, 1, 256] + - Exact: [256, 5056, 1, 3328] + - Exact: [5056, 1856, 1, 128] + - Exact: [5888, 1408, 1, 256] + - Exact: [4288, 3584, 1, 3328] + - Exact: [1024, 2368, 1, 1280] + - Exact: [5888, 6784, 1, 3328] + - Exact: [704, 4288, 1, 1280] + - Exact: [6784, 448, 1, 3328] + - Exact: [4288, 1024, 1, 128] + - Exact: [1920, 2048, 1, 2048] + - Exact: [2880, 3072, 1, 3072] + - Exact: [3840, 4096, 1, 4096] + - Exact: [7680, 8192, 1, 8192] + - Exact: [2048, 2048, 1, 2048] + - Exact: [3072, 3072, 1, 3072] + - Exact: [4096, 4096, 1, 4096] + - Exact: [8192, 8192, 1, 8192] + - Exact: [1152, 1152, 1, 1152] + - Exact: [1536, 1536, 1, 1536] + - Exact: [1920, 1920, 1, 1920] + - Exact: [2304, 2304, 1, 2304] + - Exact: [2688, 2688, 1, 2688] + - Exact: [3456, 3456, 1, 3456] + - Exact: [3840, 3840, 1, 3840] + - Exact: [4224, 4224, 1, 4224] + - Exact: [4608, 4608, 1, 4608] + - Exact: [4992, 4992, 1, 4992] + - Exact: [5376, 5376, 1, 5376] + - Exact: [5760, 5760, 1, 5760] + - Exact: [6144, 6144, 1, 6144] + - Exact: [6528, 6528, 1, 6528] + - Exact: [6912, 6912, 1, 6912] + - Exact: [7296, 7296, 1, 7296] + - Exact: [7680, 7680, 1, 7680] + +# bodys midSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [4] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1856, 448, 1, 3328] + - Exact: [128, 6784, 1, 3328] + - Exact: [2368, 448, 1, 128] + - Exact: [256, 4288, 1, 3328] + - Exact: [704, 1856, 1, 3328] + - Exact: [448, 1024, 1, 1280] + - Exact: [256, 1408, 1, 3328] + - Exact: [704, 1856, 1, 1280] + - Exact: [128, 5056, 1, 128] + - Exact: [2368, 128, 1, 256] + - Exact: [64, 5056, 1, 256] + - Exact: [256, 2944, 1, 256] + - Exact: [256, 1856, 1, 1280] + - Exact: [128, 3584, 1, 1280] + - Exact: [4288, 256, 1, 256] + - Exact: [2944, 128, 1, 128] + - Exact: [5888, 64, 1, 3328] + - Exact: [2944, 256, 1, 3328] + - Exact: [1408, 448, 1, 1280] + - Exact: [1408, 704, 1, 3328] + - Exact: [6784, 64, 1, 256] + - Exact: [2944, 256, 1, 256] + - Exact: [704, 1408, 1, 3328] + - Exact: [2944, 256, 1, 128] + - Exact: [448, 2944, 1, 128] + - Exact: [2368, 128, 1, 3328] + - Exact: [2944, 128, 1, 256] + - Exact: [448, 1408, 1, 256] + - Exact: [64, 5056, 1, 3328] + - Exact: [1024, 448, 1, 128] + - Exact: [256, 3584, 1, 3328] + - Exact: [5056, 64, 1, 1280] + - Exact: [1024, 704, 1, 256] + - Exact: [128, 4288, 1, 128] + - Exact: [3584, 256, 1, 128] + - Exact: [4288, 128, 1, 1280] + - Exact: [5888, 64, 1, 256] + - Exact: [1856, 256, 1, 1280] + - Exact: [64, 5888, 1, 3328] + - Exact: [704, 1024, 1, 1280] + - Exact: [448, 1856, 1, 128] + - Exact: [1024, 704, 1, 1280] + - Exact: [128, 5888, 1, 256] + - Exact: [704, 704, 1, 3328] + - Exact: [704, 1408, 1, 1280] + - Exact: [3584, 256, 1, 3328] + - Exact: [704, 1856, 1, 128] + - Exact: [128, 3584, 1, 3328] + - Exact: [128, 2944, 1, 1280] + - Exact: [3584, 128, 1, 256] + - Exact: [448, 1408, 1, 3328] + - Exact: [256, 3584, 1, 256] + - Exact: [256, 2944, 1, 3328] + - Exact: [448, 2368, 1, 128] + - Exact: [1408, 704, 1, 256] + - Exact: [448, 2944, 1, 3328] + - Exact: [64, 5888, 1, 256] + - Exact: [6784, 128, 1, 3328] + - Exact: [704, 704, 1, 256] + - Exact: [128, 4288, 1, 3328] + - Exact: [448, 704, 1, 1280] + - Exact: [1024, 448, 1, 3328] + - Exact: [1856, 704, 1, 1280] + - Exact: [448, 1408, 1, 1280] + - Exact: [1024, 1024, 1, 1280] + - Exact: [448, 1024, 1, 128] + - Exact: [448, 2368, 1, 3328] + - Exact: [5056, 64, 1, 128] + - Exact: [704, 1024, 1, 256] + - Exact: [128, 6784, 1, 1280] + - Exact: [1856, 256, 1, 256] + - Exact: [256, 4288, 1, 1280] + - Exact: [256, 1856, 1, 128] + - Exact: [448, 1408, 1, 128] + - Exact: [6784, 128, 1, 256] + - Exact: [704, 448, 1, 256] + - Exact: [704, 1408, 1, 128] + - Exact: [2944, 448, 1, 128] + - Exact: [128, 2944, 1, 128] + - Exact: [1024, 704, 1, 3328] + - Exact: [128, 4288, 1, 256] + - Exact: [704, 448, 1, 3328] + - Exact: [1024, 1024, 1, 3328] + - Exact: [448, 2368, 1, 1280] + - Exact: [64, 6784, 1, 3328] + - Exact: [2944, 256, 1, 1280] + - Exact: [256, 2368, 1, 128] + - Exact: [1856, 704, 1, 256] + - Exact: [1408, 448, 1, 3328] + - Exact: [2368, 256, 1, 256] + - Exact: [1856, 448, 1, 1280] + - Exact: [128, 5888, 1, 128] + - Exact: [1024, 1024, 1, 256] + - Exact: [704, 1856, 1, 256] + - Exact: [64, 6784, 1, 256] + - Exact: [256, 2368, 1, 1280] + - Exact: [2944, 448, 1, 256] + - Exact: [1856, 448, 1, 128] + - Exact: [2368, 128, 1, 1280] + - Exact: [2368, 256, 1, 128] + - Exact: [64, 5056, 1, 1280] + - Exact: [2368, 256, 1, 1280] + - Exact: [2368, 448, 1, 1280] + - Exact: [128, 3584, 1, 256] + - Exact: [704, 448, 1, 1280] + - Exact: [128, 5056, 1, 256] + - Exact: [4288, 256, 1, 1280] + - Exact: [4288, 128, 1, 3328] + - Exact: [1408, 256, 1, 128] + - Exact: [256, 1408, 1, 1280] + - Exact: [128, 2368, 1, 256] + - Exact: [6784, 64, 1, 3328] + - Exact: [128, 2944, 1, 3328] + - Exact: [2944, 448, 1, 3328] + - Exact: [256, 4288, 1, 256] + - Exact: [5888, 128, 1, 256] + - Exact: [2368, 448, 1, 3328] + - Exact: [5056, 64, 1, 256] + - Exact: [1024, 704, 1, 128] + - Exact: [128, 5056, 1, 3328] + - Exact: [704, 1024, 1, 128] + - Exact: [4288, 128, 1, 256] + - Exact: [1408, 448, 1, 128] + - Exact: [128, 5888, 1, 1280] + - Exact: [704, 448, 1, 128] + - Exact: [3584, 256, 1, 256] + - Exact: [128, 2944, 1, 256] + - Exact: [128, 6784, 1, 128] + - Exact: [448, 1856, 1, 256] + - Exact: [3584, 128, 1, 3328] + - Exact: [1024, 448, 1, 1280] + - Exact: [5888, 128, 1, 3328] + - Exact: [1408, 704, 1, 1280] + - Exact: [448, 2944, 1, 256] + - Exact: [448, 2368, 1, 256] + - Exact: [128, 2368, 1, 3328] + - Exact: [5056, 128, 1, 1280] + - Exact: [5056, 64, 1, 3328] + - Exact: [64, 5888, 1, 128] + - Exact: [5056, 128, 1, 3328] + - Exact: [448, 704, 1, 256] + - Exact: [2944, 128, 1, 3328] + - Exact: [128, 5056, 1, 1280] + - Exact: [704, 704, 1, 128] + - Exact: [64, 6784, 1, 1280] + - Exact: [2368, 128, 1, 128] + - Exact: [5056, 128, 1, 128] + - Exact: [1024, 1024, 1, 1024] + - Exact: [448, 1024, 1, 3328] + - Exact: [256, 2368, 1, 3328] + - Exact: [256, 3584, 1, 128] + - Exact: [4288, 256, 1, 128] + - Exact: [256, 1856, 1, 256] + - Exact: [256, 2944, 1, 128] + - Exact: [1408, 256, 1, 3328] + - Exact: [2368, 448, 1, 256] + - Exact: [4288, 256, 1, 3328] + - Exact: [1856, 704, 1, 128] + - Exact: [4288, 128, 1, 128] + - Exact: [1408, 448, 1, 256] + - Exact: [6784, 64, 1, 1280] + - Exact: [3584, 128, 1, 128] + - Exact: [256, 2368, 1, 256] + - Exact: [2944, 448, 1, 1280] + - Exact: [448, 1856, 1, 1280] + - Exact: [1856, 256, 1, 128] + - Exact: [5056, 128, 1, 256] + - Exact: [448, 1024, 1, 256] + - Exact: [64, 6784, 1, 128] + - Exact: [5888, 64, 1, 1280] + - Exact: [128, 3584, 1, 128] + - Exact: [1408, 256, 1, 256] + - Exact: [128, 5888, 1, 3328] + - Exact: [1408, 256, 1, 1280] + - Exact: [1024, 1024, 1, 128] + - Exact: [64, 5056, 1, 128] + - Exact: [5888, 64, 1, 128] + - Exact: [448, 704, 1, 128] + - Exact: [1408, 704, 1, 128] + - Exact: [2368, 256, 1, 3328] + - Exact: [5888, 128, 1, 1280] + - Exact: [256, 3584, 1, 1280] + - Exact: [256, 1408, 1, 128] + - Exact: [256, 4288, 1, 128] + - Exact: [5888, 128, 1, 128] + - Exact: [1856, 256, 1, 3328] + - Exact: [64, 5888, 1, 1280] + - Exact: [6784, 64, 1, 128] + - Exact: [704, 704, 1, 1280] + - Exact: [128, 2368, 1, 1280] + - Exact: [3584, 256, 1, 1280] + - Exact: [3584, 128, 1, 1280] + - Exact: [448, 1856, 1, 3328] + - Exact: [1024, 448, 1, 256] + - Exact: [2944, 128, 1, 1280] + - Exact: [128, 2368, 1, 128] + - Exact: [256, 2944, 1, 1280] + - Exact: [704, 1024, 1, 3328] + - Exact: [128, 6784, 1, 256] + - Exact: [256, 1856, 1, 3328] + - Exact: [6784, 128, 1, 128] + - Exact: [704, 1408, 1, 256] + - Exact: [256, 1408, 1, 256] + - Exact: [448, 2944, 1, 1280] + - Exact: [6784, 128, 1, 1280] + - Exact: [1856, 448, 1, 256] + - Exact: [128, 4288, 1, 1280] + - Exact: [448, 704, 1, 3328] + - Exact: [1856, 704, 1, 3328] + - Exact: [960, 1024, 1, 1024] + - Exact: [768, 768, 1, 768] + +# bodys smaSize + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 128, 1, 128] + - Exact: [2368, 64, 1, 3328] + - Exact: [1408, 64, 1, 128] + - Exact: [1408, 64, 1, 1280] + - Exact: [2944, 64, 1, 256] + - Exact: [1024, 256, 1, 3328] + - Exact: [1856, 64, 1, 1280] + - Exact: [704, 128, 1, 1280] + - Exact: [4288, 64, 1, 3328] + - Exact: [4288, 64, 1, 256] + - Exact: [64, 3584, 1, 3328] + - Exact: [704, 256, 1, 128] + - Exact: [128, 1408, 1, 128] + - Exact: [4288, 64, 1, 1280] + - Exact: [1024, 256, 1, 256] + - Exact: [448, 448, 1, 256] + - Exact: [128, 1024, 1, 3328] + - Exact: [64, 1856, 1, 1280] + - Exact: [256, 1024, 1, 256] + - Exact: [1024, 128, 1, 1280] + - Exact: [448, 256, 1, 3328] + - Exact: [128, 1024, 1, 128] + - Exact: [128, 704, 1, 1280] + - Exact: [1856, 128, 1, 3328] + - Exact: [64, 2944, 1, 128] + - Exact: [448, 448, 1, 3328] + - Exact: [1408, 128, 1, 1280] + - Exact: [128, 1856, 1, 1280] + - Exact: [256, 448, 1, 256] + - Exact: [128, 1856, 1, 128] + - Exact: [64, 1408, 1, 3328] + - Exact: [128, 1408, 1, 256] + - Exact: [4288, 64, 1, 128] + - Exact: [256, 448, 1, 3328] + - Exact: [64, 2368, 1, 1280] + - Exact: [2368, 64, 1, 256] + - Exact: [1408, 128, 1, 128] + - Exact: [1024, 256, 1, 128] + - Exact: [2944, 64, 1, 128] + - Exact: [1856, 64, 1, 256] + - Exact: [704, 128, 1, 256] + - Exact: [448, 256, 1, 1280] + - Exact: [1856, 128, 1, 1280] + - Exact: [64, 3584, 1, 256] + - Exact: [3584, 64, 1, 128] + - Exact: [256, 1024, 1, 1280] + - Exact: [3584, 64, 1, 1280] + - Exact: [128, 1856, 1, 3328] + - Exact: [64, 2944, 1, 3328] + - Exact: [64, 4288, 1, 3328] + - Exact: [64, 1856, 1, 256] + - Exact: [256, 704, 1, 256] + - Exact: [2368, 64, 1, 128] + - Exact: [64, 1408, 1, 128] + - Exact: [704, 256, 1, 3328] + - Exact: [64, 2944, 1, 256] + - Exact: [448, 256, 1, 128] + - Exact: [704, 128, 1, 3328] + - Exact: [128, 704, 1, 128] + - Exact: [256, 448, 1, 1280] + - Exact: [704, 256, 1, 1280] + - Exact: [64, 2368, 1, 3328] + - Exact: [1856, 64, 1, 128] + - Exact: [704, 128, 1, 128] + - Exact: [256, 704, 1, 3328] + - Exact: [256, 448, 1, 128] + - Exact: [64, 3584, 1, 128] + - Exact: [1024, 128, 1, 256] + - Exact: [2944, 64, 1, 1280] + - Exact: [128, 1408, 1, 3328] + - Exact: [1408, 64, 1, 256] + - Exact: [64, 1856, 1, 128] + - Exact: [64, 2368, 1, 256] + - Exact: [1024, 128, 1, 3328] + - Exact: [1856, 128, 1, 128] + - Exact: [2368, 64, 1, 1280] + - Exact: [128, 1024, 1, 1280] + - Exact: [64, 4288, 1, 1280] + - Exact: [1408, 64, 1, 3328] + - Exact: [64, 2944, 1, 1280] + - Exact: [256, 704, 1, 128] + - Exact: [256, 1024, 1, 128] + - Exact: [64, 1408, 1, 1280] + - Exact: [448, 448, 1, 1280] + - Exact: [1024, 256, 1, 1280] + - Exact: [128, 1024, 1, 256] + - Exact: [3584, 64, 1, 3328] + - Exact: [1408, 128, 1, 256] + - Exact: [256, 1024, 1, 3328] + - Exact: [1856, 64, 1, 3328] + - Exact: [448, 256, 1, 256] + - Exact: [128, 704, 1, 256] + - Exact: [64, 3584, 1, 1280] + - Exact: [3584, 64, 1, 256] + - Exact: [64, 1856, 1, 3328] + - Exact: [1408, 128, 1, 3328] + - Exact: [128, 704, 1, 3328] + - Exact: [128, 1856, 1, 256] + - Exact: [64, 4288, 1, 256] + - Exact: [256, 704, 1, 1280] + - Exact: [64, 2368, 1, 128] + - Exact: [64, 4288, 1, 128] + - Exact: [1856, 128, 1, 256] + - Exact: [64, 1408, 1, 256] + - Exact: [2944, 64, 1, 3328] + - Exact: [128, 1408, 1, 1280] + - Exact: [448, 448, 1, 128] + - Exact: [704, 256, 1, 256] + - Exact: [512, 512, 1, 512] + - Exact: [384, 384, 1, 384] + +# bodys bigM + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 1 ] + - [ 4, 2 ] + - WorkGroup: + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 32, 4, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [3584, 4, 1, 1280] + - Exact: [2944, 4, 1, 256] + - Exact: [2368, 4, 1, 1280] + - Exact: [6784, 4, 1, 1280] + - Exact: [1856, 4, 1, 1280] + - Exact: [2944, 4, 1, 128] + - Exact: [3584, 4, 1, 128] + - Exact: [4288, 4, 1, 256] + - Exact: [3584, 4, 1, 3328] + - Exact: [5888, 4, 1, 128] + - Exact: [2368, 4, 1, 256] + - Exact: [1408, 4, 1, 256] + - Exact: [5056, 4, 1, 1280] + - Exact: [1408, 4, 1, 3328] + - Exact: [6784, 4, 1, 128] + - Exact: [5888, 4, 1, 3328] + - Exact: [5056, 4, 1, 128] + - Exact: [5888, 4, 1, 1280] + - Exact: [2944, 4, 1, 3328] + - Exact: [2368, 4, 1, 128] + - Exact: [1856, 4, 1, 128] + - Exact: [1408, 4, 1, 1280] + - Exact: [6784, 4, 1, 256] + - Exact: [4288, 4, 1, 128] + - Exact: [1856, 4, 1, 3328] + - Exact: [3584, 4, 1, 256] + - Exact: [2368, 4, 1, 3328] + - Exact: [6784, 4, 1, 3328] + - Exact: [4288, 4, 1, 1280] + - Exact: [1856, 4, 1, 256] + - Exact: [1408, 4, 1, 128] + - Exact: [5056, 4, 1, 256] + - Exact: [4288, 4, 1, 3328] + - Exact: [2944, 4, 1, 1280] + - Exact: [5888, 4, 1, 256] + - Exact: [5056, 4, 1, 3328] + +# bodys bigN + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 1, 4 ] + - [ 2, 2 ] + - [ 2, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 1856, 1, 3328] + - Exact: [4, 2944, 1, 1280] + - Exact: [4, 1408, 1, 128] + - Exact: [4, 2368, 1, 1280] + - Exact: [4, 3584, 1, 128] + - Exact: [4, 5888, 1, 3328] + - Exact: [4, 1408, 1, 3328] + - Exact: [4, 6784, 1, 3328] + - Exact: [4, 4288, 1, 128] + - Exact: [4, 5056, 1, 3328] + - Exact: [4, 6784, 1, 1280] + - Exact: [4, 2944, 1, 3328] + - Exact: [4, 5056, 1, 256] + - Exact: [4, 5056, 1, 1280] + - Exact: [4, 2368, 1, 3328] + - Exact: [4, 1856, 1, 256] + - Exact: [4, 2368, 1, 256] + - Exact: [4, 2944, 1, 256] + - Exact: [4, 4288, 1, 1280] + - Exact: [4, 6784, 1, 128] + - Exact: [4, 3584, 1, 1280] + - Exact: [4, 5888, 1, 256] + - Exact: [4, 6784, 1, 256] + - Exact: [4, 1408, 1, 1280] + - Exact: [4, 3584, 1, 256] + - Exact: [4, 1408, 1, 256] + - Exact: [4, 4288, 1, 3328] + - Exact: [4, 5888, 1, 1280] + - Exact: [4, 1856, 1, 1280] + - Exact: [4, 1856, 1, 128] + - Exact: [4, 2944, 1, 128] + - Exact: [4, 5056, 1, 128] + - Exact: [4, 4288, 1, 256] + - Exact: [4, 3584, 1, 3328] + - Exact: [4, 5888, 1, 128] + - Exact: [4, 2368, 1, 128] + +# bodys other + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - WavefrontSize: [32] # , 64] + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [ 8, 16, 32 ] + - VectorWidth: [1] + - GlobalSplitU: [1] + - StaggerUMapping: [3] + - StaggerUStride: [128] + - StaggerU: [0, 32] + - WorkGroupMapping: [1,4,8] + - ExpandPointerSwap: [True] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [4, 704, 1, 1280] + - Exact: [128, 64, 1, 256] + - Exact: [64, 4, 1, 256] + - Exact: [64, 704, 1, 128] + - Exact: [448, 64, 1, 1280] + - Exact: [128, 4, 1, 1280] + - Exact: [64, 1024, 1, 1280] + - Exact: [64, 704, 1, 1280] + - Exact: [1024, 64, 1, 128] + - Exact: [64, 1024, 1, 3328] + - Exact: [1024, 64, 1, 1280] + - Exact: [4, 704, 1, 256] + - Exact: [704, 4, 1, 1280] + - Exact: [64, 448, 1, 256] + - Exact: [64, 1024, 1, 128] + - Exact: [4, 64, 1, 1280] + - Exact: [128, 256, 1, 3328] + - Exact: [64, 448, 1, 1280] + - Exact: [448, 4, 1, 256] + - Exact: [448, 4, 1, 1280] + - Exact: [128, 4, 1, 128] + - Exact: [256, 4, 1, 128] + - Exact: [704, 64, 1, 3328] + - Exact: [64, 128, 1, 256] + - Exact: [704, 64, 1, 128] + - Exact: [1024, 4, 1, 256] + - Exact: [256, 256, 1, 128] + - Exact: [64, 256, 1, 128] + - Exact: [704, 64, 1, 1280] + - Exact: [128, 448, 1, 256] + - Exact: [128, 256, 1, 1280] + - Exact: [448, 64, 1, 3328] + - Exact: [256, 128, 1, 128] + - Exact: [64, 128, 1, 3328] + - Exact: [128, 128, 1, 3328] + - Exact: [256, 128, 1, 256] + - Exact: [64, 448, 1, 3328] + - Exact: [256, 256, 1, 3328] + - Exact: [1024, 4, 1, 3328] + - Exact: [4, 4, 1, 256] + - Exact: [256, 64, 1, 256] + - Exact: [256, 128, 1, 1280] + - Exact: [128, 64, 1, 1280] + - Exact: [4, 448, 1, 3328] + - Exact: [64, 1024, 1, 256] + - Exact: [256, 4, 1, 1280] + - Exact: [64, 704, 1, 256] + - Exact: [4, 704, 1, 128] + - Exact: [448, 128, 1, 256] + - Exact: [448, 64, 1, 128] + - Exact: [4, 448, 1, 1280] + - Exact: [256, 256, 1, 256] + - Exact: [256, 64, 1, 128] + - Exact: [4, 1024, 1, 3328] + - Exact: [704, 4, 1, 128] + - Exact: [256, 4, 1, 256] + - Exact: [256, 4, 1, 3328] + - Exact: [4, 256, 1, 256] + - Exact: [4, 4, 1, 128] + - Exact: [4, 128, 1, 256] + - Exact: [64, 64, 1, 1280] + - Exact: [448, 128, 1, 3328] + - Exact: [4, 448, 1, 128] + - Exact: [64, 256, 1, 1280] + - Exact: [4, 128, 1, 3328] + - Exact: [64, 4, 1, 128] + - Exact: [64, 64, 1, 256] + - Exact: [4, 704, 1, 3328] + - Exact: [4, 4, 1, 1280] + - Exact: [128, 128, 1, 128] + - Exact: [1024, 4, 1, 128] + - Exact: [64, 64, 1, 3328] + - Exact: [4, 64, 1, 128] + - Exact: [64, 128, 1, 1280] + - Exact: [128, 128, 1, 1280] + - Exact: [128, 256, 1, 256] + - Exact: [256, 64, 1, 1280] + - Exact: [1024, 4, 1, 1280] + - Exact: [704, 64, 1, 256] + - Exact: [128, 448, 1, 1280] + - Exact: [128, 64, 1, 3328] + - Exact: [448, 64, 1, 256] + - Exact: [4, 256, 1, 128] + - Exact: [1024, 64, 1, 256] + - Exact: [64, 128, 1, 128] + - Exact: [4, 4, 1, 3328] + - Exact: [4, 1024, 1, 1280] + - Exact: [704, 4, 1, 256] + - Exact: [128, 4, 1, 3328] + - Exact: [448, 4, 1, 3328] + - Exact: [704, 4, 1, 3328] + - Exact: [448, 128, 1, 1280] + - Exact: [1024, 64, 1, 3328] + - Exact: [4, 1024, 1, 128] + - Exact: [64, 256, 1, 3328] + - Exact: [448, 128, 1, 128] + - Exact: [128, 256, 1, 128] + - Exact: [128, 4, 1, 256] + - Exact: [256, 256, 1, 1280] + - Exact: [256, 128, 1, 3328] + - Exact: [448, 4, 1, 128] + - Exact: [4, 256, 1, 3328] + - Exact: [4, 128, 1, 128] + - Exact: [4, 256, 1, 1280] + - Exact: [64, 4, 1, 3328] + - Exact: [4, 64, 1, 3328] + - Exact: [4, 1024, 1, 256] + - Exact: [64, 256, 1, 256] + - Exact: [4, 64, 1, 256] + - Exact: [128, 448, 1, 128] + - Exact: [64, 448, 1, 128] + - Exact: [64, 704, 1, 3328] + - Exact: [128, 448, 1, 3328] + - Exact: [4, 448, 1, 256] + - Exact: [4, 128, 1, 1280] + - Exact: [128, 64, 1, 128] + - Exact: [64, 64, 1, 128] + - Exact: [64, 4, 1, 1280] + - Exact: [256, 64, 1, 3328] + - Exact: [128, 128, 1, 256] + +# tail +LibraryLogic: + ScheduleName: "navi21" + DeviceNames: ["Device 73a2"] + ArchitectureName: "gfx1030" + +LibraryClient: diff --git a/Tensile/Contractions.py b/Tensile/Contractions.py index 9ce04bd30..2efe3dc2d 100644 --- a/Tensile/Contractions.py +++ b/Tensile/Contractions.py @@ -286,10 +286,10 @@ def FromOriginalKeyPair(cls, pair): return None if key == "AssertFree0ElementMultiple": - tag = "FreeSizeAMultiple" + tag = "Free0SizeMultiple" index = 0 elif key == "AssertFree1ElementMultiple": - tag = "FreeSizeBMultiple" + tag = "Free1SizeMultiple" index = 0 elif key == "AssertSummationElementMultiple": tag = "BoundSizeMultiple" diff --git a/Tensile/GenerateSummations.py b/Tensile/GenerateSummations.py index 41ff361cb..79ec169fb 100644 --- a/Tensile/GenerateSummations.py +++ b/Tensile/GenerateSummations.py @@ -1,6 +1,6 @@ ################################################################################ -# Copyright 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -94,14 +94,14 @@ def GenerateSummations(userArgs): finalPath = ensurePath(os.path.join(currentPath, "final")) localLogicPath = ensurePath(os.path.join(currentPath, "logic")) localLogicFilePath = os.path.join(localLogicPath, logicFileBaseName) - + # Here we read in two version of the logic the first one fills the solutions with - # defaults and modifies some of the parameters. The final logic file should be the + # defaults and modifies some of the parameters. The final logic file should be the # same as the initial logic with the summation model added. To preseve the original # logic we also read in the raw unaltered version of the logic and stage the content # to write the final logic. - logic = LibraryIO.readLibraryLogicForSchedule(logicFileName) - rawLogic = LibraryIO.readRawLibraryLogic(logicFileName) + logic = LibraryIO.parseLibraryLogicFile(logicFileName) + rawLogic = LibraryIO.rawLibraryLogic(logicFileName) # If we cannot read the logic file then skip it if rawLogic == None or logic == None: @@ -126,7 +126,7 @@ def GenerateSummations(userArgs): exactList.append(e) libraryPath = libPath - clientBuildDir = os.path.join(outputPath, "client") + clientBuildDir = os.path.join(outputPath, "client") problemTypeObj = problemType.state problemSizes = ProblemSizes(problemTypeObj, exactList) @@ -162,7 +162,7 @@ def GenerateSummations(userArgs): solutionIndex = 0 for s_stateR, kernelName in zip(solutionStatesR, libSolutionNames): solutionIndex += 1 - perf_raw = working_data[kernelName] + perf_raw = working_data[kernelName] perf = (1000*index_keys) / perf_raw model = np.polyfit(x=index_keys, y=perf, deg=1) slope = model[0].item() @@ -182,7 +182,7 @@ def GenerateSummations(userArgs): rawLogicData.append(deepcopy(rangeLogicR)) for idx in range(0, len(otherFieldsR)): rawLogicData.append(deepcopy(otherFieldsR[idx])) - + localFinalLogic = os.path.join(finalPath, logicFileBaseName) yamlWriter = LibraryIO.YAMLWriter() diff --git a/Tensile/KernelWriter.py b/Tensile/KernelWriter.py index 924200788..cefb1a62d 100644 --- a/Tensile/KernelWriter.py +++ b/Tensile/KernelWriter.py @@ -617,8 +617,8 @@ def makeSubIterSchedule(self, kernel, localReadCode, iteration, pointerLWCode, p # calculate the data index of this mfma used for A and B # if i // kernel["MIWaveTile"][0]==0, mfma will use new A (need to take iu into account) # if i % kernel["MIWaveTile"][0]==0, mfma will use new B - packAIdx += instPerPack if i//(kernel["MIWaveTile"][0]+kernel["MIWaveTile"][0]*kernel["MIWaveTile"][1]*(i//(kernel["MIWaveTile"][0]*kernel["MIWaveTile"][1]))) == 0 else 0 - packBIdx += instPerPack if i % kernel["MIWaveTile"][0] == 0 else 0 + packAIdx += instPerPack if i//(kernel["MIWaveTileA"]+kernel["MIWaveTileA"]*kernel["MIWaveTileB"]*(i//(kernel["MIWaveTileA"]*kernel["MIWaveTileB"]))) == 0 else 0 + packBIdx += instPerPack if i % kernel["MIWaveTileA"] == 0 else 0 # blockWidth < 1, means 0.5 or 0.25 (BF,H,Int8) packAIdx = packAIdx if self.tPA["localReadInstruction"].blockWidth < 1 else 0 packBIdx = packBIdx if self.tPB["localReadInstruction"].blockWidth < 1 else 0 @@ -880,8 +880,8 @@ def makeSubIterSchedule(self, kernel, localReadCode, iteration, pointerLWCode, p # calculate the data index of this mfma used for A and B # if i // kernel["MIWaveTile"][0]==0, mfma will use new A (need to take iu into account) # if i % kernel["MIWaveTile"][0]==0, mfma will use new B - packAIdx += instPerPack if i//(kernel["MIWaveTile"][0]+kernel["MIWaveTile"][0]*kernel["MIWaveTile"][1]*(i//(kernel["MIWaveTile"][0]*kernel["MIWaveTile"][1]))) == 0 else 0 - packBIdx += instPerPack if i % kernel["MIWaveTile"][0] == 0 else 0 + packAIdx += instPerPack if i//(kernel["MIWaveTileA"]+kernel["MIWaveTileA"]*kernel["MIWaveTileB"]*(i//(kernel["MIWaveTileA"]*kernel["MIWaveTileB"]))) == 0 else 0 + packBIdx += instPerPack if i % kernel["MIWaveTileA"] == 0 else 0 # blockWidth < 1, means 0.5 or 0.25 (BF,H,Int8) packAIdx = packAIdx if self.tPA["localReadInstruction"].blockWidth < 1 else 0 packBIdx = packBIdx if self.tPB["localReadInstruction"].blockWidth < 1 else 0 @@ -978,8 +978,12 @@ def makeSubIterSchedule(self, kernel, localReadCode, iteration, pointerLWCode, p localReads = item.countType(Code.LocalReadInst) localWrites = item.countType(Code.LocalWriteInst) if self.numVgprBuffer: - # here the reads are prefetches so can skip them in the waitcnt - lgkmcnt += localReads + # SQ: If PrefetchLocalRead = 1 and DepthU == LocalSplitU, then there is no double + # buffering and we must wait for all localReads but not localWrites. + # In that case, LoopIters == 1: + if kernel["LoopIters"] > 1: + # here the reads are prefetches so can skip them in the waitcnt + lgkmcnt += localReads # and the writes are targetting another section of LDS and are # synchronized through a different waitnct than this one # (which is always just before the macs) @@ -1206,7 +1210,7 @@ def setupNewTile(self, kernel, tensorParametersA, tensorParametersB, isPap, isOp if kernel["PrefetchGlobalRead"]: pfi = 1 kl.append(self.comment("prefetch: global -> local")) - kl.append(self.openSumAtLeastUnroll(kernel, prefetch=True, isPap=isPap, isOptNLL=isOptNLL)) + kl.append(self.openSumAtLeastUnroll(kernel, prefetch=True, isOptNLL=isOptNLL, isPap=isPap)) if isPap and isOptNLL: if self.enable["GlobalRead"]: self.dtlsM0UpdateACode = self.directToLdsM0Update(kernel, 0, tensorParametersA) @@ -1245,7 +1249,7 @@ def setupNewTile(self, kernel, tensorParametersA, tensorParametersB, isPap, isOp # # isOptNLL : the NLL is to be optimized for the alpha=1 and non-edge case ############################################################################## - def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, isNGLL, pack ): + def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, isPap, isNGLL, pack ): kl = [] pflr = self.numItersPLR localWriteEndIter = kernel["LoopIters"] - self.numItersPLR - 1 @@ -1255,7 +1259,10 @@ def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, is self.perIterLocalWriteCode = self.perIterLocalWriteCodeNGLL self.perIterLocalWriteCanSkip = [ 0 for i in range (kernel["LoopIters"]) ] else: - kl.append(self.comment3("%s NoLoadLoop - Begin") % ("Opt." if isOptNLL else "Ord.")) + if not isOptNLL: + kl.append(self.comment3("Ord. NoLoadLoop - Begin")) + else: + kl.append(self.comment3("Opt. NoLoadLoop %s PAP - Begin") % ("With" if isPap else "Without")) self.dtlsM0UpdateACode = Code.StructuredModule() self.globalReadACode = Code.StructuredModule() # empty self.dtlsM0UpdateBCode = Code.StructuredModule() @@ -1267,7 +1274,7 @@ def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, is self.localWriteBCode = Code.Module() # the scheduled GlobalRead,Inc code of PAP is inside openSumAtLeastUnroll (if PAP=on) - kl.append(self.openSumAtLeastUnroll(kernel, prefetch=False, isPap=False, isOptNLL=isOptNLL)) + kl.append(self.openSumAtLeastUnroll(kernel, prefetch=False, isOptNLL=isOptNLL, isPap=isPap)) if not self.numItersPLR: if self.enable["Wait"]: @@ -1433,7 +1440,7 @@ def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, is item.tempVgpr = None pack[luIdx] = Code.Module() - kl.append(self.closeSumAtLeastUnroll(kernel, prefetch=False, isOptNLL=isOptNLL, isNGLL=isNGLL)) + kl.append(self.closeSumAtLeastUnroll(kernel, prefetch=False, isOptNLL=isOptNLL, isPap=isPap, isNGLL=isNGLL)) return kl @@ -1474,11 +1481,8 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): kl.append(self.comment3("Local Read Addresses")) # tile assignments - kl.append(self.comment("local read addresses: tile assignments a")) - kl.append(self.lraTileAssignment(kernel, tensorParametersA)) - kl.append(self.comment("local read addresses: tile assignments b")) - kl.append(self.lraTileAssignment(kernel, tensorParametersB)) - + kl.append(self.comment("local read addresses: tile assignments a/b")) + kl.append(self.lraTileAssignment(kernel, tensorParametersA, tensorParametersB)) # final offsets kl.append(self.comment("local read addresses: final offsets a")) @@ -1591,7 +1595,7 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): if iui*self.numReadsIterCoalescedB < kernel["InnerUnroll"]: kl.append(self.comment("local read inc b")) kl.append(self.localReadInc(kernel, iui, tensorParametersB)) - kl.append(self.closeSumAtLeastUnroll(kernel, prefetch=True, isOptNLL=False, isNGLL=False)) + kl.append(self.closeSumAtLeastUnroll(kernel, prefetch=True, isOptNLL=False, isPap=False, isNGLL=False)) # open unrolled summation loop kl.append(self.comment3("Unrolled Loop(s) - Begin")) @@ -2094,7 +2098,7 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): self.tmpCheckedOutLWVgprs = [] if kernel["PrefetchGlobalRead"] == 2: - kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=False, isNGLL=True, pack=pack) + kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=False, isPap=False, isNGLL=True, pack=pack) # This "NoLoad" loop is a copy of the unroll loop but with global loads + LDS writes removed # doShadowInit is required since this pushes up the store SRD initialization before the NLL @@ -2107,14 +2111,25 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): kernel["BufferLoad"] and kernel["BufferStore"] and self.doShadowInit and \ kernel["LocalSplitU"]==1 and kernel["GlobalSplitU"] == 1 and \ self.actualSummationLoops==1: - self.saveLocalPointers(kernel) + # three different noLoadLoops: + # 1. OptNLL & PAP global-read interleaved (only for PAP=ON) + # 2. OptNLL : No PAP global-read (For PAP=OFF, or PAP=ON but the last tile) + # 3. OrdinaryNLL (Not Opt.) + if self.prefetchAcrossPersistent: + self.saveLocalPointers(kernel) + # deepCopy packCode for OptNLL noLoadLoop + deepCopyPack = copy.deepcopy(pack) + kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=True, isPap=True, isNGLL=False, pack=deepCopyPack) + self.restoreLocalPointers(kernel) + + self.saveLocalPointers(kernel) # deepCopy packCode for OptNLL noLoadLoop deepCopyPack = copy.deepcopy(pack) - kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=True, isNGLL=False, pack=deepCopyPack) + kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=True, isPap=False, isNGLL=False, pack=deepCopyPack) self.restoreLocalPointers(kernel) - kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=False, isNGLL=False, pack=pack) + kl += self.noLoadLoop(kernel, tensorParametersA, tensorParametersB, isOptNLL=False, isPap=False, isNGLL=False, pack=pack) # if PGR, last few iterations will have PLR, # and those PLR will not be used(register not checkIn) if without NoLoadLoop else: @@ -2232,10 +2247,15 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): KinInnerUnroll = kernel["InnerUnroll"] if kernel["EnableMatrixInstruction"]: KinInnerUnroll *= kernel["MatrixInstK"] - tailLoopInnerUnroll = kernel["InnerUnroll"] if (kernel["AssertSummationElementMultiple"] % KinInnerUnroll == 0) else 1 + + tailLoopInnerUnroll = 1 + if (kernel["AssertSummationElementMultiple"] % KinInnerUnroll == 0): + tailLoopInnerUnroll = kernel["InnerUnroll"] + elif (kernel["LocalDotLayout"] == 2) and (kernel["InnerUnroll"] == 2): + tailLoopInnerUnroll = kernel["InnerUnroll"] pack[0] = Code.Module() - for iui in range(0,tailLoopInnerUnroll): + for iui in range(0, tailLoopInnerUnroll): if self.enable["LocalRead"]: # Reading 16-bit data from LDS requires packing when ECC enabled kl.append(self.comment("local read a")) @@ -2262,11 +2282,12 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): self.vgprPool.checkIn(item.tempVgpr) item.tempVgpr = None pack[0] = Code.Module() + if self.enable["MAC"]: if kernel["EnableMatrixInstruction"]: kl.append(self.mfmaIter(kernel, 0, tailLoopInnerUnroll, True)) else: - kl.append(self.macIter(kernel, 0, tailLoopInnerUnroll, True)) + kl.append(self.macIter(kernel, 0, tailLoopInnerUnroll, True, True)) kl.append(self.closeLoop(kernel, -1, True, uDu if kernel["DepthULdsDivisor"]>1 else None)) # always emit the skip-tail-loop label @@ -2859,8 +2880,8 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB ): tensorParametersA["PackBatchDims"] = kernel["PackBatchDims"] if kernel["PackBatchDims"] & 0x1 else 0 tensorParametersB["PackBatchDims"] = kernel["PackBatchDims"] if kernel["PackBatchDims"] & 0x2 else 0 - tensorParametersA["PackedIndices"] = kernel["PackedC0IndicesX"] - tensorParametersB["PackedIndices"] = kernel["PackedC1IndicesX"] + tensorParametersA["PackedIndices"] = kernel["PackedC%uIndicesX"%self.tPA["tile01Idx"]] + tensorParametersB["PackedIndices"] = kernel["PackedC%uIndicesX"%self.tPB["tile01Idx"]] @staticmethod def zpForSumIdx(sumIdx, zeroPad): @@ -2954,6 +2975,7 @@ def getTensorParameters(self, tP, kernel, tA): tP["tensorIdx"] = 0 # tensor index A=0, B=1 tP["tileChar"] = self.tileCharA # tile char I0 or J1 tP["tileIdx"] = kernel["ProblemType"]["Index01A"] # is the tile dimension of A the 0th or 1th index, i.e. Aki, tileIdx=0 + tP["tile01Idx"] = 1 if tP["tileIdx"] else 0 tP["lsc"] = "LSCA" # load size coalesced A, number of elements that get loaded along coalesced dimension with each load tP["lsp"] = "LSPA" # load size perpendicular A, number of elements that get loaded along non-coalesced dimension with each load tP["lvc"] = "LVCA" # "load size" in terms of number of short-vectors and not elements @@ -2963,11 +2985,11 @@ def getTensorParameters(self, tP, kernel, tA): #tP["ruv"] = self.readUnrollDimVectorA #tP["nlvc"] = self.numReadVectorComponentsA #tP["nwvc"] = self.numWriteVectorComponentsA - tP["wg"] = "WorkGroup0" # these are storing the actual strong to lookup the number from kernel dictionary - tP["prevWg"] = "PrevWorkGroup0" # used for prefetch-across-persistent - tP["sg"] = "SubGroup0" - tP["tt"] = "ThreadTile0" - tP["mt"] = "MacroTile0" + tP["wg"] = "WorkGroup%u" % (tP["tile01Idx"])# these are storing the actual strong to lookup the number from kernel dictionary + tP["prevWg"] = "PrevWorkGroup0" # used for prefetch-across-persistent #NHWC TO-do + tP["sg"] = "SubGroup%u" % (tP["tile01Idx"]) + tP["tt"] = "ThreadTile%u" % (tP["tile01Idx"]) + tP["mt"] = "MacroTile%u" % (tP["tile01Idx"]) tP["grcg"] = self.globalReadCoalesceGroupA # global reads are coalesced along threads tP["grcv"] = kernel["GlobalReadCoalesceVectorA"] # global reads are vector reads, and lds writes will be components if transposing tP["tlu"] = kernel["ProblemType"]["TLUA"] # thread stride is less than unroll stride, i.e., not transposing matrix @@ -3009,6 +3031,7 @@ def getTensorParameters(self, tP, kernel, tA): tP["tensorIdx"] = 1 tP["tileChar"] = self.tileCharB tP["tileIdx"] = kernel["ProblemType"]["Index01B"] + tP["tile01Idx"] = 1 if tP["tileIdx"] else 0 tP["lsc"] = "LSCB" tP["lsp"] = "LSPB" tP["lvc"] = "LVCB" @@ -3018,11 +3041,11 @@ def getTensorParameters(self, tP, kernel, tA): #tP["ruv"] = self.readUnrollDimVectorB #tP["nlvc"] = self.numReadVectorComponentsB #tP["nwvc"] = self.numWriteVectorComponentsB - tP["wg"] = "WorkGroup1" + tP["wg"] = "WorkGroup%u" % (tP["tile01Idx"]) tP["prevWg"] = "PrevWorkGroup1" - tP["sg"] = "SubGroup1" - tP["tt"] = "ThreadTile1" - tP["mt"] = "MacroTile1" + tP["sg"] = "SubGroup%u" % (tP["tile01Idx"]) + tP["tt"] = "ThreadTile%u" % (tP["tile01Idx"]) + tP["mt"] = "MacroTile%u" % (tP["tile01Idx"]) tP["grcg"] = self.globalReadCoalesceGroupB tP["grcv"] = kernel["GlobalReadCoalesceVectorB"] tP["tlu"] = kernel["ProblemType"]["TLUB"] @@ -3170,7 +3193,7 @@ def lwaDeclareAddresses(self, kernel, tP): # Local Read Addresses: Tile Assignment ############################################################################## @abc.abstractmethod - def lraTileAssignment(self, kernel, tP): + def lraTileAssignment(self, kernel, tPA, tPB): return "" ############################################################################## @@ -3302,18 +3325,18 @@ def checkAlphaBetaForHPA(self, kernel): # useMacro : if true, call the MAC* macro. If False, inline the MACs ############################################################################## @abc.abstractmethod - def macIter(self, kernel, bufferIdx, iuiCount, useMacro): + def macIter(self, kernel, bufferIdx, iuiCount, useMacro, isTail=False): return "" ############################################################################## # At Least 1 Unroll ############################################################################## @abc.abstractmethod - def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): + def openSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap): return "" @abc.abstractmethod - def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isNGLL): + def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap, isNGLL): return "" ############################################################################## diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index b20f58686..71b027fee 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -427,7 +427,7 @@ class PreLoopVmcntCase(Enum): Undefined = 0 Basic_Load = 1 OptNLL_Store = 2 - OrdNLL_B0_Store = 3 + OrdNLL_E1_Store = 3 OrdNLL_B1_Store = 4 ################################################################################ @@ -568,8 +568,34 @@ def __init__( self, kernelMinNaming, kernelSerialNaming ): self.overlapVgprC = False self.serializedStore = False - def getCompileArgs(self, sourceFileName, objectFileName, *moreArgs, useGlobalISA=False): - isa = self.version if not useGlobalISA else globalParameters["CurrentISA"] + @property + def vcc(self) -> str: + if self.kernel["WavefrontSize"] == 64: + return "vcc" + else: + return "vcc_lo" + + @property + def exec(self) -> str: + if self.kernel["WavefrontSize"] == 64: + return "exec" + else: + return "exec_lo" + + @property + def laneSGPRCount(self) -> int: + """ How many SGPRs does it take to have one bit per lane? """ + if self.kernel["WavefrontSize"] == 64: + return 2 + else: + return 1 + + def getCompileArgs(self, sourceFileName, objectFileName, *moreArgs, isa=None, wavefrontSize=None): + if isa is None: + isa = self.version + if wavefrontSize is None: + wavefrontSize = self.kernel["WavefrontSize"] + archHasV3 = globalParameters["AsmCaps"][isa]["HasCodeObjectV3"] rv = [globalParameters['AssemblerPath'], @@ -581,8 +607,10 @@ def getCompileArgs(self, sourceFileName, objectFileName, *moreArgs, useGlobalISA rv += ['-mcpu=' + gfxName(isa)] - if isa[0] == 10: + if wavefrontSize == 64: rv += ['-mwavefrontsize64'] + else: + rv += ['-mno-wavefrontsize64'] rv += moreArgs @@ -812,7 +840,7 @@ def selectMemoryInstruction(self, strides ): #instructions = self.memoryArchitecture[operation] - instructions = self.memoryInstructions[self.version][operation] + instructions = self.memoryInstructions[operation] # try to combine if (write2 == "Coalesced" and para2) \ or (write2 == "Perpendicular" and perp2): @@ -873,7 +901,7 @@ def dumpSgpr(self, sgprStore): kStr += inst("v_mov_b32", vgpr(tmp), sgprStore, "Debug") kStr += inst("flat_store_dword", vgpr("AddressDbg", 2), \ vgpr(tmp), "debug dump sgpr store" ) - kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), "vcc", vgpr("AddressDbg"), \ + kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), self.vcc, vgpr("AddressDbg"), \ hex(4), "debug dump inc" ) self.vgprPool.checkIn(tmp) @@ -1116,25 +1144,25 @@ def initKernel(self, kernel, tPA, tPB ): # # a dictionary storing the vmcnt numbers for each case: # case 1: first PK-Loop (no previous store), cnt = #-basic-globalload - # case 2: after Opt.NLL (no Beta), cnt = #-prev-store (no beta,edge) + #-basic-globalload - # case 3: after Ord.NLL (no Beta), cnt = #-prev-store (no beta) + #-basic-globalload + # case 2: after Opt.NLL (no Beta), cnt = #-prev-store (no beta,no edge) + #-basic-globalload + # case 3: after Ord.NLL (with Edge but No Beta), cnt = #-prev-store (edge store) + #-basic-globalload # case 4: after Ord.NLL (with Beta), cnt = no needed for vmcnt self.preLoopVmcntDict = { \ PreLoopVmcntCase.Basic_Load:0, \ PreLoopVmcntCase.OptNLL_Store:0, \ - PreLoopVmcntCase.OrdNLL_B0_Store:0 } + PreLoopVmcntCase.OrdNLL_E1_Store:0 } # Case4: No need to count store vmcnt for next PreLoop since OrdNLL_B1_Store already has vmcnts waiting for loading beta # PreLoopVmcntCase.OrdNLL_B1_Store:0 } # a dictionary storing the keywords to be replaced for each case: # case 1: replace the vmcnt("Basic_Load") with vmcnt(N) # case 2: replace the vmcnt("OptNLL_Store" + "Basic_Load") with vmcnt(M1+N) - # case 3: replace the vmcnt("OrdNLL_B0_Store" + "Basic_Load") with vmcnt(M2+N) + # case 3: replace the vmcnt("OrdNLL_E1_Store" + "Basic_Load") with vmcnt(M2+N) # case 4: s_waitcnt vmcnt will be removed, no need to replace self.preLoopCaseToReplaceKWList = { \ PreLoopVmcntCase.Basic_Load :[PreLoopVmcntCase.Basic_Load], \ PreLoopVmcntCase.OptNLL_Store :[PreLoopVmcntCase.Basic_Load, PreLoopVmcntCase.OptNLL_Store], \ - PreLoopVmcntCase.OrdNLL_B0_Store:[PreLoopVmcntCase.Basic_Load, PreLoopVmcntCase.OrdNLL_B0_Store] } + PreLoopVmcntCase.OrdNLL_E1_Store:[PreLoopVmcntCase.Basic_Load, PreLoopVmcntCase.OrdNLL_E1_Store] } # PreLoopVmcntCase.OrdNLL_B1_Store:[PreLoopVmcntCase.Basic_Load, PreLoopVmcntCase.OrdNLL_B1_Store] } self.useManualVmcnt = False @@ -1236,7 +1264,6 @@ def initKernel(self, kernel, tPA, tPB ): chosen_store_dword = flat_store_dword self.memoryInstructions = { - (9,0,0): { "GlobalRead": [ chosen_load_dwordx4, chosen_load_dwordx2, chosen_load_dword, chosen_load_short, chosen_load_byte ], "GlobalWrite": [ chosen_store_dwordx4, chosen_store_dwordx2, @@ -1245,14 +1272,7 @@ def initKernel(self, kernel, tPA, tPB ): ds_read_b64, ds_read2_b32, ds_read_b32, ds_read_u16, ds_read_u8 ], "LocalWrite": [ ds_write_b128, ds_write2_b64, ds_write_b64, ds_write2_b32, ds_write_b32, ds_write_b16, ds_write_b8 ] - }, # 900 } - self.memoryInstructions[(8,0,3)] = self.memoryInstructions[(9,0,0)] - self.memoryInstructions[(9,0,6)] = self.memoryInstructions[(9,0,0)] - self.memoryInstructions[(9,0,8)] = self.memoryInstructions[(9,0,0)] - self.memoryInstructions[(9,0,10)] = self.memoryInstructions[(9,0,0)] - self.memoryInstructions[(10,1,0)] = self.memoryInstructions[(9,0,0)] - self.memoryInstructions[(10,1,1)] = self.memoryInstructions[(9,0,0)] if self.asmCaps["v_fma_mix_f32"]: self.mixinst = "v_fma_mix_f32" @@ -1489,7 +1509,7 @@ def initKernel(self, kernel, tPA, tPB ): self.localRead2CoalescedB, localRead2Perpendicular, [self.localReadStrideCoalescedB] ) - instructions = self.memoryInstructions[self.version] + instructions = self.memoryInstructions self.globalReadInstructionA = instructions["GlobalRead"][ \ self.globalReadInstructionIdxA] self.globalReadInstructionB = instructions["GlobalRead"][ \ @@ -1519,8 +1539,8 @@ def initKernel(self, kernel, tPA, tPB ): PLR = kernel["PrefetchLocalRead"] if kernel["PrefetchLocalRead"] < kernel["LoopIters"] else kernel["LoopIters"] - 1 valuBlocks = (1+PLR) * kernel["InnerUnroll"] if kernel["EnableMatrixInstruction"]: - self.numVgprValuAPerBlock = kernel["MIWaveTile"][0] * kernel["MIInputPerThread"] * tPA["bpe"] // self.bpr - self.numVgprValuBPerBlock = kernel["MIWaveTile"][1] * kernel["MIInputPerThread"] * tPA["bpe"] // self.bpr + self.numVgprValuAPerBlock = kernel["MIWaveTileA"] * kernel["MIInputPerThread"] * tPA["bpe"] // self.bpr + self.numVgprValuBPerBlock = kernel["MIWaveTileB"] * kernel["MIInputPerThread"] * tPA["bpe"] // self.bpr else: self.numVgprValuAPerBlock = kernel["ThreadTileA"]*tPA["bpe"]//self.bpr self.numVgprValuBPerBlock = kernel["ThreadTileB"]*tPB["bpe"]//self.bpr @@ -1630,6 +1650,9 @@ def initKernel(self, kernel, tPA, tPB ): # TODO: alignment hack, figure out a better solution vgprIdx = ((vgprIdx+1)//2)*2 + # Avoid bank conflict between VgprA and VgprC + if (self.version[0] == 10) and ((vgprIdx % 4) == (self.startVgprValuC % 4)): + vgprIdx += 1 self.startVgprValuA = vgprIdx; vgprIdx += numVgprValuA self.startVgprG2LA = None if not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]: @@ -2024,8 +2047,7 @@ def initKernel(self, kernel, tPA, tPB ): # complex multiplication is emulated by 4 matrix instructions operating on real and imaginary numbers # multiplier 2 indicates complex mul requires equal share of extra vgprs to store the imaginary part self.agprMultiplier = 2 if kernel["ProblemType"]["DataType"].isComplex() else 1 - - self.destAgprs = kernel["MatrixInstM"] * kernel["MatrixInstN"] * kernel["MatrixInstB"] // globalParameters["WavefrontWidth"] * kernel["MIRegPerOut"] + self.destAgprs = kernel["MatrixInstM"] * kernel["MatrixInstN"] * kernel["MatrixInstB"] // kernel["WavefrontSize"] * kernel["MIRegPerOut"] self.totalAgprs = self.destAgprs * kernel["MIWaveTile"][0] * kernel["MIWaveTile"][1] * self.agprMultiplier ######################################## @@ -2172,50 +2194,55 @@ def defineMACs(self, kernel, m, innerUnroll): beAggressive = kernel["AggressivePerfMode"] doOnce = False - macIdx = 0 # half precision is entirely in component system. # bfloat16 if kernel["ProblemType"]["DataType"].isBFloat16(): if (self.version == (9,0,8) or self.version == (9,0,10)) and kernel["ProblemType"]["HighPrecisionAccumulate"]: for iui in range(0, innerUnroll): - for blockA in range(kernel["ThreadTile0"]//2-1, -1, -1): + for blockA in range(kernel["ThreadTileA"]//2-1, -1, -1): kStr += "v_and_b32 v[vgprValuA_X%u_I%u+%u], 0xffff0000, v[vgprValuA_X%u_I%u+%u]%s" % (m, iui, blockA*2+1, m, iui, blockA, self.endLine) kStr += "v_lshlrev_b32 v[vgprValuA_X%u_I%u+%u], 16, v[vgprValuA_X%u_I%u+%u]%s" % (m, iui, blockA*2, m, iui, blockA, self.endLine) - for blockB in range(kernel["ThreadTile1"]//2-1, -1, -1): + for blockB in range(kernel["ThreadTileB"]//2-1, -1, -1): kStr += "v_and_b32 v[vgprValuB_X%u_I%u+%u], 0xffff0000, v[vgprValuB_X%u_I%u+%u]%s" % (m, iui, blockB*2+1, m, iui, blockB, self.endLine) kStr += "v_lshlrev_b32 v[vgprValuB_X%u_I%u+%u], 16, v[vgprValuB_X%u_I%u+%u]%s" % (m, iui, blockB*2, m, iui, blockB, self.endLine) - for blockB in range(0, kernel["ThreadTile1"]//2): - for blockA in range(0, kernel["ThreadTile0"]//2): + for block1 in range(0, kernel["ThreadTile1"]//2): + for block0 in range(0, kernel["ThreadTile0"]//2): if kernel["ProblemType"]["HighPrecisionAccumulate"]: # we treat HighPrecisionAccumulate as expanded packed math - b = blockB*2 - a = blockA*2 for iui in range(0, innerUnroll): + + blockA = block0 if self.tPB["tile01Idx"] else block1 + blockB = block1 if self.tPB["tile01Idx"] else block0 + aStr0 = "v[%s+%u]" % ("vgprValuA_X%u_I%u"%(m,iui), blockA*2+0) aStr1 = "v[%s+%u]" % ("vgprValuA_X%u_I%u"%(m,iui), blockA*2+1) bStr0 = "v[%s+%u]" % ("vgprValuB_X%u_I%u"%(m,iui), blockB*2+0) bStr1 = "v[%s+%u]" % ("vgprValuB_X%u_I%u"%(m,iui), blockB*2+1) - cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + 0 - cStr = "v[%s+%u*2+%u*%u*2+0*2+0]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"]) # *2 b/c of fp32 + cidx = block0*2 + block1*kernel["ThreadTile0"]*2 + 0 + cStr = "v[%s+%u*2+%u*%u*2+0*2+0]" % ("vgprValuC", block0, block1, kernel["ThreadTile0"]) # *2 b/c of fp32 kStr += "v_fma_f32 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr0, bStr0, cStr, cidx, self.endLine) if beAggressive and not doOnce: kStr += "s_setprio 1 // Raise priority while processing macs%s" % self.endLine doOnce = True - cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + 1 - cStr = "v[%s+%u*2+%u*%u*2+0*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"]) # *2 b/c of fp32 - kStr += "v_fma_f32 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr1, bStr0, cStr, cidx, self.endLine) + aStr = aStr1 if self.tPB["tile01Idx"] else aStr0 + bStr = bStr0 if self.tPB["tile01Idx"] else bStr1 + cidx = block0*2 + block1*kernel["ThreadTile0"]*2 + 1 + cStr = "v[%s+%u*2+%u*%u*2+0*2+1]" % ("vgprValuC", block0, block1, kernel["ThreadTile0"]) # *2 b/c of fp32 + kStr += "v_fma_f32 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) - cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 0 - cStr = "v[%s+%u*2+%u*%u*2+%u*2+0]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]//2) - kStr += "v_fma_f32 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr0, bStr1, cStr, cidx, self.endLine) + aStr = aStr0 if self.tPB["tile01Idx"] else aStr1 + bStr = bStr1 if self.tPB["tile01Idx"] else bStr0 + cidx = block0*2 + block1*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 0 + cStr = "v[%s+%u*2+%u*%u*2+%u*2+0]" % ("vgprValuC", block0, block1, kernel["ThreadTile0"], kernel["ThreadTile0"]//2) + kStr += "v_fma_f32 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) - cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 1 - cStr = "v[%s+%u*2+%u*%u*2+%u*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]//2) + cidx = block0*2 + block1*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 1 + cStr = "v[%s+%u*2+%u*%u*2+%u*2+1]" % ("vgprValuC", block0, block1, kernel["ThreadTile0"], kernel["ThreadTile0"]//2) kStr += "v_fma_f32 %s, %s, %s, %s //valuC[%u]%s" % (cStr, aStr1, bStr1, cStr, cidx, self.endLine) """ ignore this, not quite correct for mixed precision @@ -2228,16 +2255,11 @@ def defineMACs(self, kernel, m, innerUnroll): else: printExit("Bfloat16 not supported for arch=%s" % str(self.version) ) - # integer i8x4 elif kernel["ProblemType"]["DataType"].isInt8x4(): - for b in range(0, kernel["ThreadTile1"]): - for a in range(0, kernel["ThreadTile0"]): - if self.version == (8,0,3): - kStr += self.comment3("int8 not implemented yet for gfx803:") - elif self.version == (9,0,0): - kStr += self.comment3("int8 not implemented yet for gfx900:") - elif self.version == (9,0,6) or self.version == (9,0,8) or self.version == (9,0,10): + if self.version == (9,0,6) or self.version == (9,0,8) or self.version == (9,0,10) or self.version == (10,3,0): + for b in range(0, kernel["ThreadTile1"]): + for a in range(0, kernel["ThreadTile0"]): for iui in range(0, innerUnroll): cidx = a + b*kernel["ThreadTile0"] + 0 cStr = "v[%s+%u+%u*%u]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) @@ -2247,34 +2269,11 @@ def defineMACs(self, kernel, m, innerUnroll): if beAggressive and not doOnce: kStr += "s_setprio 1 // Raise priority while processing macs%s" % self.endLine doOnce = True - if beAggressive: - kStr += "s_setprio 0 // Reset priority after macs %s" % self.endLine - - # single precision - elif kernel["ProblemType"]["DataType"].isSingle(): - for b in range(0, kernel["ThreadTile1"]): - for a in range(0, kernel["ThreadTile0"]): - for iui in range(0, innerUnroll): - cStr = "v[%s+%u+%u*%u]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) - aStr = "v[%s+%u]" \ - % ("vgprValuA_X%u_I%u"%(m,iui), a) - bStr = "v[%s+%u]" \ - % ("vgprValuB_X%u_I%u"%(m,iui), b) - #if a==0 and b==0: - # kStr += dump(aStr) - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) - if beAggressive and not doOnce: - kStr += "s_setprio 1 // Raise priority while processing macs%s" % self.endLine - doOnce = True - if macIdx == kernel["PerformanceWaitLocation"]: - kStr += "s_waitcnt lgkmcnt(%u) // extra wait for performance%s" \ - % (kernel["PerformanceWaitCount"], self.endLine) - if macIdx == kernel["PerformanceSyncLocation"]: - kStr += "s_barrier // extra barrier for performance%s" \ - % (self.endLine) - macIdx += 1 - if beAggressive: - kStr += "s_setprio 0 // Reset priority after macs %s" % self.endLine + if beAggressive: + kStr += "s_setprio 0 // Reset priority after macs %s" % self.endLine + else: + version = "gfx{}{}{}".format(self.version[0], self.version[1], self.version[2]) + kStr += self.comment3("int8x4 not implemented yet for {}:".format(version)) # double precision elif kernel["ProblemType"]["DataType"].isDouble(): @@ -2301,32 +2300,32 @@ def defineMACs(self, kernel, m, innerUnroll): cStr = "v[%s+(%u+%u*%u)*2]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) aStr = "v[%s+%u*2]" % ("vgprValuA_X%u_I%u"%(m,iui) , a) bStr = "v[%s+%u*2]" % ("vgprValuB_X%u_I%u"%(m,iui) , b) - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) cStr = "v[%s+(%u+%u*%u)*2]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) aStr = "v[%s+%u*2+1]" % ("vgprValuA_X%u_I%u"%(m,iui) , a) bStr = "v[%s+%u*2+1]" % ("vgprValuB_X%u_I%u"%(m,iui) , b) if (not kernel["ProblemType"]["ComplexConjugateA"] and not kernel["ProblemType"]["ComplexConjugateB"]) or \ (kernel["ProblemType"]["ComplexConjugateA"] and kernel["ProblemType"]["ComplexConjugateB"]): - kStr += "v_mac_f32 %s, -%s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, -%s, %s%s" % (cStr, aStr, bStr, self.endLine) else: - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) cStr = "v[%s+(%u+%u*%u)*2+1]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) aStr = "v[%s+%u*2]" % ("vgprValuA_X%u_I%u"%(m,iui) , a) bStr = "v[%s+%u*2+1]" % ("vgprValuB_X%u_I%u"%(m,iui) , b) if kernel["ProblemType"]["ComplexConjugateB"]: - kStr += "v_mac_f32 %s, %s, -%s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, -%s%s" % (cStr, aStr, bStr, self.endLine) else: - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) cStr = "v[%s+(%u+%u*%u)*2+1]" % ("vgprValuC", a, b, kernel["ThreadTile0"]) aStr = "v[%s+%u*2+1]" % ("vgprValuA_X%u_I%u"%(m,iui) , a) bStr = "v[%s+%u*2]" % ("vgprValuB_X%u_I%u"%(m,iui) , b) if kernel["ProblemType"]["ComplexConjugateA"]: - kStr += "v_mac_f32 %s, -%s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, -%s, %s%s" % (cStr, aStr, bStr, self.endLine) else: - kStr += "v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) + kStr += "_v_mac_f32 %s, %s, %s%s" % (cStr, aStr, bStr, self.endLine) if beAggressive and not doOnce: kStr += "s_setprio 1 // Raise priority while processing macs%s" % self.endLine @@ -2385,6 +2384,10 @@ def defineMACs(self, kernel, m, innerUnroll): def defineMACMacro(self, kernel, innerUnroll, useMacro): + """ + Defines a macro that performs one set of multiply-accumulate operations. + """ + kStr = "" # Create a macro version that processes just one U iter @@ -2410,10 +2413,9 @@ def defineMACMacro(self, kernel, innerUnroll, useMacro): if useMacro: kStr += ".endm%s" % self.endLine - return kStr - def defineCMPXMacros(self, kernel): + def defineCMPXMacros(self): """ Navi's cmpx instruction writes only to EXEC, not to SGPRs or to VCC. For now, replicate old behaviour with two instructions. @@ -2425,7 +2427,10 @@ def macro(op, dtype): mStr += r" v_cmpx_{op}_{dtype} \dst, \src0, \src1 ".format(**dict) + self.endLine else: mStr += r" v_cmp_{op}_{dtype} \dst, \src0, \src1".format(**dict) + self.endLine - mStr += r" s_mov_b64 exec \dst" + self.endLine + if self.kernel["WavefrontSize"] == 64: + mStr += r" s_mov_b64 exec \dst" + self.endLine + else: + mStr += r" s_mov_b32 exec_lo \dst" + self.endLine mStr += ".endm" + self.endLine return mStr @@ -2439,18 +2444,12 @@ def macro(op, dtype): for op in ops for dtype in dtypes]) - - ############################################################################## - # Function Signature - # called after rest of code - ############################################################################## - def functionSignature(self, kernel ): + def defineFeatureMacros(self): + """ + Defines cross-architecture compatibility macros. + """ kStr = "" - component = Component.Signature.find(self) - if component: - kStr += component(self) - kStr += self.comment3("Asm syntax workarounds") kStr += ".macro _v_add_co_u32 dst:req, cc:req, src0:req, src1:req, dpp=" + self.endLine if self.AsmBugs["ExplicitCO"]: @@ -2460,7 +2459,7 @@ def functionSignature(self, kernel ): kStr += ".endm" + self.endLine # add w/o carry-out. On older arch, vcc is still written - kStr += "\n" + kStr += self.endLine kStr += ".macro _v_add_u32 dst:req, src0:req, src1:req, dpp=" + self.endLine if self.AsmBugs["ExplicitNC"]: kStr += r" v_add_nc_u32 \dst, \src0 \src1 \dpp" + self.endLine @@ -2470,7 +2469,28 @@ def functionSignature(self, kernel ): kStr += r" v_add_u32 \dst, vcc, \src0, \src1 \dpp" + self.endLine kStr += ".endm" + self.endLine - kStr += "\n" + # add w/o carry-out. On older arch, vcc is still written + kStr += self.endLine + kStr += ".macro _v_add_i32 dst:req, src0:req, src1:req, dpp=" + self.endLine + if self.AsmBugs["ExplicitNC"]: + kStr += r" v_add_nc_i32 \dst, \src0 \src1 \dpp" + self.endLine + elif self.AsmBugs["ExplicitCO"]: + kStr += r" v_add_i32 \dst, \src0, \src1 \dpp" + self.endLine + else: + kStr += r" v_add_i32 \dst, vcc, \src0, \src1 \dpp" + self.endLine + kStr += ".endm" + self.endLine + + kStr += self.endLine + kStr += ".macro _v_addc_co_u32 dst:req, ccOut:req, src0:req, ccIn:req, src1:req, dpp=" + self.endLine + if self.AsmBugs["ExplicitNC"]: + kStr += r" v_add_co_ci_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + elif self.AsmBugs["ExplicitCO"]: + kStr += r" v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + else: + kStr += r" v_addc_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + kStr += ".endm" + self.endLine + + kStr += self.endLine kStr += ".macro _v_sub_co_u32 dst:req, cc:req, src0:req, src1:req, dpp=" + self.endLine if self.AsmBugs["ExplicitCO"]: kStr += r" v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp" + self.endLine @@ -2478,27 +2498,30 @@ def functionSignature(self, kernel ): kStr += r" v_sub_u32 \dst, \cc, \src0, \src1 \dpp" + self.endLine kStr += ".endm" + self.endLine - kStr += "\n" + kStr += self.endLine # sub w/o carry-out. On older arch, vcc is still written. kStr += ".macro _v_sub_u32 dst:req, src0:req, src1:req, dpp=" + self.endLine - if self.AsmBugs["ExplicitCO"]: + if self.AsmBugs["ExplicitNC"]: + kStr += r" v_sub_nc_u32 \dst, \src0, \src1 \dpp" + self.endLine + elif self.AsmBugs["ExplicitCO"]: kStr += r" v_sub_u32 \dst, \src0, \src1 \dpp" + self.endLine else: kStr += r" v_sub_u32 \dst, vcc, \src0, \src1 \dpp" + self.endLine kStr += ".endm" + self.endLine - kStr += "\n" - kStr += ".macro _v_addc_co_u32 dst:req, ccOut:req, src0:req, ccIn:req, src1:req, dpp=" + self.endLine + kStr += self.endLine + # sub w/o carry-out. On older arch, vcc is still written. + kStr += ".macro _v_sub_i32 dst:req, src0:req, src1:req, dpp=" + self.endLine if self.AsmBugs["ExplicitNC"]: - kStr += r" v_add_co_ci_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + kStr += r" v_sub_nc_i32 \dst, \src0, \src1 \dpp" + self.endLine elif self.AsmBugs["ExplicitCO"]: - kStr += r" v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + kStr += r" v_sub_i32 \dst, \src0, \src1 \dpp" + self.endLine else: - kStr += r" v_addc_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp" + self.endLine + kStr += r" v_sub_i32 \dst, vcc, \src0, \src1 \dpp" + self.endLine kStr += ".endm" + self.endLine # Use combined add+shift, where available: - kStr += "\n" + kStr += self.endLine kStr += ".macro _v_add_lshl_u32 dst:req, src0:req, src1:req, shiftCnt:req" + self.endLine if globalParameters["AsmCaps"][self.version]["HasAddLshl"]: kStr += r" v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt" + self.endLine @@ -2512,7 +2535,7 @@ def functionSignature(self, kernel ): # Use combined shift+add, where available: - kStr += "\n" + kStr += self.endLine kStr += ".macro _v_lshl_add_u32 dst:req, src0:req, src1:req, shiftCnt:req" + self.endLine if globalParameters["AsmCaps"][self.version]["HasAddLshl"]: kStr += r" v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt" + self.endLine @@ -2534,8 +2557,42 @@ def functionSignature(self, kernel ): kStr += r" v_or_b32 \dst, \dst, \src1" + self.endLine kStr += ".endm" + self.endLine - kStr += self.defineCMPXMacros(kernel) - #kStr += self.defineF16PackedMathMacros(kernel) + kStr += self.defineCMPXMacros() + kStr += self.defineMACInstructionMacros() + + return kStr + + def defineMACInstructionMacros(self): + kStr = "" + + kStr += ".macro _v_mac_f32 c:req, a:req, b:req" + self.endLine + if self.kernel["MACInstruction"] == "FMA": + if self.asmCaps["v_fmac_f32"]: + kStr += r" v_fmac_f32 \c, \a, \b" + self.endLine + elif self.asmCaps["v_fma_f32"]: + kStr += r" v_fma_f32 \c, \a, \b, \c" + self.endLine + else: + raise RuntimeError("FMA instruction specified but not supported on {}".format(self.kernel["ISA"])) + elif self.asmCaps["v_mac_f32"]: + kStr += r" v_mac_f32 \c, \a, \b" + self.endLine + else: + raise RuntimeError("MAC instruction specified but not supported on {}".format(self.kernel["ISA"])) + kStr += ".endmacro" + self.endLine + + return kStr + + ############################################################################## + def functionSignature(self, kernel ): + """ + Function Signature + called after rest of code + """ + kStr = "" + + signature = Component.Signature.find(self) + kStr += signature(self) + + kStr += self.defineFeatureMacros() # Performs a division using 'magic number' computed on host # Argument requirements: @@ -2553,7 +2610,7 @@ def functionSignature(self, kernel ): kStr += ".macro V_MAGIC_DIV dstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req" + self.endLine kStr += r" v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber" + self.endLine kStr += r" v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicA" + self.endLine - kStr += r" v_add_u32 v[\dstIdx+0], v[\dstIdx+0], v[\dstIdx+1]" + self.endLine + kStr += r" _v_add_u32 v[\dstIdx+0], v[\dstIdx+0], v[\dstIdx+1]" + self.endLine kStr += r" v_lshrrev_b32 v[\dstIdx+0], \magicShift, v[\dstIdx+0]" + self.endLine kStr += ".endm" + self.endLine @@ -2794,7 +2851,7 @@ def functionSignature(self, kernel ): else: dest = "v[\\vgprTmp+0]" needAdd = 1 - kStr += inst("v_sub_u32", \ + kStr += inst("_v_sub_u32", \ dest, sgpr("Size%s"%globalParameters["IndexChars"][indices[i]]), \ "1", \ @@ -2814,7 +2871,7 @@ def functionSignature(self, kernel ): srcHi = 0 if pendingOffset else destHi kStr += inst("_v_add_co_u32", \ destLo, \ - "vcc", \ + self.vcc, \ srcLo, \ "v[\\vgprTmp+0]", \ "accumulate %s lower"%idxChar) @@ -2866,12 +2923,12 @@ def functionSignature(self, kernel ): destHi = "v[\\vgprTmp+1]" needAdd = 1 if isMirrorIdx: - kStr += inst("v_sub_i32", \ + kStr += inst("_v_sub_i32", \ "v[\\vgprTmp+0]", sgpr("Size%s"%globalParameters["IndexChars"][idx]), \ offset, \ "mirror %s%s 1"%(tc, globalParameters["IndexChars"][indices[i]])) - kStr += inst("v_sub_i32", \ + kStr += inst("_v_sub_i32", \ "v[\\vgprTmp+0]", "v[\\vgprTmp+0]", \ "1", \ @@ -2921,7 +2978,7 @@ def functionSignature(self, kernel ): srcHi = 0 if pendingOffset else destHi kStr += inst("_v_add_co_u32", \ destLo, \ - "vcc", \ + self.vcc, \ srcLo, \ "v[\\vgprTmp+0]", \ "accumulate %s lower"%idxChar) @@ -2930,10 +2987,10 @@ def functionSignature(self, kernel ): if not justOffset32: kStr += inst("_v_addc_co_u32", \ "v[\\vgprAddr+1]", \ - "vcc", \ + self.vcc, \ "v[\\vgprTmp+1]", \ srcHi, \ - "vcc", \ + self.vcc, \ "accumulate %s upper"%idxChar) pendingOffset = None @@ -2949,7 +3006,7 @@ def functionSignature(self, kernel ): else: kStr += inst("_v_add_co_u32", \ destLo, \ - "vcc", \ + self.vcc, \ destLo, \ pendingOffset, \ "accumulate final pendingOffset") @@ -2978,33 +3035,34 @@ def functionSignature(self, kernel ): # Dynamic Scalar Divide kStr += self.comment3("Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor;") kStr += ".macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp%s" % self.endLine + sTmpStr = "s[\\sTmp]" if (self.kernel["WavefrontSize"] == 32) else "s[\\sTmp:\\sTmp+1]" kStr += inst("v_cvt_f32_u32", "v[\\vQuotient]", "v[\\vDivisor]", "" ) kStr += inst("v_rcp_f32", "v[\\vQuotient]", "v[\\vQuotient]", "" ) kStr += inst("v_mul_f32", "v[\\vQuotient]", "0x4f800000", "v[\\vQuotient]", "" ) kStr += inst("v_cvt_u32_f32", "v[\\vQuotient]", "v[\\vQuotient]", "" ) kStr += inst("v_mul_lo_u32", "v[\\vRemainder]", "v[\\vDivisor]", "v[\\vQuotient]", "" ) kStr += inst("v_mul_hi_u32", "v[\\vTmp0]", "v[\\vDivisor]", "v[\\vQuotient]", "" ) - kStr += inst("_v_sub_co_u32", "v[\\vTmp1]", "vcc", hex(0), "v[\\vRemainder]", "" ) - kStr += inst("v_cmp_ne_i32", "s[\\sTmp:\\sTmp+1]", hex(0), "v[\\vTmp0]", "" ) - kStr += inst("v_cndmask_b32", "v[\\vRemainder]", "v[\\vTmp1]", "v[\\vRemainder]", "s[\\sTmp:\\sTmp+1]", "" ) + kStr += inst("_v_sub_co_u32", "v[\\vTmp1]", self.vcc, hex(0), "v[\\vRemainder]", "" ) + kStr += inst("v_cmp_ne_i32", sTmpStr, hex(0), "v[\\vTmp0]", "" ) + kStr += inst("v_cndmask_b32", "v[\\vRemainder]", "v[\\vTmp1]", "v[\\vRemainder]", sTmpStr, "" ) kStr += inst("v_mul_hi_u32", "v[\\vRemainder]", "v[\\vRemainder]", "v[\\vQuotient]", "" ) - kStr += inst("_v_sub_co_u32", "v[\\vTmp0]", "vcc", "v[\\vQuotient]", "v[\\vRemainder]", "" ) - kStr += inst("_v_add_co_u32", "v[\\vQuotient]", "vcc", "v[\\vQuotient]", "v[\\vRemainder]", "" ) - kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vQuotient]", "v[\\vTmp0]", "s[\\sTmp:\\sTmp+1]", "" ) + kStr += inst("_v_sub_co_u32", "v[\\vTmp0]", self.vcc, "v[\\vQuotient]", "v[\\vRemainder]", "" ) + kStr += inst("_v_add_co_u32", "v[\\vQuotient]", self.vcc, "v[\\vQuotient]", "v[\\vRemainder]", "" ) + kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vQuotient]", "v[\\vTmp0]", sTmpStr, "" ) kStr += inst("v_mul_hi_u32", "v[\\vQuotient]", "v[\\vQuotient]", "v[\\vDividend]", "" ) kStr += inst("v_mul_lo_u32", "v[\\vRemainder]", "v[\\vQuotient]", "v[\\vDivisor]", "" ) - kStr += inst("_v_sub_co_u32", "v[\\vTmp0]", "vcc", "v[\\vDividend]", "v[\\vRemainder]", "" ) - kStr += inst("v_cmp_ge_u32", "s[\\sTmp:\\sTmp+1]", "v[\\vDividend]", "v[\\vRemainder]", "" ) - kStr += inst("_v_add_co_u32", "v[\\vRemainder]", "vcc", hex(1), "v[\\vQuotient]", "" ) - kStr += inst("_v_add_co_u32", "v[\\vTmp1]", "vcc", -1, "v[\\vQuotient]", "" ) - kStr += inst("v_cmp_le_u32", "vcc", "v[\\vDivisor]", "v[\\vTmp0]", "" ) - kStr += inst("s_and_b64", "vcc", "s[\\sTmp:\\sTmp+1]", "vcc", "" ) - kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vQuotient]", "v[\\vRemainder]", "vcc", "" ) - kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vTmp1]", "v[\\vQuotient]", "s[\\sTmp:\\sTmp+1]", "" ) - kStr += inst("v_cmp_ne_i32", "vcc", hex(0), "v[\\vDivisor]", "" ) - kStr += inst("v_cndmask_b32", "v[\\vQuotient]", -1, "v[\\vQuotient]", "vcc", "final result" ) + kStr += inst("_v_sub_co_u32", "v[\\vTmp0]", self.vcc, "v[\\vDividend]", "v[\\vRemainder]", "" ) + kStr += inst("v_cmp_ge_u32", sTmpStr, "v[\\vDividend]", "v[\\vRemainder]", "" ) + kStr += inst("_v_add_co_u32", "v[\\vRemainder]", self.vcc, hex(1), "v[\\vQuotient]", "" ) + kStr += inst("_v_add_co_u32", "v[\\vTmp1]", self.vcc, -1, "v[\\vQuotient]", "" ) + kStr += inst("v_cmp_le_u32", self.vcc, "v[\\vDivisor]", "v[\\vTmp0]", "" ) + kStr += inst("s_and_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sTmpStr, self.vcc, "" ) + kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vQuotient]", "v[\\vRemainder]", self.vcc, "" ) + kStr += inst("v_cndmask_b32", "v[\\vQuotient]", "v[\\vTmp1]", "v[\\vQuotient]", sTmpStr, "" ) + kStr += inst("v_cmp_ne_i32", self.vcc, hex(0), "v[\\vDivisor]", "" ) + kStr += inst("v_cndmask_b32", "v[\\vQuotient]", -1, "v[\\vQuotient]", self.vcc, "final result" ) kStr += inst("v_mul_lo_u32", "v[\\vRemainder]", "v[\\vQuotient]", "v[\\vDivisor]", "" ) - kStr += inst("_v_sub_co_u32", "v[\\vRemainder]", "vcc", "v[\\vDividend]", "v[\\vRemainder]", "final result" ) + kStr += inst("_v_sub_co_u32", "v[\\vRemainder]", self.vcc, "v[\\vDividend]", "v[\\vRemainder]", "final result" ) kStr += ".endm%s" % self.endLine if not kernel["EnableMatrixInstruction"]: @@ -3204,6 +3262,9 @@ def allocateResources(self, kernel): # set Serial id vpgr kStr += inst("v_mov_b32", vgpr("Serial"), vgpr(0), "thread serial id") + if self.kernel["WavefrontSize"] == 32: + kStr += inst("s_mov_b32", "vcc_hi", "0", "Ensure hi bits are zero") + ######################################## # load kernel args kStr += self.comment("Load Kernel Args") @@ -3298,26 +3359,26 @@ def allocateResources(self, kernel): kStr += self.comment("Short circuit condition if Alpha == 0, then sumDims=0") endCheckLabel = "label_AlphaNonZero" if kernel["ProblemType"]["ComputeDataType"].isDoubleComplex(): - kStr += inst("v_cmp_eq_f64", "vcc", sgpr("Alpha", 2), 0.0, "Alpha.real == 0.0 ?") + kStr += inst("v_cmp_eq_f64", self.vcc, sgpr("Alpha", 2), 0.0, "Alpha.real == 0.0 ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if Alpha.real != 0") - kStr += inst("v_cmp_eq_f64", "vcc", sgpr("Alpha+2", 2), 0.0, "Alpha.imag == 0.0 ?") + kStr += inst("v_cmp_eq_f64", self.vcc, sgpr("Alpha+2", 2), 0.0, "Alpha.imag == 0.0 ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if Alpha.imag != 0") elif kernel["ProblemType"]["ComputeDataType"].isDouble(): - kStr += inst("v_cmp_eq_f64", "vcc", sgpr("Alpha", 2), 0.0, "Alpha == 0.0 ?") + kStr += inst("v_cmp_eq_f64", self.vcc, sgpr("Alpha", 2), 0.0, "Alpha == 0.0 ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if Alpha != 0") elif kernel["ProblemType"]["ComputeDataType"].isSingleComplex(): - kStr += inst("v_cmp_eq_f32", "vcc", sgpr("Alpha"), 0.0, "Alpha.real == 0.0f ?") + kStr += inst("v_cmp_eq_f32", self.vcc, sgpr("Alpha"), 0.0, "Alpha.real == 0.0f ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if Alpha.real != 0") - kStr += inst("v_cmp_eq_f32", "vcc", sgpr("Alpha+1"), 0.0, "Alpha.imag == 0.0f ?") + kStr += inst("v_cmp_eq_f32", self.vcc, sgpr("Alpha+1"), 0.0, "Alpha.imag == 0.0f ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if Alpha.imag != 0") # AlphaType is f32 or two-concated-f16, or two-concated-bf16(not support) elif kernel["ProblemType"]["ComputeDataType"].isSingle() or \ kernel["ProblemType"]["ComputeDataType"].isHalf() or \ kernel["ProblemType"]["ComputeDataType"].isBFloat16(): - kStr += inst("v_cmp_eq_f32", "vcc", sgpr("Alpha"), 0.0, "Alpha == 0.0f ?") + kStr += inst("v_cmp_eq_f32", self.vcc, sgpr("Alpha"), 0.0, "Alpha == 0.0f ?") kStr += inst("s_cbranch_vccz %s" % (endCheckLabel), "branch if alpha != 0") # AlphaType is int32 @@ -3376,7 +3437,7 @@ def allocateResources(self, kernel): % (self.tileChar0, self.tileChar0, self.tileChar0, self.endLine) kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(kernel["MacroTile0"]-1), "MT0-1") kStr += inst("v_mov_b32", vgpr(tmpVgpr), sgpr(tmpSgpr), "MT0-1") - kStr += inst("_v_add_co_u32", vgpr(nwg0), "vcc", sgpr("SizesFree+0"), \ + kStr += inst("_v_add_co_u32", vgpr(nwg0), self.vcc, sgpr("SizesFree+0"), \ vgpr(tmpVgpr), "%s = size0+MT0-1"%vgpr(nwg0)) kStr += vectorStaticDivide(nwg0, nwg0, kernel["MacroTile0"], tmpVgpr, tmpSgpr) self.vgprPool.checkIn(tmpVgpr) @@ -3386,19 +3447,19 @@ def allocateResources(self, kernel): kStr += inst("v_mov_b32", vgpr(v+1), sgpr("WorkGroup1"), "%s=wg1"%vgpr(v+1) ) kStr += inst("v_mul_lo_u32", vgpr(v+1), vgpr(v+1), vgpr(nwg0), \ "%s=wg1*nwg0"%vgpr(v+1) ) - kStr += inst("_v_add_co_u32", vgpr(v), "vcc", vgpr(v), vgpr(v+1), \ + kStr += inst("_v_add_co_u32", vgpr(v), self.vcc, vgpr(v), vgpr(v+1), \ "%s=wg1*nwg0+wg0"%vgpr(v) ) kStr += staticMultiply(vgpr(v), vgpr(v), kernel["NumThreads"], sgpr(tmpSgpr)) - kStr += inst("_v_add_co_u32", vgpr(v), "vcc", vgpr(v), vgpr("Serial"), \ + kStr += inst("_v_add_co_u32", vgpr(v), self.vcc, vgpr(v), vgpr("Serial"), \ "%s=tid+NT*(wg1*nwg0+wg0)=serial"%vgpr(v) ) kStr += inst("v_mul_lo_u32", vgpr(v), hex(self.nipt*4), vgpr(v), \ "%s=serial*nipt*4"%vgpr(v) ) kStr += inst("v_mov_b32", vgpr(v+1), 0, "") - kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), "vcc", sgpr("AddressDbg"), \ + kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), self.vcc, sgpr("AddressDbg"), \ vgpr(v), "%s=AddrD* + serial*nipt*4"%vgpr("AddressDbg") ) kStr += inst("v_mov_b32", vgpr(v+2), sgpr("AddressDbg+1"), "%s=AddressD1"%vgpr(v+2) ) - kStr += inst("_v_addc_co_u32", vgpr("AddressDbg+1"), "vcc", vgpr(v+2), \ - vgpr(v+1), "vcc", "%s=AddrD* + serial*nipt*4"%vgpr("AddressDbg") ) + kStr += inst("_v_addc_co_u32", vgpr("AddressDbg+1"), self.vcc, vgpr(v+2), \ + vgpr(v+1), self.vcc, "%s=AddrD* + serial*nipt*4"%vgpr("AddressDbg") ) kStr += inst("s_mov_b32", sgpr("DebugKernelItems"), 0, "") self.vgprPool.checkIn(v) self.vgprPool.checkIn(nwg0) @@ -3450,6 +3511,51 @@ def sMagicDivAlg2(self, kernel, dest, dividend, magicNumber, magicShiftAbit): kStr += inst("s_lshr_b32", sgpr(dest), sgpr(dest), sgpr(tmpS), " sMagicDiv Alg 2") return kStr + def extractPackedCoord1ToRowStart(self, kernel, packedC1, packedCoordVgpr, storeChar): + # calculate packed rowStart vgpr + # vgprTmp assignments: + # - tmp+0 is the incoming packed coordinate 1, used on replay too + # - tmp+1 is DIV output + # - tmp+2 is scratch + # - tmp+3 holds thread rowStart free1 offset + kStr = "" + tmpV0 = self.vgprPool.checkOut(4) + tmpV1 = tmpV0 + 1 + tmpV2 = tmpV0 + 2 + tmpV3 = tmpV0 + 3 + + #assert(kernel["LdcEqualsLdd"]) + kStr += inst("v_mov_b32", vgpr(tmpV0), vgpr(packedCoordVgpr), "copy coord1 then unpack") + for i,idx in enumerate(packedC1[:-1]): + idxChar= globalParameters["IndexChars"][idx] + kStr += self.comment1("extract %s"%self.sizeRef(idx)) + kStr += "V_MAGIC_DIV %s, %s, %s, %s, %s\n" % \ + (tmpV1, vgpr(tmpV0), sgpr("MagicNumberSize%s"%idxChar), \ + sgpr("MagicShiftSize%s"%idxChar), sgpr("MagicAbitSize%s"%idxChar) if kernel["MagicDivAlg"]==2 else "0") + kStr += inst("v_mul_lo_u32", vgpr(tmpV2), vgpr(tmpV1), self.sizeRef(idx), "remainder part 1") + kStr += inst("_v_sub_u32", vgpr(tmpV2), vgpr(tmpV0), vgpr(tmpV2), "remainder part 2") + if i==0: + kStr += inst("v_mul_lo_u32", vgpr(tmpV3), vgpr(tmpV2), \ + self.strideRef(storeChar, idx), "addrCalc <- scaled extracted dim") + else: + kStr += inst("v_mul_lo_u32", vgpr(tmpV2), vgpr(tmpV2), \ + self.strideRef(storeChar, idx), "scale extracted dim") + kStr += inst("_v_add_u32", vgpr(tmpV3), vgpr(tmpV3), \ + vgpr(tmpV2), "addrCalc += scaled extracted dim ") + + if i < len(packedC1)-2: + kStr += inst("v_mov_b32", vgpr(tmpV0), vgpr(tmpV1), \ + "Copy remaining bits for next divide") + + kStr += self.comment1("extract final %s"%self.sizeRef(packedC1[-1])) + kStr += inst("v_mul_lo_u32", vgpr(tmpV2), vgpr(tmpV1), \ + self.strideRef(storeChar, packedC1[-1]), "scale final extracted dim") + kStr += inst("_v_add_u32", vgpr(self.coutRowPtr), vgpr(tmpV3), \ + vgpr(tmpV2), "rowStart += scaled extracted dim ") + + self.vgprPool.checkIn(tmpV0) + return kStr + ############################################################################## # Open Persistent Loop # init iteration counter, define loop target @@ -3546,7 +3652,7 @@ def graWorkGroup(self, kernel, isPap): % (self.tileChar1, self.tileChar1, self.tileChar1, self.endLine) kStr += inst("v_mov_b32", vgpr(nwg1), sgpr("SizesFree+1"), "") kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(kernel["MacroTile1"]-1), "") - kStr += inst("_v_add_co_u32", vgpr(nwg1), "vcc", sgpr(tmpSgpr), vgpr(nwg1), \ + kStr += inst("_v_add_co_u32", vgpr(nwg1), self.vcc, sgpr(tmpSgpr), vgpr(nwg1), \ "%s = size1+MT1-1"%vgpr(nwg1)) kStr += vectorStaticDivide(quotient, nwg1, kernel["MacroTile1"], tmpVgpr, tmpSgpr) self.vgprPool.checkIn(nwg1) @@ -3718,13 +3824,13 @@ def graTileAssignment(self, kernel, tP): if kernel["WaveSeparateGlobalRead%s"%tc]: dividendReg = self.vgprPool.checkOut(1, "idInWave", self.preventVgprOverflowDuringNewTile) dummy = self.vgprPool.checkOut(1, "dummy", self.preventVgprOverflowDuringNewTile) - kStr += vectorStaticRemainder(dummy, dividendReg, "Serial", globalParameters["WavefrontWidth"], tmpVgpr, tmpSgpr) + kStr += vectorStaticRemainder(dummy, dividendReg, "Serial", self.kernel["WavefrontSize"], tmpVgpr, tmpSgpr) kStr += vectorStaticDivideAndRemainder(qReg, rReg, dividendReg, divisor, tmpVgpr, tmpSgpr) if kernel["WaveSeparateGlobalRead%s"%tc]: kStr += inst("v_readfirstlane_b32", sgpr(tmpSgpr), vgpr("Serial"), "WaveIdxWavefrontWidth") - kStr += inst("s_lshr_b32", sgpr(tmpSgpr), sgpr(tmpSgpr), hex(log2(globalParameters["WavefrontWidth"])), "WaveId") + kStr += inst("s_lshr_b32", sgpr(tmpSgpr), sgpr(tmpSgpr), hex(log2(self.kernel["WavefrontSize"])), "WaveId") kStr += inst("s_mul_i32", sgpr(tmpSgpr), sgpr(tmpSgpr), kernel[tP["lsp"]] * tP["nrp"], \ "Global Read Wave: each wave loads continuous lsp(%u)*nrp(%u) columns" % (kernel[tP["lsp"]], tP["nrp"])) kStr += inst("_v_add_u32", vgpr(qReg), sgpr(tmpSgpr), vgpr(qReg), \ @@ -3744,7 +3850,7 @@ def graTileAssignment(self, kernel, tP): # Buffer Load will set the SRD to start of the MacroTile # So don't add the static wg-related component here - save for later. kStr += staticMultiply(vgpr(tmpVgpr), sgpr(tP["wg"]), kernel[tP["mt"]]) # workgroup - kStr += inst("_v_add_co_u32", vgpr(tReg2), "vcc", vgpr(tmpVgpr), \ + kStr += inst("_v_add_co_u32", vgpr(tReg2), self.vcc, vgpr(tmpVgpr), \ vgpr(tReg), "gro%s-tile = serial%s%s*VW + (wg%s*MT%s)" \ % (tc, tOpStr, divisorName, tc, tc) ) @@ -3790,7 +3896,7 @@ def graUnrollAssignment(self, kernel, tP): vgpr(gsuOffset), "gsuOffset=gsuSumIdx*(SizeU/GSU)") self.vgprPool.checkIn(quotient) - kStr += inst("_v_add_co_u32", vgpr(tP["gpr"]["uReg"]), "vcc", \ + kStr += inst("_v_add_co_u32", vgpr(tP["gpr"]["uReg"]), self.vcc, \ vgpr(gsuOffset), vgpr(tP["gpr"]["uReg"]), \ "graUnrollAssignment += gsuOffset") self.vgprPool.checkIn(gsuOffset) @@ -3852,16 +3958,16 @@ def graTileOffsets(self, kernel, tP): vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, 0) ) # l=0, s>0 for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ + kStr += inst("_v_add_co_u32", vgpr(v+s), self.vcc, 1, \ vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, s) ) for l in range(1, tP["nrt"]): # l>0, s=0 - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), self.vcc, stride, \ vgpr(v+(l-1)*tP["glvw"]), \ "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], tP["tileChar"], l, 0, strideIdx) ) # l>0, s>0 for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), self.vcc, \ 1, vgpr(v+l*tP["glvw"]+(s-1)), \ "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], l, s) ) @@ -3869,7 +3975,7 @@ def graTileOffsets(self, kernel, tP): kStr += inst("v_mov_b32", vgpr(v), \ vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u"%(tP["tensorChar"], tP["tileChar"], 0) ) for l in range(1, tP["nrt"]): - kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ + kStr += inst("_v_add_co_u32", vgpr(v+l), self.vcc, stride, \ vgpr(v+l-1), "gro%s%s_%u += %s"%(tP["tensorChar"], tP["tileChar"], l, strideIdx) ) if numExtraPackedOffsetsPerTile: tmpV = self.vgprPool.checkOutAligned(2,2,"packTmp", self.preventVgprOverflowDuringNewTile) @@ -3888,7 +3994,7 @@ def graTileOffsets(self, kernel, tP): sgpr("MagicShiftSize%s"%pChar), sgpr("MagicAbitSize%s"%pChar) if kernel["MagicDivAlg"]==2 else "0") kStr += inst("v_mov_b32", groVgpr, vgpr(tmpV), "extract gro%s%s_%u (%s)"%(tc,groChar,l,groVgpr)) kStr += inst("v_mul_lo_u32", vgpr(tmpV), groVgpr, sgpr("SizesFree+%u"%lastGroIdx), "remainder part 1") - kStr += inst("v_sub_u32", lastGroVgpr, lastGroVgpr, vgpr(tmpV), \ + kStr += inst("_v_sub_u32", lastGroVgpr, lastGroVgpr, vgpr(tmpV), \ "remove extracted bits from gro%s%s_%u (%s)"%(tc, globalParameters["IndexChars"][lastGroIdx], l, lastGroVgpr)) lastGroVgpr = groVgpr lastGroIdx = groIdx @@ -3922,23 +4028,23 @@ def graUnrollOffsets(self, kernel, tP): vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, 0) ) # l=0, s>0 for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ + kStr += inst("_v_add_co_u32", vgpr(v+s), self.vcc, 1, \ vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) for l in range(1, tP["nru"]): # l>0, s=0 - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), self.vcc, stride, \ vgpr(v+(l-1)*tP["glvw"]), \ "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], self.unrollChar, l, 0, strideIdx) ) # l>0, s>0 for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), self.vcc, \ 1, vgpr(v+l*tP["glvw"]+(s-1)), \ "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) else: kStr += inst("v_mov_b32", vgpr(v), \ vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u"%(tP["tensorChar"], self.unrollChar, 0) ) for l in range(1, tP["nru"]): - kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ + kStr += inst("_v_add_co_u32", vgpr(v+l), self.vcc, stride, \ vgpr(v+l-1), "gro%s%s_%u + %s"%(tP["tensorChar"], self.unrollChar, l, strideIdx) ) #self.vgprPool.checkIn(tP["gpr"]["uReg"]) return "" if self.dontAppendCode else kStr @@ -3986,8 +4092,8 @@ def graShift(self, kernel, tP): kStr += inst("v_mov_b32", vgpr(edge), sgpr(tmpSgpr), \ "edge vgpr = Size%s- WG*MT - margin(%u)"%(tP["tileChar"], margin) ) shiftedEdge = self.vgprPool.checkOut(1, "shiftedEdge", self.preventVgprOverflowDuringNewTile) - kStr += inst("_v_add_co_u32", vgpr(shiftedEdge), "vcc", vgpr(edge), self.srdShiftLeft[tc], \ - "shiftedEdge = edge + srdShiftLeft(%u)"%(self.srdShiftLeft[tc])) + kStr += inst("_v_add_co_u32", vgpr(shiftedEdge), self.vcc, vgpr(edge), self.srdShiftLeft[tc], + "shiftedEdge = edge + srdShiftLeft({})".format(self.srdShiftLeft[tc])) else: tmpSgpr = self.getTmpSgpr(1).idx() kStr += inst("s_sub_u32", sgpr(tmpSgpr), self.sizeRef(tP["idx"]), margin, \ @@ -4003,23 +4109,23 @@ def graShift(self, kernel, tP): # shift offsets v = tP["vgprTileOffsets"] - tmpSgpr = self.getTmpSgpr(2).idx() + tmpSgpr = self.getTmpSgpr(self.laneSGPRCount).idx() for l in range(0, tP["nrt"]): # compare cmpCommentText = "offset < edge" if self.groOffsetInMacroTile: shiftedOffset = self.vgprPool.checkOut(1, "shiftedOffset", self.preventVgprOverflowDuringNewTile) - kStr += inst("_v_add_co_u32", vgpr(shiftedOffset), "vcc", vgpr(v+l), self.srdShiftLeft[tc], \ - "shiftedOffset = offset + srdShiftLeft(%u)"%(self.srdShiftLeft[tc])) + kStr += inst("_v_add_co_u32", vgpr(shiftedOffset), self.vcc, vgpr(v+l), self.srdShiftLeft[tc], "shiftedOffset = offset + srdShiftLeft(%u)"%(self.srdShiftLeft[tc])) # int cmp since if we are near the front of the tile this may go negative: - cmpCommentText = "shiftedOffset < shiftedEdge" - kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,2), vgpr(shiftedOffset), vgpr(shiftedEdge), cmpCommentText ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,self.laneSGPRCount), vgpr(shiftedOffset), vgpr(shiftedEdge), + "shiftedOffset < shiftedEdge") self.vgprPool.checkIn(shiftedOffset) else: - kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,2), vgpr(v+l), vgpr(edge), cmpCommentText ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,self.laneSGPRCount), vgpr(v+l), vgpr(edge), + "shiftedOffset < shiftedEdge") # shift - kStr += inst("v_cndmask_b32", vgpr(v+l), vgpr(edge), vgpr(v+l), sgpr(tmpSgpr,2), \ - "offset = (%s) ? offset(v%u) : edge(v%u)"%(cmpCommentText, v+l, edge) ) + kStr += inst("v_cndmask_b32", vgpr(v+l), vgpr(edge), vgpr(v+l), sgpr(tmpSgpr,self.laneSGPRCount), + "offset = (%s) ? offset(v%u) : edge(v%u)"%(cmpCommentText, v+l, edge)) self.vgprPool.checkIn(edge) if self.groOffsetInMacroTile: self.vgprPool.checkIn(shiftedEdge) @@ -4101,13 +4207,13 @@ def graFinalOffsets(self, kernel, tP): # add room for instruction offset groVgpr = "GlobalReadOffset%s+%u" % (tP["tensorChar"], graIdx) kStr += inst("s_mov_b32", sgpr(tmpSgpr), self.buff_load_inst_offset_max, "" ) - kStr += inst("v_add_u32", vgpr(groVgpr), vgpr(groVgpr), sgpr(tmpSgpr), "shift for UseInstOffsetForGRO") + kStr += inst("_v_add_u32", vgpr(groVgpr), vgpr(groVgpr), sgpr(tmpSgpr), "shift for UseInstOffsetForGRO") - ldsInc = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + ldsInc = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr if kernel["LdsBlockSizePerPad%s"%tc] != 0: ldsInc += (ldsInc // kernel["LdsBlockSizePerPad%s"%tc]) * kernel["LdsPad%s"%tc] * tP["bpe"] else: - padInterval = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + padInterval = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr ldsInc += (ldsInc // padInterval) * kernel["LdsPad%s"%tc] * tP["bpe"] # buffer_load only support 12 bit instruction offset @@ -4117,7 +4223,7 @@ def graFinalOffsets(self, kernel, tP): ldsInc = (ldsInc * graIdx) % self.buff_load_inst_offset_max if (ldsInc != 0): kStr += inst("s_mov_b32", sgpr(tmpSgpr), ldsInc, "" ) - kStr += inst("v_sub_u32", vgpr(groVgpr), vgpr(groVgpr), sgpr(tmpSgpr), "sub offset for buffer_load instoffset") + kStr += inst("_v_sub_u32", vgpr(groVgpr), vgpr(groVgpr), sgpr(tmpSgpr), "sub offset for buffer_load instoffset") for zpr in [zpr for zpr in self.zeroPadRegs[tc].values() if zpr.isMatch(perp, sPerp, para, sPara)]: assert(zpr.state == ZeroPadReg.State.Allocated) # only calc address once @@ -4134,12 +4240,12 @@ def graFinalOffsets(self, kernel, tP): "zp.freeDim * strideFree") vgprOffset = vgpr(iaToGpr[sumDim]) if vgpr(iaToGpr[sumDim]) else 0 if sumDim in kernel["ProblemType"]["MirrorDims%s"%tc]: - kStr += inst("v_sub_u32", \ + kStr += inst("_v_sub_u32", \ vgpr(tmp), \ sgpr("Size%s"%sumDimChar), \ vgprOffset, \ "zp.sumDim mirror 1") - kStr += inst("v_sub_u32", \ + kStr += inst("_v_sub_u32", \ vgpr(tmp), \ vgpr(tmp), \ "1", \ @@ -4161,7 +4267,7 @@ def graFinalOffsets(self, kernel, tP): "Bpe%sLog2"%tc, \ vgpr(zpr.regName), \ "scale to bpe") - kStr += inst("v_sub_u32", + kStr += inst("_v_sub_u32", vgpr(zpr.regName), \ vgpr(zpr.regName), \ sgpr("PadStart%s%s%s"%(tc, freeDimChar, sumDimChar)), \ @@ -4174,7 +4280,7 @@ def graFinalOffsets(self, kernel, tP): % (lastValidThread, kernel["NumThreads"]) kStr += inst("s_mov_b32", sgpr(tmpSgpr), lastValidThread, "" ) kStr += inst("v_cmp_lt_u32", \ - "vcc", \ + self.vcc, \ vgpr("Serial"), \ sgpr(tmpSgpr), \ "tid < valid-tid") @@ -4185,7 +4291,7 @@ def graFinalOffsets(self, kernel, tP): vgpr("GlobalReadOffset%s+%u"%(tP["tensorChar"], graIdx)), \ vgpr(boundsVgpr), \ vgpr("GlobalReadOffset%s+%u"%(tP["tensorChar"], graIdx)), \ - "vcc", + self.vcc, "Mask load so OOB will return 0") self.vgprPool.checkIn(boundsVgpr) @@ -4234,11 +4340,11 @@ def graFinalOffsets(self, kernel, tP): # add room for instruction offset kStr += inst("s_add_u32", sgpr(scalarGro), sgpr(scalarGro), self.buff_load_inst_offset_max, "shift for UseInstOffsetForGRO") - ldsInc = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + ldsInc = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr if kernel["LdsBlockSizePerPad%s"%tc] != 0: ldsInc += (ldsInc // kernel["LdsBlockSizePerPad%s"%tc]) * kernel["LdsPad%s"%tc] * tP["bpe"] else: - padInterval = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + padInterval = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr ldsInc += (ldsInc // padInterval) * kernel["LdsPad%s"%tc] * tP["bpe"] # buffer_load only support 12 bit instruction offset @@ -4530,16 +4636,16 @@ def graAddresses(self, kernel, tP): para, sPara, perp, sPerp ) kStr += inst("_v_add_co_u32", \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ vgpr(tmp+0), \ comment+" (lower)") kStr += inst("_v_addc_co_u32", \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ vgpr(tmp+1), \ - "vcc", \ + self.vcc, \ comment+" (upper)") #kStr += dump(vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx))) #kStr += dump(vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx))) @@ -4785,7 +4891,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): "padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) kStr += staticMultiply(vgpr(uReg), vgpr(uReg), kernel["LdsPad%s"%tc] * tP["bpe"], sgpr(tmpSgpr), \ "padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) - kStr += inst("v_add_u32", vgpr(destVgpr), vgpr(uReg), vgpr(destVgpr), \ + kStr += inst("_v_add_u32", vgpr(destVgpr), vgpr(uReg), vgpr(destVgpr), \ "add padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) self.vgprPool.checkIn(tmpVgpr) else: @@ -4828,7 +4934,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): "lw%s%s**(MT%s + PAD)"%(tP["tensorChar"], self.unrollChar, tP["tensorChar"])) kStr += inst("_v_add_co_u32", \ vgpr(uReg), \ - "vcc", \ + self.vcc, \ vgpr(uRegScrap), \ vgpr(uReg), \ "add scraps from LDL masking") @@ -4844,7 +4950,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): if tP["isB"]: kStr += inst("_v_add_co_u32", \ vgpr(destVgpr), \ - "vcc", \ + self.vcc, \ hex(kernel["LdsOffsetB"]*tP["bpe"]), \ vgpr(destVgpr), \ "lwFOB = lwB%s + lwB%s*MT%s + LDS_OFFSET_B=%u*%u" % (tP["tileChar"], \ @@ -4862,7 +4968,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): maxBytesPerLoad = kernel["NumThreads"] * tP["glvw"] * numBytesPerElement if kernel["WaveSeparateGlobalRead%s"%tc]: - validBytesPerLoad *= (kernel["NumThreads"] // globalParameters["WavefrontWidth"]) + validBytesPerLoad *= (kernel["NumThreads"] // self.kernel["WavefrontSize"]) assert (validBytesPerLoad <= maxBytesPerLoad) assert (kernel[tP["lsc"]] * kernel[tP["lsp"]] % tP["glvw"] == 0) @@ -4872,7 +4978,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): kStr += inst("s_mov_b32", sgpr(tmpSgpr), validWIPerLoad, \ "lsc*lsp=%u*%u"%(kernel[tP["lsc"]],kernel[tP["lsp"]] )) kStr += inst("v_cmp_lt_u32", \ - "vcc", \ + self.vcc, \ vgpr("Serial"), \ sgpr(tmpSgpr), \ "fractional: ensure tid < global read tile elements") @@ -4882,7 +4988,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): vgpr(destVgpr), \ vgpr(tmpVgpr), \ vgpr(destVgpr), \ - "vcc", \ + self.vcc, \ "Mask load so out-of-gr-tile bounds returns 0") self.vgprPool.checkIn(tmpVgpr) @@ -4891,7 +4997,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): # only for TN tensor + TN lds layout assert tP["tlu"] == 0 - kStr += inst("v_cmp_eq_u32","vcc", vgpr(tP["gpr"]["subIterReg"]), uDu, "if sub_g2l_idx == %u ?"%uDu) + kStr += inst("v_cmp_eq_u32",self.vcc, vgpr(tP["gpr"]["subIterReg"]), uDu, "if sub_g2l_idx == %u ?"%uDu) ldsOOB = self.vgprPool.checkOut(1, "lds OOB addr", self.preventVgprOverflowDuringNewTile) kStr += inst("v_mov_b32", vgpr(ldsOOB), hex(self.LdsOOB), "lds OOB address") @@ -4899,7 +5005,7 @@ def lwaFirstOffset(self, kernel, tP, uDu=0): vgpr(destVgpr), \ vgpr(ldsOOB), \ vgpr(destVgpr), \ - "vcc", \ + self.vcc, \ "Mask threads not belonging to current sub_g2l_idx by assigning OOB") self.vgprPool.checkIn(ldsOOB) @@ -4963,12 +5069,17 @@ def lwaDeclareAddresses(self, kernel, tP): ############################################################################## # Local Read Addresses: Tile Assignment ############################################################################## - def lraTileAssignment(self, kernel, tP): + def lraTileAssignment(self, kernel, tPA, tPB): kStr = "" component = Component.LraTileAssignment.find(self) + + tP0 = tPA if tPB["tile01Idx"] else tPB + tP1 = tPB if tPB["tile01Idx"] else tPA + if component: - kStr += component(self, kernel, tP) + kStr += component(self, kernel, tP0) + kStr += component(self, kernel, tP1) return kStr @@ -4986,25 +5097,25 @@ def lraFinalOffset(self, kernel, tP): # constant tc = tP["tensorChar"] - tIdx = tP["tensorIdx"] + tile01 = tP["tile01Idx"] LdsPad = kernel["LdsPad%s" % tc] if kernel["LdsBlockSizePerPad%s" % tc] == 0 else 0 divisor = kernel["SubGroup0"] * kernel["SubGroup1"] - mtAddPad = kernel["MacroTile%u" % tIdx] + LdsPad + mtAddPad = kernel["MacroTile%u" % tile01] + LdsPad # generate instruction kStr += vectorStaticDivide(sgid, "Serial", divisor, tmpVgpr, tmpSgpr, \ "LSU offset: sgid = Serial / subGroup(%u)" % divisor) kStr += inst("s_mov_b32", sgpr(tmpSgpr), mtAddPad, \ - "LSU offset: stirde = MT%u(%u) + PAD%u(%u)" % (tIdx, kernel["MacroTile%u" % tIdx], tIdx, LdsPad)) + "LSU offset: stirde = MT%u(%u) + PAD%u(%u)" % (tile01, kernel["MacroTile%u" % tile01], tile01, LdsPad)) kStr += inst("v_mul_lo_u32", vgpr(sgid), sgpr(tmpSgpr), vgpr(sgid), \ - "LSU offset: lsuoffset = sgid*(MT%u+PAD)"%tIdx) + "LSU offset: lsuoffset = sgid*(MT%u+PAD)"%tile01) if not kernel["EnableMatrixInstruction"] and kernel["VectorWidth"] > 1: kStr += staticMultiply(vgpr(tP["gpr"]["lro"]), vgpr(tP["gpr"]["lro"]), kernel["VectorWidth"], sgpr(tmpSgpr), \ "Final Offset: lr%sOffset * VW" % tc) # final offset kStr += inst("_v_add_lshl_u32", vgpr("LocalReadAddr%s"%tc), vgpr(sgid), vgpr(tP["gpr"]["lro"]), hex(log2(tP["bpe"])), \ - "Final Offset: offset = (lro%s*VW+lsuoffset)*bpe" % tIdx ) + "Final Offset: offset = (lro%s*VW+lsuoffset)*bpe" % tile01 ) # LdsBlockSizePerPad: add padding if kernel["LdsBlockSizePerPad%s"%tc] != 0 and kernel["LdsPad%s"%tc] !=0: @@ -5012,7 +5123,7 @@ def lraFinalOffset(self, kernel, tP): "Final Offset: padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) kStr += staticMultiply(vgpr(rReg), vgpr(rReg), kernel["LdsPad%s"%tc] * tP["bpe"], sgpr(tmpSgpr), \ "Final Offset: padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) - kStr += inst("v_add_u32", vgpr("LocalReadAddr%s"%tc), vgpr(rReg), vgpr("LocalReadAddr%s"%tc), \ + kStr += inst("_v_add_u32", vgpr("LocalReadAddr%s"%tc), vgpr(rReg), vgpr("LocalReadAddr%s"%tc), \ "Final Offset: add padding %u per block %u" % (kernel["LdsPad%s"%tc], kernel["LdsBlockSizePerPad%s"%tc])) # release resources @@ -5033,7 +5144,7 @@ def lraDeclareAddresses(self, kernel, tP): else: return inst("_v_add_co_u32", \ vgpr("LocalReadAddr%s+0"%tP["tensorChar"]), \ - "vcc", \ + self.vcc, \ hex(kernel["LdsOffset%s"%tP["tensorChar"]]*tP["bpe"]), \ vgpr("LocalReadAddr%s+0"%tP["tensorChar"]), \ " += LdsOffset%s (lower)"%tP["tensorChar"]) @@ -5601,7 +5712,7 @@ def openLoop(self, kernel, loopIdx, uDu=None): self.vgprPool.checkIn(dummy) #kStr += dump(vgpr(sgId)) #kStr += dump(vgpr(numIter)) - kStr += inst("_v_cmpx_lt_u32", "vcc", \ + kStr += inst("_v_cmpx_lt_u32", self.vcc, \ vgpr(sgId), vgpr(numIter), "sgId < numIter") self.vgprPool.checkIn(tmpVgpr) #self.tailNumIter = numIter @@ -5613,9 +5724,9 @@ def openLoop(self, kernel, loopIdx, uDu=None): # LSU mask for this iteration if kernel["LocalSplitU"] > 1: - kStr += inst("_v_cmpx_lt_u32", "vcc", \ + kStr += inst("_v_cmpx_lt_u32", self.vcc, \ vgpr(sgId), vgpr(numIter), "sgId < numIter") - kStr += inst("_v_add_co_u32", vgpr(sgId), "vcc", hex(kernel["LocalSplitU"]), \ + kStr += inst("_v_add_co_u32", vgpr(sgId), self.vcc, hex(kernel["LocalSplitU"]), \ vgpr(sgId), "sgId+=LSU") self.vgprPool.checkIn(sgId) self.vgprPool.checkIn(numIter) @@ -5625,14 +5736,26 @@ def openLoop(self, kernel, loopIdx, uDu=None): if loopIdx == self.unrollIdx: if kernel["PrefetchGlobalRead"] == 2: - kStr += inst("s_cmp_eq_u32", \ - loopCounter, \ - hex(endCounter-1), \ - "LoopCounter%s < EndCounter"%(loopChar) ) + if not self.unrollIncIsDepthU: + kStr += inst("s_cmp_eq_u32", \ + loopCounter, \ + hex(endCounter-1), \ + "LoopCounter%s < EndCounter"%(loopChar) ) + else: + kStr += inst("s_cmp_ge_u32", \ + loopCounter, \ + sgpr("UnrollLoopLastIter"), \ + "LoopCounter%s > EndCounter"%(loopChar) ) toPGR1 = self.getLabelNum("toPGR1") kStr += inst("s_cbranch_scc1 label_%04u"%toPGR1, "PGR=2 but only 1 loop, toPGR1") if self.unrollIncIsDepthU: + if kernel["PrefetchGlobalRead"] == 2: + tmpSgpr = self.getTmpSgpr(1).idx() + kStr += inst("s_add_u32", sgpr(tmpSgpr),\ + loopCounter, \ + "DepthU", "") + loopCounter = sgpr(tmpSgpr) kStr += inst("s_cmp_ge_u32", \ loopCounter, \ sgpr("UnrollLoopLastIter"), \ @@ -5682,8 +5805,7 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False tailLoop = loopIdx < 0 if tailLoop: loopIdx = self.unrollIdx - loopChar = self.indexChars[ \ - kernel["ProblemType"]["IndicesSummation"][loopIdx]] + loopChar = self.indexChars[kernel["ProblemType"]["IndicesSummation"][loopIdx]] loopLabelBegin = self.getNamedLabel("TailLoopBegin%s%s"%(loopChar, "_G2L%s"%uDu if uDu is not None else "") ) loopLabelEnd = self.getNamedLabel("TailLoopEnd%s%s"%(loopChar, "_G2L%s"%uDu if uDu is not None else "") ) loopLabelEndOddExit = self.getNamedLabel("TailLoopEnd%s_oddexit"%(loopChar) ) @@ -5699,6 +5821,8 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False KinInnerUnroll *= kernel["MatrixInstK"] if kernel["AssertSummationElementMultiple"] % KinInnerUnroll == 0: unrollInc *= kernel["InnerUnroll"] + elif (kernel["LocalDotLayout"] == 2) and (kernel["InnerUnroll"] == 2): + unrollInc *= kernel["InnerUnroll"] kStr += self.comment("closeLoop loop%s finalLoop=%d tailLoop=%d" % (loopChar, finalLoop, tailLoop)) @@ -5733,10 +5857,20 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False if self.unrollIncIsDepthU and loopIdx==self.unrollIdx: assert (not kernel["SuppressNoLoadLoop"]) # not accounting for end-of-loop iteration change here in deprecated mode - kStr += inst("s_cmp_ge_u32", \ - loopCounter, \ - sgpr("UnrollLoopLastIter"), \ - "counter%s==0"%(loopChar) ) + if kernel["PrefetchGlobalRead"] == 2: + tmpSgpr = self.getTmpSgpr(1).idx() + kStr += inst("s_add_u32", sgpr(tmpSgpr),\ + loopCounter, \ + "DepthU", "") + kStr += inst("s_cmp_ge_u32", \ + sgpr(tmpSgpr), \ + sgpr("UnrollLoopLastIter"), \ + "LoopCounter%s + DU < EndCounter. Go to PGR1"%(loopChar) ) + else: + kStr += inst("s_cmp_ge_u32", \ + loopCounter, \ + sgpr("UnrollLoopLastIter"), \ + "counter%s==0"%(loopChar) ) else: kStr += inst("s_sub_u32", \ loopCounter, loopCounter, \ @@ -5814,7 +5948,7 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False for tP in [self.tPA, self.tPB]: tc = tP["tensorChar"] LdsPad = kernel["LdsPad%s" % tc] if kernel["LdsBlockSizePerPad%s"%tc] == 0 else 0 - inc = kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]]+LdsPad)*tP["bpe"] + inc = kernel["LocalSplitU"]*(kernel["MacroTile%s"%tc]+LdsPad)*tP["bpe"] # aligned with localReadInc if kernel["EnableMatrixInstruction"]: @@ -5825,7 +5959,7 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False kStr += inst("s_mov_b32", sgpr(stmp), inc, "tailloop lds offset") kStr += inst("s_mul_i32", sgpr(stmp), sgpr("OrigLoopCounter"), sgpr(stmp), "scale by mul") - kStr += inst("v_sub_u32", vgpr("LocalReadAddr%s"%tc), vgpr("LocalReadAddr%s"%tc), sgpr(stmp), "remove lro damage") + kStr += inst("_v_sub_u32", vgpr("LocalReadAddr%s"%tc), vgpr("LocalReadAddr%s"%tc), sgpr(stmp), "remove lro damage") # if LWA is backed-up before, we simply restore the addr if self.oriLwaA != None: kStr += inst("v_mov_b32", vgpr("LocalWriteAddrA"), vgpr(self.oriLwaA), "restore LWA") @@ -5837,11 +5971,13 @@ def closeLoop(self, kernel, loopIdx, finalLoop, uDu=None, emitEndLabelOnly=False # restore all threads if tailLoop and kernel["LocalSplitU"] > 1: + sgprCnt = self.laneSGPRCount + waveSize = kernel["WavefrontSize"] kStr += self.comment("restore full exec mask") - fullExec = self.getTmpSgpr(2).idx() - kStr += inst("s_mov_b64", sgpr(fullExec,2), \ - "0xFFFFFFFFFFFFFFFF", "restore all threads active") - kStr += inst("s_or_saveexec_b64", sgpr(fullExec,2), sgpr(fullExec,2), "full mask -> exec" ) + fullExec = self.getTmpSgpr(sgprCnt).idx() + activeMask = "0xFFFFFFFF" if (waveSize == 32) else "0xFFFFFFFFFFFFFFFF" + kStr += inst("s_mov_b{}".format(waveSize), sgpr(fullExec,sgprCnt), activeMask, "restore all threads active") + kStr += inst("s_or_saveexec_b{}".format(waveSize), sgpr(fullExec,sgprCnt), sgpr(fullExec,sgprCnt), "full mask -> exec" ) return kStr ############################################################################## @@ -5857,10 +5993,10 @@ def openLoopCopy(self, kernel, lc): ############################################################################## # End Summation ############################################################################## - def endSummation(self, kernel): + def endSummation(self, kernel, label = None): kStr = "" - kStr += "%s:\n" % self.getNamedLabelUnique("Summation_End") + kStr += "%s:\n" % (self.getNamedLabelUnique("Summation_End") if label is None else label) if self.overlapVgprC: # After summation loop, valuC is due for Acc->Arch read and is thus locked out. @@ -5956,7 +6092,7 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): numRegistersOut = 2 if kernel["ProblemType"]["DataType"].isDouble() else 1 loopCounterName = self.loopCounterName(kernel, self.unrollIdx) accs_per_wave = kernel["MatrixInstM"] * kernel["MatrixInstN"] * kernel["MatrixInstB"] \ - / globalParameters["WavefrontWidth"] * numRegistersOut + / self.kernel["WavefrontSize"] * numRegistersOut dividerFortidInK = kernel["MatrixInstN"] * kernel["MatrixInstB"] numMIInput = kernel["MIInputPerThread"] miInTypeName = kernel["ProblemType"]["DataType"].toNameAbbrev() # v_mfma_[...xK] @@ -5995,18 +6131,18 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): if tail and kernel["MatrixInstK"] > 1: kReg = self.vgprPool.checkOut(1,"kReg") # remainder tmpSgpr = self.getTmpSgpr(3).idx() - shiftK.addCode(vectorStaticRemainder(dummy, kReg, "Serial", globalParameters["WavefrontWidth"], tmpVgpr, tmpSgpr)) + shiftK.addCode(vectorStaticRemainder(dummy, kReg, "Serial", self.kernel["WavefrontSize"], tmpVgpr, tmpSgpr)) shiftK.addCode(vectorStaticDivide(kReg, kReg, dividerFortidInK, tmpVgpr, tmpSgpr)) shiftK.addCode(staticMultiply(vgpr(kReg), vgpr(kReg), numMIInput, sgpr(tmpSgpr))) # replace 0 for differnet thread shiftK.addCode(inst("v_cmp_ge_i32", sgpr(tmpSgpr, 2), vgpr(kReg), sgpr(loopCounterName), "check K index >= Size L")) for bk in range(0, vgprPerInput): - for a in range(0, kernel["MIWaveTile"][0]): + for a in range(0, kernel["MIWaveTileA"]): for iui in range(0, innerUnroll): aStr = vgpr("ValuA_X%u_I%u+%u+%u" % (m, iui, a*vgprPerInput, bk), 1) shiftK.addCode(inst("v_cndmask_b32", aStr, aStr, hex(0), sgpr(tmpSgpr, 2), "set 0 if K_idx >= sizeL")) - for b in range(0, kernel["MIWaveTile"][1]): + for b in range(0, kernel["MIWaveTileB"]): for iui in range(0, innerUnroll): bStr = vgpr("ValuB_X%u_I%u+%u+%u" % (m, iui, b*vgprPerInput, bk), 1) shiftK.addCode(inst("v_cndmask_b32", bStr, bStr, hex(0), sgpr(tmpSgpr, 2), "set 0 if K_idx >= sizeL")) @@ -6016,12 +6152,12 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): abReg = self.vgprPool.checkOutAligned(vgprPerInput, 2 if vgprPerInput>1 else 1, "abReg") tmpVgpr = self.vgprPool.checkOutAligned(2,2,"tmpVgpr") dummy = self.vgprPool.checkOut(1,"dummy") - shiftK.addCode(inst("v_sub_u32", vgpr(kReg), sgpr(loopCounterName), vgpr(kReg), "get distance between size and k index")) + shiftK.addCode(inst("_v_sub_u32", vgpr(kReg), sgpr(loopCounterName), vgpr(kReg), "get distance between size and k index")) shiftK.addCode(inst("v_cmp_lt_i32", sgpr(tmpSgpr,2), vgpr(kReg), numMIInput, "set partial 0 if distance less than input per thread")) shiftK.addCode(inst("s_and_b32", sgpr(tmpSgpr+2), sgpr(loopCounterName), numMIInput-1, "get inputs for edge thread")) shiftK.addCode(inst("s_sub_u32", sgpr(tmpSgpr+2), numMIInput, sgpr(tmpSgpr+2), "use shift to fill 0 for outside element")) shiftK.addCode(inst("s_lshl_b32", sgpr(tmpSgpr+2), sgpr(tmpSgpr+2), log2(shiftPerElement), "use shift to fill 0 for outside element")) - for a in range(0, kernel["MIWaveTile"][0]): + for a in range(0, kernel["MIWaveTileA"]): for iui in range(0, innerUnroll): iuiA_new = (iui//self.numReadsIterCoalescedA)*self.numReadsIterCoalescedA iuiA_new_offset = iui%self.numReadsIterCoalescedA*vgprPerInput @@ -6031,7 +6167,7 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): for bk in range(0, vgprPerInput): aStr = vgpr("ValuA_X%u_I%u+%u+%u+%u+%u" % (vgprBufferA_new, iuiA_new, a_new, vgprBufferA_new_offset, iuiA_new_offset, bk), 1) shiftK.addCode(inst("v_cndmask_b32", aStr, aStr, vgpr(abReg+bk), sgpr(tmpSgpr, 2), "")) - for b in range(0, kernel["MIWaveTile"][1]): + for b in range(0, kernel["MIWaveTileB"]): for iui in range(0, innerUnroll): iuiB_new = (iui//self.numReadsIterCoalescedB)*self.numReadsIterCoalescedB iuiB_new_offset = iui%self.numReadsIterCoalescedB*vgprPerInput @@ -6054,15 +6190,19 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): iuiA_new_offset = iui%self.numReadsIterCoalescedA*vgprPerInput iuiB_new = (iui//self.numReadsIterCoalescedB)*self.numReadsIterCoalescedB iuiB_new_offset = iui%self.numReadsIterCoalescedB*vgprPerInput - for b in range(0, kernel["MIWaveTile"][1]): - for a in range(0, kernel["MIWaveTile"][0]): - accIdx = b * kernel["MIWaveTile"][0] + a + for idx1 in range(0, kernel["MIWaveTile"][1]): + for idx0 in range(0, kernel["MIWaveTile"][0]): + accIdx = idx1 * kernel["MIWaveTile"][0] + idx0 accStart = accIdx * accs_per_wave accEnd = accStart + accs_per_wave - 1 - a_new = a*vgprPerInput*self.numReadsIterCoalescedA - b_new = b*vgprPerInput*self.numReadsIterCoalescedB + idxA = idx0 if self.tPB["tile01Idx"] else idx1 + idxB = idx1 if self.tPB["tile01Idx"] else idx0 + a_new = idxA*vgprPerInput*self.numReadsIterCoalescedA + b_new = idxB*vgprPerInput*self.numReadsIterCoalescedB aStr = vgpr("ValuA_X%u_I%u+%u+%u+%u" % (vgprBufferA_new, iuiA_new, a_new, vgprBufferA_new_offset, iuiA_new_offset), vgprPerInput) bStr = vgpr("ValuB_X%u_I%u+%u+%u+%u" % (vgprBufferB_new, iuiB_new, b_new, vgprBufferB_new_offset, iuiB_new_offset), vgprPerInput) + Str0 = aStr if self.tPB["tile01Idx"] else bStr + Str1 = bStr if self.tPB["tile01Idx"] else aStr if kernel["ProblemType"]["DataType"].isSingleComplex(): # override because complex mul is emulated by 4 mfma insts # TODO: adopt component system @@ -6095,9 +6235,14 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): for v in ccVgprs: if v is not None: self.vgprPool.checkIn(v) else: - imod.addCode("v_mfma_%s_%ux%ux%u%s%s %s[%u:%u], %s, %s, %s[%u:%u]%s" \ - % (miOutTypeName, kernel["MatrixInstM"], kernel["MatrixInstN"], kernel["MatrixInstK"], miInTypeName, - mfma_1k, accumRegType, accStart, accEnd, aStr, bStr, accumRegType, accStart, accEnd, self.endLine)) + if kernel["SourceSwap"]: + imod.addCode("v_mfma_%s_%ux%ux%u%s%s %s[%u:%u], %s, %s, %s[%u:%u]%s" \ + % (miOutTypeName, kernel["MatrixInstM"], kernel["MatrixInstN"], kernel["MatrixInstK"], miInTypeName, + mfma_1k, accumRegType, accStart, accEnd, Str1, Str0, accumRegType, accStart, accEnd, self.endLine)) + else: + imod.addCode("v_mfma_%s_%ux%ux%u%s%s %s[%u:%u], %s, %s, %s[%u:%u]%s" \ + % (miOutTypeName, kernel["MatrixInstM"], kernel["MatrixInstN"], kernel["MatrixInstK"], miInTypeName, + mfma_1k, accumRegType, accStart, accEnd, Str0, Str1, accumRegType, accStart, accEnd, self.endLine)) # release register if kReg is not None: self.vgprPool.checkIn(kReg) @@ -6111,13 +6256,40 @@ def mfmaIter(self, kernel, m, innerUnroll, tail=False): return mfmaMod + + def removeExtraUnroll(self, kernel): + kStr = "" + loopCounterName = self.loopCounterName(kernel, self.unrollIdx) + tmpSgpr = self.getTmpSgpr(1).idx() + + kStr += inst("s_cmp_eq_u32", sgpr(loopCounterName), hex(kernel["LocalDotLayout"]-1), f'leftover L == {kernel["LocalDotLayout"]-1}?') + kStr += inst("s_lshl_b32", sgpr(tmpSgpr), "scc", hex(log2(self.bpeAB*8)), "shift lenghth for remove unused unroll") + + for blockA in range(0, kernel["ThreadTile0"]//2): + for iui in range(0, kernel["InnerUnroll"]): + aStr = f'ValuA_X0_I{iui}+{blockA}' + kStr += inst("v_lshlrev_b32", vgpr(aStr), sgpr(tmpSgpr), vgpr(aStr), "remove unused unroll") + + for blockB in range(0, kernel["ThreadTile1"]//2): + for iui in range(0, kernel["InnerUnroll"]): + bStr = f'ValuB_X0_I{iui}+{blockB}' + kStr += inst("v_lshlrev_b32", vgpr(bStr), sgpr(tmpSgpr), vgpr(bStr), "remove unused unroll") + + return kStr + + ############################################################################## # MAC Iteration ############################################################################## - def macIter(self, kernel, bufferIdx, iuiCount, useMacro): - if not self.do["MAC"]: return "" + def macIter(self, kernel, bufferIdx, iuiCount, useMacro, isTail=False): imod = Code.Module("macIter_X%u_I%u"%(bufferIdx, iuiCount)) + if not self.do["MAC"]: return imod + + if isTail and (kernel["InnerUnroll"] == 2) and (kernel["LocalDotLayout"] == 2) \ + and ((kernel["AssertSummationElementMultiple"] % kernel["LocalDotLayout"]) != 0): + imod.addText(self.removeExtraUnroll(kernel)) + if kernel["ProblemType"]["DataType"].isHalf(): imod.addInst(".align32 8, 0xbf800001", "align v_pk_fma") # Align v_pk_fma instructions used in MAC_ blocks @@ -6230,7 +6402,7 @@ def macCode(self, kernel, bufferIdx, iuiCount): # isPap means this is the PAP iteration, need to adjust the loop exit # isOptNLL : this is for the store-interleaved NLL optimization ############################################################################## - def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): + def openSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap): kStr = "" if prefetch: kStr += self.checkLastIter(kernel) @@ -6251,6 +6423,15 @@ def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): % self.getNamedLabel(labelName), \ "skip prefetch loads since numIter==0") elif isOptNLL: + + # When OptNLL + PAP enabled, but is the last tile so isPap=False (brief: T,T,F), + # We don't need to append the code here (checking Alpha,Beta,Tail) since it is shared with (T,T,T) + # Somehow we still need to do the register-pool backup... + if self.prefetchAcrossPersistent and not isPap: + self.savedVgprPool = deepcopy(self.vgprPool) + self.savedSgprPool = deepcopy(self.sgprPool) + return "" + skipOptNLL = self.getNamedLabel("OptNLL_End") tmpSgpr = self.getTmpSgpr(2).idx() @@ -6323,7 +6504,7 @@ def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): "skip if tail loop required") # The prefetch across persistent for OptNLL case - if self.prefetchAcrossPersistent: # can we use isPap input arg? + if self.prefetchAcrossPersistent and isPap: kStr += str(self.openPrefetchAcrossPersistent(kernel, isOptNLL=True)) newTileCodes = self.setupNewTile(kernel, self.tPA, self.tPB, isPap=True, isOptNLL=True) codes = '\n'.join([str(x) for x in newTileCodes]) @@ -6370,7 +6551,7 @@ def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): ############################################################################## ############################################################################## - def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isNGLL): + def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap, isNGLL): kStr = "" if not prefetch: if isNGLL: @@ -6378,8 +6559,9 @@ def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isNGLL): kStr += "label_%04u:%s" % (toPGR1, self.endLine) else: if isOptNLL: + endSumLabel = self.getNamedLabel("Summation_End_OptNLL") # If is PAP inside OptNLL: Swap the LRO (if EPS, depends on if BreakAtEvenIter) - if self.prefetchAcrossPersistent: + if self.prefetchAcrossPersistent and isPap: if kernel["ExpandPointerSwap"]: kStr += inst("s_cmp_eq_u32", sgpr("BreakAtEvenIter"), 1, "test if BreakAtEvenIter==1 ?") kStr += inst("s_cbranch_scc1", self.getLabelTarget("SkipLroSwap"), "Skip LROSwap if BreakAtEvenIter==1") @@ -6392,24 +6574,32 @@ def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isNGLL): if kernel["ExpandPointerSwap"]: kStr += self.getLabelDef("SkipLroSwap", "Skip LRO Swap\n") - kStr += self.comment1("Stores for OptNLL") - kStr += self.endSummation(kernel) + # Jump to Summation + kStr += inst("s_branch", "%s"%endSumLabel, "jump to Summation End") + kStr += "\n" + # Append label for pure OptNLL (no PAP interleaved) + kStr += "%s:\n" % self.getNamedLabel("SkipTo_PureOptNLL_LastTile") - # perhaps could work with LSU>1 by adding other indices here, but not tested - assert (kernel["LocalSplitU"] == 1) - kStr += self.notLocalSplitUGlobalWriteIndices(kernel) + else: + kStr += self.comment1("Stores for OptNLL") + kStr += self.endSummation(kernel, endSumLabel) - # add stores for opt NLL - (fullVw, elements) = self.notLocalFullTileElements(kernel, False) - kStr += self.globalWriteElements(kernel, [fullVw], [elements], applyAlpha=False, betas=[False], edges=[False]) + # perhaps could work with LSU>1 by adding other indices here, but not tested + assert (kernel["LocalSplitU"] == 1) + kStr += self.notLocalSplitUGlobalWriteIndices(kernel) - self.cleanupGlobalWrite(kernel) - kStr += "\n" - kStr += str(self.functionEnd(kernel, False)) - #kStr += inst("s_branch %s"%summationEnd, "skip the OptNLL") + # add stores for opt NLL + (fullVw, elements) = self.notLocalFullTileElements(kernel, False) + kStr += self.globalWriteElements(kernel, [fullVw], [elements], applyAlpha=False, betas=[False], edges=[False]) + + self.cleanupGlobalWrite(kernel) + kStr += "\n" + kStr += str(self.functionEnd(kernel, False)) + #kStr += inst("s_branch %s"%summationEnd, "skip the OptNLL") + + label = self.getNamedLabel("OptNLL_End") + kStr += "%s:%s" % (label, self.endLine) - label = self.getNamedLabel("OptNLL_End") - kStr += "%s:%s" % (label, self.endLine) else: label = self.getNamedLabel("PrefetchGlobalLastIterEnd") kStr += "%s:%s" % (label, self.endLine) @@ -6584,30 +6774,30 @@ def globalReadIncrement(self, kernel, imod, loopIdx, tP, prefetchIndex, incs=1): if self.globalReadIncsUseVgpr: imod.addInst("_v_add_co_u32 ", \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ vgpr("GlobalReadIncs%s+%u+0"%(tP["tensorChar"], 2*loopIdx)), \ "gra += inc%s%s (lower)"%(tP["tensorChar"], loopChar)) imod.addInst("_v_addc_co_u32", \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ vgpr("GlobalReadIncs%s+%u+1"%(tP["tensorChar"], 2*loopIdx)), \ - "vcc", \ + self.vcc, \ "gra += inc%s%s (upper)"%(tP["tensorChar"], loopChar)) else: imod.addInst("_v_add_co_u32 ", \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ sgpr("GlobalReadIncs%s+%u"%(tP["tensorChar"], loopIdx)), \ "gra += inc%s%s (lower)"%(tP["tensorChar"], loopChar)) imod.addInst("_v_addc_co_u32", \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ 0, - "vcc", \ + self.vcc, \ "gra += inc%s%s (upper)"%(tP["tensorChar"], loopChar)) graIdx += self.rpga #kStr += dump(vgpr("GlobalReadAddrA+0")) @@ -6715,7 +6905,7 @@ def globalReadIncrementAB(self, kernel, loopIdx, prefetchIndex, incs=1): assert(kernel["BufferLoad"]) - incCodeA.addText("\n"); + incCodeA.addText("\n") incCodeA.addComment1("Reset and increment SRDs") for tc in ('A','B'): incCodeA.addInst("s_mov_b32", sgpr("Srd%s+0"%tc), sgpr("InitialSrd%sBase+0"%tc), "restore base") @@ -6730,7 +6920,7 @@ def globalReadIncrementAB(self, kernel, loopIdx, prefetchIndex, incs=1): # TODO - this skips over the stagger-u wrap codes def incrementSrdPsd(tc, tp): - incCodeA.addText("\n"); + incCodeA.addText("\n") incUpperA = sgpr(inc[tc]+1) if self.use64bPackSumOffset else 0 if bool(set(kernel["ProblemType"]["IndicesSummation"]).intersection(set(kernel["ProblemType"]["MirrorDims%s"%tc]))) and not self.use64bPackSumOffset: incUpperA = sgpr(self.getTmpSgpr(1).idx()) @@ -6797,12 +6987,13 @@ def globalReadGuardK(self, kernel, tP): maxAddrVgpr = self.vgprPool.checkOut(2, "maxAddrVgpr") kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+0), sgpr(maxAddrSgpr+0), "sgpr->vgpr") kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+1), sgpr(maxAddrSgpr+1), "sgpr->vgpr") - del maxAddrSgpr # full exec mask fullExec = tmpSgpr - kStr += inst("s_mov_b64", sgpr(fullExec,2), \ - "0xFFFFFFFFFFFFFFFF", "to restore all threads active") + sgprCnt = self.laneSGPRCount + waveSize = kernel["WavefrontSize"] + activeMask = "0xFFFFFFFF" if (waveSize == 32) else "0xFFFFFFFFFFFFFFFF" + kStr += inst("s_mov_b{}".format(waveSize), sgpr(fullExec,sgprCnt), activeMask, "to restore all threads active") bpeVgpr = self.vgprPool.checkOut(1, "bpeVgpr") kStr += inst("v_mov_b32", vgpr(bpeVgpr), hex(tP["bpe"]), "bpe") @@ -6914,17 +7105,17 @@ def globalReadGuardK(self, kernel, tP): # However, buffer_load uses soffset as uint value, so GRO - SGRO, SGRO = 0 if unrollMirrorWithSoffset: codeMod = Code.Module("mirrorIdx%u"%loopCnt) - codeMod.addInst("v_sub_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset, "mirror unroll: GRO=GRO-SGRO, soffset=0") + codeMod.addInst("_v_sub_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset, "mirror unroll: GRO=GRO-SGRO, soffset=0") kStr += str(codeMod) soffset_prev = soffset soffset = "0" if kernel["DirectToLds%s"%tc]: - ldsInc = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + ldsInc = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr if kernel["LdsBlockSizePerPad%s"%tc] != 0: ldsInc += (ldsInc // kernel["LdsBlockSizePerPad%s"%tc]) * kernel["LdsPad%s"%tc] * tP["bpe"] else: - padInterval = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + padInterval = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr ldsInc += (ldsInc // padInterval) * kernel["LdsPad%s"%tc] * tP["bpe"] if kernel["UseInstOffsetForGRO"]: @@ -6980,7 +7171,7 @@ def globalReadGuardK(self, kernel, tP): if unrollMirrorWithSoffset: codeMod = Code.Module("mirrorIdx%u"%loopCnt) - codeMod.addInst("v_add_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset_prev, "mirror unroll: restore GRO=GRO+SGRO") + codeMod.addInst("_v_add_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset_prev, "mirror unroll: restore GRO=GRO+SGRO") kStr += str(codeMod) if kernel["DirectToLds%s"%tc] and kernel["UseInstOffsetForGRO"]: @@ -6989,7 +7180,7 @@ def globalReadGuardK(self, kernel, tP): else: # Not buffer load, ie 'flat' load # mask if current address if in bounds - kStr += inst("_v_cmpx_lt_u64", "vcc", \ + kStr += inst("_v_cmpx_lt_u64", self.vcc, \ vgpr("GlobalReadAddr%s+%u"%(tP["tensorChar"], graIdx),2), \ vgpr(maxAddrVgpr,2), \ "addr < maxAddr") @@ -7005,21 +7196,21 @@ def globalReadGuardK(self, kernel, tP): comment="load one flat value").toStr() # restore full exec mask - kStr += inst("s_or_saveexec_b64", "vcc", sgpr(fullExec,2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr(fullExec,self.laneSGPRCount), \ "all threads active") # increment address by 1 element (BPE) kStr += inst("_v_add_co_u32", \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)), \ vgpr(bpeVgpr), "gra += 1 (lower)") kStr += inst("_v_addc_co_u32", \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ - "vcc", \ + self.vcc, \ vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)), \ vgpr(zeroVgpr), \ - "vcc", \ + self.vcc, \ "gra += 1 (upper)") # int8 byte: @@ -7121,22 +7312,22 @@ def guardZeroPad(self, kernel, tP, codeMod, offsetVgpr, soffset, tmpSgpr, addrV, codeMod.addInst("s_add_u32", sgpr(tmpSgpr), sgpr(tmpSgpr), soffset, "add soffset ") if sumDim in kernel["ProblemType"]["MirrorDims%s"%tc]: - codeMod.addInst("v_sub_u32", vgpr(addrV), vgpr(zpr.regName), sgpr(tmpSgpr), \ + codeMod.addInst("_v_sub_u32", vgpr(addrV), vgpr(zpr.regName), sgpr(tmpSgpr), \ "<- GRO - scaled elementCounter") else: codeMod.addInst("_v_add_u32", vgpr(addrV), vgpr(zpr.regName), sgpr(tmpSgpr), \ "<- GRO + scaled elementCounter") - cmpDest = "vcc" if i==0 else sgpr(tmpSgpr,2) # first one writes vcc + cmpDest = self.vcc if i==0 else sgpr(tmpSgpr,self.laneSGPRCount) # first one writes vcc codeMod.addInst("v_cmp_ge_u32", cmpDest, vgpr(addrV), \ sgpr("ElementEdge%s%s"%(tc,sumChar)), \ "loopCounter*strideSum >= ElementEdge ?") if i>0: - codeMod.addInst("s_or_b64", "vcc", "vcc", sgpr(tmpSgpr,2),"combine elementEdge masks") + codeMod.addInst("s_or_b{}".format(self.kernel["WavefrontSize"]), self.vcc, self.vcc, sgpr(tmpSgpr,self.laneSGPRCount),"combine elementEdge masks") if i==len(zps)-1: - codeMod.addInst("v_cndmask_b32", vgpr(addrV), vgpr(offsetVgpr), -1, "vcc", \ + codeMod.addInst("v_cndmask_b32", vgpr(addrV), vgpr(offsetVgpr), -1, self.vcc, \ "Set addresses in pad to large OOB value") #if soffset != "0": @@ -7288,17 +7479,17 @@ def globalReadDo(self, kernel, mode, tP): # However, buffer_load uses soffset as uint value, so GRO - SGRO, SGRO = 0 if unrollMirrorWithSoffset: codeMod = Code.Module("mirrorIdx%u"%loopCnt) - codeMod.addInst("v_sub_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset, "mirror unroll: GRO=GRO-SGRO, soffset=0") + codeMod.addInst("_v_sub_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset, "mirror unroll: GRO=GRO-SGRO, soffset=0") loadModule.addCode(codeMod) soffset_prev = soffset soffset = "0" if kernel["DirectToLds%s"%tc]: - ldsInc = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + ldsInc = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr if kernel["LdsBlockSizePerPad%s"%tc] != 0: ldsInc += (ldsInc // kernel["LdsBlockSizePerPad%s"%tc]) * kernel["LdsPad%s"%tc] * tP["bpe"] else: - padInterval = (globalParameters["WavefrontWidth"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr + padInterval = (self.kernel["WavefrontSize"] if kernel["WaveSeparateGlobalRead%c"%tc] else kernel["NumThreads"]) * self.bpr ldsInc += (ldsInc // padInterval) * kernel["LdsPad%s"%tc] * tP["bpe"] if kernel["UseInstOffsetForGRO"]: @@ -7326,7 +7517,7 @@ def globalReadDo(self, kernel, mode, tP): if unrollMirrorWithSoffset: codeMod = Code.Module("mirrorIdx%u"%loopCnt) - codeMod.addInst("v_add_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset_prev, "mirror unroll: restore GRO=GRO+SGRO") + codeMod.addInst("_v_add_u32", vgpr(offsetVgpr), vgpr(offsetVgpr), soffset_prev, "mirror unroll: restore GRO=GRO+SGRO") loadModule.addCode(codeMod) if kernel["DirectToLds%s"%tc] and kernel["UseInstOffsetForGRO"]: @@ -7561,7 +7752,7 @@ def recalcLocalWriteAddresses(self, kernel, tP, uDu): kernel["LocalWrite2A"], \ self.localWrite2CoalescedA, self.localWrite2PerpendicularA, [self.localWriteStrideTileA, self.localWriteStrideUnrollA] ) - tP["localWriteInstruction"] = self.memoryInstructions[self.version]["LocalWrite"][newInstIdx] + tP["localWriteInstruction"] = self.memoryInstructions["LocalWrite"][newInstIdx] if kernel["PersistentKernel"]: if getattr(self, "oriLwa%s"%tc) is None: @@ -7618,15 +7809,14 @@ def recalcLocalReadAddressesAB(self, kernel): self.oriLraB = self.vgprPool.checkOut(1, "OriLocalReadAddrB") kStr += inst("v_mov_b32", vgpr(self.oriLraB), vgpr("LocalReadAddrB"), "back up LRA for persistent kernel + wider local read") - kStr += (self.lraTileAssignment(kernel, self.tPA)) + kStr += (self.lraTileAssignment(kernel, self.tPA, self.tPB)) kStr += (self.lraFinalOffset(kernel, self.tPA)) kStr += (self.lraDeclareAddresses(kernel, self.tPA)) - kStr += (self.lraTileAssignment(kernel, self.tPB)) kStr += (self.lraFinalOffset(kernel, self.tPB)) kStr += (self.lraDeclareAddresses(kernel, self.tPB)) imod.addCode(kStr) localRead2Perpendicular = False - instructions = self.memoryInstructions[self.version] + instructions = self.memoryInstructions localReadWidth = self.tPA["bpe"] / self.bpr if kernel["UnrollMajorLDSA"]: @@ -7672,10 +7862,10 @@ def preLoopLocalWriteDo(self, kernel, tPA, tPB): return imod, tmpCheckedOutVgprA, tmpCheckedOutVgprB # Opt for PAP waitcnt, 4 cases: - # one for the first PK-loop, one for Opt-NLL, one for Ord-NLL, No beta / one for Beta + # one for the first PK-loop, one for Opt-NLL, one for Edge, one for Beta basic_gl_Label = self.getNamedLabel("Basic_GL_Label") optNLL_lw_Label = self.getNamedLabel("OptNLL_LW_Label") - ordNLL_B0_lw_Label = self.getNamedLabel("OrdNLL_B0_LW_Label") + ordNLL_E1_lw_Label = self.getNamedLabel("OrdNLL_E1_LW_Label") ordNLL_B1_lw_Label = self.getNamedLabel("OrdNLL_B1_LW_Label") lwEnd_Label = self.getNamedLabel("PreLoopLWEnd") @@ -7704,10 +7894,11 @@ def preLoopLocalWriteDo(self, kernel, tPA, tPB): BranchMod.addInst("s_cmp_eq_u32", sgpr("PreLoopLWVmcntCase"), hex(2), "Case 2: Prev PK-Loop is Opt-NLL?") BranchMod.addInst("s_cbranch_scc1", optNLL_lw_Label, "jump to Case 2") - BranchMod.addInst("s_cmp_eq_u32", sgpr("PreLoopLWVmcntCase"), hex(3), "Case 3: Prev PK-Loop is Ord-NLL with no beta?") - BranchMod.addInst("s_cbranch_scc1", ordNLL_B0_lw_Label, "jump to Case 3") - BranchMod.addInst("s_cmp_eq_u32", sgpr("PreLoopLWVmcntCase"), hex(4), "Case 4: Prev PK-Loop is Ord-NLL with beta?") - BranchMod.addInst("s_cbranch_scc1", ordNLL_B1_lw_Label, "jump to Case 4") + BranchMod.addInst("s_cmp_eq_u32", sgpr("PreLoopLWVmcntCase"), hex(3), "Case 3: Prev PK-Loop is Ord-NLL with edge?") + BranchMod.addInst("s_cbranch_scc1", ordNLL_E1_lw_Label, "jump to Case 3") + if kernel["ProblemType"]["UseBeta"]: + BranchMod.addInst("s_cmp_eq_u32", sgpr("PreLoopLWVmcntCase"), hex(4), "Case 4: Prev PK-Loop is Ord-NLL with beta?") + BranchMod.addInst("s_cbranch_scc1", ordNLL_B1_lw_Label, "jump to Case 4") # Fast duplicate LWDoCodeTemplate four times to different placeholder keywords for later replacement (after global write) # can avoid calling localWriteDo() for several times @@ -7735,26 +7926,27 @@ def preLoopLocalWriteDo(self, kernel, tPA, tPB): LWDoCase2Mod.addInst("s_branch", lwEnd_Label, "finish case, jump to end of LW") # CASE 3: - # replace vmcnt("__placeholder__ + Basic_Load - Decrement") to vmcnt("OrdNLL_B0_Store + Basic_Load - Decrement") - currCaseKW = PreLoopVmcntCase( PreLoopVmcntCase.OrdNLL_B0_Store ).name + # replace vmcnt("__placeholder__ + Basic_Load - Decrement") to vmcnt("OrdNLL_E1_Store + Basic_Load - Decrement") + currCaseKW = PreLoopVmcntCase( PreLoopVmcntCase.OrdNLL_E1_Store ).name LWDoCase3Mod = imod.addCode(Code.Module(currCaseKW)) - LWDoCase3Mod.addText("\n%s:" % ordNLL_B0_lw_Label) + LWDoCase3Mod.addText("\n%s:" % ordNLL_E1_lw_Label) LWDoCase3Mod.addComment1("prev-global-store-cnt = %s, global-load-cnt = %s"%(currCaseKW, basicVmcntKW)) for item in codeTemplateStrList: LWDoCase3Mod.addText(str(item).replace("__placeholder__",currCaseKW)) LWDoCase3Mod.addInst("s_branch", lwEnd_Label, "finish case, jump to end of LW") - # CASE 4: - # replace vmcnt("__placeholder__ + Basic_Load - Decrement") to vmcnt("OrdNLL_B1_Store + Basic_Load - Decrement") - currCaseKW = PreLoopVmcntCase( PreLoopVmcntCase.OrdNLL_B1_Store ).name - LWDoCase4Mod = imod.addCode(Code.Module(currCaseKW)) - LWDoCase4Mod.addText("\n%s:" % ordNLL_B1_lw_Label) - # special for case 4, prev store already did vmcnt(n) for loading beta, we don't need any vmcnt here - # so only keep the lines without s_waitcnt vmcnt( __placeholder__ ), otherwise, discard them - # LWDoCase4Mod.addComment1("prev-global-store-cnt = %s, global-load-cnt = %s"%(currCaseKW, basicVmcntKW)) - for item in codeTemplateStrList: - if (str(item).find("__placeholder__") == -1): - LWDoCase4Mod.addText(str(item)) + if kernel["ProblemType"]["UseBeta"]: + # CASE 4: + # replace vmcnt("__placeholder__ + Basic_Load - Decrement") to vmcnt("OrdNLL_B1_Store + Basic_Load - Decrement") + currCaseKW = PreLoopVmcntCase( PreLoopVmcntCase.OrdNLL_B1_Store ).name + LWDoCase4Mod = imod.addCode(Code.Module(currCaseKW)) + LWDoCase4Mod.addText("\n%s:" % ordNLL_B1_lw_Label) + # special for case 4, prev store already did vmcnt(n) for loading beta, we don't need any vmcnt here + # so only keep the lines without s_waitcnt vmcnt( __placeholder__ ), otherwise, discard them + # LWDoCase4Mod.addComment1("prev-global-store-cnt = %s, global-load-cnt = %s"%(currCaseKW, basicVmcntKW)) + for item in codeTemplateStrList: + if (str(item).find("__placeholder__") == -1): + LWDoCase4Mod.addText(str(item)) # End imod.addText("\n%s:" % lwEnd_Label) @@ -7765,7 +7957,7 @@ def preLoopLocalWriteDo(self, kernel, tPA, tPB): ############################################################################## def replacePreLoopLWVmcnt(self, kernel): # This replaces the vmcnt keywords with the actual number - # ("Basic_Load"/"OptNLL_Store"/"OrdNLL_B0_Store"/"OrdNLL_B1_Store") + # ("Basic_Load"/"OptNLL_Store"/"OrdNLL_E1_Store"/"OrdNLL_B1_Store") maxVmcnt = globalParameters["AsmCaps"][self.version]["MaxVmcnt"] @@ -8061,7 +8253,7 @@ def localReadInc(self, kernel, iui, tP): LdsPad = kernel["LdsPad%s"%tc] if kernel["LdsBlockSizePerPad%s"%tc] == 0 else 0 if self.inTailLoop: - inc = kernel["LocalSplitU"] * (kernel["MacroTile%u" % tP["tensorIdx"]] + LdsPad) * tP["bpe"] + inc = kernel["LocalSplitU"] * (kernel["MacroTile%s" % tP["tensorChar"]] + LdsPad) * tP["bpe"] if kernel["EnableMatrixInstruction"]: if kernel["UnrollMajorLDS%s" % tP["tensorChar"]]: inc = kernel["LocalSplitU"] * tP["bpe"] @@ -8070,7 +8262,7 @@ def localReadInc(self, kernel, iui, tP): kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(inc), "inc") kStr += inst("_v_add_co_u32", \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ - "vcc", \ + self.vcc, \ sgpr(tmpSgpr), \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ "lr%s += %u (LSU*(MT+PAD)*bpe)"%(tP["tensorChar"], inc) ) @@ -8082,29 +8274,29 @@ def localReadInc(self, kernel, iui, tP): else: if tc == "A": if kernel["MatrixInstB"] != 1 or self.lrvwA == self.lrvwB: - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * kernel["MatrixInstK"] * self.numReadsIterCoalescedA + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * kernel["MatrixInstK"] * self.numReadsIterCoalescedA else: if (self.localReadDoCntA)%(kernel["LocalReadVectorWidth"]//self.lrvwA): - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * self.lrvwA + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * self.lrvwA else: - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * (kernel["MatrixInstK"]*kernel["LocalReadVectorWidth"]//self.lrvwA-self.lrvwA*(kernel["LocalReadVectorWidth"]//self.lrvwA-1)) + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * (kernel["MatrixInstK"]*kernel["LocalReadVectorWidth"]//self.lrvwA-self.lrvwA*(kernel["LocalReadVectorWidth"]//self.lrvwA-1)) else: if kernel["MatrixInstB"] != 1 or self.lrvwA == self.lrvwB: - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * kernel["MatrixInstK"] * self.numReadsIterCoalescedB + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * kernel["MatrixInstK"] * self.numReadsIterCoalescedB else: if (self.localReadDoCntB)%(kernel["LocalReadVectorWidth"]//self.lrvwB): - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * self.lrvwB + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * self.lrvwB else: - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) * (kernel["MatrixInstK"]*kernel["LocalReadVectorWidth"]//self.lrvwB-self.lrvwB*(kernel["LocalReadVectorWidth"]//self.lrvwB-1)) + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) * (kernel["MatrixInstK"]*kernel["LocalReadVectorWidth"]//self.lrvwB-self.lrvwB*(kernel["LocalReadVectorWidth"]//self.lrvwB-1)) else: - tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%u"%tP["tensorIdx"]] + LdsPad) + tP["localReadOffset"] += kernel["LocalSplitU"] * (kernel["MacroTile%s"%tP["tensorChar"]] + LdsPad) kStr += self.comment1("N/A, lro->%d" % tP["localReadOffset"]) kStr += self.comment1("self.localReadDoCntA %d self.localReadDoCntB %d" % (self.localReadDoCntA,self.localReadDoCntB)) else: - inc = kernel["LocalSplitU"] * (kernel["MacroTile%u" % tP["tensorIdx"]] + LdsPad) + inc = kernel["LocalSplitU"] * (kernel["MacroTile%s" % tP["tensorChar"]] + LdsPad) kStr += inst("_v_add_co_u32", \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ - "vcc", \ + self.vcc, \ hex(inc), \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ "lr%s += %u (LSU+(MT+Pad)*bpe"%(tP["tensorChar"], inc) ) @@ -8250,8 +8442,8 @@ def localSplitULocalWrite(self, kernel): # thread offset addr = lr0 - kStr += inst("_v_add_co_u32", vgpr(addr), "vcc", vgpr(lr1), vgpr(addr), "") - kStr += inst("_v_add_co_u32", vgpr(addr), "vcc", vgpr(sg), vgpr(addr), "threadOffset") + kStr += inst("_v_add_co_u32", vgpr(addr), self.vcc, vgpr(lr1), vgpr(addr), "") + kStr += inst("_v_add_co_u32", vgpr(addr), self.vcc, vgpr(sg), vgpr(addr), "threadOffset") self.vgprPool.checkIn(lr0) self.vgprPool.checkIn(lr1) self.vgprPool.checkIn(sg) @@ -8385,7 +8577,7 @@ def localSplitUReduction(self, kernel): regIdx //= elementStep # assume v_add_i32 can be used in place of v_add_f32 # may need to add saturation directive to v_add_i32 instruction to clamp integer arithmetic - kStr += inst("v_add_i32", vgpr("ValuC+%u"%cIdx), \ + kStr += inst("_v_add_i32", vgpr("ValuC+%u"%cIdx), \ vgpr("ValuC+%u" % regIdx), vgpr("ValuC+%u"%cIdx), "c[%u] += c[%u]"%(cIdx, regIdx) ) elif kernel["ProblemType"]["DataType"].isSingle(): cIdx //= elementStep @@ -8460,10 +8652,10 @@ def computeStoreSrdStart(self, kernel): if i == kernel["ProblemType"]["Index0"]: # Used if the output is transposed? addToSrd = False - elif i == kernel["ProblemType"]["Index1"]: + elif i == kernel["ProblemType"]["Index1"] and len(kernel["PackedC1IndicesX"]) == 1: coord = sgpr(wgMT1) addToSrd = True - elif not isPackedIndex(kernel, i): + elif i != kernel["ProblemType"]["Index0"] and i != kernel["ProblemType"]["Index1"] and not isPackedIndex(kernel, i): # group index, this is higher-order Tensor dimension, just add to SRD base: isStridedBuffer = kernel["ProblemType"]["StridedBatched"] or kernel["_GlobalAccumulation"] coord = sgpr("WorkGroup2") if isStridedBuffer else None @@ -8742,7 +8934,7 @@ def storeRemapAddStore(self, kernel, ss, addrCalc, tmpVgpr, tmpS01, edge): kStr += "\n" gwvw = kernel["StoreRemapVectorWidth"] - nElements = kernel["MacroTile0"]*kernel["MatrixInstN"]//kernel["MIWaveGroup"][0]//globalParameters["WavefrontWidth"] + nElements = kernel["MacroTile0"]*kernel["MatrixInstN"]//kernel["MIWaveGroup"][0]//self.kernel["WavefrontSize"] bpe = self.bpeCexternal bps = bpe * gwvw @@ -8788,7 +8980,7 @@ def storeRemapAddStore(self, kernel, ss, addrCalc, tmpVgpr, tmpS01, edge): kStr += inst("v_mov_b32", addr0, vgpr(self.storeRemapOffsetCoord1), "coord1") else: currentStep = i//gwvw - kStr += inst("v_add_u32", addr0, vgpr(self.storeRemapOffsetCoord1), self.storeRemapNCPL * currentStep , "coord1 += nColPerLoad") + kStr += inst("_v_add_u32", addr0, vgpr(self.storeRemapOffsetCoord1), self.storeRemapNCPL * currentStep , "coord1 += nColPerLoad") kStr += inst("v_mul_lo_u32", addr0, addr0, sgpr(strideD1), "coord1 offset = coord1 * StrideD") kStr += inst("_v_add_lshl_u32", addr0, addr0, vgpr(self.storeRemapCoord0), hex(log2(bpe)), "global write D address") @@ -8799,7 +8991,7 @@ def storeRemapAddStore(self, kernel, ss, addrCalc, tmpVgpr, tmpS01, edge): numStoreInst += 1 kStr += self.chooseGlobalWrite(True, bps, storeRegs[rIdx], rpv, addr0, addr1, 0, ntStr) else: - tmpS23 = tmpS01+2 + tmpS23 = tmpS01+self.laneSGPRCount coord0 = tmpVgpr coord1 = coord0+1 lrVw = kernel["StoreRemapVectorWidth"] @@ -8824,18 +9016,21 @@ def storeRemapAddStore(self, kernel, ss, addrCalc, tmpVgpr, tmpS01, edge): currentStep = i//lrVw # calculate global coordination - kStr += inst("v_add_u32", vgpr(coord1), vgpr(self.storeRemapCoord1), self.storeRemapNCPL * currentStep , "coord1 += nColPerLoad") - kStr += inst("v_add_u32",vgpr(coord0), vgpr(self.storeRemapCoord0), vi , "coord0 += element index of load vector") - kStr += inst("v_add_u32", addr0, vgpr(self.storeRemapOffsetCoord1), self.storeRemapNCPL * currentStep , \ + kStr += inst("_v_add_u32", vgpr(coord1), vgpr(self.storeRemapCoord1), self.storeRemapNCPL * currentStep , "coord1 += nColPerLoad") + kStr += inst("_v_add_u32",vgpr(coord0), vgpr(self.storeRemapCoord0), vi , "coord0 += element index of load vector") + kStr += inst("_v_add_u32", addr0, vgpr(self.storeRemapOffsetCoord1), self.storeRemapNCPL * currentStep , \ "offset coord1 += nColPerLoad") - kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,2), vgpr(coord0), sizeBoundary[0], "coord0 < size0" ) - kStr += inst("v_cmp_lt_u32", sgpr(tmpS23,2), vgpr(coord1), sizeBoundary[1], "coord1 < size1" ) - kStr += inst("s_and_b64", sgpr(tmpS23,2), sgpr(tmpS01,2), sgpr(tmpS23,2), "in0 && in1" ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,self.laneSGPRCount), vgpr(coord0), sizeBoundary[0], "coord0 < size0" ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpS23,self.laneSGPRCount), vgpr(coord1), sizeBoundary[1], "coord1 < size1" ) + kStr += inst("s_and_b{}".format(self.kernel["WavefrontSize"]), + sgpr(tmpS23,self.laneSGPRCount), + sgpr(tmpS01,self.laneSGPRCount), + sgpr(tmpS23,self.laneSGPRCount), "in0 && in1" ) kStr += inst("v_mul_lo_u32", addr0, addr0, sgpr(strideD1), "coord1 element offset = coord1 * StrideD") kStr += inst("_v_add_lshl_u32", addr0, addr0, vgpr(coord0), hex(log2(bpe)), "scale to BPE") - kStr += inst("v_cndmask_b32", addr0, -1, addr0, sgpr(tmpS23,2), "clip if OOB. offset" ) + kStr += inst("v_cndmask_b32", addr0, -1, addr0, sgpr(tmpS23,self.laneSGPRCount), "clip if OOB. offset" ) sumIdx = storeRegs[rIdx] + int(vi*rpe) numStoreInst += 1 @@ -8889,20 +9084,20 @@ def storeRemapComputeStoreVgprs(self, kernel): ldsPad = max(kernel["StoreRemapVectorWidth"],kernel["MIOutputVectorWidth"]) #calculate local write Address: v[vgprLocalWriteAddrC] - kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", globalParameters["WavefrontWidth"]*kernel["MIWaveGroup"][0], \ + kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", self.kernel["WavefrontSize"]*kernel["MIWaveGroup"][0], \ tmpV0, tmpS0) kStr += inst("v_mul_lo_u32", vgpr(waveCoord1), hex(kernel["MatrixInstN"]), vgpr(tid1), "coord1 offset of LDS for each Wave") kStr += inst("v_and_b32", vgpr(tid1), hex(kernel["MatrixInstN"]-1), vgpr("Serial"), "coord1 offset of LDS for each thread") - kStr += inst("v_add_u32", vgpr(tid1), vgpr(waveCoord1),vgpr(tid1),"coord1 offset in MacroTile") + kStr += inst("_v_add_u32", vgpr(tid1), vgpr(waveCoord1),vgpr(tid1),"coord1 offset in MacroTile") kStr += inst("v_mov_b32", vgpr(ldsStride), hex(kernel["MacroTile0"]+ldsPad), \ "lds stride = MT0 + PAD") kStr += inst("v_mul_lo_u32", vgpr(tmpV0), vgpr(tid1), vgpr(ldsStride), \ "lds coord1 offset = Col-id* lds stride") - kStr += vectorStaticDivideAndRemainder(waveCoord0, tid0, tid0, globalParameters["WavefrontWidth"],tmpV0, tmpS0) + kStr += vectorStaticDivideAndRemainder(waveCoord0, tid0, tid0, self.kernel["WavefrontSize"],tmpV0, tmpS0) kStr += inst("v_lshrrev_b32", vgpr(coord0), hex(log2(kernel["MatrixInstN"])), vgpr(tid0), \ "tid / matrixInstN") @@ -8925,20 +9120,20 @@ def storeRemapComputeStoreVgprs(self, kernel): kStr += self.comment1("Store Remap Local Read address") - kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", globalParameters["WavefrontWidth"], \ + kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", self.kernel["WavefrontSize"], \ tmpV0, tmpS0) kStr += inst("v_mul_lo_u32", vgpr(waveCoord1), hex(kernel["MatrixInstN"]//kernel["MIWaveGroup"][0]), vgpr(tid1), "coord1 offset of LDS for each Wave") nThreadPerCol = kernel["MacroTile0"] // gwvw - nColPerLoad = globalParameters["WavefrontWidth"] // nThreadPerCol + nColPerLoad = self.kernel["WavefrontSize"] // nThreadPerCol self.storeRemapLrOffset = (kernel["MacroTile0"]+ldsPad) * nColPerLoad self.storeRemapNCPL = nColPerLoad kStr += inst("v_lshrrev_b32", vgpr(tmpV1),\ hex(log2(nThreadPerCol)), vgpr(tid0), \ "tid / nThreadPerCol") - kStr += inst("v_add_u32", vgpr(coord1Offset), vgpr(waveCoord1),vgpr(tmpV1),"coord1 offset in MacroTile") + kStr += inst("_v_add_u32", vgpr(coord1Offset), vgpr(waveCoord1),vgpr(tmpV1),"coord1 offset in MacroTile") kStr += inst("v_mul_lo_u32", vgpr(tmpV0), vgpr(coord1Offset), vgpr(ldsStride), \ "lds coord1 offset = Col-id* lds stride") @@ -8957,7 +9152,7 @@ def storeRemapComputeStoreVgprs(self, kernel): # calculate global write coord0 and coord1 kStr += self.comment1("Store Remap global write coord0 and coord1") - kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", globalParameters["WavefrontWidth"]*kernel["MIWaveGroup"][0], \ + kStr += vectorStaticDivideAndRemainder(tid1, tid0, "Serial", self.kernel["WavefrontSize"]*kernel["MIWaveGroup"][0], \ tmpV0, tmpS0) ColsPerBlockShape = kernel["MatrixInstN"] * kernel["MatrixInstBN"] @@ -8965,7 +9160,7 @@ def storeRemapComputeStoreVgprs(self, kernel): kStr += inst("v_mul_lo_u32", vgpr(waveCoord1), hex(ColsPerBlockShape), vgpr(tid1), "coord1 offset of global memory for each Wave") - kStr += vectorStaticDivideAndRemainder(tid1, tid0, tid0, globalParameters["WavefrontWidth"], \ + kStr += vectorStaticDivideAndRemainder(tid1, tid0, tid0, self.kernel["WavefrontSize"], \ tmpV0, tmpS0) kStr += inst("v_mad_u32_u24", vgpr(waveCoord1), kernel["MatrixInstN"]//kernel["MIWaveGroup"][0], vgpr(tid1), vgpr(waveCoord1), \ "waveCoord1 += waveCoord0 * MiN / WaveGroupM") @@ -8974,7 +9169,7 @@ def storeRemapComputeStoreVgprs(self, kernel): hex(log2(nThreadPerCol)), vgpr(tid0), \ "tid / nThreadPerCol") - kStr += inst("v_add_u32", vgpr(coord1Offset), vgpr(waveCoord1),vgpr(tmpV1),"coord1 offset in MacroTile") + kStr += inst("_v_add_u32", vgpr(coord1Offset), vgpr(waveCoord1),vgpr(tmpV1),"coord1 offset in MacroTile") kStr += inst("s_mul_i32", \ sgpr(tmpS0), \ @@ -8982,7 +9177,7 @@ def storeRemapComputeStoreVgprs(self, kernel): sgpr(wg0), \ "%s = wg0*MT0"%sgpr(tmpS0)) - kStr += inst("v_add_co_u32", vgpr(tid0), "vcc", sgpr(tmpS0), vgpr(coord0), "coord0 = coord0 + wg0 * MT0") + kStr += inst("_v_add_co_u32", vgpr(tid0), self.vcc, sgpr(tmpS0), vgpr(coord0), "coord0 = coord0 + wg0 * MT0") kStr += inst("s_mul_i32", \ sgpr(wgMT1), \ @@ -8991,7 +9186,7 @@ def storeRemapComputeStoreVgprs(self, kernel): "<- wg1*MT1") kStr += inst("_v_add_co_u32", \ vgpr(tid1), \ - "vcc", \ + self.vcc, \ sgpr(wgMT1), \ vgpr(coord1Offset), \ "coord1 = tid1*VW + wg1*MT1") @@ -9104,10 +9299,10 @@ def __init__(self, kernelWriter, kernel, ss, gwvw, edge, beta, atomic): if ss.optSharedMask: self.numSgprsPerElement = 0 - self.fixedSgprsPerBatch = 2 + self.fixedSgprsPerBatch = kernelWriter.laneSGPRCount else: - self.numSgprsPerElement = 2 - self.fixedSgprsPerBatch = 6 + self.numSgprsPerElement = kernelWriter.laneSGPRCount + self.fixedSgprsPerBatch = 3*kernelWriter.laneSGPRCount if self.numSgprsPerElement: numSgprAvailable = kernelWriter.maxSgprs - kernelWriter.sgprPool.size() + kernelWriter.sgprPool.availableBlockAtEnd() @@ -9189,6 +9384,9 @@ def __init__(self, kernelWriter, kernel, gwvw, edge, beta, atomic, elements): if len(kernel["PackedC0IndicesX"]) > 1: # packed mode needs a unique VGPR address calc for each column. self.optSharedColVgpr = 1 + elif len(kernel["PackedC1IndicesX"]) > 1: + self.optSharedColVgpr = 0 + self.optSingleColVgpr = 0 else: self.optSingleColVgpr = 1 @@ -9208,9 +9406,6 @@ def __init__(self, kernelWriter, kernel, gwvw, edge, beta, atomic, elements): # can't have both of these enabled: assert (not (self.optSingleColVgpr and self.optSharedColVgpr)) - # packed1 not yet supported. Would need to: - # - extract packed dimensions from coord1 into - assert( len(kernel["PackedC1IndicesX"]) == 1) self.cfg = self.StoreConstConfig(kernelWriter, kernel, self, gwvw, edge, beta, atomic) @@ -9280,7 +9475,10 @@ def setupStoreElementsForBatch(self, kernel, gwvw, batchElements, batchElementSg coordOffset1 = bIdx1 * kernel["MatrixInstN"] coordOffset1 += wtIdex * kernel["MatrixInstN"] * kernel["MatrixInstBN"] * kernel["MIWaveGroup"][1] - coordOffset1 += vc1 + if kernel["SourceSwap"]: + coordOffset1 += vc0 * 4 + else: + coordOffset1 += vc1 else: if kernel["LocalSplitU"] > 1: strideD1 = (kernel["NumThreads"]*kernel["VectorWidth"]//kernel["MacroTile0"]) @@ -9297,7 +9495,7 @@ def setupStoreElementsForBatch(self, kernel, gwvw, batchElements, batchElementSg coordOffset0 = d0 * kernel["MatrixInstM"] * kernel["MatrixInstBM"] * kernel["MIWaveGroup"][0] + vc0 else: MFMAContinuousOutputs = kernel["MIOutputVectorWidth"] - OutputsPerMIMN = kernel["MatrixInstM"] * kernel["MatrixInstN"] // globalParameters["WavefrontWidth"] + OutputsPerMIMN = kernel["MatrixInstM"] * kernel["MatrixInstN"] // self.kernel["WavefrontSize"] eIdx0 = d0 % (OutputsPerMIMN // MFMAContinuousOutputs) remain_d0 = d0 // (OutputsPerMIMN // MFMAContinuousOutputs) @@ -9305,10 +9503,13 @@ def setupStoreElementsForBatch(self, kernel, gwvw, batchElements, batchElementSg remain_d0 = remain_d0 // kernel["MatrixInstBM"] wtIdex = remain_d0 % kernel["MIWaveTile"][0] - coordOffset0 = eIdx0 * (globalParameters["WavefrontWidth"] // kernel["MatrixInstN"]) * MFMAContinuousOutputs + coordOffset0 = eIdx0 * (self.kernel["WavefrontSize"] // kernel["MatrixInstN"]) * MFMAContinuousOutputs coordOffset0 += bIdx0 * kernel["MatrixInstM"] coordOffset0 += wtIdex * kernel["MatrixInstM"] * kernel["MatrixInstBM"] * kernel["MIWaveGroup"][0] - coordOffset0 += vc0 * (4 if kernel["ProblemType"]["DataType"].isDouble() else 1) + if kernel["SourceSwap"]: + coordOffset0 += vc1 + else: + coordOffset0 += vc0 * (4 if kernel["ProblemType"]["DataType"].isDouble() else 1) else: coordOffset0 = d0 * kernel["SubGroup0"]*kernel["VectorWidth"] + vc0 @@ -9382,7 +9583,7 @@ def setupStoreElementsForBatch(self, kernel, gwvw, batchElements, batchElementSg elif kernel["MatrixInstM"] == 4: sumIdx = kw.startVgprValuC + vc0 + (d0 * kernel["MIOutputVectorWidth"]) + d1 * (kernel["MIOutputVectorWidth"] * kernel["MIWaveTile"][0]) else: - d1_stride = ((kernel["MatrixInstM"] * kernel["MatrixInstN"]) // globalParameters["WavefrontWidth"]) * kernel["MatrixInstBM"] * kernel["MIWaveTile"][0] + d1_stride = ((kernel["MatrixInstM"] * kernel["MatrixInstN"]) // self.kernel["WavefrontSize"]) * kernel["MatrixInstBM"] * kernel["MIWaveTile"][0] sumIdx = kw.startVgprValuC + vc0 + (d0 * kernel["MIOutputVectorWidth"]) + (d1 * d1_stride) else: sumIdx = kw.startVgprValuC + vc0 + d0*kernel["VectorWidth"] + vc1*kernel["ThreadTile0"] + d1*kernel["VectorWidth"]*kernel["ThreadTile0"] @@ -9453,7 +9654,7 @@ def addScaled(self, destV, src0, src1, scale1, tmpS01, comment=""): return kStr - def emitAddressCoordIncrement(self, kernel, ss, tmpVgpr, tmpS01, edge): + def emitAddressCoordIncrement(self, kernel, ss, tmpVgpr, tmpS01, updateCoord1): """ Emit code that computes the coord0 and coord1 for this element sets self.coord0Vgpr with the address that holds the coord0 value for this element. @@ -9479,26 +9680,26 @@ def emitAddressCoordIncrement(self, kernel, ss, tmpVgpr, tmpS01, edge): self.coord0Vgpr = kw.coord0 elif self.coordOffset0 <= 64: self.coord0Vgpr = tmpVgpr - kStr += inst("_v_add_co_u32", vgpr(self.coord0Vgpr), "vcc", vgpr(kw.coord0), self.coordOffset0, \ + kStr += inst("_v_add_co_u32", vgpr(self.coord0Vgpr), self.kernelWriter.vcc, vgpr(kw.coord0), self.coordOffset0, \ "coord0.1: coord0 += d0*sg0*VW + vc0") else: self.coord0Vgpr = tmpVgpr kStr += inst("s_mov_b32", sgpr(tmpS01), self.coordOffset0, "coordOffset0 d0=%u vc0=%u"%(d0, vc0)) - kStr += inst("_v_add_co_u32", vgpr(self.coord0Vgpr), "vcc", vgpr(kw.coord0), sgpr(tmpS01), \ + kStr += inst("_v_add_co_u32", vgpr(self.coord0Vgpr), self.kernelWriter.vcc, vgpr(kw.coord0), sgpr(tmpS01), \ "coord0.2: coord0 += d0*sg0*VW + vc0") if self.newCoord1: - if not kernel["BufferStore"] or edge: #TODO, do we need edge? + if not kernel["BufferStore"] or updateCoord1: if self.rowInc== 0: None elif self.rowInc <= 64: # rowInc fits in instruction: - kStr += inst("_v_add_co_u32", vgpr(self.coord1Vgpr), "vcc", \ + kStr += inst("_v_add_co_u32", vgpr(self.coord1Vgpr), self.kernelWriter.vcc, \ vgpr(self.kernelWriter.coord1), self.rowInc, \ "coord1.1: coord1Vgpr += d1*sg1*VW + vc1") else: kStr += inst("s_mov_b32", sgpr(tmpS01), self.rowInc, "rowInc d1=%u vc1=%u"%(d0, vc0)) - kStr += inst("_v_add_co_u32", vgpr(self.coord1Vgpr), "vcc", \ + kStr += inst("_v_add_co_u32", vgpr(self.coord1Vgpr), self.kernelWriter.vcc, \ vgpr(self.kernelWriter.coord1), sgpr(tmpS01), \ "coord1.2: coord1 += d1*sg1*VW + vc1") return kStr @@ -9512,9 +9713,6 @@ def emitExtractAndScalePackedDims(self, kernel, ss, tmpVgpr, storeChar): packedBits = self.coord0Vgpr # start with coord0, will move to temp below rowPtr = kw.cinRowPtr if (storeChar == 'C') else kw.coutRowPtr - if (len(packedIndices)>1) and (storeChar =='C'): - return kStr - for i,idx in enumerate(packedIndices[:-1]): # vgprTmp assignments: # - tmp+0 may be the incoming packed coordinate 0, used on replay too @@ -9661,7 +9859,10 @@ def edgeProtectCode(self, kernel, edge, beta, atomic, mask, tmpSgpr): kStr = "" kw = self.kernelWriter tmpS01 = tmpSgpr - tmpS23 = tmpSgpr+2 + tmpS23 = tmpSgpr+self.kernelWriter.laneSGPRCount + + laneSGPRCount = self.kernelWriter.laneSGPRCount + wavefrontSize = kernel["WavefrontSize"] # Now do the edge check and compute the address in bytes: if kernel["BufferStore"]: @@ -9676,16 +9877,16 @@ def edgeProtectCode(self, kernel, edge, beta, atomic, mask, tmpSgpr): sgpr("PackedSize1") if len(kernel["PackedC1IndicesX"]) > 1 \ else kw.sizeRef(kernel["ProblemType"]["Index1"]) - kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,2), vgpr(self.coord0Vgpr), sizeBoundary[0], "coord0 < size0" ) - kStr += inst("v_cmp_lt_u32", sgpr(mask,2), vgpr(self.coord1Vgpr), sizeBoundary[1], "coord1 < size1" ) - kStr += inst("s_and_b64", sgpr(mask,2), sgpr(tmpS01,2), sgpr(mask,2), "in0 && in1" ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,laneSGPRCount), vgpr(self.coord0Vgpr), sizeBoundary[0], "coord0 < size0" ) + kStr += inst("v_cmp_lt_u32", sgpr(mask,laneSGPRCount), vgpr(self.coord1Vgpr), sizeBoundary[1], "coord1 < size1" ) + kStr += inst("s_and_b{}".format(wavefrontSize), sgpr(mask,laneSGPRCount), sgpr(tmpS01,laneSGPRCount), sgpr(mask,laneSGPRCount), "in0 && in1" ) else: - kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,2), vgpr(self.coord0Vgpr), sgpr("SizesFree+0"), "coord0 < size0" ) - kStr += inst("v_cmp_lt_u32", sgpr(tmpS23,2), vgpr(self.coord1Vgpr), sgpr("SizesFree+1"), "coord1 < size1" ) - kStr += inst("s_and_b64", sgpr(mask,2), sgpr(tmpS01,2), sgpr(tmpS23,2), "in0 && in1" ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,laneSGPRCount), vgpr(self.coord0Vgpr), sgpr("SizesFree+0"), "coord0 < size0" ) + kStr += inst("v_cmp_lt_u32", sgpr(tmpS23,laneSGPRCount), vgpr(self.coord1Vgpr), sgpr("SizesFree+1"), "coord1 < size1" ) + kStr += inst("s_and_b{}".format(wavefrontSize), sgpr(mask,laneSGPRCount), sgpr(tmpS01,laneSGPRCount), sgpr(tmpS23,laneSGPRCount), "in0 && in1" ) if (beta or atomic): - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "sgprs -> exec" ) + kStr += inst("s_mov_b{}".format(wavefrontSize), self.kernelWriter.exec, sgpr(mask,laneSGPRCount), "sgprs -> exec" ) return kStr @@ -9705,7 +9906,8 @@ def emitAddressSetupCode(self, kernel, ss, tmpVgpr, tmpS01, edge, beta, atomic, kStr = "" kw = self.kernelWriter - kStr += self.emitAddressCoordIncrement(kernel, ss, tmpVgpr, tmpS01, edge) + updateCoord1 = (edge or len(kernel["PackedC1IndicesX"]) > 1) + kStr += self.emitAddressCoordIncrement(kernel, ss, tmpVgpr, tmpS01, updateCoord1) # calculate flat load offset if not kernel["BufferStore"]: @@ -9731,13 +9933,15 @@ def emitAddressSetupCode(self, kernel, ss, tmpVgpr, tmpS01, edge, beta, atomic, self.rowIncDirtyRowPtr = 1 #assert (not kernel["ProblemType"]["UseInitialStridesCD"]) kStr += kw.comment("Fix for UseInitialStridesCD, emitAddressSetupCode") - assert (len(kernel["PackedC1IndicesX"])==1) - strideChar = self.kernelWriter.indexChars[kernel["PackedC1IndicesX"][0]] - kStr += self.addScaled(vgpr(kw.cinRowPtr), vgpr(kw.cinRowPtr), \ - sgpr("StrideC%s"%strideChar), self.rowInc, tmpS01, "ROWINC- Move cinRowPtr to next row") - kStr += self.addScaled(vgpr(kw.coutRowPtr), vgpr(kw.coutRowPtr), \ - sgpr("StrideD%s"%strideChar), self.rowInc, tmpS01, "Move coutRowPtr to next row") + if len(kernel["PackedC1IndicesX"]) == 1: + strideChar = self.kernelWriter.indexChars[kernel["PackedC1IndicesX"][0]] + kStr += self.addScaled(vgpr(kw.cinRowPtr), vgpr(kw.cinRowPtr), \ + sgpr("StrideC%s"%strideChar), self.rowInc, tmpS01, "ROWINC- Move cinRowPtr to next row") + kStr += self.addScaled(vgpr(kw.coutRowPtr), vgpr(kw.coutRowPtr), \ + sgpr("StrideD%s"%strideChar), self.rowInc, tmpS01, "Move coutRowPtr to next row") + elif len(kernel["PackedC1IndicesX"]) > 1: + kStr += self.kernelWriter.extractPackedCoord1ToRowStart(kernel, kernel["PackedC1IndicesX"] , self.coord1Vgpr, 'D') # Shift Pointer for MFMA: # For MFMA shift pointer, correct data is stored in another thread. @@ -9746,6 +9950,8 @@ def emitAddressSetupCode(self, kernel, ss, tmpVgpr, tmpS01, edge, beta, atomic, if not kernel["GuaranteeNoPartialB"] and kw.readTileDimVectorB and kernel["EnableMatrixInstruction"] and edge: (d1,d0,vc1,vc0) = self.element if (d1 == vc1 == d0 == vc0 == 0) or self.newCoord1: + sgprCnt = self.kernelWriter.laneSGPRCount + waveSize = kernel["WavefrontSize"] packedC1 = kernel["PackedC1IndicesX"] strideC1 = "StrideC%s" % (kw.indexChars[packedC1[0]]) strideD1 = "StrideD%s" % (kw.indexChars[packedC1[0]]) @@ -9755,29 +9961,28 @@ def emitAddressSetupCode(self, kernel, ss, tmpVgpr, tmpS01, edge, beta, atomic, vTmp1 = tmpVgpr vTmp2 = tmpVgpr+1 sTmp1 = tmpS01 - sTmp2 = tmpS01+2 + sTmp2 = tmpS01+sgprCnt # check conditions kStr += inst("v_bfi_b32", vgpr(vTmp1), vw-1, 0, vgpr(self.coord1Vgpr), "coord1 & ~(vw-1)") kStr += inst("v_bfi_b32", vgpr(vTmp2), vw-1, 0, sgpr("SizesFree+%u"%kw.tPB["idx"]), "sizeFree & ~(vw-1)") - kStr += inst("v_cmp_eq_u32", sgpr(sTmp1,2), vgpr(vTmp1), vgpr(vTmp2), "if coord1 is in edge glvw") + kStr += inst("v_cmp_eq_u32", sgpr(sTmp1,sgprCnt), vgpr(vTmp1), vgpr(vTmp2), "if coord1 is in edge glvw") kStr += inst("v_and_b32", vgpr(vTmp2), sgpr("SizesFree+%u"%kw.tPB["idx"]), vw-1, "sizeFree mod VW") - kStr += inst("v_cmp_gt_u32", sgpr(sTmp2,2), vgpr(vTmp2), 0, "this problem is not multiple size of glvw") - kStr += inst("s_and_b64", sgpr(sTmp1,2), sgpr(sTmp1,2), sgpr(sTmp2,2), "AND both conditions") + kStr += inst("v_cmp_gt_u32", sgpr(sTmp2,sgprCnt), vgpr(vTmp2), 0, "this problem is not multiple size of glvw") + kStr += inst("s_and_b{}".format(waveSize), sgpr(sTmp1,sgprCnt), sgpr(sTmp1,sgprCnt), sgpr(sTmp2,sgprCnt), "AND both conditions") # calculate new coord - kStr += inst("v_add_u32", vgpr(vTmp1), vgpr(self.coord1Vgpr), vgpr(vTmp2), "shift coord1") - kStr += inst("v_bfi_b32", vgpr(vTmp1), vw-1, vgpr(vTmp1), sgpr("SizesFree+%u"%kw.tPB["idx"]), \ - "new coord1 = (shift coord1 & (vw-1)) | (sizeFree & ~(vw-1))") - kStr += inst("v_sub_i32", vgpr(vTmp2), vgpr(vTmp1), vgpr(self.coord1Vgpr), "shift how many column") + kStr += inst("_v_add_u32", vgpr(vTmp1), vgpr(self.coord1Vgpr), vgpr(vTmp2), "shift coord1") + kStr += inst("v_bfi_b32", vgpr(vTmp1), vw-1, vgpr(vTmp1), sgpr("SizesFree+%u"%kw.tPB["idx"]), "new coord1 = (shift coord1 & (vw-1)) | (sizeFree & ~(vw-1))") + kStr += inst("_v_sub_i32", vgpr(vTmp2), vgpr(vTmp1), vgpr(self.coord1Vgpr), "shift how many column") kStr += inst("v_cndmask_b32", vgpr(self.coord1Vgpr), vgpr(self.coord1Vgpr), vgpr(vTmp1), \ - sgpr(sTmp1,2), "set new coord1 if meet conditions" ) + sgpr(sTmp1,sgprCnt), "set new coord1 if meet conditions" ) kStr += inst("v_mad_i32_i24", vgpr(vTmp1), sgpr(strideC1), vgpr(vTmp2), vgpr(kw.cinRowPtr), \ "new rowStart address += shift column * StridesC") - kStr += inst("v_cndmask_b32", vgpr(kw.cinRowPtr), vgpr(kw.cinRowPtr), vgpr(vTmp1), sgpr(sTmp1,2), \ + kStr += inst("v_cndmask_b32", vgpr(kw.cinRowPtr), vgpr(kw.cinRowPtr), vgpr(vTmp1), sgpr(sTmp1,sgprCnt), \ "set new rowStart if meet conditions" ) kStr += inst("v_mad_i32_i24", vgpr(vTmp1), sgpr(strideD1), vgpr(vTmp2), vgpr(kw.coutRowPtr), \ "new rowStart address += shift column * StridesD") - kStr += inst("v_cndmask_b32", vgpr(kw.coutRowPtr), vgpr(kw.coutRowPtr), vgpr(vTmp1), sgpr(sTmp1,2), \ + kStr += inst("v_cndmask_b32", vgpr(kw.coutRowPtr), vgpr(kw.coutRowPtr), vgpr(vTmp1), sgpr(sTmp1,sgprCnt), \ "set new rowStart if meet conditions" ) if kernel["StoreRemapVectorWidth"]: @@ -9787,7 +9992,7 @@ def emitAddressSetupCode(self, kernel, ss, tmpVgpr, tmpS01, edge, beta, atomic, kStr += inst("v_mad_i32_i24", vgpr(vTmp1), vgpr(vTmp1), vgpr(vTmp2), vgpr(kw.storeRemapLW), \ "new lds write address += shift column * Lds byte Stride") kStr += inst("v_cndmask_b32", vgpr(kw.storeRemapLW), vgpr(kw.storeRemapLW), vgpr(vTmp1), \ - sgpr(sTmp1,2), "set new rowStart if meet conditions" ) + sgpr(sTmp1,sgprCnt), "set new rowStart if meet conditions" ) kStr += "\n" return kStr @@ -9798,18 +10003,20 @@ def emitLdChange(self, kernel, ss, tc, edge, beta, mask, singleUpdate, tmpVgpr, Generate code for final C read/D write address """ + laneSGPRCount = self.kernelWriter.laneSGPRCount + kStr = "" if kernel["BufferStore"]: kStr += self.emitScaleToBpe(kernel, ss, tmpVgpr, singleUpdate, tc) if edge and (not kernel["StoreRemapVectorWidth"] or (kernel["StoreRemapVectorWidth"] and beta)): kStr += inst("v_cndmask_b32", vgpr(self.addrVgpr), -1, vgpr(self.addrVgpr), \ - sgpr(mask,2), "LD%s clip if OOB. offset" % tc ) + sgpr(mask,laneSGPRCount), "LD%s clip if OOB. offset" % tc ) else: # store a copy of the offset in 2 of the tmpVgpr for D - kStr += inst("_v_add_co_u32", vgpr(addr+0), "vcc", vgpr(BufAddr+0), vgpr(tmpVgpr+2), \ + kStr += inst("_v_add_co_u32", vgpr(addr+0), self.kernelWriter.vcc, vgpr(BufAddr+0), vgpr(tmpVgpr+2), \ "addr = C(D) + index*bytes (lo)" ) - kStr += inst("_v_addc_co_u32", vgpr(addr+1), "vcc", vgpr(BufAddr+1), vgpr(tmpVgpr+3), \ - "vcc", "addr = C(D) + index*bytes (hi)") + kStr += inst("_v_addc_co_u32", vgpr(addr+1), self.kernelWriter.vcc, vgpr(BufAddr+1), vgpr(tmpVgpr+3), \ + self.kernelWriter.vcc, "addr = C(D) + index*bytes (hi)") return kStr @@ -10047,9 +10254,9 @@ def globalWriteElements(self, kernel, vectorWidths, elements, kStr += inst("v_readfirstlane_b32", sgpr("Alpha"), vgpr(alphaVgprTmp), "restore alpha sgpr") self.vgprPool.checkIn(alphaVgprTmp) - #jgolds look at moving these converted values back to scalar regs and free up the VGPRs - # ethan if using (h,h,h,h,s,s) + HPA then the host should pass in an F32 alpha, we don't have to do the cvt if beta: + #jgolds look at moving these converted values back to scalar regs and free up the VGPRs + # TODO - for hpa the host should pass in an F32 alpha so we don't have to do it here self.betaVgpr = self.vgprPool.checkOut(1, "beta") kStr += inst("v_mov_b32", vgpr(self.betaVgpr), sgpr("Beta"), "sgpr -> vgpr b/c op_sel") kStr += inst("v_cvt_f32_f16", vgpr(self.betaVgpr), vgpr(self.betaVgpr), "convert beta to fp32") @@ -10085,22 +10292,6 @@ def globalWriteElements(self, kernel, vectorWidths, elements, if beta: kStr += "%s:\n"%(betaLabel) - # if len(betas) == 1, then is for OptNLL (case 2), else is OrdNLL (case 3,4) - if self.canOptimizePreLoopLWVmcnt: - if len(betas) > 1: # betas = [False,True], OrdNLL - if beta: # case 3 = no beta / case 4 = beta - case = 4 - self.currPreLoopVmcntCase = PreLoopVmcntCase.OrdNLL_B1_Store - else: - case = 3 - self.currPreLoopVmcntCase = PreLoopVmcntCase.OrdNLL_B0_Store - kStr += inst("s_mov_b32", sgpr("PreLoopLWVmcntCase"), hex(case), \ - "for optimizing next PreLoop LWVmcnt, set to Case%u: OrdNLL and %sbeta"%(case, "" if beta else "no ")) - else: # betas = [False], OptNLL - self.currPreLoopVmcntCase = PreLoopVmcntCase.OptNLL_Store - kStr += inst("s_mov_b32", sgpr("PreLoopLWVmcntCase"), hex(2), \ - "for optimizing next PreLoop LW vmcnt, set to Case2: OptNLL") - ######################################## # branch if Edge0 or Edge1 if False in edges and True in edges: @@ -10110,6 +10301,20 @@ def globalWriteElements(self, kernel, vectorWidths, elements, for edge in edges: kStr += "%s:%s"%(writeLabels[beta][edge], self.endLine) + if self.canOptimizePreLoopLWVmcnt: + if beta: + self.currPreLoopVmcntCase = PreLoopVmcntCase.OrdNLL_B1_Store + elif edge: + self.currPreLoopVmcntCase = PreLoopVmcntCase.OrdNLL_E1_Store + else: + self.currPreLoopVmcntCase = PreLoopVmcntCase.OptNLL_Store + kStr += inst("s_mov_b32", sgpr("PreLoopLWVmcntCase"), hex(self.currPreLoopVmcntCase.value), \ + "for optimizing next PreLoop LW vmcnt, set to Case%u"%self.currPreLoopVmcntCase.value) + # reset vmcnt if the dict has this key (OptNLL_Store, OrdNLL_E1_Store), + # OrdNLL_B1_Store is excluded + if self.currPreLoopVmcntCase in self.preLoopVmcntDict: + self.preLoopVmcntDict[self.currPreLoopVmcntCase] = 0 + # for storeRemap edge case, non-beta still can enable vector stores if kernel["StoreRemapVectorWidth"] and not beta: edgeI = False @@ -10321,34 +10526,49 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \ # rpv = regs per vector rpv = bpl/4.0 - if self.version[0] == 10: - extraFields += " glc, slc, dlc" if useBuffer: + rv = Code.Module("Global Read") tailFields = "offen offset:%u"%offset + # buffer_load offset field is 12-bit. + # if offset >= 4096, use soffset instead + if offset >= 4096: + if soffset == 0 or soffset == "0": + tailFields = "offen offset:0" + soffset = sgpr(self.getTmpSgpr(1).idx()) + rv.addCode(inst("s_mov_b32", soffset, offset, "large offset")) + else: + assert 0, "offset too large and soffset set" if extraFields != "": tailFields += ", %s"% extraFields if bpl==1 and hi16: - return Code.GlobalReadInst("buffer_load_ubyte_d16_hi", vgpr(destVgpr, rpv*4), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_ubyte_d16_hi", vgpr(destVgpr, rpv*4), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==1 and not hi16: - return Code.GlobalReadInst("buffer_load_ubyte_d16", vgpr(destVgpr, rpv*4), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_ubyte_d16", vgpr(destVgpr, rpv*4), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==2 and hi16: - return Code.GlobalReadInst("buffer_load_short_d16_hi", vgpr(destVgpr, rpv*2), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_short_d16_hi", vgpr(destVgpr, rpv*2), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==2 and not hi16: - return Code.GlobalReadInst("buffer_load_short_d16", vgpr(destVgpr, rpv*2), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_short_d16", vgpr(destVgpr, rpv*2), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==4: - return Code.GlobalReadInst("buffer_load_dword", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_dword", vgpr(destVgpr, rpv), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==8: - return Code.GlobalReadInst("buffer_load_dwordx2", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_dwordx2", vgpr(destVgpr, rpv), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==16: - return Code.GlobalReadInst("buffer_load_dwordx4", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment) + rv.addCode(Code.GlobalReadInst("buffer_load_dwordx4", vgpr(destVgpr, rpv), addr0, \ + addr1, soffset, tailFields, comment)) + return rv elif bpl==32: # split into two dwordx4 loads. Second load offset is +0.5 bpl tailFields1 = "offen offset:%u"%(offset + bpl/2) @@ -10360,11 +10580,11 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \ addr1, soffset, tailFields, comment)) rv.addCode(Code.GlobalReadInst("buffer_load_dwordx4", vgpr(int(destVgpr + rpv/2), rpv/2), addr0, \ addr1, soffset, tailFields1, comment)) - return rv - else: assert 0, "chooseGlobalRead: bad bpl" + return rv + else: if bpl==2 and hi16: return Code.GlobalReadInst("flat_load_short_d16_hi", vgpr(destVgpr, rpv*2), addr0, extraFields, comment ) @@ -10391,28 +10611,36 @@ def chooseGlobalWrite(self, useBuffer, bps, srcVgpr, rpv, \ kStr = "" if useBuffer: + tmpSgpr = 0 + # buffer_load offset field is 12-bit. + # if offset >= 4096, use soffset instead + if offset >= 4096: + tmpSgpr = sgpr(self.getTmpSgpr(1).idx()) + kStr += inst("s_mov_b32", tmpSgpr, offset, "large offset") + offset = 0 + if bps==2 and hi16: kStr += inst("buffer_store_short_d16_hi", vgpr(srcVgpr, rpv*2), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") elif bps==2 and not hi16: kStr += inst("buffer_store_short", vgpr(srcVgpr, rpv*2), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") elif bps==4: kStr += inst("buffer_store_dword", vgpr(srcVgpr, rpv), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") elif bps==8: kStr += inst("buffer_store_dwordx2", vgpr(srcVgpr, rpv), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") elif bps==16: kStr += inst("buffer_store_dwordx4", vgpr(srcVgpr, rpv), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") elif bps == 32: # split into two dwordx4 loads. Offset the second by +0.5 bps kStr += inst("buffer_store_dwordx4", vgpr(srcVgpr, rpv/2), addr0, \ - addr1, 0, "offen", "offset:%u"%offset, extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%offset, extraFields, "store D") kStr += inst("buffer_store_dwordx4", vgpr(int(srcVgpr +rpv/2), rpv/2), addr0, \ - addr1, 0, "offen", "offset:%u"%(int(offset+bps/2)), extraFields, "store D") + addr1, tmpSgpr, "offen", "offset:%u"%(int(offset+bps/2)), extraFields, "store D") else: assert 0, "bad bps" else: @@ -10511,7 +10739,7 @@ def chooseAddForAtomic(self, kernel, dst, src0, src1, comment): elif kernel["ProblemType"]["DataType"].isInt8x4() or kernel["ProblemType"]["DataType"].isInt8(): # assume v_add_i32 can be used in place of v_add_f32 # need to add saturation directive to v_add_i32 instruction to clamp integer arithmetic - kStr += inst("v_add_i32", \ + kStr += inst("_v_add_i32", \ dst, src0, src1, \ comment) elif kernel["ProblemType"]["DataType"].isSingle(): @@ -10584,15 +10812,15 @@ def applyAlpha(self, kernel, gwvw, elementSumIdx, elementIdx, tmpS01): tmpVgpr = self.vgprPool.checkOut(1) kStr += inst("v_mov_b32", vgpr(tmpVgpr), vgpr("ValuC+%u"%(sumIdxV*2)), "store Cr") kStr += inst("v_mul_f32", vgpr("ValuC+%u"%(sumIdxV*2)), sgpr("Alpha"), vgpr("ValuC+%u"%(sumIdxV*2)), "*= alpha ( Cr = Ar * Cr)") - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), "-" + sgpr("Alpha+1"), vgpr("ValuC+%u"%(sumIdxV*2+1)), "*= alpha ( Cr += -Ai * Ci )") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), "-" + sgpr("Alpha+1"), vgpr("ValuC+%u"%(sumIdxV*2+1)), "*= alpha ( Cr += -Ai * Ci )") kStr += inst("v_mul_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), sgpr("Alpha"), vgpr("ValuC+%u"%(sumIdxV*2+1)), "*= alpha ( Ci = Ar * Ci)") - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), sgpr("Alpha+1"), vgpr(tmpVgpr), "*= alpha ( Ci += Ai * Cr_backup )") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), sgpr("Alpha+1"), vgpr(tmpVgpr), "*= alpha ( Ci += Ai * Cr_backup )") self.vgprPool.checkIn(tmpVgpr) # double precision complex elif kernel["ProblemType"]["ComputeDataType"].isDoubleComplex(): - vtmp1 = self.vgprPool.checkOut(2) - vtmp2 = self.vgprPool.checkOut(2) + vtmp1 = self.vgprPool.checkOutAligned(2, 2) + vtmp2 = self.vgprPool.checkOutAligned(2, 2) # tmp1 = a.real * b.real kStr += inst("v_mul_f64", vgpr(vtmp1,2), sgpr("Alpha+0",2), vgpr("ValuC+%u"%(sumIdxV*4+0),2), "") # tmp2 = a.imag * b.real @@ -10681,7 +10909,10 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, loadsIssued = 0 storesIssued = 0 tmpS01 = tmpSgpr # scratch sgprs - tmpS23 = tmpS01+2 + tmpS23 = tmpS01+self.laneSGPRCount + + wavelen = self.kernel["WavefrontSize"] + laneSGPRC = self.laneSGPRCount ######################################## # calculate addr and masks @@ -10753,9 +10984,16 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, kStr += self.applyAlpha(kernel, gwvw, ss.elementSumIdx, elementIdx, tmpS01) if not kernel["BufferStore"]: + offsetSrc = (tmpVgpr+2) if beta else addr + + kStr += inst("_v_add_co_u32", vgpr(addr+0), self.vcc, vgpr(addrD+0), \ + vgpr(offsetSrc+0), "addr = D + index*bytes (lo)" ) + kStr += inst("_v_addc_co_u32", vgpr(addr+1), self.vcc, vgpr(addrD+1), \ + vgpr(offsetSrc+1), self.vcc, "addr = D + index*bytes (hi)") + # restore full exec mask for calculating addr of next element if edge and (beta or atomic): - kStr += inst("s_mov_b64", "exec", -1, "full mask -1 -> exec" ) + kStr += inst("s_mov_b{}".format(kernel["WavefrontSize"]), self.exec, -1, "full mask -1 -> exec" ) ######################################## # AccVgpr read @@ -10807,7 +11045,6 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # - Use vi to compute addresses, sumIdx. # - Need a solution for the mask. Can move to all buffer or can fix? - # atomic loop label element = batchElements[0] d1 = element[0] d0 = element[1] @@ -10834,7 +11071,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # apply in-bounds exec mask if edge: - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "sgprs -> exec (before atomic)" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "sgprs -> exec (before atomic)" ) for avi in range(0, gwvw//atomicW): dataV = ss.elementData[elementIdx] + int(avi*ss.cfg.numVgprsPerDataPerVI) @@ -10851,7 +11088,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, pass # TODO: if edge: - kStr += inst("s_mov_b64", "exec", -1, "full mask -> exec" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, -1, "full mask -> exec" ) else: ######################################## # wait for batched load @@ -10874,7 +11111,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # apply in-bounds exec mask if edge: - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "sgprs -> exec (before atomic)" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "sgprs -> exec (before atomic)" ) for avi in range(0, gwvw//atomicW): dataV = ss.elementData[elementIdx] + int(avi*ss.cfg.numVgprsPerDataPerVI) @@ -10940,7 +11177,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # calculate new masks if edge: - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "sgprs -> exec" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "sgprs -> exec" ) for avi in range(0, gwvw//atomicW): dataV = ss.elementData[elementIdx] + int(avi*ss.cfg.numVgprsPerDataPerVI) atomicDestVgpr = dataV if kernel["BufferStore"] else dataV+2 @@ -10949,46 +11186,47 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, if avi == 0: # use u64 for DGEMM if kernel["ProblemType"]["DestDataType"].isDouble(): - kStr += inst("v_cmp_ne_u64", sgpr(tmpS01,2), vgpr(atomicDestVgpr,2), \ + kStr += inst("v_cmp_ne_u64", sgpr(tmpS01,laneSGPRC), vgpr(atomicDestVgpr,2), \ vgpr(dataV+2,2), "c read during atomic == c read during prior load (avi=%u, first)"%avi ) else: - kStr += inst("v_cmp_ne_u32", sgpr(tmpS01,2), vgpr(atomicDestVgpr), \ + kStr += inst("v_cmp_ne_u32", sgpr(tmpS01,laneSGPRC), vgpr(atomicDestVgpr), \ vgpr(dataV+1), "c read during atomic == c read during prior load (avi=%u, first)"%avi ) else: if kernel["ProblemType"]["DestDataType"].isDouble(): - kStr += inst("v_cmp_ne_u64", sgpr(tmpS23,2), vgpr(atomicDestVgpr,2), \ + kStr += inst("v_cmp_ne_u64", sgpr(tmpS23,laneSGPRC), vgpr(atomicDestVgpr,2), \ vgpr(dataV+2,2), "c read during atomic != c read during prior load" ) else: - kStr += inst("v_cmp_ne_u32", sgpr(tmpS23,2), vgpr(atomicDestVgpr), \ + kStr += inst("v_cmp_ne_u32", sgpr(tmpS23,laneSGPRC), vgpr(atomicDestVgpr), \ vgpr(dataV+1), "c read during atomic == c read during prior load (avi=%u)"%avi ) - kStr += inst("s_or_b64", sgpr(tmpS01,2), sgpr(tmpS01,2), sgpr(tmpS23,2), "combine with tmp mask") + kStr += inst("s_or_b{}".format(wavelen), sgpr(tmpS01,laneSGPRC), \ + sgpr(tmpS01,laneSGPRC), sgpr(tmpS23,laneSGPRC), "combine with tmp mask") if kernel["DisableAtomicFail"]: - kStr += inst("s_mov_b64", sgpr(mask,2), 0, "DisableAtomicFail, force 0" ) + kStr += inst("s_mov_b{}".format(wavelen), sgpr(mask,laneSGPRC), 0, "DisableAtomicFail, force 0" ) else: - kStr += inst("s_and_b64", sgpr(mask,2), sgpr(tmpS01,2), sgpr(mask,2), "inBounds & must try again" ) + kStr += inst("s_and_b{}".format(wavelen), sgpr(mask,laneSGPRC), sgpr(tmpS01,laneSGPRC), sgpr(mask,laneSGPRC), "inBounds & must try again" ) else: for avi in range(0, gwvw//atomicW): dataV = ss.elementData[elementIdx] + int(avi*ss.cfg.numVgprsPerDataPerVI) atomicDestVgpr = dataV if kernel["BufferStore"] else dataV+2 if kernel["DisableAtomicFail"]: - kStr += inst("s_mov_b64", sgpr(mask,2), 0, "DisableAtomicFail, force 0" ) + kStr += inst("s_mov_b{}".format(wavelen), sgpr(mask,laneSGPRC), 0, "DisableAtomicFail, force 0" ) else: if kernel["ProblemType"]["DestDataType"].isDouble(): - kStr += inst("v_cmp_ne_u64", sgpr(mask,2), vgpr(atomicDestVgpr,2), \ + kStr += inst("v_cmp_ne_u64", sgpr(mask,laneSGPRC), vgpr(atomicDestVgpr,2), \ vgpr(dataV+2,2), "c read during atomic != c read during prior load" ) else: - kStr += inst("v_cmp_ne_u32", sgpr(mask,2), vgpr(atomicDestVgpr), \ + kStr += inst("v_cmp_ne_u32", sgpr(mask,laneSGPRC), vgpr(atomicDestVgpr), \ vgpr(dataV+1), "c read during atomic != c read during prior load" ) # or masks together to check early exit kStr += self.comment("or masks to check for exit") - kStr += inst("s_mov_b64", sgpr(tmpS01,2), hex(0), "empty mask" ) + kStr += inst("s_mov_b{}".format(wavelen), sgpr(tmpS01,laneSGPRC), hex(0), "empty mask" ) for elementIdx in range(0, len(batchElements)): mask = ss.elementMask[elementIdx] - kStr += inst("s_or_b64", sgpr(tmpS01,2), sgpr(mask,2), sgpr(tmpS01,2), "or to add threads" ) - kStr += inst("s_or_saveexec_b64", sgpr(tmpS23,2), sgpr(tmpS01,2), "apply combined mask" ) + kStr += inst("s_or_b{}".format(wavelen), sgpr(tmpS01,laneSGPRC), sgpr(mask,laneSGPRC), sgpr(tmpS01,laneSGPRC), "or to add threads" ) + kStr += inst("s_or_saveexec_b{}".format(wavelen), sgpr(tmpS23,laneSGPRC), sgpr(tmpS01,laneSGPRC), "apply combined mask" ) kStr += inst("s_cbranch_execz", "label_%04u" % labelAfterAtomicLoop, "if exec is zero skip loop" ) # begin atomic loop @@ -11014,7 +11252,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, if kernel["ProblemType"]["DestDataType"].isDouble(): sumIdxV = sumIdxV * 2 # apply mask for element - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "must try again" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "must try again" ) if kernel["ProblemType"]["DestDataType"].isDouble(): #64-bit C val move by 2 32-bit instructions kStr += inst("v_mov_b32", vgpr(dataV+2), vgpr(atomicDestVgpr), "dataV+2 = tmp (new original C)" ) @@ -11062,36 +11300,30 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, atomicDestVgpr = dataV if kernel["BufferStore"] else dataV+2 # apply mask for element - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "must try again" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "must try again" ) # compare success if kernel["ProblemType"]["DestDataType"].isDouble(): - kStr += inst("v_cmp_ne_u64", sgpr(tmpS01,2), vgpr(data+2,2), vgpr(atomicDestVgpr,2), \ + kStr += inst("v_cmp_ne_u64", sgpr(tmpS01,laneSGPRC), vgpr(data+2,2), vgpr(atomicDestVgpr,2), \ "c read during atomic != c read during prior load" ) else: - kStr += inst("v_cmp_ne_u32", sgpr(tmpS01,2), vgpr(data+1), vgpr(atomicDestVgpr), \ + kStr += inst("v_cmp_ne_u32", sgpr(tmpS01,laneSGPRC), vgpr(data+1), vgpr(atomicDestVgpr), \ "c read during atomic == c read during prior load" ) # update element mask - kStr += inst("s_and_b64", sgpr(mask,2), sgpr(tmpS01,2), sgpr(mask,2), "inBounds & must try again" ) + kStr += inst("s_and_b{}".format(wavelen), sgpr(mask,laneSGPRC), sgpr(tmpS01,laneSGPRC), sgpr(mask,laneSGPRC), "inBounds & must try again" ) # or masks together kStr += self.comment("or masks to check for exit") - kStr += inst("s_mov_b64", sgpr(tmpS01,2), hex(0), "empty mask" ) + kStr += inst("s_mov_b{}".format(wavelen), sgpr(tmpS01,laneSGPRC), hex(0), "empty mask" ) for elementIdx in range(0, len(batchElements)): mask = ss.elementMask[elementIdx] - kStr += inst("s_or_b64", sgpr(tmpS01,2), sgpr(mask,2), sgpr(tmpS01,2), "or to add threads" ) + kStr += inst("s_or_b{}".format(wavelen), sgpr(tmpS01,laneSGPRC), sgpr(mask,laneSGPRC), sgpr(tmpS01,laneSGPRC), "or to add threads" ) # apply combined masks and exit - kStr += inst("s_or_saveexec_b64", sgpr(tmpS23,2), sgpr(tmpS01,2), "apply combined mask" ) + kStr += inst("s_or_saveexec_b{}".format(wavelen), sgpr(tmpS23,laneSGPRC), sgpr(tmpS01,laneSGPRC), "apply combined mask" ) kStr += inst("s_cbranch_execnz", "label_%04u" % label, "try again if not complete" ) kStr += "label_%04u:%s" % (labelAfterAtomicLoop, self.endLine) - kStr += inst("s_mov_b64", "exec", -1, "full mask -> exec" ) - - # kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "apply new mask" ) - # #kStr += inst("s_and_saveexec_b64", sgpr(tmpS45,2), "vcc", "apply new mask" ) - # kStr += inst("s_cbranch_execnz", "label_%04u" % labelIdx, "try again if not complete" ) - # kStr += inst("s_mov_b64", "exec", sgpr(fullExecMaskSgpr,2), "full mask -> exec" ) - + kStr += inst("s_mov_b{}".format(wavelen), self.exec, -1, "full mask -> exec" ) ######################################## # Not Atomic @@ -11159,11 +11391,11 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, kStr += self.comment("StoreRemap: shift coord1 address") kStr += addrCalc.incrementToNextRow(kernel, "D", ss, tmpS01) kStr += inst("v_mov_b32", vgpr(tmpVgpr), addrCalc.rowInc, "set shift rows") - kStr += inst("v_add_u32", vgpr(self.storeRemapCoord1), vgpr(self.storeRemapCoord1), vgpr(tmpVgpr), "shift storeRemap coord1") + kStr += inst("_v_add_u32", vgpr(self.storeRemapCoord1), vgpr(self.storeRemapCoord1), vgpr(tmpVgpr), "shift storeRemap coord1") # apply in-bounds exec mask if edge and not kernel["BufferStore"]: - kStr += inst("s_mov_b64", "exec", sgpr(mask,2), "sgprs -> exec" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, sgpr(mask,laneSGPRC), "sgprs -> exec" ) if beta: # if GWVW=1 the half path still assumes we have @@ -11227,11 +11459,11 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, kStr += inst("v_and_b32", vgpr(tmpVgpr), vgpr(dataCExternal), vgpr(vgprBf16Mask), "convert bf16 to fp32") else: kStr += inst("v_lshlrev_b32", vgpr(tmpVgpr), "16", vgpr(dataCExternal), "convert bf16 to fp32" ) - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%sumIdxV), vgpr(tmpVgpr), sgpr("Beta"), \ + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%sumIdxV), vgpr(tmpVgpr), sgpr("Beta"), \ "finalSum = sum*alpha + C*beta") elif kernel["ProblemType"]["DestDataType"].isSingle(): - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%sumIdxV), vgpr(dataV+0), sgpr("Beta"), \ + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%sumIdxV), vgpr(dataV+0), sgpr("Beta"), \ "finalSum = sum*alpha + C*beta") elif kernel["ProblemType"]["DestDataType"].isInt32(): @@ -11239,11 +11471,10 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # v_mad_i32_i24 # kStr += inst("v_mad_i32_i24", vgpr("ValuC+%u"%sumIdxV), vgpr(dataV+0), sgpr("Beta"), vgpr("ValuC+%u"%sumIdxV), \ # "finalSum = sum*alpha + C*beta") - kStr += inst("v_mul_lo_i32", vgpr(dataV+0), sgpr("Beta"), vgpr(dataV+0), \ + kStr += inst("v_mul_lo_u32", vgpr(dataV+0), sgpr("Beta"), vgpr(dataV+0), \ "C = C*beta") - kStr += inst("v_add_u32", vgpr("ValuC+%u"%sumIdxV), vgpr(dataV+0), vgpr("ValuC+%u"%sumIdxV), \ + kStr += inst("_v_add_u32", vgpr("ValuC+%u"%sumIdxV), vgpr(dataV+0), vgpr("ValuC+%u"%sumIdxV), \ "finalSum = sum*alpha + C*beta") - kStr += " " elif kernel["ProblemType"]["DestDataType"].isDouble(): # dataV+0 = new c = old c*beta @@ -11252,10 +11483,10 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # single precision complex elif kernel["ProblemType"]["DestDataType"].isSingleComplex(): - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), vgpr(dataV+0), sgpr("Beta"), "finalSum Cr += old Cr * Br") - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), vgpr(dataV+1), "-"+sgpr("Beta+1"), "finalSum Cr += old Ci * -Bi") - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), vgpr(dataV+1), sgpr("Beta"), "finalSum Ci += old Ci * Br") - kStr += inst("v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), vgpr(dataV+0), sgpr("Beta+1"), "finalSum Ci += old Cr * Bi") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), vgpr(dataV+0), sgpr("Beta"), "finalSum Cr += old Cr * Br") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2)), vgpr(dataV+1), "-"+sgpr("Beta+1"), "finalSum Cr += old Ci * -Bi") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), vgpr(dataV+1), sgpr("Beta"), "finalSum Ci += old Ci * Br") + kStr += inst("_v_mac_f32", vgpr("ValuC+%u"%(sumIdxV*2+1)), vgpr(dataV+0), sgpr("Beta+1"), "finalSum Ci += old Cr * Bi") # double precision complex elif kernel["ProblemType"]["DestDataType"].isDoubleComplex(): @@ -11280,10 +11511,10 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, kStr += inst("v_pack_b32_f16", vgpr(d), vgpr("ValuC+%u"%(sumIdxV-1)), vgpr("ValuC+%u"%sumIdxV), "Pack with neighbor" ) elif kernel["ProblemType"]["DestDataType"].isBFloat16(): - kStr += inst("v_cmp_u_f32", sgpr(tmpS01,2), vgpr("ValuC+%u"%sumIdxV), vgpr("ValuC+%u"%sumIdxV), "check Nan" ) + kStr += inst("v_cmp_u_f32", sgpr(tmpS01,laneSGPRC), vgpr("ValuC+%u"%sumIdxV), vgpr("ValuC+%u"%sumIdxV), "check Nan" ) kStr += inst("v_bfe_u32", vgpr(vgprBf16Temp), vgpr("ValuC+%u"%sumIdxV), "16", "1", "Non-Nan case: store lsb of bf16" ) kStr += inst("v_add3_u32", vgpr(vgprBf16Temp), vgpr("ValuC+%u"%sumIdxV), vgpr(vgprBf16Temp), vgpr(vgprBf16Inc), "Non-Nan case: add lsb and the increment for rounding" ) - kStr += inst("v_cndmask_b32", vgpr("ValuC+%u"%sumIdxV), vgpr(vgprBf16Temp), vgpr(vgprFp32Nan), sgpr(tmpS01,2), "" ) + kStr += inst("v_cndmask_b32", vgpr("ValuC+%u"%sumIdxV), vgpr(vgprBf16Temp), vgpr(vgprFp32Nan), sgpr(tmpS01,laneSGPRC), "" ) if vi%2 == 0: kStr += inst("v_lshrrev_b32", vgpr("ValuC+%u"%sumIdxV), "16", vgpr("ValuC+%u"%sumIdxV), "convert C to bf16" ) elif vi%2 == 1: @@ -11356,7 +11587,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, # subsequent batch must start with full exec mask # BufferStore doesn't need exec since it used buffer range checking when # possible - kStr += inst("s_mov_b64", "exec", -1, "full mask -> exec" ) + kStr += inst("s_mov_b{}".format(wavelen), self.exec, -1, "full mask -> exec" ) if self.db["ConservativeWaitCnt"] & 0x40: kStr += "s_barrier // debug\n" @@ -11403,7 +11634,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, ############################################################################## ############################################################################## def openPrefetchAcrossPersistent(self, kernel, isOptNLL): - label = "SkipPrefetchAcrossPersistent_OptNLL" if isOptNLL else "SkipPrefetchAcrossPersistent" + label = "SkipTo_PureOptNLL_LastTile" if isOptNLL else "SkipPrefetchAcrossPersistent" imod = Code.Module() stmp = self.getTmpSgpr(1).idx() imod.addCode(self.comment3("PrefetchAcrossPersistent - Open")) @@ -11411,7 +11642,7 @@ def openPrefetchAcrossPersistent(self, kernel, isOptNLL): if kernel["PersistentKernelAlongBatch"]: imod.addInst("s_mul_i32", sgpr(stmp), sgpr(stmp), sgpr("NumWorkGroups2"), "Total WG-0 x 1 x 2") imod.addInst("s_cmp_ge_u32", sgpr("SerialWorkGroupIter"), sgpr(stmp), "outside legal WG?") - imod.addInst("s_cbranch_scc1", self.getNamedLabel(label), "skip pf if OOB") + imod.addInst("s_cbranch_scc1", self.getNamedLabel(label), "skip pf if OOB - last tile no PAP, go to pure OptNLL") #imod.addInst("s_branch", self.getLabelTarget("SkipPrefetchAcrossPersistent"), "skip pf if OOB") return imod @@ -11581,7 +11812,7 @@ def wait(self, kernel, tPA, tPB, skipGlobalRead, skipLocalWrite, \ # SyncThreads ############################################################################## def syncThreads(self, kernel, comment=""): - if kernel["NumThreads"] > 64 and self.do["Sync"]: + if kernel["NumThreads"] > self.kernel["WavefrontSize"] and self.do["Sync"]: kStr = "" if self.archCaps["SeparateVscnt"]: kStr += inst("s_waitcnt_lgkmcnt", "null", "0", "extra navi wait") @@ -11681,16 +11912,28 @@ def AccToArchMapper(self, kernel): acc2arch[i] = i arch2acc[i] = i else: - OutputsPerMFMA1B = kernel["MatrixInstM"] * kernel["MatrixInstN"] // globalParameters["WavefrontWidth"] * kernel["MIRegPerOut"] - for wgIdx1 in range(0, kernel["MIWaveTile"][1]): - for wgIdx0 in range(0, kernel["MIWaveTile"][0]): - for bIdx1 in range(0, kernel["MatrixInstBN"]): - for bIdx0 in range(0, kernel["MatrixInstBM"]): - for tIdx in range(0, OutputsPerMFMA1B): - src = tIdx + OutputsPerMFMA1B * (bIdx0 + kernel["MatrixInstBM"] * (bIdx1 + kernel["MatrixInstBN"] * (wgIdx0 + kernel["MIWaveTile"][0] * wgIdx1))) - dst = tIdx + OutputsPerMFMA1B * (bIdx0 + kernel["MatrixInstBM"] * (wgIdx0 + kernel["MIWaveTile"][0] * (bIdx1 + kernel["MatrixInstBN"] * wgIdx1))) + if kernel["SourceSwap"]: + OutputsPerMFMA = kernel["MatrixInstM"] * kernel["MatrixInstN"] // self.kernel["WavefrontSize"] + for wgIdx1 in range(0, kernel["MIWaveTile"][1]): + for tIdx1 in range(0, OutputsPerMFMA): + for wgIdx0 in range(0, kernel["MIWaveTile"][0]): + for tIdx0 in range(0, kernel["MIRegPerOut"]): + # TODO MatrixInstBM and BN support + src = tIdx0 + kernel["MIRegPerOut"] * (tIdx1 + OutputsPerMFMA * (wgIdx0 + kernel["MIWaveTile"][0] * wgIdx1)) + dst = tIdx0 + kernel["MIRegPerOut"] * (wgIdx0 + kernel["MIWaveTile"][0] * (tIdx1 + OutputsPerMFMA * wgIdx1)) acc2arch[src] = dst arch2acc[dst] = src + else: + OutputsPerMFMA1B = kernel["MatrixInstM"] * kernel["MatrixInstN"] // self.kernel["WavefrontSize"] * kernel["MIRegPerOut"] + for wgIdx1 in range(0, kernel["MIWaveTile"][1]): + for wgIdx0 in range(0, kernel["MIWaveTile"][0]): + for bIdx1 in range(0, kernel["MatrixInstBN"]): + for bIdx0 in range(0, kernel["MatrixInstBM"]): + for tIdx in range(0, OutputsPerMFMA1B): + src = tIdx + OutputsPerMFMA1B * (bIdx0 + kernel["MatrixInstBM"] * (bIdx1 + kernel["MatrixInstBN"] * (wgIdx0 + kernel["MIWaveTile"][0] * wgIdx1))) + dst = tIdx + OutputsPerMFMA1B * (bIdx0 + kernel["MatrixInstBM"] * (wgIdx0 + kernel["MIWaveTile"][0] * (bIdx1 + kernel["MatrixInstBN"] * wgIdx1))) + acc2arch[src] = dst + arch2acc[dst] = src return acc2arch, arch2acc @@ -11866,14 +12109,14 @@ def assertCommon(self, cookie=-1): def assertCmpCommon(self, cond, val0, val1, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_or_saveexec_b64", sgpr("SaveExecMask",2), 0, \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), 0, \ "assert: saved execmask") - kStr += inst("_v_cmpx_%s"%cond, "vcc", val0, val1, "v_cmp" ) + kStr += inst("_v_cmpx_%s"%cond, self.vcc, val0, val1, "v_cmp" ) kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -11939,17 +12182,17 @@ def assert_multiple_b32(self, sval, multiple2, cookie=-1): stmp = sgpr("SaveExecMask") # repurpose to get a tmp sgpr - kStr += inst("s_and_b32", stmp, sval, multiple2-1, "mask" ) + kStr += inst("s_and_b{}".format(self.kernel["WavefrontSize"]), stmp, sval, multiple2-1, "mask" ) kStr += inst("s_cmp_eq_u32", stmp, 0, "if maskedBits==0 then SCC=1 == no fault" ) - kStr += inst("s_mov_b64", sgpr("SaveExecMask",2), -1, "") - kStr += inst("s_cmov_b64", sgpr("SaveExecMask", 2), 0, "Clear exec mask") + kStr += inst("s_mov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), -1, "") + kStr += inst("s_cmov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask", self.laneSGPRCount), 0, "Clear exec mask") - kStr += inst("s_and_saveexec_b64", sgpr("SaveExecMask",2), sgpr("SaveExecMask",2), \ + kStr += inst("s_and_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: saved execmask") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -11957,15 +12200,15 @@ def assert_multiple_b32(self, sval, multiple2, cookie=-1): def assert_s_eq(self, sval0, sval1, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_and_saveexec_b64", sgpr("SaveExecMask",2), sgpr("SaveExecMask",2), \ + kStr += inst("s_and_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: saved execmask") - kStr += inst("s_mov_b64", sgpr("SaveExecMask",2), -1, "") + kStr += inst("s_mov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask", self.laneSGPRCount), -1, "") kStr += inst("s_cmp_eq_u32", sval0, sval1, "cmp") - kStr += inst("s_cmov_b64", sgpr("SaveExecMask", 2), 0, "No assert if SCC=1") + kStr += inst("s_cmov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask", self.laneSGPRCount), 0, "No assert if SCC=1") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -11974,14 +12217,14 @@ def assert_s_eq(self, sval0, sval1, cookie=-1): def assert_scc_is_1(self, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_and_saveexec_b64", sgpr("SaveExecMask",2), sgpr("SaveExecMask",2), \ + kStr += inst("s_and_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: saved execmask") - kStr += inst("s_mov_b64", sgpr("SaveExecMask",2), -1, "") - kStr += inst("s_cmov_b64", sgpr("SaveExecMask", 2), 0, "No assert if SCC=1") + kStr += inst("s_mov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), -1, "") + kStr += inst("s_cmov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), 0, "No assert if SCC=1") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -11989,15 +12232,15 @@ def assert_scc_is_1(self, cookie=-1): def assert_scc_is_0(self, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_and_saveexec_b64", sgpr("SaveExecMask",2), sgpr("SaveExecMask",2), \ + kStr += inst("s_and_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: saved execmask") - kStr += inst("s_mov_b64", sgpr("SaveExecMask",2), -1, "") - kStr += inst("s_cmov_b64", sgpr("SaveExecMask", 2), 0, "") - kStr += inst("s_not_b64", sgpr("SaveExecMask",2), sgpr("SaveExecMask", 2), "Assert if SCC==1") + kStr += inst("s_mov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), -1, "") + kStr += inst("s_cmov_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask", self.laneSGPRCount), 0, "") + kStr += inst("s_not_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), sgpr("SaveExecMask", self.laneSGPRCount), "Assert if SCC==1") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -12006,11 +12249,11 @@ def assert_scc_is_0(self, cookie=-1): def assert_vcc_all_true(self, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_or_saveexec_b64", sgpr("SaveExecMask",2), 0, \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), 0, \ "assert: saved execmask") - kStr += inst("s_mov_b64", "exec", "vcc", "Predicate based on VCC") + kStr += inst("s_mov_b{}".format(self.kernel["WavefrontSize"]), self.exec, self.vcc, "Predicate based on VCC") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -12018,11 +12261,11 @@ def assert_vcc_all_true(self, cookie=-1): def assert_vcc_all_false(self, cookie=-1): kStr = "" if self.db["EnableAsserts"]: - kStr += inst("s_or_saveexec_b64", sgpr("SaveExecMask",2), 0, \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), sgpr("SaveExecMask",self.laneSGPRCount), 0, \ "assert: saved execmask") - kStr += inst("s_not_b64", "exec", "vcc", "Predicate based on !VCC") + kStr += inst("s_not_b{}".format(self.kernel["WavefrontSize"]), self.exec, self.vcc, "Predicate based on !VCC") kStr += self.assertCommon(cookie) - kStr += inst("s_or_saveexec_b64", "vcc", sgpr("SaveExecMask",2), \ + kStr += inst("s_or_saveexec_b{}".format(self.kernel["WavefrontSize"]), self.vcc, sgpr("SaveExecMask",self.laneSGPRCount), \ "assert: restore execmask") return kStr @@ -12032,7 +12275,7 @@ def assert_vector_diff(self, v0, v1, expectedScalarDiff, cookie=-1): kStr = "" cmpVgpr = self.vgprPool.checkOut(1) kStr += inst("_v_add_co_u32", \ - vgpr(cmpVgpr), "vcc", \ + vgpr(cmpVgpr), self.vcc, \ expectedScalarDiff, \ v0, \ "assert_vector_diff add expectedScalarDiff") @@ -12059,7 +12302,7 @@ def dump(self, vgprStore): kStr += inst("flat_store_dword", vgpr("AddressDbg", 2), \ vgprStore, "debug dump store" ) - kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), "vcc", vgpr("AddressDbg"), \ + kStr += inst("_v_add_co_u32", vgpr("AddressDbg"), self.vcc, vgpr("AddressDbg"), \ hex(4), "debug dump inc" ) if self.db["DebugKernelMaxItems"] != -1: diff --git a/Tensile/KernelWriterSource.py b/Tensile/KernelWriterSource.py index d86d918c6..147d9b8f3 100644 --- a/Tensile/KernelWriterSource.py +++ b/Tensile/KernelWriterSource.py @@ -701,13 +701,13 @@ def functionPrefix(self, kernel): self.endLinePP) """ - for b in range(0, kernel["ThreadTileB"]): - for a in range(0, kernel["ThreadTileA"]): - strC = "rC[%d+%d*TT%s]" % (a, b, self.tileChar0 ) - strA = "rA[%d%s]" % (a, ("+TT%s"%self.tileCharA) if m>0 else "") - strB = "rB[%d%s]" % (b, ("+TT%s"%self.tileCharB) if m>0 else "") - if ((kernel["ThreadTileA"] % 2 == 0) and (kernel["ProblemType"]["DataType"].isHalf())): - if a % 2 == 0: + for idx1 in range(0, kernel["ThreadTile1"]): + for idx0 in range(0, kernel["ThreadTile0"]): + strC = "rC[%d+%d*TT%s]" % (idx0, idx1, self.tileChar0 ) + strA = "rA[%d%s]" % (idx0 if self.tPB["tile01Idx"] else idx1, ("+TT%s"%self.tileCharA) if m>0 else "") + strB = "rB[%d%s]" % (idx1 if self.tPB["tile01Idx"] else idx0, ("+TT%s"%self.tileCharB) if m>0 else "") + if ((kernel["ThreadTile0"] % 2 == 0) and (kernel["ProblemType"]["DataType"].isHalf())): + if idx0 % 2 == 0: kStr += " TYPE_MAC(%s,%s,%s , " % (strA, strB, strC) else: kStr += "%s,%s,%s); %s" % (strA, strB, strC, self.endLinePP) @@ -980,10 +980,10 @@ def allocateResources(self, kernel): # registers for valuAB kStr += " DATA_TYPE rA[TT%s%s];%s" \ - % (self.tileChar0, ("*2" if kernel["PrefetchLocalRead"] else ""), \ + % (self.tPA["tileChar"], ("*2" if kernel["PrefetchLocalRead"] else ""), \ self.endLine) kStr += " DATA_TYPE rB[TT%s%s];%s" \ - % (self.tileChar1, ("*2" if kernel["PrefetchLocalRead"] else ""), \ + % (self.tPB["tileChar"], ("*2" if kernel["PrefetchLocalRead"] else ""), \ self.endLine) #################################### @@ -1349,7 +1349,7 @@ def graTileOffsets(self, kernel, tP): self.endLine) # clip to edge if the flattened offset is OOB: - tP["packedSizeList"] = ["size%s"%self.indexChars[idx] for idx in kernel["PackedC%dIndicesX"%tP["tensorIdx"]]] + tP["packedSizeList"] = ["size%s"%self.indexChars[idx] for idx in kernel["PackedC%dIndicesX"%(tP["tile01Idx"])]] sizeStr = " * ".join(tP["packedSizeList"]) kStr += " %s = (%s > (%s-1)) ? (%s-1):%s;%s" \ @@ -1714,16 +1714,18 @@ def lwaDeclareAddresses(self, kernel, tP): return kStr ############################################################################## - # Local Read Addresses: Tile Assignment + # Local Read Addresses: Tile Assignment A/B ############################################################################## - def lraTileAssignment(self, kernel, tP): + def lraTileAssignment(self, kernel, tPA, tPB): kStr = "" - if tP["tensorChar"] == 'A': - kStr += " unsigned int lr%s = (serial %% SG%s);%s" \ - % (tP["tileChar"], self.tileChar0, self.endLine) - elif tP["tensorChar"] == 'B': - kStr += " unsigned int lr%s = (serial / SG%s) %% SG%s;%s" \ - % (tP["tileChar"], self.tileChar0, self.tileChar1, self.endLine) + + tP0 = tPA if tPB["tile01Idx"] else tPB + tP1 = tPB if tPB["tile01Idx"] else tPA + + kStr += " unsigned int lr%s = (serial %% SG%s);%s" \ + % (tP0["tileChar"], self.tileChar0, self.endLine) + kStr += " unsigned int lr%s = (serial / SG%s) %% SG%s;%s" \ + % (tP1["tileChar"], self.tileChar0, self.tileChar1, self.endLine) return kStr @@ -2136,7 +2138,7 @@ def checkAlphaBetaForHPA(self,kernel): ############################################################################## # MAC Iteration ############################################################################## - def macIter(self, kernel, black, iuiCount, useMacro): + def macIter(self, kernel, black, iuiCount, useMacro, isTail=False): kStr = "" for iui in range(0,iuiCount): kStr += "%sMAC_%ux%u" % (self.indent, \ @@ -2149,7 +2151,7 @@ def macIter(self, kernel, black, iuiCount, useMacro): ############################################################################## # At Least 1 Unroll ############################################################################## - def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): + def openSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap): kStr = "" if kernel["GlobalSplitU"] > 1: kStr += "%sif (numIterMyWg >= 1) {%s" \ @@ -2160,7 +2162,7 @@ def openSumAtLeastUnroll(self, kernel, prefetch, isPap, isOptNLL): self.indent += " " return kStr - def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isNGLL): + def closeSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap, isNGLL): kStr = "" self.indent = self.indent[2:] kStr += "%s} // end %s%s" % \ @@ -2644,7 +2646,7 @@ def shiftVectorComponents(self, kernel, tP): for r in range(1, tP["glvw"]): kStr += " if (r%s == %u) {%s" % (tP["tileChar"], r, self.endLine) - numVectors = kernel["ThreadTile%s"%tP["tileIdx"]]//tP["glvw"] + numVectors = kernel["ThreadTile%s"%(tP["tile01Idx"])]//tP["glvw"] for vIdx in range(0, numVectors): if vIdx == 0: kStr += " " @@ -2653,7 +2655,7 @@ def shiftVectorComponents(self, kernel, tP): if vIdx < numVectors-1: kStr += "if (s%s == %u) " % (tP["tileChar"], vIdx) kStr += "{%s" % self.endLine - for tt in range(0, kernel["ThreadTile%u"%((tP["tileIdx"]+1)%2)]): + for tt in range(0, kernel["ThreadTile%u"%(((tP["tile01Idx"])+1)%2)]): for s in range(0, r): if tP["isA"]: kStr += " rC[%u + %u*GLOBAL_LOAD_VECTOR_WIDTH_A + %u*TT%s] = rC[%u + %u*GLOBAL_LOAD_VECTOR_WIDTH_A + %u*TT%s];%s" \ @@ -3046,6 +3048,9 @@ def notLocalSplitUGlobalWrite(self, kernel): kStr += self.extractGlobalCDims(kernel, base1, 1) addTensorDimCheck1 = 0 + tP0 = self.tPA if self.tPB["tile01Idx"] else self.tPB + tP1 = self.tPB if self.tPB["tile01Idx"] else self.tPA + ### Bounds checks: # if packed, check flattened against product of all packed sizes # The flattened base never changes so add all address offsets before comparison @@ -3053,7 +3058,7 @@ def notLocalSplitUGlobalWrite(self, kernel): # base contains some addressing components, so just offset here: offset0 = offsetS0 globalC0ForCheck = "flattenedGlobalC0" - size0ForCheck = " * ".join(self.tPA["packedSizeList"]) + size0ForCheck = " * ".join(tP0["packedSizeList"]) # Check 0 dimension against appropriate size limit kStr += " if (%s%s + %u*SG%s*VECTOR_WIDTH < %s) {" \ @@ -3064,7 +3069,7 @@ def notLocalSplitUGlobalWrite(self, kernel): if packGranularity == 2: offset1 = offsetS1 globalC1ForCheck = "flattenedGlobalC1" - size1ForCheck = " * ".join(self.tPB["packedSizeList"]) + size1ForCheck = " * ".join(tP1["packedSizeList"]) kStr += " if (%s%s + %u*SG%s*VECTOR_WIDTH < %s) {" \ % (globalC1ForCheck, diff --git a/Tensile/LibraryIO.py b/Tensile/LibraryIO.py index aec5a49de..a3b762bea 100644 --- a/Tensile/LibraryIO.py +++ b/Tensile/LibraryIO.py @@ -19,212 +19,200 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -from .Common import print2, printExit, printWarning, versionIsCompatible +from .Common import printExit, printWarning, versionIsCompatible from .SolutionStructs import Solution, ProblemSizes, ProblemType from . import __version__ from . import Common from . import SolutionLibrary -import os try: - import yaml + import yaml except ImportError: - printExit("You must install PyYAML to use Tensile (to parse config files). See http://pyyaml.org/wiki/PyYAML for installation instructions.") + printExit("You must install PyYAML to use Tensile (to parse config files). See http://pyyaml.org/wiki/PyYAML for installation instructions.") try: - import msgpack + import msgpack except ImportError: - print("Message pack python library not detected. Must use YAML backend instead.") - -################################################################################ -# Read Benchmark Config from YAML Files -################################################################################ -def readConfig( filename ): - try: - stream = open(filename, "r") - except IOError: - printExit("Cannot open file: %s" % filename ) - config = yaml.load(stream, yaml.SafeLoader) - stream.close() - return config - -################################################################################ -# Write List of Solutions to YAML File -################################################################################ -def writeSolutions( filename, problemSizes, solutions ): - # convert objects to nested dictionaries - solutionStates = [] - for hardcoded in solutions: - for solution in hardcoded: - solutionState = solution.getAttributes() - solutionState["ProblemType"] = solutionState["ProblemType"].state - solutionState["ProblemType"]["DataType"] = \ - solutionState["ProblemType"]["DataType"].value - solutionState["ProblemType"]["DestDataType"] = \ - solutionState["ProblemType"]["DestDataType"].value - solutionState["ProblemType"]["ComputeDataType"] = \ - solutionState["ProblemType"]["ComputeDataType"].value - solutionStates.append(solutionState) - # write dictionaries - try: - stream = open(filename, "w") - except IOError: - printExit("Cannot open file: %s" % filename) - stream.write("- MinimumRequiredVersion: %s\n" % __version__ ) - stream.write("- ProblemSizes:\n") - if problemSizes: - for sizeRange in problemSizes.ranges: - stream.write(" - Range: %s\n" % sizeRange) - for problemExact in problemSizes.exacts: - #FIXME-problem, this ignores strides: - stream.write(" - Exact: %s\n" % str(problemExact)) - yaml.dump(solutionStates, stream, default_flow_style=None) - stream.close() - - -################################################################################ -# Read List of Solutions from YAML File -################################################################################ -def readSolutions( filename ): - try: - stream = open(filename, "r") - except IOError: - printExit("Cannot open file: %s" % filename ) - solutionStates = yaml.load(stream, yaml.SafeLoader) - stream.close() - - # verify - if len(solutionStates) < 2: - printExit("len(%s) %u < 2" % (filename, len(solutionStates))) - versionString = solutionStates[0]["MinimumRequiredVersion"] - if not versionIsCompatible(versionString): - printWarning("File \"%s\" version=%s does not match current Tensile version=%s" \ - % (filename, versionString, __version__) ) - - if "ProblemSizes" not in solutionStates[1]: - printExit("%s doesn't begin with ProblemSizes" % filename) - else: - problemSizesConfig = solutionStates[1]["ProblemSizes"] - - solutions = [] - for i in range(2, len(solutionStates)): - solutionState = solutionStates[i] - # force redo the deriving of parameters, make sure old version logic yamls can be validated - solutionState["AssignedProblemIndependentDerivedParameters"] = False - solutionState["AssignedDerivedParameters"] = False - solutionObject = Solution(solutionState) - solutions.append(solutionObject) - problemType = solutions[0]["ProblemType"] - problemSizes = ProblemSizes(problemType, problemSizesConfig) - return (problemSizes, solutions) - -################################################################################ -# Read Raw Library Logic from YAML -################################################################################ -def readRawLibraryLogic(filename): - """ Encode library logic """ - try: - stream = open(filename, "r") - except IOError: - print ("Cannot open file: %s" % filename ) - return None - data = yaml.load(stream, yaml.SafeLoader) - stream.close() - - versionString = data[0] - scheduleName = data[1] - architectureName = data[2] - deviceNames = data[3] - problemTypeState = data[4] - solutionStates = data[5] - indexOrder = data[6] - exactLogic = data[7] - rangeLogic = data[8] - otherFields = [] - - dataLength = len(data) - if dataLength > 9: - for idx in range(9, dataLength): - otherFields.append(data[idx]) - - return (versionString, scheduleName, architectureName, deviceNames,\ - problemTypeState, solutionStates, indexOrder, exactLogic, rangeLogic, otherFields) - -################################################################################ -# Read Library Logic from YAML -################################################################################ -def readLibraryLogicForSchedule( filename ): - #print1("# Reading Library Logic: %s" % ( filename )) - try: - stream = open(filename, "r") - except IOError: - printExit("Cannot open file: %s" % filename ) - data = yaml.load(stream, yaml.SafeLoader) - stream.close() - - # verify data has all the fields we need - if len(data) < 9: - printExit("Library logic file %s is missing required fields (len = %u < 9)" % (filename, len(data))) - - # parse out objects - versionString = data[0]["MinimumRequiredVersion"] - scheduleName = data[1] - architectureName = data[2] if isinstance(data[2], str) else data[2]["Architecture"] - deviceNames = data[3] - problemTypeState = data[4] - solutionStates = data[5] - indexOrder = data[6] - exactLogic = data[7] - rangeLogic = data[8] - - # does version match - if not versionIsCompatible(versionString): - printWarning("File \"%s\" version=%s does not match Tensile version=%s" \ - % (filename, versionString, __version__) ) - - # unpack problemType - problemType = ProblemType(problemTypeState) - # unpack solutions - solutions = [] - for i in range(0, len(solutionStates)): - solutionState = solutionStates[i] - if solutionState["KernelLanguage"] == "Assembly": - solutionState["ISA"] = Common.gfxArch(architectureName) + print("Message pack python library not detected. Must use YAML backend instead.") + + +################### +# Writing functions +################### +def write(filename_noExt, data, format="yaml"): + """Writes data to file with specified format; extension is appended based on format.""" + if format == "yaml": + writeYAML(filename_noExt + ".yaml", data) + elif format == "msgpack": + writeMsgPack(filename_noExt + ".dat", data) else: - solutionState["ISA"] = (0, 0, 0) - # force redo the deriving of parameters, make sure old version logic yamls can be validated - solutionState["AssignedProblemIndependentDerivedParameters"] = False - solutionState["AssignedDerivedParameters"] = False - solutionObject = Solution(solutionState) - if solutionObject["ProblemType"] != problemType: - printExit("ProblemType of file doesn't match solution: %s != %s" \ - % (problemType, solutionObject["ProblemType"])) - solutions.append(solutionObject) - - newLibrary = SolutionLibrary.MasterSolutionLibrary.FromOriginalState(data, solutions) - - return (scheduleName, deviceNames, problemType, solutions, indexOrder, \ - exactLogic, rangeLogic, newLibrary, architectureName) - -################################################################################ -# Data-specific writers -################################################################################ - -def configWriter(libraryFormat = "yaml"): - return YAMLWriter() if libraryFormat == "yaml" else MessagePackWriter() - -class Writer: - def write(self, filename, data): - """ Write data to a given file. """ - pass + printExit("Unrecognized format {}".format(format)) + +def writeYAML(filename, data, **kwargs): + """Writes data to file in YAML format.""" + # set default kwags for yaml dump + if "explicit_start" not in kwargs: + kwargs["explicit_start"] = True + if "explicit_end" not in kwargs: + kwargs["explicit_end"] = True + if "default_flow_style" not in kwargs: + kwargs["default_flow_style"] = None + + with open(filename, "w") as f: + yaml.dump(data, f, **kwargs) + +def writeMsgPack(filename, data): + """Writes data to file in Message Pack format.""" + with open(filename, "wb") as f: + msgpack.pack(data, f) - def writeLibraryLogicForSchedule(self, filePath, schedulePrefix, architectureName, \ - deviceNames, logicTuple): - """ Encode library logic """ - pass +def writeSolutions(filename, problemSizes, solutions): + """Writes solution YAML file.""" + + # convert objects to nested dictionaries + solutionStates = [] + for hardcoded in solutions: + for solution in hardcoded: + solutionState = solution.getAttributes() + solutionState["ProblemType"] = solutionState["ProblemType"].state + solutionState["ProblemType"]["DataType"] = \ + solutionState["ProblemType"]["DataType"].value + solutionState["ProblemType"]["DestDataType"] = \ + solutionState["ProblemType"]["DestDataType"].value + solutionState["ProblemType"]["ComputeDataType"] = \ + solutionState["ProblemType"]["ComputeDataType"].value + solutionStates.append(solutionState) + # write dictionaries + with open(filename, "w") as f: + f.write("- MinimumRequiredVersion: %s\n" % __version__ ) + f.write("- ProblemSizes:\n") + if problemSizes: + for sizeRange in problemSizes.ranges: + f.write(" - Range: %s\n" % sizeRange) + for problemExact in problemSizes.exacts: + #FIXME-problem, this ignores strides: + f.write(" - Exact: %s\n" % str(problemExact)) + + yaml.dump(solutionStates, f, default_flow_style=None) + + +############################### +# Reading and parsing functions +############################### +def readYAML(filename): + """Reads and returns YAML data from file.""" + with open(filename, "r") as f: + data = yaml.load(f, yaml.SafeLoader) + return data - def _getLibraryLogicForSchedule(self, schedulePrefix, architectureName, deviceNames, \ - logicTuple): +def parseSolutionsFile(filename): + """Wrapper function to read and parse a solutions file.""" + return parseSolutionsData(readYAML(filename), filename) + +def parseSolutionsData(data, srcFile="?"): + """Parses problem sizes and solutions from the data of a solutions file.""" + + if len(data) < 3: + printExit("Solution file {} is missing required fields (len = {} < 3".format(srcFile, len(data))) + + versionString = data[0]["MinimumRequiredVersion"] + if not versionIsCompatible(versionString): + printWarning("Version = {} in solution file {} does not match Tensile version = {}" \ + .format(srcFile, versionString, __version__) ) + + if "ProblemSizes" not in data[1]: + printExit("Solution file {} doesn't begin with ProblemSizes".format(srcFile)) + + problemSizesConfig = data[1]["ProblemSizes"] + + solutions = [] + for i in range(2, len(data)): + solutionState = data[i] + # force redo the deriving of parameters, make sure old version logic yamls can be validated + solutionState["AssignedProblemIndependentDerivedParameters"] = False + solutionState["AssignedDerivedParameters"] = False + solutionObject = Solution(solutionState) + solutions.append(solutionObject) + problemType = solutions[0]["ProblemType"] + problemSizes = ProblemSizes(problemType, problemSizesConfig) + return (problemSizes, solutions) + +def parseLibraryLogicFile(filename): + """Wrapper function to read and parse a library logic file.""" + return parseLibraryLogicData(readYAML(filename), filename) + +def parseLibraryLogicData(data, srcFile="?"): + """Parses the data of a library logic file.""" + + if len(data) < 9: + printExit("Library logic file {} is missing required fields (len = {} < 9)".format(srcFile, len(data))) + + versionString = data[0]["MinimumRequiredVersion"] + scheduleName = data[1] + architectureName = data[2] if isinstance(data[2], str) else data[2]["Architecture"] + deviceNames = data[3] + problemTypeState = data[4] + solutionStates = data[5] + indexOrder = data[6] + exactLogic = data[7] + rangeLogic = data[8] + + if not versionIsCompatible(versionString): + printWarning("Version = {} in library logic file {} does not match Tensile version = {}" \ + .format(srcFile, versionString, __version__) ) + + # unpack problemType + problemType = ProblemType(problemTypeState) + # unpack solutions + solutions = [] + for i in range(0, len(solutionStates)): + solutionState = solutionStates[i] + if solutionState["KernelLanguage"] == "Assembly": + solutionState["ISA"] = Common.gfxArch(architectureName) + else: + solutionState["ISA"] = (0, 0, 0) + # force redo the deriving of parameters, make sure old version logic yamls can be validated + solutionState["AssignedProblemIndependentDerivedParameters"] = False + solutionState["AssignedDerivedParameters"] = False + solutionObject = Solution(solutionState) + + if solutionObject["ProblemType"] != problemType: + printExit("ProblemType in library logic file {} doesn't match solution: {} != {}" \ + .format(srcFile, problemType, solutionObject["ProblemType"])) + solutions.append(solutionObject) + + newLibrary = SolutionLibrary.MasterSolutionLibrary.FromOriginalState(data, solutions) + + return (scheduleName, deviceNames, problemType, solutions, indexOrder, \ + exactLogic, rangeLogic, newLibrary, architectureName) + +def rawLibraryLogic(data): + """Returns a tuple of the data in a library logic file.""" + versionString = data[0] + scheduleName = data[1] + architectureName = data[2] + deviceNames = data[3] + problemTypeState = data[4] + solutionStates = data[5] + indexOrder = data[6] + exactLogic = data[7] + rangeLogic = data[8] + otherFields = [] + + dataLength = len(data) + if dataLength > 9: + for idx in range(9, dataLength): + otherFields.append(data[idx]) + + return (versionString, scheduleName, architectureName, deviceNames,\ + problemTypeState, solutionStates, indexOrder, exactLogic, rangeLogic, otherFields) + + +################# +# Other functions +################# +def createLibraryLogic(schedulePrefix, architectureName, deviceNames, logicTuple): + """Creates the data for a library logic file suitable for writing to YAML.""" problemType = logicTuple[0] solutions = logicTuple[1] indexOrder = logicTuple[2] @@ -233,7 +221,7 @@ def _getLibraryLogicForSchedule(self, schedulePrefix, architectureName, deviceNa tileSelection = False if len(logicTuple) > 5 and logicTuple[5]: - tileSelection = True + tileSelection = True data = [] # Tensile version @@ -246,38 +234,38 @@ def _getLibraryLogicForSchedule(self, schedulePrefix, architectureName, deviceNa # problem type problemTypeState = problemType.state problemTypeState["DataType"] = \ - problemTypeState["DataType"].value + problemTypeState["DataType"].value problemTypeState["DestDataType"] = \ - problemTypeState["DestDataType"].value + problemTypeState["DestDataType"].value problemTypeState["ComputeDataType"] = \ - problemTypeState["ComputeDataType"].value + problemTypeState["ComputeDataType"].value data.append(problemTypeState) # solutions solutionList = [] for solution in solutions: - solutionState = solution.getAttributes() - solutionState["ProblemType"] = solutionState["ProblemType"].state - solutionState["ProblemType"]["DataType"] = \ - solutionState["ProblemType"]["DataType"].value - solutionState["ProblemType"]["DestDataType"] = \ - solutionState["ProblemType"]["DestDataType"].value - solutionState["ProblemType"]["ComputeDataType"] = \ - solutionState["ProblemType"]["ComputeDataType"].value - solutionList.append(solutionState) - - if tileSelection: - tileSolutions = logicTuple[5] - for solution in tileSolutions: solutionState = solution.getAttributes() solutionState["ProblemType"] = solutionState["ProblemType"].state solutionState["ProblemType"]["DataType"] = \ - solutionState["ProblemType"]["DataType"].value + solutionState["ProblemType"]["DataType"].value solutionState["ProblemType"]["DestDataType"] = \ - solutionState["ProblemType"]["DestDataType"].value + solutionState["ProblemType"]["DestDataType"].value solutionState["ProblemType"]["ComputeDataType"] = \ - solutionState["ProblemType"]["ComputeDataType"].value + solutionState["ProblemType"]["ComputeDataType"].value solutionList.append(solutionState) + if tileSelection: + tileSolutions = logicTuple[5] + for solution in tileSolutions: + solutionState = solution.getAttributes() + solutionState["ProblemType"] = solutionState["ProblemType"].state + solutionState["ProblemType"]["DataType"] = \ + solutionState["ProblemType"]["DataType"].value + solutionState["ProblemType"]["DestDataType"] = \ + solutionState["ProblemType"]["DestDataType"].value + solutionState["ProblemType"]["ComputeDataType"] = \ + solutionState["ProblemType"]["ComputeDataType"].value + solutionList.append(solutionState) + data.append(solutionList) # index order data.append(indexOrder) @@ -285,69 +273,19 @@ def _getLibraryLogicForSchedule(self, schedulePrefix, architectureName, deviceNa # exactLogic exactLogicList = [] for key in exactLogic: - exactLogicList.append([list(key), exactLogic[key]]) + exactLogicList.append([list(key), exactLogic[key]]) data.append(exactLogicList) # rangeLogic data.append(rangeLogic) if tileSelection: - tileSelectionLogic = {} - tileSelectionIndices = logicTuple[6] - tileSelectionLogic["TileSelectionIndices"] = tileSelectionIndices - data.append(tileSelectionLogic) + tileSelectionLogic = {} + tileSelectionIndices = logicTuple[6] + tileSelectionLogic["TileSelectionIndices"] = tileSelectionIndices + data.append(tileSelectionLogic) else: - data.append(None) + data.append(None) data.append(logicTuple[7]) return data - -class YAMLWriter(Writer): - def write(self, filename, data): - with open(filename, 'w') as f: - yaml.dump(data, f, explicit_start=True, explicit_end=True, default_flow_style=None) - - def writeLibraryLogicForSchedule(self, filePath, schedulePrefix, architectureName, \ - deviceNames, logicTuple): - problemType = logicTuple[0] - filename = os.path.join(filePath, "%s_%s.yaml" \ - % (schedulePrefix, str(problemType))) - print2("# writeLogic( %s )" % ( filename )) - - data = self._getLibraryLogicForSchedule(schedulePrefix, architectureName, \ - deviceNames, logicTuple) - - try: - stream = open(filename, "w") - yaml.dump(data, stream, default_flow_style=None) - stream.close() - except IOError: - printExit("Cannot open file: %s" % filename) - -class MessagePackWriter(Writer): - def write(self, filename, data): - with open(filename, 'wb') as f: - try: - msgpack.pack(data, f) - except NameError: - printExit("You must install MessagePack for Python to use Tensile (to parse config files). See https://github.com/msgpack/msgpack-python for installation instructions.") - - - def writeLibraryLogicForSchedule(self, filePath, schedulePrefix, architectureName, \ - deviceNames, logicTuple): - problemType = logicTuple[0] - filename = os.path.join(filePath, "%s_%s.dat" \ - % (schedulePrefix, str(problemType))) - print2("# writeLogic( %s )" % ( filename )) - - data = self._getLibraryLogicForSchedule(schedulePrefix, architectureName, \ - deviceNames, logicTuple) - - try: - stream = open(filename, "wb") - msgpack.pack(data, stream) - stream.close() - except IOError: - printExit("Cannot open file: %s" % filename) - except NameError: - printExit("You must install MessagePack for Python to use Tensile (to parse config files). See https://github.com/msgpack/msgpack-python for installation instructions.") diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index ebdd7d158..bd56f7b94 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -1440,7 +1440,7 @@ def generateLogic(config, benchmarkDataPath, libraryLogicPath): printExit("%s doesn't exist for %s" % (dataFileName, fileBase) ) if not os.path.exists(solutionsFileName): printExit("%s doesn't exist for %s" % (solutionsFileName, fileBase) ) - (problemSizes, solutions) = LibraryIO.readSolutions(solutionsFileName) + (problemSizes, solutions) = LibraryIO.parseSolutionsFile(solutionsFileName) if len(solutions) == 0: printExit("%s doesn't contains any solutions." % (solutionsFileName) ) problemType = solutions[0]["ProblemType"] @@ -1450,12 +1450,16 @@ def generateLogic(config, benchmarkDataPath, libraryLogicPath): dataFileName, solutionsFileName, selectionFileName, solutions) ) for problemType in problemTypes: - logicTuple = analyzeProblemType( problemType, problemTypes[problemType], \ - analysisParameters) + logicTuple = analyzeProblemType(problemType, problemTypes[problemType], analysisParameters) - LibraryIO.configWriter("yaml").writeLibraryLogicForSchedule(globalParameters["WorkingPath"], \ - analysisParameters["ScheduleName"], analysisParameters["ArchitectureName"], \ - analysisParameters["DeviceNames"], logicTuple) + filename = os.path.join(globalParameters["WorkingPath"], \ + "{}_{}".format(analysisParameters["ScheduleName"], str(problemType) + ".yaml")) + + print2("# writing library logic YAML {}".format(filename)) + data = LibraryIO.createLibraryLogic(analysisParameters["ScheduleName"], \ + analysisParameters["ArchitectureName"], analysisParameters["DeviceNames"], logicTuple) + + LibraryIO.writeYAML(filename, data) currentTime = time.time() elapsedTime = currentTime - startTime diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt index 6db049b58..67ebc8d47 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx908" .text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 108 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -69,8 +69,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -242,7 +242,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: +Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt deleted file mode 100644 index 89f226e34..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,932 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx908" -.text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 108 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_group_segment_fixed_size 28672 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 28672 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x7EC80300 -.long 0x26CA00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254A030 -.long 0x92545402 -.long 0x92558830 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0xBECC00FF, 0x00000480 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254C032 -.long 0x92545403 -.long 0x92559032 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CA85 -.long 0xD2850004, 0x00020432 -.long 0x2606CA9F -.long 0x32AC0704 -.long 0x68ACAC54 -.long 0x24ACAC82 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x68AEAC4A -.long 0x68B0AE4A -.long 0x68B2B04A -.long 0x68B4B24A -.long 0x68B6B44A -.long 0x68B8B64A -.long 0x68BAB84A -.long 0xBECE00FF, 0x00000900 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00002400 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00001200 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00002400 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBEFC004D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004D -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F65 -.long 0xD285005E, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68BCBD01 -.long 0x24BCBC82 -.long 0x68BCBD02 -.long 0x68BCBC80 -.long 0x68BEBCFF, 0x00001200 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F65 -.long 0xD2850060, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C102 -.long 0x9254FF52, 0x00000900 -.long 0x68C0C054 -.long 0x68C0C0FF, 0x00002400 -.long 0x68C2C0FF, 0x00002400 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603C0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004D04 -.long 0x2608CA8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603C0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004904 -.long 0x2608CA8F -.long 0xD2850005, 0x00004904 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0069, 0x020AD76A -.long 0xBF8A0000 -.long 0xD9FE0000, 0x2000005E -.long 0xD9FE0900, 0x2800005E -.long 0xD9FE0040, 0x2400005E -.long 0xD9FE0940, 0x2C00005E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD9FE0040, 0x48000060 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xD3C50004, 0x04128B29 -.long 0xD3C50000, 0x04028D22 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128F2B -.long 0xD9FE0040, 0x3400005F -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xD3C50004, 0x0412952E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50000, 0x04029727 -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50004, 0x0412972F -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04129D3A -.long 0xD9FE0000, 0x2000005E -.long 0xD3C50000, 0x04029F33 -.long 0xD9FE0900, 0x2800005E -.long 0xD3C50004, 0x04129F3B -.long 0xD9FE0040, 0x2400005E -.long 0xBF8CC37F -.long 0xD3C50000, 0x0402A134 -.long 0xD9FE0940, 0x2C00005E -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50004, 0x0412A53E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD3C50000, 0x0402A737 -.long 0xD9FE0040, 0x48000060 -.long 0xD3C50004, 0x0412A73F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xE05C1000, 0x80041068 -.long 0xE05C1040, 0x80041468 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128B29 -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028D22 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0040, 0x3400005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x04128F2B -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50004, 0x0412952E -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50000, 0x04029727 -.long 0xD3C50004, 0x0412972F -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xD3C50004, 0x04129D3A -.long 0xD3C50000, 0x04029F33 -.long 0xD3C50004, 0x04129F3B -.long 0xBF8CC07F -.long 0xD3C50000, 0x0402A134 -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50000, 0x0402A737 -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50004, 0x0412A53E -.long 0xD3C50004, 0x0412A73F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005310 -.long 0xD1CB0001, 0x04045311 -.long 0xD1CB0002, 0x04085312 -.long 0xD1CB0003, 0x040C5313 -.long 0xE07C1000, 0x80050069 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x04105314 -.long 0xD1CB0005, 0x04145315 -.long 0xD1CB0006, 0x04185316 -.long 0xD1CB0007, 0x041C5317 -.long 0xE07C1040, 0x80050469 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt index f908b4075..e7277850e 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx90a" .text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 116 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -70,8 +70,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -243,7 +243,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: +Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt deleted file mode 100644 index e221c5cf5..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,933 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx90a" -.text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 116 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_accum_offset 108 // accumulate vgpr offset - .amdhsa_group_segment_fixed_size 28672 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 28672 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x7EC80300 -.long 0x26CA00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254A030 -.long 0x92545402 -.long 0x92558830 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0xBECC00FF, 0x00000480 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254C032 -.long 0x92545403 -.long 0x92559032 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CA85 -.long 0xD2850004, 0x00020432 -.long 0x2606CA9F -.long 0x32AC0704 -.long 0x68ACAC54 -.long 0x24ACAC82 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x68AEAC4A -.long 0x68B0AE4A -.long 0x68B2B04A -.long 0x68B4B24A -.long 0x68B6B44A -.long 0x68B8B64A -.long 0x68BAB84A -.long 0xBECE00FF, 0x00000900 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00002400 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00001200 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00002400 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBEFC004D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004D -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F65 -.long 0xD285005E, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68BCBD01 -.long 0x24BCBC82 -.long 0x68BCBD02 -.long 0x68BCBC80 -.long 0x68BEBCFF, 0x00001200 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F65 -.long 0xD2850060, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C102 -.long 0x9254FF52, 0x00000900 -.long 0x68C0C054 -.long 0x68C0C0FF, 0x00002400 -.long 0x68C2C0FF, 0x00002400 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603C0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004D04 -.long 0x2608CA8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603C0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004904 -.long 0x2608CA8F -.long 0xD2850005, 0x00004904 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0069, 0x020AD76A -.long 0xBF8A0000 -.long 0xD9FE0000, 0x2000005E -.long 0xD9FE0900, 0x2800005E -.long 0xD9FE0040, 0x2400005E -.long 0xD9FE0940, 0x2C00005E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD9FE0040, 0x48000060 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xD3C50004, 0x04128B29 -.long 0xD3C50000, 0x04028D22 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128F2B -.long 0xD9FE0040, 0x3400005F -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xD3C50004, 0x0412952E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50000, 0x04029727 -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50004, 0x0412972F -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04129D3A -.long 0xD9FE0000, 0x2000005E -.long 0xD3C50000, 0x04029F33 -.long 0xD9FE0900, 0x2800005E -.long 0xD3C50004, 0x04129F3B -.long 0xD9FE0040, 0x2400005E -.long 0xBF8CC37F -.long 0xD3C50000, 0x0402A134 -.long 0xD9FE0940, 0x2C00005E -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50004, 0x0412A53E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD3C50000, 0x0402A737 -.long 0xD9FE0040, 0x48000060 -.long 0xD3C50004, 0x0412A73F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xE05C1000, 0x80041068 -.long 0xE05C1040, 0x80041468 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128B29 -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028D22 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0040, 0x3400005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x04128F2B -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50004, 0x0412952E -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50000, 0x04029727 -.long 0xD3C50004, 0x0412972F -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xD3C50004, 0x04129D3A -.long 0xD3C50000, 0x04029F33 -.long 0xD3C50004, 0x04129F3B -.long 0xBF8CC07F -.long 0xD3C50000, 0x0402A134 -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50000, 0x0402A737 -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50004, 0x0412A53E -.long 0xD3C50004, 0x0412A73F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005310 -.long 0xD1CB0001, 0x04045311 -.long 0xD1CB0002, 0x04085312 -.long 0xD1CB0003, 0x040C5313 -.long 0xE07C1000, 0x80050069 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x04105314 -.long 0xD1CB0005, 0x04145315 -.long 0xD1CB0006, 0x04185316 -.long 0xD1CB0007, 0x041C5317 -.long 0xE07C1040, 0x80050469 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt index ee979dc83..628b32788 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx908" .text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 108 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -69,8 +69,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -242,7 +242,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: +Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt deleted file mode 100644 index 6ac072b7a..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,1580 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx908" -.text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 108 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_group_segment_fixed_size 60000 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 60000 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 36 -.set vgprG2LA, 40 -.set vgprValuB_X0_I0, 48 -.set vgprValuB_X1_I0, 56 -.set vgprG2LB, 64 -.set vgprLocalWriteAddrA, 80 -.set vgprLocalWriteAddrB, 81 -.set vgprGlobalReadOffsetA, 82 -.set vgprGlobalReadOffsetB, 83 -.set vgprLocalReadAddrA, 84 -.set vgprLocalReadAddrB, 85 -.set vgprSerial, 86 -/* Num VGPR=87 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - - - - - -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC00A0600, 0x00000008 -.long 0xC0020B40, 0x0000006C -.long 0xBEFC00FF, 0x00006000 -.long 0x7EC80300 -.long 0x26CA00BF -.long 0x2004C886 -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF840212 -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8B00FF, 0x00020000 -.long 0x80B85418 -.long 0x80B95518 -.long 0x8EB88238 -.long 0x80388438 -.long 0x82398039 -.long 0xBF068039 -.long 0x850AFF38, 0x80000000 -.long 0xBE8A00FF, 0x80000000 -.long 0x9254C030 -.long 0x92545402 -.long 0x8E558452 -.long 0x92533055 -.long 0x92553104 -.long 0x81545354 -.long 0x80545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000108 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0x68ACAA47 -.long 0x68AEAC47 -.long 0x68B0AE47 -.long 0x68B2B047 -.long 0xBECC00FF, 0x00000840 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8F00FF, 0x00020000 -.long 0x80BA541A -.long 0x80BB551A -.long 0x8EBA823A -.long 0x803A843A -.long 0x823B803B -.long 0xBF06803B -.long 0x850EFF3A, 0x80000000 -.long 0xBE8E00FF, 0x80000000 -.long 0x9254FF32, 0x00000080 -.long 0x92545403 -.long 0x925532A0 -.long 0x92555552 -.long 0x81545554 -.long 0x92553304 -.long 0x80545554 -.long 0x2004CA85 -.long 0x2606CA9F -.long 0xD2850004, 0x00020432 -.long 0x32400704 -.long 0x68404054 -.long 0x24404082 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000108 -.long 0x6842404A -.long 0x6844424A -.long 0x6846444A -.long 0x6848464A -.long 0x684A484A -.long 0x684C4A4A -.long 0x684E4C4A -.long 0x68504E4A -.long 0x6852504A -.long 0x6854524A -.long 0x6856544A -.long 0x6858564A -.long 0x685A584A -.long 0x685C5A4A -.long 0x685E5C4A -.long 0xBECE00FF, 0x00001080 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004200 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002100 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00004200 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBEFC004D -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBEFC004C -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF8500DC -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004D -.long 0x802E812E -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004C -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FF24 -.long 0xBF8C4F70 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD3D94008, 0x18000080 -.long 0xD3D94009, 0x18000080 -.long 0xD3D9400A, 0x18000080 -.long 0xD3D9400B, 0x18000080 -.long 0xD3D9400C, 0x18000080 -.long 0xD3D9400D, 0x18000080 -.long 0xD3D9400E, 0x18000080 -.long 0xD3D9400F, 0x18000080 -.long 0xD3D94010, 0x18000080 -.long 0xD3D94011, 0x18000080 -.long 0xD3D94012, 0x18000080 -.long 0xD3D94013, 0x18000080 -.long 0xD3D94014, 0x18000080 -.long 0xD3D94015, 0x18000080 -.long 0xD3D94016, 0x18000080 -.long 0xD3D94017, 0x18000080 -.long 0xD3D94018, 0x18000080 -.long 0xD3D94019, 0x18000080 -.long 0xD3D9401A, 0x18000080 -.long 0xD3D9401B, 0x18000080 -.long 0xD3D9401C, 0x18000080 -.long 0xD3D9401D, 0x18000080 -.long 0xD3D9401E, 0x18000080 -.long 0xD3D9401F, 0x18000080 -.long 0xC0060700, 0x00000000 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xC00A0800, 0x00000018 -.long 0xD1130001, 0x00013F65 -.long 0xD2850060, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C0C101 -.long 0x2002CA85 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C080 -.long 0x68C2C0FF, 0x00002100 -.long 0xBF8A0000 -.long 0xD1130001, 0x00013F65 -.long 0xD2850062, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C4C501 -.long 0x2002CA85 -.long 0x68C4C501 -.long 0x24C4C482 -.long 0x9254FF52, 0x00001080 -.long 0x68C4C454 -.long 0x68C4C4FF, 0x00004200 -.long 0x68C6C4FF, 0x00004200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603FF, 0x00000080 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x24C8CC86 -.long 0x68C8C965 -.long 0xD2850004, 0x0002CCA0 -.long 0xD2850003, 0x00004D04 -.long 0x2608C89F -.long 0xD2850005, 0x00004D04 -.long 0x2608C8BF -.long 0x200C0885 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF85053B -.long 0xBF8CC47F -.long 0xD3C40000, 0x04020120 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD3C40000, 0x04020322 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xD86C0060, 0x0C000062 -.long 0xD3C40010, 0x04420323 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xBF8CCB7F -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xBF8CC97F -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40010, 0x04421D3D -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xBF8CCB7F -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xBF8CC97F -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C40000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C40010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C40000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C40010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C40000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C40010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C40000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C40010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD3C40000, 0x04023F5E -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C40010, 0x04423F5F -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020120 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C40000, 0x04020322 -.long 0xD3C40010, 0x04420323 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C40010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C40000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C40010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C40000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C40010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C40000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C40010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C40000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C40010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD3C40000, 0x04023F5E -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD3C40010, 0x04423F5F -.long 0xD86C0038, 0x07000062 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FEAC -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020120 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C40000, 0x04020322 -.long 0xD3C40010, 0x04420323 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1000, 0x80042068 -.long 0xE05C1020, 0x80042468 -.long 0xE05C1040, 0x80042868 -.long 0xE05C1060, 0x80042C68 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C40010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1080, 0x80043068 -.long 0xE05C10A0, 0x80043468 -.long 0xE05C10C0, 0x80043868 -.long 0xE05C10E0, 0x80043C68 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40000, 0x04023756 -.long 0xD3C40000, 0x04023958 -.long 0xD3C40000, 0x04023B5A -.long 0xD3C40000, 0x04023D5C -.long 0xD3C40000, 0x04023F5E -.long 0xBF068029 -.long 0xBF8400A3 -.long 0xD3C40010, 0x04423151 -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3C40010, 0x04423353 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1000, 0x80050068 -.long 0xD3C40010, 0x04423555 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C1020, 0x80050468 -.long 0xD3C40010, 0x04423757 -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C1040, 0x80050868 -.long 0xD3C40010, 0x04423959 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C1060, 0x80050C68 -.long 0xD3C40010, 0x04423B5B -.long 0xD3C40010, 0x04423D5D -.long 0xD3C40010, 0x04423F5F -.long 0xBF800003 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1080, 0x80050068 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C10A0, 0x80050468 -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40010, 0x04423353 -.long 0xD3C40010, 0x04423555 -.long 0xD3C40010, 0x04423757 -.long 0xD3C40010, 0x04423959 -.long 0xD3C40010, 0x04423B5B -.long 0xD3C40010, 0x04423D5D -.long 0xD3C40010, 0x04423F5F -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xBF8C0F74 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005320 -.long 0xD1160001, 0x00005321 -.long 0xD1160002, 0x00005322 -.long 0xD1160003, 0x00005323 -.long 0xE07C1000, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005324 -.long 0xD1160005, 0x00005325 -.long 0xD1160006, 0x00005326 -.long 0xD1160007, 0x00005327 -.long 0xE07C1020, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005328 -.long 0xD1160009, 0x00005329 -.long 0xD116000A, 0x0000532A -.long 0xD116000B, 0x0000532B -.long 0xE07C1040, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000532C -.long 0xD116000D, 0x0000532D -.long 0xD116000E, 0x0000532E -.long 0xD116000F, 0x0000532F -.long 0xE07C1060, 0x80050C68 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xBF8C0F70 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005330 -.long 0xD1160001, 0x00005331 -.long 0xD1160002, 0x00005332 -.long 0xD1160003, 0x00005333 -.long 0xE07C1080, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005334 -.long 0xD1160005, 0x00005335 -.long 0xD1160006, 0x00005336 -.long 0xD1160007, 0x00005337 -.long 0xE07C10A0, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005338 -.long 0xD1160009, 0x00005339 -.long 0xD116000A, 0x0000533A -.long 0xD116000B, 0x0000533B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000533C -.long 0xD116000D, 0x0000533D -.long 0xD116000E, 0x0000533E -.long 0xD116000F, 0x0000533F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 - diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt index e530eba56..a4959c308 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx90a" .text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 140 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -70,8 +70,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -243,7 +243,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: +Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt deleted file mode 100644 index 191f354b0..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA90a_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,1581 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx90a" -.text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 140 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_accum_offset 108 // accumulate vgpr offset - .amdhsa_group_segment_fixed_size 60000 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 60000 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 36 -.set vgprG2LA, 40 -.set vgprValuB_X0_I0, 48 -.set vgprValuB_X1_I0, 56 -.set vgprG2LB, 64 -.set vgprLocalWriteAddrA, 80 -.set vgprLocalWriteAddrB, 81 -.set vgprGlobalReadOffsetA, 82 -.set vgprGlobalReadOffsetB, 83 -.set vgprLocalReadAddrA, 84 -.set vgprLocalReadAddrB, 85 -.set vgprSerial, 86 -/* Num VGPR=87 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - - - - - - -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC00A0600, 0x00000008 -.long 0xC0020B40, 0x0000006C -.long 0xBEFC00FF, 0x00006000 -.long 0x7EC80300 -.long 0x26CA00BF -.long 0x2004C886 -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF840212 -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8B00FF, 0x00020000 -.long 0x80B85418 -.long 0x80B95518 -.long 0x8EB88238 -.long 0x80388438 -.long 0x82398039 -.long 0xBF068039 -.long 0x850AFF38, 0x80000000 -.long 0xBE8A00FF, 0x80000000 -.long 0x9254C030 -.long 0x92545402 -.long 0x8E558452 -.long 0x92533055 -.long 0x92553104 -.long 0x81545354 -.long 0x80545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000108 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0x68ACAA47 -.long 0x68AEAC47 -.long 0x68B0AE47 -.long 0x68B2B047 -.long 0xBECC00FF, 0x00000840 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8F00FF, 0x00020000 -.long 0x80BA541A -.long 0x80BB551A -.long 0x8EBA823A -.long 0x803A843A -.long 0x823B803B -.long 0xBF06803B -.long 0x850EFF3A, 0x80000000 -.long 0xBE8E00FF, 0x80000000 -.long 0x9254FF32, 0x00000080 -.long 0x92545403 -.long 0x925532A0 -.long 0x92555552 -.long 0x81545554 -.long 0x92553304 -.long 0x80545554 -.long 0x2004CA85 -.long 0x2606CA9F -.long 0xD2850004, 0x00020432 -.long 0x32400704 -.long 0x68404054 -.long 0x24404082 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000108 -.long 0x6842404A -.long 0x6844424A -.long 0x6846444A -.long 0x6848464A -.long 0x684A484A -.long 0x684C4A4A -.long 0x684E4C4A -.long 0x68504E4A -.long 0x6852504A -.long 0x6854524A -.long 0x6856544A -.long 0x6858564A -.long 0x685A584A -.long 0x685C5A4A -.long 0x685E5C4A -.long 0xBECE00FF, 0x00001080 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004200 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002100 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00004200 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBEFC004D -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBEFC004C -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF8500DC -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004D -.long 0x802E812E -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004C -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FF24 -.long 0xBF8C4F70 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD3D94008, 0x18000080 -.long 0xD3D94009, 0x18000080 -.long 0xD3D9400A, 0x18000080 -.long 0xD3D9400B, 0x18000080 -.long 0xD3D9400C, 0x18000080 -.long 0xD3D9400D, 0x18000080 -.long 0xD3D9400E, 0x18000080 -.long 0xD3D9400F, 0x18000080 -.long 0xD3D94010, 0x18000080 -.long 0xD3D94011, 0x18000080 -.long 0xD3D94012, 0x18000080 -.long 0xD3D94013, 0x18000080 -.long 0xD3D94014, 0x18000080 -.long 0xD3D94015, 0x18000080 -.long 0xD3D94016, 0x18000080 -.long 0xD3D94017, 0x18000080 -.long 0xD3D94018, 0x18000080 -.long 0xD3D94019, 0x18000080 -.long 0xD3D9401A, 0x18000080 -.long 0xD3D9401B, 0x18000080 -.long 0xD3D9401C, 0x18000080 -.long 0xD3D9401D, 0x18000080 -.long 0xD3D9401E, 0x18000080 -.long 0xD3D9401F, 0x18000080 -.long 0xC0060700, 0x00000000 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xC00A0800, 0x00000018 -.long 0xD1130001, 0x00013F65 -.long 0xD2850060, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C0C101 -.long 0x2002CA85 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C080 -.long 0x68C2C0FF, 0x00002100 -.long 0xBF8A0000 -.long 0xD1130001, 0x00013F65 -.long 0xD2850062, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C4C501 -.long 0x2002CA85 -.long 0x68C4C501 -.long 0x24C4C482 -.long 0x9254FF52, 0x00001080 -.long 0x68C4C454 -.long 0x68C4C4FF, 0x00004200 -.long 0x68C6C4FF, 0x00004200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603FF, 0x00000080 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x24C8CC86 -.long 0x68C8C965 -.long 0xD2850004, 0x0002CCA0 -.long 0xD2850003, 0x00004D04 -.long 0x2608C89F -.long 0xD2850005, 0x00004D04 -.long 0x2608C8BF -.long 0x200C0885 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF85053B -.long 0xBF8CC47F -.long 0xD3C48000, 0x04020120 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C48010, 0x04420121 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD3C48000, 0x04020322 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xD86C0060, 0x0C000062 -.long 0xD3C48010, 0x04420323 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C48000, 0x04020524 -.long 0xD3C48010, 0x04420525 -.long 0xBF8CCC7F -.long 0xD3C48000, 0x04020726 -.long 0xD3C48010, 0x04420727 -.long 0xBF8CCB7F -.long 0xD3C48000, 0x04020928 -.long 0xD3C48010, 0x04420929 -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04020B2A -.long 0xD3C48010, 0x04420B2B -.long 0xBF8CC97F -.long 0xD3C48000, 0x04020D2C -.long 0xD3C48010, 0x04420D2D -.long 0xBF8CC87F -.long 0xD3C48000, 0x04020F2E -.long 0xD3C48010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04021130 -.long 0xD3C48010, 0x04421131 -.long 0xD3C48000, 0x04021332 -.long 0xD3C48010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C48000, 0x04021534 -.long 0xD3C48010, 0x04421535 -.long 0xBF8A0000 -.long 0xD3C48000, 0x04021736 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C48010, 0x04421737 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C48000, 0x04021938 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C48010, 0x04421939 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C48000, 0x04021B3A -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C48010, 0x04421B3B -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C48000, 0x04021D3C -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C48010, 0x04421D3D -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C48000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C48010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C48010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C48000, 0x04022342 -.long 0xD3C48010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C48000, 0x04022544 -.long 0xD3C48010, 0x04422545 -.long 0xBF8CCC7F -.long 0xD3C48000, 0x04022746 -.long 0xD3C48010, 0x04422747 -.long 0xBF8CCB7F -.long 0xD3C48000, 0x04022948 -.long 0xD3C48010, 0x04422949 -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04022B4A -.long 0xD3C48010, 0x04422B4B -.long 0xBF8CC97F -.long 0xD3C48000, 0x04022D4C -.long 0xD3C48010, 0x04422D4D -.long 0xBF8CC87F -.long 0xD3C48000, 0x04022F4E -.long 0xD3C48010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04023150 -.long 0xD3C48010, 0x04423151 -.long 0xD3C48000, 0x04023352 -.long 0xD3C48010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C48000, 0x04023554 -.long 0xD3C48010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C48000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C48010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C48000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C48010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C48000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C48010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C48000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C48010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD3C48000, 0x04023F5E -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C48010, 0x04423F5F -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04020120 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD3C48010, 0x04420121 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C48000, 0x04020322 -.long 0xD3C48010, 0x04420323 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C48000, 0x04020524 -.long 0xD3C48010, 0x04420525 -.long 0xD3C48000, 0x04020726 -.long 0xD3C48010, 0x04420727 -.long 0xD3C48000, 0x04020928 -.long 0xD3C48010, 0x04420929 -.long 0xD3C48000, 0x04020B2A -.long 0xD3C48010, 0x04420B2B -.long 0xD3C48000, 0x04020D2C -.long 0xD3C48010, 0x04420D2D -.long 0xD3C48000, 0x04020F2E -.long 0xD3C48010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04021130 -.long 0xD3C48010, 0x04421131 -.long 0xD3C48000, 0x04021332 -.long 0xD3C48010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C48000, 0x04021534 -.long 0xD3C48010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C48000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C48010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C48000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C48010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C48000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C48010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C48000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C48010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C48000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C48010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C48010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C48000, 0x04022342 -.long 0xD3C48010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C48000, 0x04022544 -.long 0xD3C48010, 0x04422545 -.long 0xD3C48000, 0x04022746 -.long 0xD3C48010, 0x04422747 -.long 0xD3C48000, 0x04022948 -.long 0xD3C48010, 0x04422949 -.long 0xD3C48000, 0x04022B4A -.long 0xD3C48010, 0x04422B4B -.long 0xD3C48000, 0x04022D4C -.long 0xD3C48010, 0x04422D4D -.long 0xD3C48000, 0x04022F4E -.long 0xD3C48010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04023150 -.long 0xD3C48010, 0x04423151 -.long 0xD3C48000, 0x04023352 -.long 0xD3C48010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C48000, 0x04023554 -.long 0xD3C48010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C48000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C48010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C48000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C48010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C48000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C48010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C48000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C48010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD3C48000, 0x04023F5E -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD3C48010, 0x04423F5F -.long 0xD86C0038, 0x07000062 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FEAC -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04020120 -.long 0xD3C48010, 0x04420121 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C48000, 0x04020322 -.long 0xD3C48010, 0x04420323 -.long 0xBF8CCC7F -.long 0xD3C48000, 0x04020524 -.long 0xD3C48010, 0x04420525 -.long 0xD3C48000, 0x04020726 -.long 0xD3C48010, 0x04420727 -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04020928 -.long 0xD3C48010, 0x04420929 -.long 0xD3C48000, 0x04020B2A -.long 0xD3C48010, 0x04420B2B -.long 0xBF8CC87F -.long 0xD3C48000, 0x04020D2C -.long 0xD3C48010, 0x04420D2D -.long 0xD3C48000, 0x04020F2E -.long 0xD3C48010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04021130 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1000, 0x80042068 -.long 0xE05C1020, 0x80042468 -.long 0xE05C1040, 0x80042868 -.long 0xE05C1060, 0x80042C68 -.long 0xD3C48010, 0x04421131 -.long 0xD3C48000, 0x04021332 -.long 0xD3C48010, 0x04421333 -.long 0xD3C48000, 0x04021534 -.long 0xD3C48010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C48000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C48010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C48000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C48010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C48000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C48010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C48000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C48010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C48000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C48010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04022140 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1080, 0x80043068 -.long 0xE05C10A0, 0x80043468 -.long 0xE05C10C0, 0x80043868 -.long 0xE05C10E0, 0x80043C68 -.long 0xD3C48010, 0x04422141 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C48000, 0x04022342 -.long 0xD3C48010, 0x04422343 -.long 0xBF8CCC7F -.long 0xD3C48000, 0x04022544 -.long 0xD3C48010, 0x04422545 -.long 0xD3C48000, 0x04022746 -.long 0xD3C48010, 0x04422747 -.long 0xBF8CCA7F -.long 0xD3C48000, 0x04022948 -.long 0xD3C48010, 0x04422949 -.long 0xD3C48000, 0x04022B4A -.long 0xD3C48010, 0x04422B4B -.long 0xBF8CC87F -.long 0xD3C48000, 0x04022D4C -.long 0xD3C48010, 0x04422D4D -.long 0xD3C48000, 0x04022F4E -.long 0xD3C48010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C48000, 0x04023150 -.long 0xD3C48000, 0x04023352 -.long 0xD3C48000, 0x04023554 -.long 0xD3C48000, 0x04023756 -.long 0xD3C48000, 0x04023958 -.long 0xD3C48000, 0x04023B5A -.long 0xD3C48000, 0x04023D5C -.long 0xD3C48000, 0x04023F5E -.long 0xBF068029 -.long 0xBF8400A3 -.long 0xD3C48010, 0x04423151 -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3C48010, 0x04423353 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1000, 0x80050068 -.long 0xD3C48010, 0x04423555 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C1020, 0x80050468 -.long 0xD3C48010, 0x04423757 -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C1040, 0x80050868 -.long 0xD3C48010, 0x04423959 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C1060, 0x80050C68 -.long 0xD3C48010, 0x04423B5B -.long 0xD3C48010, 0x04423D5D -.long 0xD3C48010, 0x04423F5F -.long 0xBF800003 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1080, 0x80050068 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C10A0, 0x80050468 -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 -.long 0xD3C48010, 0x04423151 -.long 0xD3C48010, 0x04423353 -.long 0xD3C48010, 0x04423555 -.long 0xD3C48010, 0x04423757 -.long 0xD3C48010, 0x04423959 -.long 0xD3C48010, 0x04423B5B -.long 0xD3C48010, 0x04423D5D -.long 0xD3C48010, 0x04423F5F -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xBF8C0F74 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005320 -.long 0xD1160001, 0x00005321 -.long 0xD1160002, 0x00005322 -.long 0xD1160003, 0x00005323 -.long 0xE07C1000, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005324 -.long 0xD1160005, 0x00005325 -.long 0xD1160006, 0x00005326 -.long 0xD1160007, 0x00005327 -.long 0xE07C1020, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005328 -.long 0xD1160009, 0x00005329 -.long 0xD116000A, 0x0000532A -.long 0xD116000B, 0x0000532B -.long 0xE07C1040, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000532C -.long 0xD116000D, 0x0000532D -.long 0xD116000E, 0x0000532E -.long 0xD116000F, 0x0000532F -.long 0xE07C1060, 0x80050C68 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xBF8C0F70 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005330 -.long 0xD1160001, 0x00005331 -.long 0xD1160002, 0x00005332 -.long 0xD1160003, 0x00005333 -.long 0xE07C1080, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005334 -.long 0xD1160005, 0x00005335 -.long 0xD1160006, 0x00005336 -.long 0xD1160007, 0x00005337 -.long 0xE07C10A0, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005338 -.long 0xD1160009, 0x00005339 -.long 0xD116000A, 0x0000533A -.long 0xD116000B, 0x0000533B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000533C -.long 0xD116000D, 0x0000533D -.long 0xD116000E, 0x0000533E -.long 0xD116000F, 0x0000533F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt index cd9240586..ad4245a78 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx908" .text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 108 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -69,8 +69,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -242,7 +242,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: +Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt deleted file mode 100644 index d0e137f31..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ /dev/null @@ -1,934 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx908" -.text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 108 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_group_segment_fixed_size 28672 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 28672 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x7ECC0300 -.long 0x26CE00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20D00884 -.long 0x7EA40568 -.long 0xD1130069, 0x0000A08F -.long 0x7EA20569 -.long 0xBF068151 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254C030 -.long 0x92545402 -.long 0x92559030 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CE85 -.long 0xD2850004, 0x00020030 -.long 0x2602CE9F -.long 0x32180304 -.long 0x68181854 -.long 0x24181882 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x681A1847 -.long 0x681C1A47 -.long 0x681E1C47 -.long 0x68201E47 -.long 0x68222047 -.long 0x68242247 -.long 0x68262447 -.long 0xBECC00FF, 0x00000900 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254A032 -.long 0x92545403 -.long 0x92558832 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CE85 -.long 0xD2850004, 0x00020432 -.long 0x2606CE9F -.long 0x32280704 -.long 0x68282854 -.long 0x24282882 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x682A284A -.long 0x682C2A4A -.long 0x682E2C4A -.long 0xBECE00FF, 0x00000480 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004800 -.long 0xBF8A0000 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00001200 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002400 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBEFC004F -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F67 -.long 0xD2850040, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68808101 -.long 0x24808082 -.long 0x68808102 -.long 0x9254FF52, 0x00000900 -.long 0x68808054 -.long 0x68808080 -.long 0x688280FF, 0x00002400 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F67 -.long 0xD2850042, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68848501 -.long 0x24848482 -.long 0x68848502 -.long 0x688484FF, 0x00004800 -.long 0x688684FF, 0x00001200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603A0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0044, 0x020AD76A -.long 0x925426C0 -.long 0x688A8854 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603A0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004904 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0046, 0x020AD76A -.long 0x925426C0 -.long 0x688E8C54 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x20000042 -.long 0xD9FE0900, 0x28000042 -.long 0xD9FE0040, 0x24000042 -.long 0xD9FE0940, 0x2C000042 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD9FE0040, 0x14000040 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04024110 -.long 0xD3C50004, 0x04125110 -.long 0xD3C50000, 0x04024311 -.long 0xD3C50004, 0x04125311 -.long 0xD3C50000, 0x04024512 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04125512 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C50000, 0x04024713 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C50004, 0x04125713 -.long 0xD9FE0040, 0x34000043 -.long 0xBF8CC37F -.long 0xD3C50000, 0x04024914 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C50004, 0x04125914 -.long 0xD3C50000, 0x04024B15 -.long 0xD3C50004, 0x04125B15 -.long 0xD3C50000, 0x04024D16 -.long 0xD3C50004, 0x04125D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C50000, 0x04024F17 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C50004, 0x04125F17 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04026118 -.long 0xD3C50004, 0x04127118 -.long 0xD3C50000, 0x04026319 -.long 0xD3C50004, 0x04127319 -.long 0xD3C50000, 0x0402651A -.long 0xBF8A0000 -.long 0xD3C50004, 0x0412751A -.long 0xD9FE0000, 0x20000042 -.long 0xD3C50000, 0x0402671B -.long 0xD9FE0900, 0x28000042 -.long 0xD3C50004, 0x0412771B -.long 0xD9FE0040, 0x24000042 -.long 0xBF8CC37F -.long 0xD3C50000, 0x0402691C -.long 0xD9FE0940, 0x2C000042 -.long 0xD3C50004, 0x0412791C -.long 0xD3C50000, 0x04026B1D -.long 0xD3C50004, 0x04127B1D -.long 0xD3C50000, 0x04026D1E -.long 0xD3C50004, 0x04127D1E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD3C50000, 0x04026F1F -.long 0xD9FE0040, 0x14000040 -.long 0xD3C50004, 0x04127F1F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C50000, 0x04024110 -.long 0xE05C1000, 0x80040844 -.long 0xE05C1000, 0x80040C45 -.long 0xD3C50004, 0x04125110 -.long 0xD3C50000, 0x04024311 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04125311 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C50000, 0x04024512 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C50004, 0x04125512 -.long 0xD9FE0040, 0x34000043 -.long 0xD3C50000, 0x04024713 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C50004, 0x04125713 -.long 0xBF8CC37F -.long 0xD3C50000, 0x04024914 -.long 0xD3C50004, 0x04125914 -.long 0xD3C50000, 0x04024B15 -.long 0xD3C50004, 0x04125B15 -.long 0xD3C50000, 0x04024D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C50004, 0x04125D16 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C50000, 0x04024F17 -.long 0xD3C50004, 0x04125F17 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04026118 -.long 0xD3C50004, 0x04127118 -.long 0xD3C50000, 0x04026319 -.long 0xD3C50004, 0x04127319 -.long 0xD3C50000, 0x0402651A -.long 0xD3C50004, 0x0412751A -.long 0xD3C50000, 0x0402671B -.long 0xD3C50004, 0x0412771B -.long 0xBF8CC07F -.long 0xD3C50000, 0x0402691C -.long 0xD3C50000, 0x04026B1D -.long 0xD3C50000, 0x04026D1E -.long 0xD3C50000, 0x04026F1F -.long 0xD3C50004, 0x0412791C -.long 0xD3C50004, 0x04127B1D -.long 0xD3C50004, 0x04127D1E -.long 0xD3C50004, 0x04127F1F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005308 -.long 0xD1CB0001, 0x04045309 -.long 0xD1CB0002, 0x0408530A -.long 0xD1CB0003, 0x040C530B -.long 0xE07C1000, 0x80050046 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x0410530C -.long 0xD1CB0005, 0x0414530D -.long 0xD1CB0006, 0x0418530E -.long 0xD1CB0007, 0x041C530F -.long 0xE07C1000, 0x80050447 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt index c3629ec79..c1c1406e5 100644 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt @@ -34,13 +34,13 @@ .amdgcn_target "amdgcn-amd-amdhsa--gfx90a" .text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function +.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 +.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr 116 // vgprs .amdhsa_next_free_sgpr 98 // sgprs @@ -70,8 +70,8 @@ amdhsa.version: - 1 - 0 amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8.kd' + - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 + .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8.kd' .language: OpenCL C .language_version: - 2 @@ -243,7 +243,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: +Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8: /******************************************/ /* Asm syntax workarounds */ diff --git a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt deleted file mode 100644 index 0af954f43..000000000 --- a/Tensile/ReplacementKernels-cov3/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA90a_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ /dev/null @@ -1,937 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.amdgcn_target "amdgcn-amd-amdhsa--gfx90a" -.text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function -.section .rodata,#alloc -.p2align 6 -.amdhsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_next_free_vgpr 116 // vgprs - .amdhsa_next_free_sgpr 98 // sgprs - .amdhsa_accum_offset 108 - .amdhsa_group_segment_fixed_size 28672 // lds bytes - .amdhsa_private_segment_fixed_size 0 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - .amdhsa_system_vgpr_workitem_id 0 -.end_amdhsa_kernel -.text - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amdgpu_metadata ---- -amdhsa.version: - - 1 - - 0 -amdhsa.kernels: - - .name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - .symbol: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8.kd' - .language: OpenCL C - .language_version: - - 2 - - 0 - .args: - - .name: sizeC - .size: 8 - .offset: 0 - .value_kind: by_value - .value_type: u64 - - .name: sizeA - .size: 8 - .offset: 8 - .value_kind: by_value - .value_type: u64 - - .name: sizeB - .size: 8 - .offset: 16 - .value_kind: by_value - .value_type: u64 - - .name: D - .size: 8 - .offset: 24 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: C - .size: 8 - .offset: 32 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: A - .size: 8 - .offset: 40 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: B - .size: 8 - .offset: 48 - .value_kind: global_buffer - .value_type: f32 - .address_space: generic - - .name: alpha - .size: 4 - .offset: 56 - .value_kind: by_value - .value_type: f32 - - .name: beta - .size: 4 - .offset: 60 - .value_kind: by_value - .value_type: f32 - - .name: strideD0 - .size: 4 - .offset: 64 - .value_kind: by_value - .value_type: u32 - - .name: strideD1 - .size: 4 - .offset: 68 - .value_kind: by_value - .value_type: u32 - - .name: strideC0 - .size: 4 - .offset: 72 - .value_kind: by_value - .value_type: u32 - - .name: strideC1 - .size: 4 - .offset: 76 - .value_kind: by_value - .value_type: u32 - - .name: strideA0 - .size: 4 - .offset: 80 - .value_kind: by_value - .value_type: u32 - - .name: strideA1 - .size: 4 - .offset: 84 - .value_kind: by_value - .value_type: u32 - - .name: strideB0 - .size: 4 - .offset: 88 - .value_kind: by_value - .value_type: u32 - - .name: strideB1 - .size: 4 - .offset: 92 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree0 - .size: 4 - .offset: 96 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree1 - .size: 4 - .offset: 100 - .value_kind: by_value - .value_type: u32 - - .name: SizesFree2 - .size: 4 - .offset: 104 - .value_kind: by_value - .value_type: u32 - - .name: SizesSum0 - .size: 4 - .offset: 108 - .value_kind: by_value - .value_type: u32 - - .name: OrigStaggerUIter - .size: 4 - .offset: 112 - .value_kind: by_value - .value_type: i32 - - .name: NumWorkGroups0 - .size: 4 - .offset: 116 - .value_kind: by_value - .value_type: u32 - - .name: NumWorkGroups1 - .size: 4 - .offset: 120 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberProblemNumGroupTiles0 - .size: 4 - .offset: 124 - .value_kind: by_value - .value_type: u32 - - .name: GridNumWorkGroups0 - .size: 4 - .offset: 128 - .value_kind: by_value - .value_type: u32 - - .name: NumFullBlocks - .size: 4 - .offset: 132 - .value_kind: by_value - .value_type: u32 - - .name: WgmRemainder1 - .size: 4 - .offset: 136 - .value_kind: by_value - .value_type: u32 - - .name: MagicNumberWgmRemainder1 - .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: padding - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 28672 - .kernarg_segment_align: 8 - .kernarg_segment_size: 152 - .max_flat_workgroup_size: 512 - .private_segment_fixed_size: 0 - .sgpr_count: 98 - .sgpr_spill_count: 0 - .vgpr_count: 108 - .vgpr_spill_count: 0 - .wavefront_size: 64 -... -.end_amdgpu_metadata -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA90a_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x20040086 -.long 0x7ECC0300 -.long 0x26CE00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x26D00483 -.long 0x7EA40568 -.long 0xD1130069, 0x0000A08F -.long 0x7EA20569 -.long 0x7E100502 -.long 0xBF088308 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254C030 -.long 0x92545402 -.long 0x92559030 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CE85 -.long 0xD2850004, 0x00020030 -.long 0x2602CE9F -.long 0x32180304 -.long 0x68181854 -.long 0x24181882 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x681A1847 -.long 0x681C1A47 -.long 0x681E1C47 -.long 0x68201E47 -.long 0x68222047 -.long 0x68242247 -.long 0x68262447 -.long 0xBECC00FF, 0x00000900 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254A032 -.long 0x92545403 -.long 0x92558832 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CE85 -.long 0xD2850004, 0x00020432 -.long 0x2606CE9F -.long 0x32280704 -.long 0x68282854 -.long 0x24282882 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x682A284A -.long 0x682C2A4A -.long 0x682E2C4A -.long 0xBECE00FF, 0x00000480 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004800 -.long 0xBF8A0000 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00001200 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002400 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBEFC004F -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F67 -.long 0xD2850040, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68808101 -.long 0x24808082 -.long 0x68808102 -.long 0x9254FF52, 0x00000900 -.long 0x68808054 -.long 0x68808080 -.long 0x688280FF, 0x00002400 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F67 -.long 0xD2850042, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68848501 -.long 0x24848482 -.long 0x68848502 -.long 0x688484FF, 0x00004800 -.long 0x688684FF, 0x00001200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603A0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0044, 0x020AD76A -.long 0x925426C0 -.long 0x688A8854 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603A0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004904 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0046, 0x020AD76A -.long 0x925426C0 -.long 0x688E8C54 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x20000042 -.long 0xD9FE0900, 0x28000042 -.long 0xD9FE0040, 0x24000042 -.long 0xD9FE0940, 0x2C000042 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD9FE0040, 0x14000040 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C58000, 0x04024110 -.long 0xD3C58004, 0x04125110 -.long 0xD3C58000, 0x04024311 -.long 0xD3C58004, 0x04125311 -.long 0xD3C58000, 0x04024512 -.long 0xBF8A0000 -.long 0xD3C58004, 0x04125512 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C58000, 0x04024713 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C58004, 0x04125713 -.long 0xD9FE0040, 0x34000043 -.long 0xBF8CC37F -.long 0xD3C58000, 0x04024914 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C58004, 0x04125914 -.long 0xD3C58000, 0x04024B15 -.long 0xD3C58004, 0x04125B15 -.long 0xD3C58000, 0x04024D16 -.long 0xD3C58004, 0x04125D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C58000, 0x04024F17 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C58004, 0x04125F17 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C58000, 0x04026118 -.long 0xD3C58004, 0x04127118 -.long 0xD3C58000, 0x04026319 -.long 0xD3C58004, 0x04127319 -.long 0xD3C58000, 0x0402651A -.long 0xBF8A0000 -.long 0xD3C58004, 0x0412751A -.long 0xD9FE0000, 0x20000042 -.long 0xD3C58000, 0x0402671B -.long 0xD9FE0900, 0x28000042 -.long 0xD3C58004, 0x0412771B -.long 0xD9FE0040, 0x24000042 -.long 0xBF8CC37F -.long 0xD3C58000, 0x0402691C -.long 0xD9FE0940, 0x2C000042 -.long 0xD3C58004, 0x0412791C -.long 0xD3C58000, 0x04026B1D -.long 0xD3C58004, 0x04127B1D -.long 0xD3C58000, 0x04026D1E -.long 0xD3C58004, 0x04127D1E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD3C58000, 0x04026F1F -.long 0xD9FE0040, 0x14000040 -.long 0xD3C58004, 0x04127F1F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C58000, 0x04024110 -.long 0xE05C1000, 0x80040844 -.long 0xE05C1000, 0x80040C45 -.long 0xD3C58004, 0x04125110 -.long 0xD3C58000, 0x04024311 -.long 0xBF8A0000 -.long 0xD3C58004, 0x04125311 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C58000, 0x04024512 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C58004, 0x04125512 -.long 0xD9FE0040, 0x34000043 -.long 0xD3C58000, 0x04024713 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C58004, 0x04125713 -.long 0xBF8CC37F -.long 0xD3C58000, 0x04024914 -.long 0xD3C58004, 0x04125914 -.long 0xD3C58000, 0x04024B15 -.long 0xD3C58004, 0x04125B15 -.long 0xD3C58000, 0x04024D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C58004, 0x04125D16 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C58000, 0x04024F17 -.long 0xD3C58004, 0x04125F17 -.long 0xBF8CC17F -.long 0xD3C58000, 0x04026118 -.long 0xD3C58004, 0x04127118 -.long 0xD3C58000, 0x04026319 -.long 0xD3C58004, 0x04127319 -.long 0xD3C58000, 0x0402651A -.long 0xD3C58004, 0x0412751A -.long 0xD3C58000, 0x0402671B -.long 0xD3C58004, 0x0412771B -.long 0xBF8CC07F -.long 0xD3C58000, 0x0402691C -.long 0xD3C58000, 0x04026B1D -.long 0xD3C58000, 0x04026D1E -.long 0xD3C58000, 0x04026F1F -.long 0xD3C58004, 0x0412791C -.long 0xD3C58004, 0x04127B1D -.long 0xD3C58004, 0x04127D1E -.long 0xD3C58004, 0x04127F1F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005308 -.long 0xD1CB0001, 0x04045309 -.long 0xD1CB0002, 0x0408530A -.long 0xD1CB0003, 0x040C530B -.long 0xE07C1000, 0x80050046 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x0410530C -.long 0xD1CB0005, 0x0414530D -.long 0xD1CB0006, 0x0418530E -.long 0xD1CB0007, 0x041C530F -.long 0xE07C1000, 0x80050447 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt index 99e82a84c..af1af7e5e 100644 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt @@ -35,12 +35,12 @@ .hsa_code_object_version 2,0 .hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" .text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: +.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function +.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: .amd_kernel_code_t is_ptr64 = 1 enable_sgpr_kernarg_segment_ptr = 1 @@ -73,8 +73,8 @@ Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_AS .amd_amdgpu_hsa_metadata Version: [ 1, 0 ] Kernels: - - Name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8@kd' + - Name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + SymbolName: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8@kd' Language: OpenCL C LanguageVersion: [ 2, 0 ] Args: diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt deleted file mode 100644 index 69b6bb006..000000000 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT32x64x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,931 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.hsa_code_object_version 2,0 -.hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" -.text -.protected Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 -Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8: -.amd_kernel_code_t - is_ptr64 = 1 - enable_sgpr_kernarg_segment_ptr = 1 - kernarg_segment_byte_size = 148 // bytes of kern args - workitem_vgpr_count = 108 // vgprs - wavefront_sgpr_count = 98 // sgprs - compute_pgm_rsrc1_vgprs = 26 // floor((108-1)/4) - compute_pgm_rsrc1_sgprs = 12 // floor((98-1)/8) - compute_pgm_rsrc2_tidig_comp_cnt = 0 // 1D wg - compute_pgm_rsrc2_tgid_x_en = 1 // wg.x - compute_pgm_rsrc2_tgid_y_en = 1 // wg.y - compute_pgm_rsrc2_tgid_z_en = 1 // wg.z - workgroup_group_segment_byte_size = 30000// lds bytes - compute_pgm_rsrc2_user_sgpr = 2 // vcc - kernarg_segment_alignment = 4 - group_segment_alignment = 4 - private_segment_alignment = 4 -.end_amd_kernel_code_t - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amd_amdgpu_hsa_metadata -Version: [ 1, 0 ] -Kernels: - - Name: Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT32x64x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG16_32_1_WGM8@kd' - Language: OpenCL C - LanguageVersion: [ 2, 0 ] - Args: - - Name: sizeC - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeA - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeB - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: D - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: C - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: A - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: B - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: alpha - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: beta - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: strideD0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideD1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree2 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesSum0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: OrigStaggerUIter - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: I32 - - Name: NumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumWorkGroups1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberProblemNumGroupTiles0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: GridNumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumFullBlocks - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: WgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberWgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: padding - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - CodeProps: - KernargSegmentSize: 148 - GroupSegmentFixedSize: 28672 - PrivateSegmentFixedSize: 0 - KernargSegmentAlign: 8 - WavefrontSize: 64 - NumSGPRs: 98 - NumVGPRs: 108 - MaxFlatWorkGroupSize: 512 -.end_amd_amdgpu_hsa_metadata - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x7EC80300 -.long 0x26CA00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254A030 -.long 0x92545402 -.long 0x92558830 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0xBECC00FF, 0x00000480 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254C032 -.long 0x92545403 -.long 0x92559032 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CA85 -.long 0xD2850004, 0x00020432 -.long 0x2606CA9F -.long 0x32AC0704 -.long 0x68ACAC54 -.long 0x24ACAC82 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x68AEAC4A -.long 0x68B0AE4A -.long 0x68B2B04A -.long 0x68B4B24A -.long 0x68B6B44A -.long 0x68B8B64A -.long 0x68BAB84A -.long 0xBECE00FF, 0x00000900 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00002400 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00001200 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00002400 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBEFC004D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004D -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511120, 0x80023153 -.long 0xE0511240, 0x80023254 -.long 0xE0511360, 0x80023355 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80034456 -.long 0xE0511120, 0x80034557 -.long 0xE0511240, 0x80034658 -.long 0xE0511360, 0x80034759 -.long 0xE0511480, 0x8003485A -.long 0xE05115A0, 0x8003495B -.long 0xE05116C0, 0x80034A5C -.long 0xE05117E0, 0x80034B5D -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBEFC004C -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F65 -.long 0xD285005E, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68BCBD01 -.long 0x24BCBC82 -.long 0x68BCBD02 -.long 0x68BCBC80 -.long 0x68BEBCFF, 0x00001200 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F65 -.long 0xD2850060, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CA84 -.long 0x24020282 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C102 -.long 0x9254FF52, 0x00000900 -.long 0x68C0C054 -.long 0x68C0C0FF, 0x00002400 -.long 0x68C2C0FF, 0x00002400 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603C0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004D04 -.long 0x2608CA8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603C0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850004, 0x0002CC90 -.long 0xD2850003, 0x00004904 -.long 0x2608CA8F -.long 0xD2850005, 0x00004904 -.long 0x200CCA84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402A0 -.long 0x32D40C54 -.long 0xD1FE0069, 0x020AD76A -.long 0xBF8A0000 -.long 0xD9FE0000, 0x2000005E -.long 0xD9FE0900, 0x2800005E -.long 0xD9FE0040, 0x2400005E -.long 0xD9FE0940, 0x2C00005E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD9FE0040, 0x48000060 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xD3C50004, 0x04128B29 -.long 0xD3C50000, 0x04028D22 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128F2B -.long 0xD9FE0040, 0x3400005F -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xD3C50004, 0x0412952E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50000, 0x04029727 -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50004, 0x0412972F -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04129D3A -.long 0xD9FE0000, 0x2000005E -.long 0xD3C50000, 0x04029F33 -.long 0xD9FE0900, 0x2800005E -.long 0xD3C50004, 0x04129F3B -.long 0xD9FE0040, 0x2400005E -.long 0xBF8CC37F -.long 0xD3C50000, 0x0402A134 -.long 0xD9FE0940, 0x2C00005E -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50004, 0x0412A53E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x44000060 -.long 0xD3C50000, 0x0402A737 -.long 0xD9FE0040, 0x48000060 -.long 0xD3C50004, 0x0412A73F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C50000, 0x04028920 -.long 0xE05C1000, 0x80041068 -.long 0xE05C1040, 0x80041468 -.long 0xD3C50004, 0x04128928 -.long 0xD3C50000, 0x04028B21 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04128B29 -.long 0xD9FE0000, 0x3000005F -.long 0xD3C50000, 0x04028D22 -.long 0xD9FE0900, 0x3800005F -.long 0xD3C50004, 0x04128D2A -.long 0xD9FE0040, 0x3400005F -.long 0xD3C50000, 0x04028F23 -.long 0xD9FE0940, 0x3C00005F -.long 0xD3C50004, 0x04128F2B -.long 0xBF8CC37F -.long 0xD3C50000, 0x04029124 -.long 0xD3C50004, 0x0412912C -.long 0xD3C50000, 0x04029325 -.long 0xD3C50004, 0x0412932D -.long 0xD3C50000, 0x04029526 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x4C000061 -.long 0xD3C50004, 0x0412952E -.long 0xD9FE0040, 0x50000061 -.long 0xD3C50000, 0x04029727 -.long 0xD3C50004, 0x0412972F -.long 0xBF8CC17F -.long 0xD3C50000, 0x04029930 -.long 0xD3C50004, 0x04129938 -.long 0xD3C50000, 0x04029B31 -.long 0xD3C50004, 0x04129B39 -.long 0xD3C50000, 0x04029D32 -.long 0xD3C50004, 0x04129D3A -.long 0xD3C50000, 0x04029F33 -.long 0xD3C50004, 0x04129F3B -.long 0xBF8CC07F -.long 0xD3C50000, 0x0402A134 -.long 0xD3C50000, 0x0402A335 -.long 0xD3C50000, 0x0402A536 -.long 0xD3C50000, 0x0402A737 -.long 0xD3C50004, 0x0412A13C -.long 0xD3C50004, 0x0412A33D -.long 0xD3C50004, 0x0412A53E -.long 0xD3C50004, 0x0412A73F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005310 -.long 0xD1CB0001, 0x04045311 -.long 0xD1CB0002, 0x04085312 -.long 0xD1CB0003, 0x040C5313 -.long 0xE07C1000, 0x80050069 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x04105314 -.long 0xD1CB0005, 0x04145315 -.long 0xD1CB0006, 0x04185316 -.long 0xD1CB0007, 0x041C5317 -.long 0xE07C1040, 0x80050469 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt index fe188b085..b4c3e67cf 100644 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_MDA2_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt @@ -35,12 +35,12 @@ .hsa_code_object_version 2,0 .hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" .text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: +.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8,@function +.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 +Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8: .amd_kernel_code_t is_ptr64 = 1 enable_sgpr_kernarg_segment_ptr = 1 @@ -73,8 +73,8 @@ Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_A .amd_amdgpu_hsa_metadata Version: [ 1, 0 ] Kernels: - - Name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8@kd' + - Name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8 + SymbolName: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8@kd' Language: OpenCL C LanguageVersion: [ 2, 0 ] Args: diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt deleted file mode 100644 index d04301967..000000000 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x128x32_AF0EM8_ASEM8_FL0_GRVW4_ISA908_PGR1_PLR1_SU32_TT4_4_VAW1_VW4_WG16_32_1_WGM8.s.txt +++ /dev/null @@ -1,1579 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.hsa_code_object_version 2,0 -.hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" -.text -.protected Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 -Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8: -.amd_kernel_code_t - is_ptr64 = 1 - enable_sgpr_kernarg_segment_ptr = 1 - kernarg_segment_byte_size = 148 // bytes of kern args - workitem_vgpr_count = 108 // vgprs - wavefront_sgpr_count = 98 // sgprs - compute_pgm_rsrc1_vgprs = 26 // floor((108-1)/4) - compute_pgm_rsrc1_sgprs = 12 // floor((98-1)/8) - compute_pgm_rsrc2_tidig_comp_cnt = 0 // 1D wg - compute_pgm_rsrc2_tgid_x_en = 1 // wg.x - compute_pgm_rsrc2_tgid_y_en = 1 // wg.y - compute_pgm_rsrc2_tgid_z_en = 1 // wg.z - workgroup_group_segment_byte_size = 60000// lds bytes - compute_pgm_rsrc2_user_sgpr = 2 // vcc - kernarg_segment_alignment = 4 - group_segment_alignment = 4 - private_segment_alignment = 4 -.end_amd_kernel_code_t - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amd_amdgpu_hsa_metadata -Version: [ 1, 0 ] -Kernels: - - Name: Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8@kd' - Language: OpenCL C - LanguageVersion: [ 2, 0 ] - Args: - - Name: sizeC - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeA - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeB - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: D - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: C - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: A - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: B - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: alpha - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: beta - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: strideD0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideD1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree2 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesSum0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: OrigStaggerUIter - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: I32 - - Name: NumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumWorkGroups1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberProblemNumGroupTiles0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: GridNumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumFullBlocks - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: WgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberWgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: padding - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - CodeProps: - KernargSegmentSize: 148 - GroupSegmentFixedSize: 60000 - PrivateSegmentFixedSize: 0 - KernargSegmentAlign: 8 - WavefrontSize: 64 - NumSGPRs: 98 - NumVGPRs: 108 - MaxFlatWorkGroupSize: 512 -.end_amd_amdgpu_hsa_metadata - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 36 -.set vgprG2LA, 40 -.set vgprValuB_X0_I0, 48 -.set vgprValuB_X1_I0, 56 -.set vgprG2LB, 64 -.set vgprLocalWriteAddrA, 80 -.set vgprLocalWriteAddrB, 81 -.set vgprGlobalReadOffsetA, 82 -.set vgprGlobalReadOffsetB, 83 -.set vgprLocalReadAddrA, 84 -.set vgprLocalReadAddrB, 85 -.set vgprSerial, 86 -/* Num VGPR=87 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - - - - - -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC00A0600, 0x00000008 -.long 0xC0020B40, 0x0000006C -.long 0xBEFC00FF, 0x00006000 -.long 0x7EC80300 -.long 0x26CA00BF -.long 0x2004C886 -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20CC0884 -.long 0x7EA40566 -.long 0xD1130067, 0x0000A08F -.long 0x7EA20567 -.long 0xBF068151 -.long 0xBF840212 -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8B00FF, 0x00020000 -.long 0x80B85418 -.long 0x80B95518 -.long 0x8EB88238 -.long 0x80388438 -.long 0x82398039 -.long 0xBF068039 -.long 0x850AFF38, 0x80000000 -.long 0xBE8A00FF, 0x80000000 -.long 0x9254C030 -.long 0x92545402 -.long 0x8E558452 -.long 0x92533055 -.long 0x92553104 -.long 0x81545354 -.long 0x80545554 -.long 0x2000CA85 -.long 0xD2850004, 0x00020030 -.long 0x2602CA9F -.long 0x32A40304 -.long 0x68A4A454 -.long 0x24A4A482 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000108 -.long 0x68A6A447 -.long 0x68A8A647 -.long 0x68AAA847 -.long 0x68ACAA47 -.long 0x68AEAC47 -.long 0x68B0AE47 -.long 0x68B2B047 -.long 0xBECC00FF, 0x00000840 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8F00FF, 0x00020000 -.long 0x80BA541A -.long 0x80BB551A -.long 0x8EBA823A -.long 0x803A843A -.long 0x823B803B -.long 0xBF06803B -.long 0x850EFF3A, 0x80000000 -.long 0xBE8E00FF, 0x80000000 -.long 0x9254FF32, 0x00000080 -.long 0x92545403 -.long 0x925532A0 -.long 0x92555552 -.long 0x81545554 -.long 0x92553304 -.long 0x80545554 -.long 0x2004CA85 -.long 0x2606CA9F -.long 0xD2850004, 0x00020432 -.long 0x32400704 -.long 0x68404054 -.long 0x24404082 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000108 -.long 0x6842404A -.long 0x6844424A -.long 0x6846444A -.long 0x6848464A -.long 0x684A484A -.long 0x684C4A4A -.long 0x684E4C4A -.long 0x68504E4A -.long 0x6852504A -.long 0x6854524A -.long 0x6856544A -.long 0x6858564A -.long 0x685A584A -.long 0x685C5A4A -.long 0x685E5C4A -.long 0xBECE00FF, 0x00001080 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004200 -.long 0xBF8A0000 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002100 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00004200 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBEFC004D -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBEFC004C -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF8500DC -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004E -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8F0001 -.long 0xBF8A0000 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004D -.long 0x802E812E -.long 0xBF8A0000 -.long 0xE0511000, 0x80023052 -.long 0xE0511108, 0x80023153 -.long 0xE0511210, 0x80023254 -.long 0xE0511318, 0x80023355 -.long 0xE0511420, 0x80023456 -.long 0xE0511528, 0x80023557 -.long 0xE0511630, 0x80023658 -.long 0xE0511738, 0x80023759 -.long 0xBEFC004F -.long 0xBF800000 -.long 0xE0511000, 0x80033020 -.long 0xE0511108, 0x80033121 -.long 0xE0511210, 0x80033222 -.long 0xE0511318, 0x80033323 -.long 0xE0511420, 0x80033424 -.long 0xE0511528, 0x80033525 -.long 0xE0511630, 0x80033626 -.long 0xE0511738, 0x80033727 -.long 0xE0511840, 0x80033828 -.long 0xE0511948, 0x80033929 -.long 0xE0511A50, 0x80033A2A -.long 0xE0511B58, 0x80033B2B -.long 0xE0511C60, 0x80033C2C -.long 0xE0511D68, 0x80033D2D -.long 0xE0511E70, 0x80033E2E -.long 0xE0511F78, 0x80033F2F -.long 0xBF8C8F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x68A4A4FF, 0x00000080 -.long 0x68A6A6FF, 0x00000080 -.long 0x68A8A8FF, 0x00000080 -.long 0x68AAAAFF, 0x00000080 -.long 0x68ACACFF, 0x00000080 -.long 0x68AEAEFF, 0x00000080 -.long 0x68B0B0FF, 0x00000080 -.long 0x68B2B2FF, 0x00000080 -.long 0x684040FF, 0x00000080 -.long 0x684242FF, 0x00000080 -.long 0x684444FF, 0x00000080 -.long 0x684646FF, 0x00000080 -.long 0x684848FF, 0x00000080 -.long 0x684A4AFF, 0x00000080 -.long 0x684C4CFF, 0x00000080 -.long 0x684E4EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBF8C4F78 -.long 0xBF8A0000 -.long 0xBF8F0001 -.long 0x685050FF, 0x00000080 -.long 0x685252FF, 0x00000080 -.long 0x685454FF, 0x00000080 -.long 0x685656FF, 0x00000080 -.long 0x685858FF, 0x00000080 -.long 0x685A5AFF, 0x00000080 -.long 0x685C5CFF, 0x00000080 -.long 0x685E5EFF, 0x00000080 -.long 0xBF8F0000 -.long 0xBEFC004C -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FF24 -.long 0xBF8C4F70 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD3D94008, 0x18000080 -.long 0xD3D94009, 0x18000080 -.long 0xD3D9400A, 0x18000080 -.long 0xD3D9400B, 0x18000080 -.long 0xD3D9400C, 0x18000080 -.long 0xD3D9400D, 0x18000080 -.long 0xD3D9400E, 0x18000080 -.long 0xD3D9400F, 0x18000080 -.long 0xD3D94010, 0x18000080 -.long 0xD3D94011, 0x18000080 -.long 0xD3D94012, 0x18000080 -.long 0xD3D94013, 0x18000080 -.long 0xD3D94014, 0x18000080 -.long 0xD3D94015, 0x18000080 -.long 0xD3D94016, 0x18000080 -.long 0xD3D94017, 0x18000080 -.long 0xD3D94018, 0x18000080 -.long 0xD3D94019, 0x18000080 -.long 0xD3D9401A, 0x18000080 -.long 0xD3D9401B, 0x18000080 -.long 0xD3D9401C, 0x18000080 -.long 0xD3D9401D, 0x18000080 -.long 0xD3D9401E, 0x18000080 -.long 0xD3D9401F, 0x18000080 -.long 0xC0060700, 0x00000000 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xC00A0800, 0x00000018 -.long 0xD1130001, 0x00013F65 -.long 0xD2850060, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C0C101 -.long 0x2002CA85 -.long 0x68C0C101 -.long 0x24C0C082 -.long 0x68C0C080 -.long 0x68C2C0FF, 0x00002100 -.long 0xBF8A0000 -.long 0xD1130001, 0x00013F65 -.long 0xD2850062, 0x000202A0 -.long 0x20020281 -.long 0xD2850001, 0x00020282 -.long 0x68C4C501 -.long 0x2002CA85 -.long 0x68C4C501 -.long 0x24C4C482 -.long 0x9254FF52, 0x00001080 -.long 0x68C4C454 -.long 0x68C4C4FF, 0x00004200 -.long 0x68C6C4FF, 0x00004200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603FF, 0x00000080 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x80145414 -.long 0x82155515 -.long 0x24C8CC86 -.long 0x68C8C965 -.long 0xD2850004, 0x0002CCA0 -.long 0xD2850003, 0x00004D04 -.long 0x2608C89F -.long 0xD2850005, 0x00004D04 -.long 0x2608C8BF -.long 0x200C0885 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0068, 0x020AD76A -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF06802E -.long 0xBF85053B -.long 0xBF8CC47F -.long 0xD3C40000, 0x04020120 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD3C40000, 0x04020322 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xD86C0060, 0x0C000062 -.long 0xD3C40010, 0x04420323 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xBF8CCB7F -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xBF8CC97F -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40010, 0x04421D3D -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CCD7F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xBF8CCB7F -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xBF8CC97F -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C40000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C40010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C40000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C40010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C40000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C40010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C40000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C40010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD3C40000, 0x04023F5E -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD86C0038, 0x07000062 -.long 0xD3C40010, 0x04423F5F -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020120 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C40000, 0x04020322 -.long 0xD3C40010, 0x04420323 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C40010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8A0000 -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40010, 0x04423353 -.long 0xBF8F0000 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40010, 0x04423555 -.long 0xBF8A0000 -.long 0xD86C0000, 0x20000060 -.long 0xD86C1080, 0x21000060 -.long 0xD86C0008, 0x22000060 -.long 0xD86C1088, 0x23000060 -.long 0xD3C40000, 0x04023756 -.long 0xD86C0010, 0x24000060 -.long 0xD86C1090, 0x25000060 -.long 0xD86C0018, 0x26000060 -.long 0xD86C1098, 0x27000060 -.long 0xD3C40010, 0x04423757 -.long 0xD86C0020, 0x28000060 -.long 0xD86C10A0, 0x29000060 -.long 0xD86C0028, 0x2A000060 -.long 0xD86C10A8, 0x2B000060 -.long 0xD3C40000, 0x04023958 -.long 0xD86C0030, 0x2C000060 -.long 0xD86C10B0, 0x2D000060 -.long 0xD86C0038, 0x2E000060 -.long 0xD86C10B8, 0x2F000060 -.long 0xD3C40010, 0x04423959 -.long 0xD86C0040, 0x30000060 -.long 0xD86C10C0, 0x31000060 -.long 0xD86C0048, 0x32000060 -.long 0xD86C10C8, 0x33000060 -.long 0xD3C40000, 0x04023B5A -.long 0xD86C0050, 0x34000060 -.long 0xD86C10D0, 0x35000060 -.long 0xD86C0058, 0x36000060 -.long 0xD86C10D8, 0x37000060 -.long 0xD3C40010, 0x04423B5B -.long 0xD86C0060, 0x38000060 -.long 0xD86C10E0, 0x39000060 -.long 0xD86C0068, 0x3A000060 -.long 0xD86C10E8, 0x3B000060 -.long 0xD3C40000, 0x04023D5C -.long 0xD86C0070, 0x3C000060 -.long 0xD86C10F0, 0x3D000060 -.long 0xD86C0078, 0x3E000060 -.long 0xD86C10F8, 0x3F000060 -.long 0xD3C40010, 0x04423D5D -.long 0xBF8A0000 -.long 0xD86C0000, 0x00000062 -.long 0xD86C0008, 0x01000062 -.long 0xD3C40000, 0x04023F5E -.long 0xD86C0010, 0x02000062 -.long 0xD86C0018, 0x03000062 -.long 0xD86C0020, 0x04000062 -.long 0xD86C0028, 0x05000062 -.long 0xD86C0030, 0x06000062 -.long 0xD3C40010, 0x04423F5F -.long 0xD86C0038, 0x07000062 -.long 0xD86C0040, 0x08000062 -.long 0xD86C0048, 0x09000062 -.long 0xD86C0050, 0x0A000062 -.long 0xD86C0058, 0x0B000062 -.long 0xBF8F0001 -.long 0x802E812E -.long 0xBF00C22E -.long 0xBF84FEAC -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020120 -.long 0xD3C40010, 0x04420121 -.long 0xD86C0060, 0x0C000062 -.long 0xD86C0068, 0x0D000062 -.long 0xD86C0070, 0x0E000062 -.long 0xD86C0078, 0x0F000062 -.long 0xD3C40000, 0x04020322 -.long 0xD3C40010, 0x04420323 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04020524 -.long 0xD3C40010, 0x04420525 -.long 0xD3C40000, 0x04020726 -.long 0xD3C40010, 0x04420727 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04020928 -.long 0xD3C40010, 0x04420929 -.long 0xD3C40000, 0x04020B2A -.long 0xD3C40010, 0x04420B2B -.long 0xBF8CC87F -.long 0xD3C40000, 0x04020D2C -.long 0xD3C40010, 0x04420D2D -.long 0xD3C40000, 0x04020F2E -.long 0xD3C40010, 0x04420F2F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04021130 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1000, 0x80042068 -.long 0xE05C1020, 0x80042468 -.long 0xE05C1040, 0x80042868 -.long 0xE05C1060, 0x80042C68 -.long 0xD3C40010, 0x04421131 -.long 0xD3C40000, 0x04021332 -.long 0xD3C40010, 0x04421333 -.long 0xD3C40000, 0x04021534 -.long 0xD3C40010, 0x04421535 -.long 0xBF8A0000 -.long 0xD86C0000, 0x40000061 -.long 0xD86C1080, 0x41000061 -.long 0xD86C0008, 0x42000061 -.long 0xD86C1088, 0x43000061 -.long 0xD3C40000, 0x04021736 -.long 0xD86C0010, 0x44000061 -.long 0xD86C1090, 0x45000061 -.long 0xD86C0018, 0x46000061 -.long 0xD86C1098, 0x47000061 -.long 0xD3C40010, 0x04421737 -.long 0xD86C0020, 0x48000061 -.long 0xD86C10A0, 0x49000061 -.long 0xD86C0028, 0x4A000061 -.long 0xD86C10A8, 0x4B000061 -.long 0xD3C40000, 0x04021938 -.long 0xD86C0030, 0x4C000061 -.long 0xD86C10B0, 0x4D000061 -.long 0xD86C0038, 0x4E000061 -.long 0xD86C10B8, 0x4F000061 -.long 0xD3C40010, 0x04421939 -.long 0xD86C0040, 0x50000061 -.long 0xD86C10C0, 0x51000061 -.long 0xD86C0048, 0x52000061 -.long 0xD86C10C8, 0x53000061 -.long 0xD3C40000, 0x04021B3A -.long 0xD86C0050, 0x54000061 -.long 0xD86C10D0, 0x55000061 -.long 0xD86C0058, 0x56000061 -.long 0xD86C10D8, 0x57000061 -.long 0xD3C40010, 0x04421B3B -.long 0xD86C0060, 0x58000061 -.long 0xD86C10E0, 0x59000061 -.long 0xD86C0068, 0x5A000061 -.long 0xD86C10E8, 0x5B000061 -.long 0xD3C40000, 0x04021D3C -.long 0xD86C0070, 0x5C000061 -.long 0xD86C10F0, 0x5D000061 -.long 0xD86C0078, 0x5E000061 -.long 0xD86C10F8, 0x5F000061 -.long 0xD3C40010, 0x04421D3D -.long 0xBF8A0000 -.long 0xD86C0000, 0x10000063 -.long 0xD86C0008, 0x11000063 -.long 0xD3C40000, 0x04021F3E -.long 0xD86C0010, 0x12000063 -.long 0xD86C0018, 0x13000063 -.long 0xD86C0020, 0x14000063 -.long 0xD86C0028, 0x15000063 -.long 0xD86C0030, 0x16000063 -.long 0xD3C40010, 0x04421F3F -.long 0xD86C0038, 0x17000063 -.long 0xD86C0040, 0x18000063 -.long 0xD86C0048, 0x19000063 -.long 0xD86C0050, 0x1A000063 -.long 0xD86C0058, 0x1B000063 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022140 -.long 0xBF068029 -.long 0xBF850008 -.long 0xE05C1080, 0x80043068 -.long 0xE05C10A0, 0x80043468 -.long 0xE05C10C0, 0x80043868 -.long 0xE05C10E0, 0x80043C68 -.long 0xD3C40010, 0x04422141 -.long 0xD86C0060, 0x1C000063 -.long 0xD86C0068, 0x1D000063 -.long 0xD86C0070, 0x1E000063 -.long 0xD86C0078, 0x1F000063 -.long 0xD3C40000, 0x04022342 -.long 0xD3C40010, 0x04422343 -.long 0xBF8CCC7F -.long 0xD3C40000, 0x04022544 -.long 0xD3C40010, 0x04422545 -.long 0xD3C40000, 0x04022746 -.long 0xD3C40010, 0x04422747 -.long 0xBF8CCA7F -.long 0xD3C40000, 0x04022948 -.long 0xD3C40010, 0x04422949 -.long 0xD3C40000, 0x04022B4A -.long 0xD3C40010, 0x04422B4B -.long 0xBF8CC87F -.long 0xD3C40000, 0x04022D4C -.long 0xD3C40010, 0x04422D4D -.long 0xD3C40000, 0x04022F4E -.long 0xD3C40010, 0x04422F4F -.long 0xBF8CC07F -.long 0xD3C40000, 0x04023150 -.long 0xD3C40000, 0x04023352 -.long 0xD3C40000, 0x04023554 -.long 0xD3C40000, 0x04023756 -.long 0xD3C40000, 0x04023958 -.long 0xD3C40000, 0x04023B5A -.long 0xD3C40000, 0x04023D5C -.long 0xD3C40000, 0x04023F5E -.long 0xBF068029 -.long 0xBF8400A3 -.long 0xD3C40010, 0x04423151 -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3C40010, 0x04423353 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1000, 0x80050068 -.long 0xD3C40010, 0x04423555 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C1020, 0x80050468 -.long 0xD3C40010, 0x04423757 -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C1040, 0x80050868 -.long 0xD3C40010, 0x04423959 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C1060, 0x80050C68 -.long 0xD3C40010, 0x04423B5B -.long 0xD3C40010, 0x04423D5D -.long 0xD3C40010, 0x04423F5F -.long 0xBF800003 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xE07C1080, 0x80050068 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xE07C10A0, 0x80050468 -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 -.long 0xD3C40010, 0x04423151 -.long 0xD3C40010, 0x04423353 -.long 0xD3C40010, 0x04423555 -.long 0xD3C40010, 0x04423757 -.long 0xD3C40010, 0x04423959 -.long 0xD3C40010, 0x04423B5B -.long 0xD3C40010, 0x04423D5D -.long 0xD3C40010, 0x04423F5F -.long 0xD3D84000, 0x18000100 -.long 0xD3D84001, 0x18000101 -.long 0xD3D84002, 0x18000102 -.long 0xD3D84003, 0x18000103 -.long 0xD3D84004, 0x18000104 -.long 0xD3D84005, 0x18000105 -.long 0xD3D84006, 0x18000106 -.long 0xD3D84007, 0x18000107 -.long 0xD3D84008, 0x18000108 -.long 0xD3D84009, 0x18000109 -.long 0xD3D8400A, 0x1800010A -.long 0xD3D8400B, 0x1800010B -.long 0xD3D8400C, 0x1800010C -.long 0xD3D8400D, 0x1800010D -.long 0xD3D8400E, 0x1800010E -.long 0xD3D8400F, 0x1800010F -.long 0xBF8C0F74 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005320 -.long 0xD1160001, 0x00005321 -.long 0xD1160002, 0x00005322 -.long 0xD1160003, 0x00005323 -.long 0xE07C1000, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005324 -.long 0xD1160005, 0x00005325 -.long 0xD1160006, 0x00005326 -.long 0xD1160007, 0x00005327 -.long 0xE07C1020, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005328 -.long 0xD1160009, 0x00005329 -.long 0xD116000A, 0x0000532A -.long 0xD116000B, 0x0000532B -.long 0xE07C1040, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000532C -.long 0xD116000D, 0x0000532D -.long 0xD116000E, 0x0000532E -.long 0xD116000F, 0x0000532F -.long 0xE07C1060, 0x80050C68 -.long 0xD3D84000, 0x18000110 -.long 0xD3D84001, 0x18000111 -.long 0xD3D84002, 0x18000112 -.long 0xD3D84003, 0x18000113 -.long 0xD3D84004, 0x18000114 -.long 0xD3D84005, 0x18000115 -.long 0xD3D84006, 0x18000116 -.long 0xD3D84007, 0x18000117 -.long 0xD3D84008, 0x18000118 -.long 0xD3D84009, 0x18000119 -.long 0xD3D8400A, 0x1800011A -.long 0xD3D8400B, 0x1800011B -.long 0xD3D8400C, 0x1800011C -.long 0xD3D8400D, 0x1800011D -.long 0xD3D8400E, 0x1800011E -.long 0xD3D8400F, 0x1800011F -.long 0xBF8C0F70 -.long 0xD1050000, 0x00005100 -.long 0xD1050001, 0x00005101 -.long 0xD1050002, 0x00005102 -.long 0xD1050003, 0x00005103 -.long 0xD1160000, 0x00005330 -.long 0xD1160001, 0x00005331 -.long 0xD1160002, 0x00005332 -.long 0xD1160003, 0x00005333 -.long 0xE07C1080, 0x80050068 -.long 0xD1050004, 0x00005104 -.long 0xD1050005, 0x00005105 -.long 0xD1050006, 0x00005106 -.long 0xD1050007, 0x00005107 -.long 0xD1160004, 0x00005334 -.long 0xD1160005, 0x00005335 -.long 0xD1160006, 0x00005336 -.long 0xD1160007, 0x00005337 -.long 0xE07C10A0, 0x80050468 -.long 0xD1050008, 0x00005108 -.long 0xD1050009, 0x00005109 -.long 0xD105000A, 0x0000510A -.long 0xD105000B, 0x0000510B -.long 0xD1160008, 0x00005338 -.long 0xD1160009, 0x00005339 -.long 0xD116000A, 0x0000533A -.long 0xD116000B, 0x0000533B -.long 0xE07C10C0, 0x80050868 -.long 0xD105000C, 0x0000510C -.long 0xD105000D, 0x0000510D -.long 0xD105000E, 0x0000510E -.long 0xD105000F, 0x0000510F -.long 0xD116000C, 0x0000533C -.long 0xD116000D, 0x0000533D -.long 0xD116000E, 0x0000533E -.long 0xD116000F, 0x0000533F -.long 0xE07C10E0, 0x80050C68 -.long 0xBF8C0000 -.long 0xBF810000 - diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt index 9b04756c1..1d2d70e2e 100644 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_MDA2_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt @@ -35,12 +35,12 @@ .hsa_code_object_version 2,0 .hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" .text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 +.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 +.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 .p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: +.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8,@function +.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 +Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8: .amd_kernel_code_t is_ptr64 = 1 enable_sgpr_kernarg_segment_ptr = 1 @@ -73,8 +73,8 @@ Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_AS .amd_amdgpu_hsa_metadata Version: [ 1, 0 ] Kernels: - - Name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8@kd' + - Name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8 + SymbolName: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG32_16_1_WGM8@kd' Language: OpenCL C LanguageVersion: [ 2, 0 ] Args: diff --git a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt b/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt deleted file mode 100644 index 1b49f2db6..000000000 --- a/Tensile/ReplacementKernels/Cijk_Alik_Bljk_SB_MT64x32x32_AF0EM8_ASEM8_FL0_GRVW2_ISA908_PGR1_PLR1_SU32_TT2_2_VAW1_VW2_WG32_16_1_WGM8.s.txt +++ /dev/null @@ -1,933 +0,0 @@ -/***********************************************************************************/ - /* */ - /* Copyright 2020-2021 Advanced Micro Devices, Inc. */ - /* */ - /* Permission is hereby granted, free of charge, to any person obtaining a copy */ - /* of this software and associated documentation files (the "Software"), to deal */ - /* in the Software without restriction, including without limitation the rights */ - /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */ - /* copies of the Software, and to permit persons to whom the Software is */ - /* furnished to do so, subject to the following conditions: */ - /* */ - /* The above copyright notice and this permission notice shall be included in */ - /* all copies or substantial portions of the Software. */ - /* */ - /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */ - /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */ - /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */ - /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ - /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */ - /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */ - /* SOFTWARE. */ - /* */ - /**********************************************************************************/ - -/******************************************/ -/* Function Prefix */ -/******************************************/ - - - -/******************************************/ -/* Begin Kernel */ -/******************************************/ - -.hsa_code_object_version 2,0 -.hsa_code_object_isa 9, 0, 8, "AMD", "AMDGPU" -.text -.protected Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.globl Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -.p2align 8 -.type Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8,@function -.amdgpu_hsa_kernel Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 -Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8: -.amd_kernel_code_t - is_ptr64 = 1 - enable_sgpr_kernarg_segment_ptr = 1 - kernarg_segment_byte_size = 148 // bytes of kern args - workitem_vgpr_count = 108 // vgprs - wavefront_sgpr_count = 98 // sgprs - compute_pgm_rsrc1_vgprs = 26 // floor((108-1)/4) - compute_pgm_rsrc1_sgprs = 12 // floor((98-1)/8) - compute_pgm_rsrc2_tidig_comp_cnt = 0 // 1D wg - compute_pgm_rsrc2_tgid_x_en = 1 // wg.x - compute_pgm_rsrc2_tgid_y_en = 1 // wg.y - compute_pgm_rsrc2_tgid_z_en = 1 // wg.z - workgroup_group_segment_byte_size = 30000// lds bytes - compute_pgm_rsrc2_user_sgpr = 2 // vcc - kernarg_segment_alignment = 4 - group_segment_alignment = 4 - private_segment_alignment = 4 -.end_amd_kernel_code_t - -/******************************************/ -/* Optimizations and Config: */ -/******************************************/ -/* ThreadTile= 4 x 4 */ -/* SubGroup= 16 x 32 */ -/* VectorWidth=4 */ -/* GlobalLoadVectorWidthA=4, GlobalLoadVectorWidthB=4 */ -/* DirectToLdsA=False */ -/* DirectToLdsB=False */ -/* UseSgprForGRO=1 */ -.amd_amdgpu_hsa_metadata -Version: [ 1, 0 ] -Kernels: - - Name: Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8 - SymbolName: 'Cijk_Alik_Bljk_SB_MT64x32x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW2_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW1_VSn1_VW2_WSGRA0_WSGRB0_WG32_16_1_WGM8@kd' - Language: OpenCL C - LanguageVersion: [ 2, 0 ] - Args: - - Name: sizeC - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeA - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: sizeB - Size: 8 - Align: 8 - ValueKind: ByValue - ValueType: I64 - - Name: D - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: C - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: A - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: B - Size: 8 - Align: 8 - ValueKind: GlobalBuffer - ValueType: F32 - AddrSpaceQual: Generic - - Name: alpha - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: beta - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: F32 - - Name: strideD0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideD1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideC1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideA1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: strideB1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesFree2 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: SizesSum0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: OrigStaggerUIter - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: I32 - - Name: NumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumWorkGroups1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberProblemNumGroupTiles0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: GridNumWorkGroups0 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: NumFullBlocks - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: WgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: MagicNumberWgmRemainder1 - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - - Name: padding - Size: 4 - Align: 4 - ValueKind: ByValue - ValueType: U32 - CodeProps: - KernargSegmentSize: 148 - GroupSegmentFixedSize: 28672 - PrivateSegmentFixedSize: 0 - KernargSegmentAlign: 8 - WavefrontSize: 64 - NumSGPRs: 98 - NumVGPRs: 108 - MaxFlatWorkGroupSize: 512 -.end_amd_amdgpu_hsa_metadata - -/******************************************/ -/* Asm syntax workarounds */ -/******************************************/ -.macro _v_add_co_u32 dst, cc, src0, src1, dpp= - v_add_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_add_u32 dst, src0, src1, dpp= - v_add_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_sub_co_u32 dst, cc, src0, src1, dpp= - v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp -.endm - -.macro _v_sub_u32 dst, src0, src1, dpp= - v_sub_u32 \dst, \src0, \src1 \dpp -.endm - -.macro _v_addc_co_u32 dst, ccOut, src0, ccIn, src1, dpp= - v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp -.endm - -.macro _v_add_lshl_u32 dst, src0, src1, shiftCnt - v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt -.endm - -.macro _v_lshl_add_u32 dst, src0, src1, shiftCnt - v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt -.endm - -/******************************************/ -/* Magic div and mod functions */ -/******************************************/ -.macro V_MAGIC_DIV dstIdx, dividend, magicNumber, magicShift - v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber - v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicNumber - v_lshrrev_b64 v[\dstIdx:\dstIdx+1], \magicShift, v[\dstIdx:\dstIdx+1] -.endm - -/******************************************/ -/* VGPR Assignments */ -/******************************************/ -.set vgprValuC, 0 -/* ValuA/B Xn=PLR buffer idx, In=InnerUnroll idx */ -.set vgprValuA_X0_I0, 32 -.set vgprValuA_X1_I0, 34 -.set vgprG2LA, 36 -.set vgprValuB_X0_I0, 40 -.set vgprValuB_X1_I0, 44 -.set vgprG2LB, 48 -.set vgprLocalWriteAddrA, 56 -.set vgprLocalWriteAddrB, 57 -.set vgprGlobalReadOffsetA, 58 -.set vgprGlobalReadOffsetB, 59 -.set vgprLocalReadAddrA, 60 -.set vgprLocalReadAddrB, 61 -.set vgprSerial, 62 -/* Num VGPR=63 */ - -/******************************************/ -/* SGPR Assignments */ -/******************************************/ -.set sgprKernArgAddress, 0 -.set sgprWorkGroup0, 2 -.set sgprWorkGroup1, 3 -.set sgprWorkGroup2, 4 -.set sgprNumWorkGroups0, 5 -.set sgprNumWorkGroups1, 6 -.set sgprSrdA, 8 -.set sgprSrdB, 12 -.set sgprSrdD, 16 -.set sgprSrdC, 20 -.set sgprTensor2dSizeC, 24 -.set sgprTensor2dSizeA, 26 -.set sgprTensor2dSizeB, 28 -.set sgprSaveExecMask, 30 -.set sgprAddressD, 32 -.set sgprAddressC, 34 -.set sgprStridesD, 36 -.set sgprStridesC, 38 -.set sgprAlpha, 40 -.set sgprBeta, 41 -.set sgprSizesFree, 42 -.set sgprSizesSum, 45 -.set sgprLoopCounters, 46 -.set sgprOrigLoopCounter, 47 -.set sgprStridesA, 48 -.set sgprStridesB, 50 -.set sgprAddressA, 52 -.set sgprAddressB, 54 -.set sgprShadowLimitA, 56 -.set sgprShadowLimitB, 58 -.set sgprOrigStaggerUIter, 60 -.set sgprStaggerUIter, 61 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 -.set sgprNumFullBlocks, 66 -.set sgprWgmRemainder1, 67 -.set sgprMagicNumberWgmRemainder1, 68 -.set sgprGlobalReadIncsA, 69 -.set sgprGlobalReadIncsB, 70 -.set sgprScalarGlobalReadOffsetA, 71 -.set sgprScalarGlobalReadOffsetB, 72 -/* max SGPR=98 */ - -/* Size Assignments */ -.set sgprSizeD0I, sgprSizesFree+0 -.set sgprSizeD1J, sgprSizesFree+1 -.set sgprSizeDK, sgprSizesFree+2 -.set sgprSizeC0I, sgprSizesFree+0 -.set sgprSizeC1J, sgprSizesFree+1 -.set sgprSizeCK, sgprSizesFree+2 -.set sgprSizeAL, sgprSizesSum+0 -.set sgprSizeA0I, sgprSizesFree+0 -.set sgprSizeAK, sgprSizesFree+2 -.set sgprSizeBL, sgprSizesSum+0 -.set sgprSizeB1J, sgprSizesFree+1 -.set sgprSizeBK, sgprSizesFree+2 - -/* Stride Assignments */ -.set constStrideD0I, 1 -.set sgprStrideD1J, sgprStridesD+0 -.set sgprStrideDK, sgprStridesD+1 -.set constStrideC0I, 1 -.set sgprStrideC1J, sgprStridesC+0 -.set sgprStrideCK, sgprStridesC+1 -.set constStrideAL, 1 -.set sgprStrideA0I, sgprStridesA+0 -.set sgprStrideAK, sgprStridesA+1 -.set constStrideBL, 1 -.set sgprStrideB1J, sgprStridesB+0 -.set sgprStrideBK, sgprStridesB+1 - -.set DepthU, 32 -/* Number of elements to shift-left SRD */ -.set SrdShiftLeftA, 4 -.set SrdShiftLeftB, 4 -/* 2GB limit - set offsets to -1 to exceed this and clamp */ -.set BufferLimit, 0x80000000 -/* Bits 127:96 of SRD. Set DataFormat = 32 bit */ -.set Srd127_96, 0x0020000 -.set BufferOOB, 0x80000000 - -/* Global Offset A */ -.macro GLOBAL_OFFSET_A vgprAddr vgprOffsetL vgprOffset0I vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprOffset0I] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/* Global Offset B */ -.macro GLOBAL_OFFSET_B vgprAddr vgprOffsetL vgprOffset1J vgprTmp -v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower -_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate d1 lower -_v_add_u32 v[\vgprAddr+0], 0x4, v[\vgprAddr+0] // add prepad for pointer shift -v_lshlrev_b32 v[\vgprAddr+0], 0x1, v[\vgprAddr+0] // offset *= bytes/element -.endm - -/******************************************/ -/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */ -/******************************************/ -.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp -v_cvt_f32_u32 v[\vQuotient], v[\vDivisor] // -v_rcp_f32 v[\vQuotient], v[\vQuotient] // -v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // -v_cvt_u32_f32 v[\vQuotient], v[\vQuotient] // -v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // -v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder] // -v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0] // -v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // -_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // -v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // -v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // -_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // -_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient] // -v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0] // -s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc // -v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // -v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // -v_cmp_ne_i32 vcc, 0x0, v[\vDivisor] // -v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result -v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // -_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result -.endm - -/******************************************/ -/* 4x8 thread-tile */ -/******************************************/ -.macro MAC_4x8_X0 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+0], v[vgprValuB_X0_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X0_I0+1], v[vgprValuB_X0_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm -.macro MAC_4x8_X1 -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[0] iui=0 -s_setprio 1 // Raise priority while processing macs -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[1] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[4] -v_fma_mix_f32 v[vgprValuC+0*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+0], v[vgprValuC+0*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[5] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[2] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[3] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[6] -v_fma_mix_f32 v[vgprValuC+1*2+0*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+0], v[vgprValuC+1*2+0*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[7] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[8] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[9] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[12] -v_fma_mix_f32 v[vgprValuC+0*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+1], v[vgprValuC+0*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[13] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[10] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[11] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[14] -v_fma_mix_f32 v[vgprValuC+1*2+1*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+1], v[vgprValuC+1*2+1*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[15] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[16] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[17] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[20] -v_fma_mix_f32 v[vgprValuC+0*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+2], v[vgprValuC+0*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[21] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[18] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[19] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[22] -v_fma_mix_f32 v[vgprValuC+1*2+2*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+2], v[vgprValuC+1*2+2*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[23] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[24] iui=0 -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[25] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[28] -v_fma_mix_f32 v[vgprValuC+0*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+0], v[vgprValuB_X1_I0+3], v[vgprValuC+0*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[29] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+0] op_sel:[0,0,0] op_sel_hi:[1,1,0] //ValuC[26] iui=0 -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+0*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+0*2+1] op_sel:[1,0,0] op_sel_hi:[1,1,0] //ValuC[27] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+0], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+0] op_sel:[0,1,0] op_sel_hi:[1,1,0] //ValuC[30] -v_fma_mix_f32 v[vgprValuC+1*2+3*4*2+2*2+1], v[vgprValuA_X1_I0+1], v[vgprValuB_X1_I0+3], v[vgprValuC+1*2+3*4*2+2*2+1] op_sel:[1,1,0] op_sel_hi:[1,1,0] //valuC[31] -s_setprio 0 // Reset priority after macs -.endm - - - - -/***** program start from here *****/ - -.long 0xC00A0600, 0x00000008 -.long 0xC00A0D00, 0x00000028 -.long 0xC00A0C00, 0x00000050 -.long 0xC0020B40, 0x0000006C -.long 0x7ECC0300 -.long 0x26CE00BF -.long 0xB8D0F804 -.long 0xD1130004, 0x0000A0B0 -.long 0x20D00884 -.long 0x7EA40568 -.long 0xD1130069, 0x0000A08F -.long 0x7EA20569 -.long 0xBF068151 -.long 0xBF8400EA -.long 0xBF8CC07F -.long 0xBE880034 -.long 0xBE890035 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE8B00FF, 0x00020000 -.long 0x96553104 -.long 0x92543104 -.long 0x8ED48254 -.long 0x80085408 -.long 0x82095509 -.long 0x9254C030 -.long 0x92545402 -.long 0x92559030 -.long 0x92555552 -.long 0x81545554 -.long 0x2000CE85 -.long 0xD2850004, 0x00020030 -.long 0x2602CE9F -.long 0x32180304 -.long 0x68181854 -.long 0x24181882 -.long 0x8E478330 -.long 0x80C7FF47, 0x00000120 -.long 0x681A1847 -.long 0x681C1A47 -.long 0x681E1C47 -.long 0x68201E47 -.long 0x68222047 -.long 0x68242247 -.long 0x68262447 -.long 0xBECC00FF, 0x00000900 -.long 0x924C4C52 -.long 0xBE8C0036 -.long 0xBE8D0037 -.long 0xBE8E00FF, 0x80000000 -.long 0xBE8F00FF, 0x00020000 -.long 0x96553304 -.long 0x92543304 -.long 0x8ED48254 -.long 0x800C540C -.long 0x820D550D -.long 0x9254A032 -.long 0x92545403 -.long 0x92558832 -.long 0x92555552 -.long 0x81545554 -.long 0x2004CE85 -.long 0xD2850004, 0x00020432 -.long 0x2606CE9F -.long 0x32280704 -.long 0x68282854 -.long 0x24282882 -.long 0x8E4A8332 -.long 0x80CAFF4A, 0x00000120 -.long 0x682A284A -.long 0x682C2A4A -.long 0x682E2C4A -.long 0xBECE00FF, 0x00000480 -.long 0x924E4E52 -.long 0x814EFF4E, 0x00004800 -.long 0xBF8A0000 -.long 0xBEFC004E -.long 0x814FFF4E, 0x00001200 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0x814DFF4C, 0x00002400 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBEFC004F -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF85004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004C -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004F -.long 0xBF8A0000 -.long 0xE0511000, 0x80030814 -.long 0xE0511120, 0x80030915 -.long 0xE0511240, 0x80030A16 -.long 0xE0511360, 0x80030B17 -.long 0xBEFC004D -.long 0xBF800000 -.long 0xE0511000, 0x8002000C -.long 0xE0511120, 0x8002010D -.long 0xE0511240, 0x8002020E -.long 0xE0511360, 0x8002030F -.long 0xE0511480, 0x80020410 -.long 0xE05115A0, 0x80020511 -.long 0xE05116C0, 0x80020612 -.long 0xE05117E0, 0x80020713 -.long 0xBF8C4F74 -.long 0xBF8A0000 -.long 0x800CFF0C, 0x00000080 -.long 0x820D800D -.long 0xBF8C0F7C -.long 0xBF8A0000 -.long 0x8008FF08, 0x00000080 -.long 0x82098009 -.long 0xBEFC004E -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FFB1 -.long 0xBF8C0F78 -.long 0xBF8A0000 -.long 0xBF8C0F70 -.long 0xBF8A0000 -.long 0xBF810000 -.long 0xC0060700, 0x00000000 -.long 0xC00A0800, 0x00000018 -.long 0xC00A0A00, 0x00000038 -.long 0xC00A0900, 0x00000040 -.long 0xD3D94000, 0x18000080 -.long 0xD3D94001, 0x18000080 -.long 0xD3D94002, 0x18000080 -.long 0xD3D94003, 0x18000080 -.long 0xD3D94004, 0x18000080 -.long 0xD3D94005, 0x18000080 -.long 0xD3D94006, 0x18000080 -.long 0xD3D94007, 0x18000080 -.long 0xD1130001, 0x00011F67 -.long 0xD2850040, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68808101 -.long 0x24808082 -.long 0x68808102 -.long 0x9254FF52, 0x00000900 -.long 0x68808054 -.long 0x68808080 -.long 0x688280FF, 0x00002400 -.long 0xBF8A0000 -.long 0xD1130001, 0x00011F67 -.long 0xD2850042, 0x000202A0 -.long 0x20040281 -.long 0xD2850002, 0x000204A0 -.long 0x2002CE84 -.long 0x24020282 -.long 0x68848501 -.long 0x24848482 -.long 0x68848502 -.long 0x688484FF, 0x00004800 -.long 0x688684FF, 0x00001200 -.long 0xBF8CC07F -.long 0xBE900022 -.long 0xBE910023 -.long 0xBE9200FF, 0x80000000 -.long 0xBE9300FF, 0x00020000 -.long 0x925603A0 -.long 0x96552656 -.long 0x92542656 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0x96552704 -.long 0x92542704 -.long 0x8ED48254 -.long 0x80105410 -.long 0x82115511 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004D04 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0044, 0x020AD76A -.long 0x925426C0 -.long 0x688A8854 -.long 0xBE940020 -.long 0xBE950021 -.long 0xBE9600FF, 0x80000000 -.long 0xBE9700FF, 0x00020000 -.long 0x925603A0 -.long 0x96552456 -.long 0x92542456 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0x96552504 -.long 0x92542504 -.long 0x8ED48254 -.long 0x80145414 -.long 0x82155515 -.long 0xD2850003, 0x0002D090 -.long 0x2608CE8F -.long 0xD2850005, 0x00004904 -.long 0x200CCE84 -.long 0x240C0C82 -.long 0x68D60B03 -.long 0x925402C0 -.long 0x32D40C54 -.long 0xD1FE0046, 0x020AD76A -.long 0x925426C0 -.long 0x688E8C54 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x20000042 -.long 0xD9FE0900, 0x28000042 -.long 0xD9FE0040, 0x24000042 -.long 0xD9FE0940, 0x2C000042 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD9FE0040, 0x14000040 -.long 0x8F2E852D -.long 0x80AE2E80 -.long 0xBF03C22E -.long 0xBF850065 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04024110 -.long 0xD3C50004, 0x04125110 -.long 0xD3C50000, 0x04024311 -.long 0xD3C50004, 0x04125311 -.long 0xD3C50000, 0x04024512 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04125512 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C50000, 0x04024713 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C50004, 0x04125713 -.long 0xD9FE0040, 0x34000043 -.long 0xBF8CC37F -.long 0xD3C50000, 0x04024914 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C50004, 0x04125914 -.long 0xD3C50000, 0x04024B15 -.long 0xD3C50004, 0x04125B15 -.long 0xD3C50000, 0x04024D16 -.long 0xD3C50004, 0x04125D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C50000, 0x04024F17 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C50004, 0x04125F17 -.long 0xBF8A0000 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04026118 -.long 0xD3C50004, 0x04127118 -.long 0xD3C50000, 0x04026319 -.long 0xD3C50004, 0x04127319 -.long 0xD3C50000, 0x0402651A -.long 0xBF8A0000 -.long 0xD3C50004, 0x0412751A -.long 0xD9FE0000, 0x20000042 -.long 0xD3C50000, 0x0402671B -.long 0xD9FE0900, 0x28000042 -.long 0xD3C50004, 0x0412771B -.long 0xD9FE0040, 0x24000042 -.long 0xBF8CC37F -.long 0xD3C50000, 0x0402691C -.long 0xD9FE0940, 0x2C000042 -.long 0xD3C50004, 0x0412791C -.long 0xD3C50000, 0x04026B1D -.long 0xD3C50004, 0x04127B1D -.long 0xD3C50000, 0x04026D1E -.long 0xD3C50004, 0x04127D1E -.long 0xBF8A0000 -.long 0xD9FE0000, 0x10000040 -.long 0xD3C50000, 0x04026F1F -.long 0xD9FE0040, 0x14000040 -.long 0xD3C50004, 0x04127F1F -.long 0x802E822E -.long 0xBF03C22E -.long 0xBF84FF9B -.long 0xBF8CC17F -.long 0xD3C50000, 0x04024110 -.long 0xE05C1000, 0x80040844 -.long 0xE05C1000, 0x80040C45 -.long 0xD3C50004, 0x04125110 -.long 0xD3C50000, 0x04024311 -.long 0xBF8A0000 -.long 0xD3C50004, 0x04125311 -.long 0xD9FE0000, 0x30000043 -.long 0xD3C50000, 0x04024512 -.long 0xD9FE0900, 0x38000043 -.long 0xD3C50004, 0x04125512 -.long 0xD9FE0040, 0x34000043 -.long 0xD3C50000, 0x04024713 -.long 0xD9FE0940, 0x3C000043 -.long 0xD3C50004, 0x04125713 -.long 0xBF8CC37F -.long 0xD3C50000, 0x04024914 -.long 0xD3C50004, 0x04125914 -.long 0xD3C50000, 0x04024B15 -.long 0xD3C50004, 0x04125B15 -.long 0xD3C50000, 0x04024D16 -.long 0xBF8A0000 -.long 0xD9FE0000, 0x18000041 -.long 0xD3C50004, 0x04125D16 -.long 0xD9FE0040, 0x1C000041 -.long 0xD3C50000, 0x04024F17 -.long 0xD3C50004, 0x04125F17 -.long 0xBF8CC17F -.long 0xD3C50000, 0x04026118 -.long 0xD3C50004, 0x04127118 -.long 0xD3C50000, 0x04026319 -.long 0xD3C50004, 0x04127319 -.long 0xD3C50000, 0x0402651A -.long 0xD3C50004, 0x0412751A -.long 0xD3C50000, 0x0402671B -.long 0xD3C50004, 0x0412771B -.long 0xBF8CC07F -.long 0xD3C50000, 0x0402691C -.long 0xD3C50000, 0x04026B1D -.long 0xD3C50000, 0x04026D1E -.long 0xD3C50000, 0x04026F1F -.long 0xD3C50004, 0x0412791C -.long 0xD3C50004, 0x04127B1D -.long 0xD3C50004, 0x04127D1E -.long 0xD3C50004, 0x04127F1F -.long 0xD3D84000, 0x18000100 -.long 0x0A000028 -.long 0xD3D84001, 0x18000101 -.long 0x0A020228 -.long 0xD3D84002, 0x18000102 -.long 0x0A040428 -.long 0xD3D84003, 0x18000103 -.long 0x0A060628 -.long 0xBF8C0F71 -.long 0xD1CB0000, 0x04005308 -.long 0xD1CB0001, 0x04045309 -.long 0xD1CB0002, 0x0408530A -.long 0xD1CB0003, 0x040C530B -.long 0xE07C1000, 0x80050046 -.long 0xD3D84004, 0x18000104 -.long 0x0A080828 -.long 0xD3D84005, 0x18000105 -.long 0x0A0A0A28 -.long 0xD3D84006, 0x18000106 -.long 0x0A0C0C28 -.long 0xD3D84007, 0x18000107 -.long 0x0A0E0E28 -.long 0xBF8C0F71 -.long 0xD1CB0004, 0x0410530C -.long 0xD1CB0005, 0x0414530D -.long 0xD1CB0006, 0x0418530E -.long 0xD1CB0007, 0x041C530F -.long 0xE07C1000, 0x80050447 -.long 0xBF8C0000 -.long 0xBF810000 diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 8c39edb5d..e44a256cc 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -231,6 +231,12 @@ def initForwardConvolution(self, problemTypeOut, config, \ nidx = i ; i+=1 kidx = i ; i+=1 sumIdx = i + elif formatD in ("CHWN", "CDHWN"): + i = 0 + nidx = i ; i+=1 + sidx = i ; i+=len(sdims) + kidx = i ; i+=1 + sumIdx = i else: raise RuntimeError ("unknown formatD '%s'"%formatD) @@ -263,6 +269,8 @@ def initForwardConvolution(self, problemTypeOut, config, \ self.registerA( [RegDim(nidx,Fbs.Batch,ndim)] + self.spatialRegDims + self.filterRegDims + chinRegDim ) elif formatA in ("CNHW", "CNDHW"): self.registerA( chinRegDim + [RegDim(nidx,Fbs.Batch,ndim)] + self.spatialRegDims + self.filterRegDims ) + elif formatA in ("CHWN", "CDHWN"): + self.registerA( chinRegDim + self.spatialRegDims + self.filterRegDims + [RegDim(nidx,Fbs.Batch,ndim)] ) else: raise RuntimeError ("unknown formatA '%s'"%formatA) @@ -291,6 +299,9 @@ def initForwardConvolution(self, problemTypeOut, config, \ elif nonFilterDims[-1].dim==cdim: setStride = True cdim.strideA = 1 + elif nonFilterDims[-1].dim==ndim: + setStride = True + ndim.strideA = 1 if self.filterRegDims and self.regDimsA[-1].dim == self.filterRegDims[-1].dim: if self.filterRegDims[-1].dim.shortChar in self.ValidLowestFilterDim: @@ -334,10 +345,11 @@ def makeZeroPadConvProblemType(self, padStart, padEnd): rv.append([anchorIdx, sumIdx, -1, -1]) return rv - def makeZeroPadProblemType(self, zps, padStart, padEnd, cc): + def makeZeroPadProblemType(self, zps, padStart, padEnd, c, cc): """ Convert padStart/padEnd into the format expected by ProblemType ZeroPad* """ rv = [] - ss = 1 + ss = c if self.regDimsA[0].dim.shortChar == 'C' else 1 + for (i,zp) in enumerate(zps): (anchorIdx, sumIdx) = zp[:2] rv.append([anchorIdx, sumIdx, padStart[i]*ss, padEnd[i]*ss]) @@ -607,11 +619,13 @@ def makeProblem(self, n, c, k, pcc): #print ("spatialOut=", spatialOut, "padStart=", pcc.padStart, "padEnd=", pcc.padEnd) + cScalar = c if self.regDimsA[0].dim.shortChar == 'C' else 1 + for fi,filterValue in enumerate(pcc.fil): try: pos = self.convolutionDims[chr(ord('X')+fi)].idx sizes[pos] = filterValue - astrides[pos] = pcc.dilation[0] if fi==0 else pcc.spatial[fi-1]*pcc.dilation[fi] + astrides[pos] = pcc.dilation[0]*cScalar if fi==0 else pcc.spatial[fi-1]*pcc.dilation[fi]*cScalar except KeyError: None @@ -619,14 +633,13 @@ def makeProblem(self, n, c, k, pcc): spatialName="DHW"[3-self.formatNumSpatialDims:] pos=self.convolutionDims[spatialName].idx sizes[pos] = reduce((lambda x, y: x * y), spatialOut) # product of all spatial dimes - astrides[pos] = pcc.stride[0] + astrides[pos] = pcc.stride[0]*cScalar else: for si,sout in enumerate(spatialOut): spatialChars=['W','H','D'] pos = self.convolutionDims[spatialChars[si]].idx sizes[pos] = sout - - astrides[pos]=pcc.stride[0] if si==0 else pcc.spatial[si-1]*pcc.stride[si] + astrides[pos]=pcc.stride[0]*cScalar if si==0 else pcc.spatial[si-1]*pcc.stride[si]*cScalar assert all(i!=-1 for i in sizes) @@ -1453,7 +1466,7 @@ def __init__(self, e, convolution): (sizes, stridesA, stridesB) = convolution.makeProblem(e['n'], e['c'], e['k'], self.convConfig) zeroPadA = convolution.makeZeroPadProblemType(convolution.problemTypeOut["ZeroPadA"], - self.convConfig.padStart, self.convConfig.padEnd, self.convConfig) + self.convConfig.padStart, self.convConfig.padEnd, e['c'], self.convConfig) Problem.__init__(self, sizes, stridesA, stridesB=stridesB, zeroPadA=zeroPadA, count=e['count']) @@ -1891,9 +1904,9 @@ def assignProblemIndependentDerivedParameters(state): state["SubGroup0"] = state["MIWaveGroup"][0] * state["MatrixInstM"] * state["MatrixInstBM"] // state["MIOutputVectorWidth"] state["SubGroup1"] = state["MIWaveGroup"][1] * state["MatrixInstN"] * state["MatrixInstBN"] else: - state["ThreadTile0"] = state["MatrixInstBM"] * state["MIWaveTile"][0] * (state["MatrixInstM"] * state["MatrixInstN"] // globalParameters["WavefrontWidth"]) + state["ThreadTile0"] = state["MatrixInstBM"] * state["MIWaveTile"][0] * (state["MatrixInstM"] * state["MatrixInstN"] // state["WavefrontSize"]) state["ThreadTile1"] = state["MatrixInstBN"] * state["MIWaveTile"][1] - state["SubGroup0"] = state["MIWaveGroup"][0] * (globalParameters["WavefrontWidth"] // state["MatrixInstN"]) + state["SubGroup0"] = state["MIWaveGroup"][0] * (state["WavefrontSize"] // state["MatrixInstN"]) state["SubGroup1"] = state["MIWaveGroup"][1] * state["MatrixInstN"] elif EnableMatrixInstruction == False: @@ -1992,7 +2005,7 @@ def setGlobalLoadVectorWidth(state, tc, totalVectors, grvw): def setGlobalLoadTileDimClassic(state, tc, numLoads, totalVectorsCoalesced, totalElementsPerp): if state["WaveSeparateGlobalRead%s"%tc]: - totalElementsPerp = roundupRatio(totalElementsPerp, state["NumThreads"] // globalParameters["WavefrontWidth"]) + totalElementsPerp = roundupRatio(totalElementsPerp, state["NumThreads"] // state["WavefrontSize"]) # nlc = 1 if state["NumLoadsCoalesced%s"%tc] == 1 : @@ -2059,7 +2072,7 @@ def setGlobalLoadTileDimClassic(state, tc, numLoads, totalVectorsCoalesced, tota state["LSP%s"%tc] = state["MacroTile%s"%tc] // state["NumLoadsPerpendicular%s"%tc] if state["WaveSeparateGlobalRead%s"%tc]: - state["LSP%s"%tc] = roundupRatio(state["LSP%s"%tc], state["NumThreads"] // globalParameters["WavefrontWidth"]) + state["LSP%s"%tc] = roundupRatio(state["LSP%s"%tc], state["NumThreads"] // state["WavefrontSize"]) return True @@ -2251,7 +2264,7 @@ def parameterWrapper(state): state["ThreadTile"][0] = state["MatrixInstruction"][5] state["ThreadTile"][1] = state["MatrixInstruction"][6] * state["MatrixInstruction"][1] state["WorkGroup"][0] = state["MatrixInstruction"][4] * state["MatrixInstruction"][0] * state["MatrixInstruction"][7] - state["WorkGroup"][1] = waves*globalParameters["WavefrontWidth"] // state["WorkGroup"][0] + state["WorkGroup"][1] = waves*state["WavefrontSize"] // state["WorkGroup"][0] #print("9-tuple: ", state["MatrixInstruction"], " TT=", state["ThreadTile"], " WG=", state["WorkGroup"]) if state["MatrixInstruction"]: state["MatrixInstruction"] = [state["MatrixInstruction"][0],state["MatrixInstruction"][1],state["MatrixInstruction"][2],state["MatrixInstruction"][3]] @@ -2286,7 +2299,7 @@ def parameterWrapper(state): state["MIBlock"][5] = MIBlock_BN # set MIWaveGroup - numOfWave = (state["WorkGroup"][0] * state["WorkGroup"][1]) // globalParameters["WavefrontWidth"] + numOfWave = (state["WorkGroup"][0] * state["WorkGroup"][1]) // state["WavefrontSize"] state['MIWaveGroup'] = [1, 1] state['MIWaveGroup'][0] = min((miwg0 // state["MatrixInstruction"][0]) // MIBlock_BM, numOfWave) state['MIWaveGroup'][1] = numOfWave // state['MIWaveGroup'][0] @@ -2308,7 +2321,7 @@ def parameterWrapper(state): @staticmethod def checkAndAssignWaveSeparateGlobalRead(state, tc): # check can we use WaveSeparateGlobalRead - numOfWaves = state["NumThreads"] // globalParameters["WavefrontWidth"] + numOfWaves = state["NumThreads"] // state["WavefrontSize"] if state["WaveSeparateGlobalRead%s"%tc]: if state["FractionalLoad"] != 0: reject(state, "didn't support WaveSeparateGlobalRead with FractionalLoad(%u) != 0" % state["FractionalLoad"]) @@ -2334,7 +2347,7 @@ def isDirectToLdsDoable(state, tc): print2("can't use DirectToLds for BF16 with AssertSummationElementMultiple %u" % state["AssertSummationElementMultiple"]) return False - if state["NumThreads"] % globalParameters["WavefrontWidth"] != 0: + if state["NumThreads"] % state["WavefrontSize"] != 0: return False if state["GlobalLoadVectorWidth%c"%tc] * numBytes != 4: @@ -2344,7 +2357,7 @@ def isDirectToLdsDoable(state, tc): return False if state["WaveSeparateGlobalRead%c" % tc]: - if state["LSC%c"%tc] * state["LSP%c"%tc] * numBytes != globalParameters["WavefrontWidth"] * 4: + if state["LSC%c"%tc] * state["LSP%c"%tc] * numBytes != state["WavefrontSize"] * 4: return False else: if state["LSC%c"%tc] * state["LSP%c"%tc] * numBytes != state["NumThreads"] * 4: @@ -2353,13 +2366,13 @@ def isDirectToLdsDoable(state, tc): if (state["LdsBlockSizePerPad%c"%tc] == 0) \ and (state["LdsPad%c"%tc] != 0): # and ((state["LSC%c"%tc] * numBytes) != (state["NumThreads"] * 4)): // TODO: -# and ((state["LSC%c"%tc] * numBytes) % (globalParameters["WavefrontWidth"] * 4) != 0): +# and ((state["LSC%c"%tc] * numBytes) % (state["WavefrontSize"] * 4) != 0): return False if (state["LdsBlockSizePerPad%c"%tc] != 0) \ and (state["LdsPad%c"%tc] != 0) \ - and (state["LdsBlockSizePerPad%c"%tc] != globalParameters["WavefrontWidth"] * 4): -# and (state["LdsBlockSizePerPad%tc"] % (globalParameters["WavefrontWidth"] * 4) != 0): // TODO: + and (state["LdsBlockSizePerPad%c"%tc] != state["WavefrontSize"] * 4): +# and (state["LdsBlockSizePerPad%tc"] % (state["WavefrontSize"] * 4) != 0): // TODO: return False return True @@ -2424,6 +2437,20 @@ def assignDerivedParameters(state): if state["DisableVgprOverlapping"] is True and state["EnableMatrixInstruction"] is not True: reject(state, "Non-MI kernels are already non-overlapping in pre-allocated registers") + # F32 only for now but we should extend this for other data types as well. + isa = tuple(state["ISA"]) + if "MACInstruction" not in state or state["MACInstruction"] not in validParameters["MACInstruction"]: + if globalParameters["AsmCaps"][isa]["v_mac_f32"]: + state["MACInstruction"] = "MAC" + else: + state["MACInstruction"] = "FMA" + + if state["WavefrontSize"] == 32 and not globalParameters["ArchCaps"][isa]["HasWave32"]: + reject(state, "WavefrontSize=32 not supported for ISA {}".format(isa)) + + if state["WavefrontSize"] == 32 and state["KernelLanguage"] == "Source": + reject(state, "WavefrontSize=32 not yet supported for source kernels.") + if state["EnableMatrixInstruction"]: if not (state["ProblemType"]["DataType"].isSingle() \ or state["ProblemType"]["DataType"].isDouble() \ @@ -2463,6 +2490,9 @@ def assignDerivedParameters(state): state["SubGroupB"] = state["SubGroup1"] state["MacroTileA"] = state["MacroTile0"] state["MacroTileB"] = state["MacroTile1"] + if state["EnableMatrixInstruction"]: + state["MIWaveTileA"] = state["MIWaveTile"][0] + state["MIWaveTileB"] = state["MIWaveTile"][1] else: state["ThreadTileB"] = state["ThreadTile0"] state["ThreadTileA"] = state["ThreadTile1"] @@ -2470,6 +2500,9 @@ def assignDerivedParameters(state): state["SubGroupA"] = state["SubGroup1"] state["MacroTileB"] = state["MacroTile0"] state["MacroTileA"] = state["MacroTile1"] + if state["EnableMatrixInstruction"]: + state["MIWaveTileA"] = state["MIWaveTile"][1] + state["MIWaveTileB"] = state["MIWaveTile"][0] Solution.checkAndAssignWaveSeparateGlobalRead(state, 'A') Solution.checkAndAssignWaveSeparateGlobalRead(state, 'B') @@ -2506,11 +2539,6 @@ def assignDerivedParameters(state): print2("PAP requires Assembly, PK!=0, PGR==1, SuppressNoLoadLoop=True, forcing PAP=False") state["PrefetchAcrossPersistent"] = False - # TODO- fix this, avoid the bug for now - if state["PrefetchAcrossPersistent"] and state["StaggerU"] == 0: - print2("PAP has some defects so far and would cause error when SU=0, disable PAP temporarily") - state["PrefetchAcrossPersistent"] = False - problemType = state["ProblemType"] if not problemType["UseInitialStridesAB"]: for (tc) in ('A','B'): @@ -2546,15 +2574,19 @@ def assignDerivedParameters(state): if problemType["Index0"] in problemType["IndexAssignmentsA"]: tc0 = 'A' tc1 = 'B' + batch0Mask = 0x1 + batch1Mask = 0x2 else: tc0 = 'B' tc1 = 'A' + batch0Mask = 0x2 + batch1Mask = 0x1 assert(isPackedIndex(state, problemType["Index01A"], 0x1)) assert(isPackedIndex(state, problemType["Index01B"], 0x2)) # Pack all the dimensions (batch and free) of A into grid[0] for idx in problemType["IndexAssignments%s"%tc0]: - if isPackedIndex(state, idx, 0x1): + if isPackedIndex(state, idx, batch0Mask): assert (idx < problemType["NumIndicesC"]) state["PackedC0IdxChars"].append("%s" % indexChars[idx]) state["PackedC0IndicesX"].append(idx) @@ -2562,7 +2594,7 @@ def assignDerivedParameters(state): state["PackedC1IdxChars"] = [] state["PackedC1IndicesX"] = [] for idx in problemType["IndexAssignments%s"%tc1]: - if isPackedIndex(state, idx, 0x2): + if isPackedIndex(state, idx, batch1Mask): assert (idx < problemType["NumIndicesC"]) state["PackedC1IdxChars"].append("%s" % indexChars[idx]) state["PackedC1IndicesX"].append(idx) @@ -2776,7 +2808,7 @@ def assignDerivedParameters(state): depthU = 2 depthULds = 2 maxDepthU = globalParameters["MaxDepthU"] - numOfWaves = state["NumThreads"] // globalParameters["WavefrontWidth"] + numOfWaves = state["NumThreads"] // state["WavefrontSize"] if state["ProblemType"]["TLUA"] and state["WaveSeparateGlobalReadA"]: depthU = max(depthU, numOfWaves) if state["ProblemType"]["TLUB"] and state["WaveSeparateGlobalReadB"]: @@ -2819,18 +2851,18 @@ def assignDerivedParameters(state): # how many elements to load if state["ProblemType"]["TLUA"]: - totalElementsCoalescedA = state["MacroTile0"] + totalElementsCoalescedA = state["MacroTileA"] totalElementsPerpA = depthU else: totalElementsCoalescedA = depthU - totalElementsPerpA = state["MacroTile0"] + totalElementsPerpA = state["MacroTileA"] if state["ProblemType"]["TLUB"]: - totalElementsCoalescedB = state["MacroTile1"] + totalElementsCoalescedB = state["MacroTileB"] totalElementsPerpB = depthU else: totalElementsCoalescedB = depthU - totalElementsPerpB = state["MacroTile1"] + totalElementsPerpB = state["MacroTileB"] totalElementsA = totalElementsCoalescedA * totalElementsPerpA totalElementsB = totalElementsCoalescedB * totalElementsPerpB @@ -3066,7 +3098,7 @@ def assignDerivedParameters(state): if state["EnableMatrixInstruction"]: state["LocalReadVectorWidth"] = state["MIInputPerThread"] else: - state["LocalReadVectorWidth"] = state["VectorWidth"] + state["LocalReadVectorWidth"] = state["VectorWidth"] else: if state["EnableMatrixInstruction"]: if state["LocalReadVectorWidth"] < state["MIInputPerThread"]: @@ -3112,23 +3144,23 @@ def assignDerivedParameters(state): ldsAlign = int(64 / state["ProblemType"]["DataType"].numRegisters()) if state["UnrollMajorLDSA"]: - ldsNumElementsA = (state["_DepthULds"] + state["LdsPadA"]) * state["MacroTile0"] + ldsNumElementsA = (state["_DepthULds"] + state["LdsPadA"]) * state["MacroTileA"] padInterval = state["LdsBlockSizePerPadA"] // bpeAB if padInterval != 0: - ldsNumElementsA = int((state["_DepthULds"] * state["MacroTile0"]) / padInterval * (padInterval + state["LdsPadA"])) + ldsNumElementsA = int((state["_DepthULds"] * state["MacroTileA"]) / padInterval * (padInterval + state["LdsPadA"])) ldsNumElementsAlignedA = roundUpToNearestMultiple(ldsNumElementsA, ldsAlign) else: - ldsNumElementsA = state["_DepthULds"] * (state["MacroTile0"] + state["LdsPadA"]) + ldsNumElementsA = state["_DepthULds"] * (state["MacroTileA"] + state["LdsPadA"]) ldsNumElementsAlignedA = roundUpToNearestMultiple(ldsNumElementsA, ldsAlign) if state["UnrollMajorLDSB"]: - ldsNumElementsB = (state["_DepthULds"] + state["LdsPadB"]) * state["MacroTile1"] + ldsNumElementsB = (state["_DepthULds"] + state["LdsPadB"]) * state["MacroTileB"] padInterval = state["LdsBlockSizePerPadB"] // bpeAB if padInterval != 0: - ldsNumElementsB = int((state["_DepthULds"] * state["MacroTile1"]) / padInterval * (padInterval + state["LdsPadB"])) + ldsNumElementsB = int((state["_DepthULds"] * state["MacroTileB"]) / padInterval * (padInterval + state["LdsPadB"])) ldsNumElementsAlignedB = roundUpToNearestMultiple(ldsNumElementsB, ldsAlign) else: - ldsNumElementsB = state["_DepthULds"] * (state["MacroTile1"] + state["LdsPadB"]) + ldsNumElementsB = state["_DepthULds"] * (state["MacroTileB"] + state["LdsPadB"]) ldsNumElementsAlignedB = roundUpToNearestMultiple(ldsNumElementsB, ldsAlign) # todo, can the alignment be a power of 2? @@ -3194,6 +3226,17 @@ def assignDerivedParameters(state): else: state["StoreRemapVectorWidth"] = defaultRemap + if state["SourceSwap"]: + if not state["EnableMatrixInstruction"]: + reject(state, "SourceSwap only applies to MatrixInstruction kernels") + return + if not state["ProblemType"]["DataType"].isDouble(): + reject(state, "SourceSwap currently only available for dgemm") + return + if state["StoreRemapVectorWidth"]: + reject(state, "SourceSwap not compatibile with StoreRemap") + return + #check not support cases and calculate lds resources if state["StoreRemapVectorWidth"]: if not state["EnableMatrixInstruction"]: @@ -3227,11 +3270,11 @@ def assignDerivedParameters(state): reject(state, "StoreRemapVectorWidth %u is not allowed for this data type" % state["StoreRemapVectorWidth"]) return - if state["StoreRemapVectorWidth"] * globalParameters["WavefrontWidth"] < state["MacroTile0"]: + if state["StoreRemapVectorWidth"] * state["WavefrontSize"] < state["MacroTile0"]: reject(state, "storeRemap: Per wave single global write instruction doesn't enough to write one M column." + \ " Please use larger StoreRemapVectorWidth.") return - if (state["MacroTile0"]*state["MatrixInstN"])//state["MIWaveGroup"][0] < state["StoreRemapVectorWidth"]*globalParameters["WavefrontWidth"]: + if (state["MacroTile0"]*state["MatrixInstN"])//state["MIWaveGroup"][0] < state["StoreRemapVectorWidth"]*state["WavefrontSize"]: reject(state, "storeRemap: number elements of lds less than per wave per local read elements." + \ " Please use smaller StoreRemapVectorWidth.") return @@ -3281,8 +3324,8 @@ def assignDerivedParameters(state): if ldl > 1: # Disable DirectToLds for LDL > 1. Necessary because we need to swizzle the input data state["DirectToLds"] = False - if (state["AssertSummationElementMultiple"] % ldl != 0): - reject(state, "LocalDotLayout > 1 only supports ASEM a multiple of LDL") + if (state["AssertSummationElementMultiple"] % ldl != 0) and (ldl != 2): + reject(state, "LocalDotLayout > 1 only supports ASEM a multiple of LDL, except ldl = 2") return if (state["ProblemType"]["HighPrecisionAccumulate"] != True or state["InnerUnroll"] != ldl): reject(state, "LocalDotLayout > 1 only supports HighPrecisionAccumulate set to true and InnerUnroll equal to LocalDotLayout") @@ -3529,7 +3572,8 @@ def assignDerivedParameters(state): # likely have more performant options. for tc in ('A', 'B'): if problemType["ZeroPad%s"%tc] and state["KernelLanguage"] == "Assembly": - if state["GlobalLoadVectorWidth%s"%tc] != 1: + if state["GlobalLoadVectorWidth%s"%tc] != 1 \ + and problemType["IndexAssignments%s"%tc][0] in problemType["ZeroPad%s"%tc][0][0:1]: reject(state, "asm ZeroPad requires GlobalLoadVectorWidth==1") if not bufferLoad: reject(state, "asm ZeroPad requires BufferLoad") @@ -3697,6 +3741,11 @@ def getParametersIndented(state, indent): ######################################## @ staticmethod def getParameterNameAbbreviation( name ): + specialValues = { + 'MACInstruction': '' # Conflicts with MatrixInstruction, but _MAD and _FMA should be enough differentiation for the kernel name. + } + if name in specialValues: return specialValues[name] + return ''.join([c for c in name if not c.islower()]) ######################################## @@ -3731,7 +3780,7 @@ def getParameterValueAbbreviation( key, value ): s = "_".join(["%d%d"%(pos,k) for pos,k in value.items()]) return s else: - printExit("Parameter \"%s\" is new object type" % str(value) ) + printExit('Parameter {key}={value} is new object type ({t})'.format(key=key, value=value, t=type(value))) return str(value) diff --git a/Tensile/Source/CMakeLists.txt b/Tensile/Source/CMakeLists.txt index 674d31e5c..5dba3eccb 100644 --- a/Tensile/Source/CMakeLists.txt +++ b/Tensile/Source/CMakeLists.txt @@ -19,7 +19,7 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.13) # Override all paths arguments as they do not work properly list(APPEND CMAKE_PREFIX_PATH $ENV{ROCM_PATH} /opt/rocm) @@ -51,7 +51,7 @@ if(TENSILE_NEW_CLIENT) endif() if(CMAKE_CXX_COMPILER STREQUAL "hipcc") - set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 CACHE STRING "GPU architectures") + set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 CACHE STRING "GPU architectures") else() set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a CACHE STRING "GPU architectures") endif() @@ -65,21 +65,17 @@ if(TENSILE_NEW_CLIENT) endif() if(TENSILE_USE_OPENMP) - #set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_OPENMP") - find_package(OpenMP QUIET) - if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + # Workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/21787 + # ensures we link to HIP's libomp and get an rpath to it. + add_library(custom_openmp_cxx INTERFACE) + + if(TENSILE_USE_HIP) + target_compile_options(custom_openmp_cxx INTERFACE "-fopenmp") + target_link_options(custom_openmp_cxx INTERFACE "-fopenmp") else () - if(EXISTS /etc/redhat-release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - set(OPENMP_LIBRARY /usr/lib/x86_64-linux-gnu/libomp.so) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENMP_LIBRARY}") - endif() - endif() + find_package(OpenMP REQUIRED) + target_link_libraries(custom_openmp_cxx INTERFACE OpenMP::OpenMP_CXX) + endif () endif() add_subdirectory(lib) @@ -108,7 +104,7 @@ else() set_property( CACHE Tensile_CODE_OBJECT_VERSION PROPERTY STRINGS V2 V3) set_property( CACHE Tensile_COMPILER PROPERTY STRINGS hipcc) set(Tensile_ARCHITECTURE all CACHE STRING "Which GPU architecture to use") - set_property( CACHE Tensile_ARCHITECTURE PROPERTY STRINGS all gfx000 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack-) + set_property( CACHE Tensile_ARCHITECTURE PROPERTY STRINGS all gfx000 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030) set_property( CACHE Tensile_LIBRARY_FORMAT PROPERTY STRINGS yaml msgpack) message( STATUS "Tensile_CODE_OBJECT_VERSION from Tensile/Source/CMakeLists.txt: ${Tensile_CODE_OBJECT_VERSION}") @@ -210,7 +206,7 @@ else() ${Tensile_RUNTIME_LANGUAGE} # HIP or OCL ${Tensile_COMPILER} # hipcc ${Tensile_CODE_OBJECT_VERSION} # V2 or V3 - ${Tensile_ARCHITECTURE} # all, gfx000, gfx803, gfx900, gfx906:xnack-, gfx908:xnack-, gfx90a:xnack- + ${Tensile_ARCHITECTURE} # all, gfx000, gfx803, gfx900, gfx906:xnack-, gfx908:xnack-, gfx90a:xnack-, gfx1010, gfx1011, gfx1012, gfx1030 ${Tensile_LIBRARY_FORMAT} # yaml or msgpack ${Tensile_MERGE_FILES} # ON or OFF ${Tensile_SHORT_FILE_NAMES} # ON or OFF diff --git a/Tensile/Source/TensileCreateLibrary.cmake b/Tensile/Source/TensileCreateLibrary.cmake index 2245c6b7f..5e8eac981 100644 --- a/Tensile/Source/TensileCreateLibrary.cmake +++ b/Tensile/Source/TensileCreateLibrary.cmake @@ -138,7 +138,7 @@ function(TensileCreateLibraryCmake add_library(Tensile ${options} ${Tensile_SOURCE_FILES}) # specify gpu targets if( Tensile_ARCHITECTURE MATCHES "all" ) - set( Tensile_HIP_ISA "gfx803" "gfx900" "gfx906" "gfx908") + set( Tensile_HIP_ISA "gfx803" "gfx900" "gfx906" "gfx908" "gfx1010" "gfx1011" "gfx1012" "gfx1030") else() set( Tensile_HIP_ISA ${Tensile_ARCHITECTURE}) endif() diff --git a/Tensile/Source/client/CMakeLists.txt b/Tensile/Source/client/CMakeLists.txt index 57329abfa..fe699b461 100644 --- a/Tensile/Source/client/CMakeLists.txt +++ b/Tensile/Source/client/CMakeLists.txt @@ -11,6 +11,7 @@ set(client_sources source/DataInitialization.cpp source/HardwareMonitor.cpp source/HardwareMonitorListener.cpp + source/LibraryUpdateReporter.cpp source/MetaRunListener.cpp source/PerformanceReporter.cpp source/ProgressListener.cpp @@ -34,17 +35,16 @@ endif() target_include_directories(TensileClient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")# "${rocm_smi_root}/include") -target_link_libraries(TensileClient TensileHost ${Boost_LIBRARIES} rocm_smi) +target_link_libraries(TensileClient PRIVATE TensileHost ${Boost_LIBRARIES} rocm_smi) +if(TENSILE_USE_OPENMP) + target_link_libraries(TensileClient PRIVATE custom_openmp_cxx) +endif() add_executable(tensile_client main.cpp) target_link_libraries(tensile_client PRIVATE TensileHost TensileClient ${Boost_LIBRARIES}) -if(OpenMP_FOUND) - target_link_libraries(tensile_client PRIVATE "${OpenMP_EXE_LINKER_FLAGS}") -else() - set(OPENMP_LIBRARY /usr/lib/x86_64-linux-gnu/libomp.so) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENMP_LIBRARY}") - target_link_libraries(tensile_client PRIVATE "${OpenMP_EXE_LINKER_FLAGS}") +if(TENSILE_USE_OPENMP) + target_link_libraries(tensile_client PRIVATE custom_openmp_cxx) endif() foreach(arch IN LISTS TENSILE_GPU_ARCHS) diff --git a/Tensile/Source/client/include/BenchmarkTimer.hpp b/Tensile/Source/client/include/BenchmarkTimer.hpp index ce79754ff..6d9b8c9b1 100644 --- a/Tensile/Source/client/include/BenchmarkTimer.hpp +++ b/Tensile/Source/client/include/BenchmarkTimer.hpp @@ -87,13 +87,15 @@ namespace Tensile virtual int error() const override; private: - const int m_numWarmups; - const int m_numBenchmarks; - const int m_numEnqueuesPerSync; - const int m_numSyncsPerBenchmark; - const int m_numEnqueuesPerSolution; + const int m_numWarmups; + const int m_numBenchmarks; + const int m_numEnqueuesPerSync; + const int m_numSyncsPerBenchmark; + const int m_numEnqueuesPerSolution; + const size_t m_minFlopsPerSync = 0; const bool m_useGPUTimer; + const bool m_syncAfterWarmups = true; const int m_sleepPercent; int m_numBenchmarksRun = 0; diff --git a/Tensile/Source/client/include/CSVStackFile.hpp b/Tensile/Source/client/include/CSVStackFile.hpp index dbb6f13fe..780df23ac 100644 --- a/Tensile/Source/client/include/CSVStackFile.hpp +++ b/Tensile/Source/client/include/CSVStackFile.hpp @@ -40,9 +40,9 @@ namespace Tensile class CSVStackFile { public: - CSVStackFile(std::string const& filename); - CSVStackFile(std::ostream& stream); - CSVStackFile(std::shared_ptr stream); + CSVStackFile(std::string const& filename, std::string const& separator = ", "); + CSVStackFile(std::ostream& stream, std::string const& separator = ", "); + CSVStackFile(std::shared_ptr stream, std::string const& separator = ", "); ~CSVStackFile(); @@ -73,6 +73,8 @@ namespace Tensile std::shared_ptr m_stream; + std::string m_separator; + bool m_firstRow = true; std::vector m_keyOrder; std::unordered_map m_headers; diff --git a/Tensile/Source/client/include/LibraryUpdateReporter.hpp b/Tensile/Source/client/include/LibraryUpdateReporter.hpp new file mode 100644 index 000000000..41197ffbb --- /dev/null +++ b/Tensile/Source/client/include/LibraryUpdateReporter.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2019-2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "CSVStackFile.hpp" +#include "ResultReporter.hpp" + +#include + +#include + +namespace Tensile +{ + namespace Client + { + namespace po = boost::program_options; + + class LibraryUpdateReporter : public ResultReporter + { + public: + static std::shared_ptr Default(po::variables_map const& args); + + static std::shared_ptr FromFilename(std::string const& filename, + bool addComment); + + /// This one will not close the stream. Useful for writing to cout. + LibraryUpdateReporter(std::ostream& stream, bool addComment); + /// This one has shared ownership of the stream. + LibraryUpdateReporter(std::shared_ptr stream, bool addComment); + + virtual void reportValue_string(std::string const& key, + std::string const& value) override; + virtual void reportValue_uint(std::string const& key, uint64_t value) override; + virtual void reportValue_int(std::string const& key, int64_t value) override; + virtual void reportValue_double(std::string const& key, double value) override; + virtual void reportValue_sizes(std::string const& key, + std::vector const& value) override; + + virtual void postProblem() override; + virtual void postSolution() override; + + void finalizeReport() override; + + private: + template + void reportValue(std::string const& key, T const& value); + + std::ostream& m_stream; + std::shared_ptr m_ownedStream; + + std::vector m_problemSizes; + + bool m_addComment = false; + + int64_t m_curSolutionIdx = -1; + std::string m_curSolutionName; + int64_t m_curSolutionSpeed = -1; + bool m_curSolutionPassed = false; + + int64_t m_fastestSolutionIdx = -1; + std::string m_fastestSolutionName; + int64_t m_fastestSolutionSpeed = -1; + }; + } // namespace Client +} // namespace Tensile diff --git a/Tensile/Source/client/include/LogReporter.hpp b/Tensile/Source/client/include/LogReporter.hpp index 7ba01735f..be649387a 100644 --- a/Tensile/Source/client/include/LogReporter.hpp +++ b/Tensile/Source/client/include/LogReporter.hpp @@ -51,7 +51,7 @@ namespace Tensile bool dumpTensors) : m_level(level) , m_stream(stream) - , m_csvOutput(stream) + , m_csvOutput(stream, ",") , m_dumpTensors(dumpTensors) { for(auto const& key : keys) @@ -64,7 +64,7 @@ namespace Tensile bool dumpTensors) : m_level(level) , m_stream(stream) - , m_csvOutput(stream) + , m_csvOutput(stream, ",") , m_dumpTensors(dumpTensors) { for(auto const& key : keys) @@ -78,7 +78,7 @@ namespace Tensile : m_level(level) , m_stream(*stream) , m_ownedStream(stream) - , m_csvOutput(stream) + , m_csvOutput(stream, ",") , m_dumpTensors(dumpTensors) { for(auto const& key : keys) @@ -87,19 +87,21 @@ namespace Tensile template static std::shared_ptr Default(po::variables_map const& args, - Stream& stream) + Stream& stream, + LogLevel level = LogLevel::Count) { bool dumpTensors = args["dump-tensors"].as(); using namespace ResultKey; - auto logLevel = args["log-level"].as(); - std::cout << "Log level: " << logLevel << std::endl; + if(level == LogLevel::Count) + level = args["log-level"].as(); + std::cout << "Log level: " << level << std::endl; PerformanceMetric metric = args["performance-metric"].as(); // Default to 'DeviceEfficiency' benchmarking if CUEfficiency not specified const std::string perfUnit = (metric == PerformanceMetric::CUEfficiency ? SpeedGFlopsPerCu : SpeedGFlops); - return std::shared_ptr(new LogReporter(logLevel, + return std::shared_ptr(new LogReporter(level, {BenchmarkRunNumber, ProblemProgress, SolutionProgress, @@ -172,7 +174,7 @@ namespace Tensile void acceptValidation(std::string const& value) { if(value == "PASSED" || value == "NO_CHECK") - m_rowLevel = LogLevel::Verbose; + m_rowLevel = LogLevel::Normal; else if(value == "FAILED" || value == "FAILED_CONV") m_rowLevel = LogLevel::Error; else if(value == "WRONG_HARDWARE") @@ -285,7 +287,7 @@ namespace Tensile virtual void preSolution(ContractionSolution const& solution) override { m_csvOutput.push(); - m_rowLevel = LogLevel::Verbose; + m_rowLevel = LogLevel::Normal; } virtual void postSolution() override diff --git a/Tensile/Source/client/include/MetaResultReporter.hpp b/Tensile/Source/client/include/MetaResultReporter.hpp index 9588081d4..cdad328b5 100644 --- a/Tensile/Source/client/include/MetaResultReporter.hpp +++ b/Tensile/Source/client/include/MetaResultReporter.hpp @@ -39,7 +39,8 @@ namespace Tensile public: virtual void addReporter(std::shared_ptr reporter) { - m_reporters.push_back(reporter); + if(reporter != nullptr) + m_reporters.push_back(reporter); } virtual void reportValue_string(std::string const& key, diff --git a/Tensile/Source/client/include/ReferenceValidator.hpp b/Tensile/Source/client/include/ReferenceValidator.hpp index 3c08f9676..f390db587 100644 --- a/Tensile/Source/client/include/ReferenceValidator.hpp +++ b/Tensile/Source/client/include/ReferenceValidator.hpp @@ -118,12 +118,15 @@ namespace Tensile virtual int error() const override; private: + void allocateResultBuffer(size_t bytes); + std::shared_ptr m_dataInit; std::shared_ptr m_referenceInputs; ConvolutionProblem m_convolutionProblem; - std::vector m_cpuResultBuffer; + size_t m_cpuResultBufferSize = 0; + std::shared_ptr m_cpuResultBuffer; ContractionProblem m_problem; diff --git a/Tensile/Source/client/include/ResultReporter.hpp b/Tensile/Source/client/include/ResultReporter.hpp index aa79e040b..4028759b0 100644 --- a/Tensile/Source/client/include/ResultReporter.hpp +++ b/Tensile/Source/client/include/ResultReporter.hpp @@ -39,6 +39,7 @@ namespace Tensile { Error = 0, Terse, + Normal, Verbose, Debug, Count @@ -77,11 +78,12 @@ namespace Tensile const std::string ProblemSizes = "problem-sizes"; // Solution information - const std::string SolutionName = "solution"; - const std::string SolutionIndex = "solution-index"; - const std::string SolutionProgress = "solution-progress"; - const std::string SolutionWinnerIdx = "solution-winner-idx"; - const std::string SolutionWinner = "solution-winner"; + const std::string SolutionName = "solution"; + const std::string SolutionIndex = "solution-index"; + const std::string SolutionLibraryIndex = "solution-library-index"; + const std::string SolutionProgress = "solution-progress"; + const std::string SolutionWinnerIdx = "solution-winner-idx"; + const std::string SolutionWinner = "solution-winner"; // Performance-related const std::string Validation = "validation"; diff --git a/Tensile/Source/client/main.cpp b/Tensile/Source/client/main.cpp index fe723ddce..10f363d8c 100644 --- a/Tensile/Source/client/main.cpp +++ b/Tensile/Source/client/main.cpp @@ -43,6 +43,7 @@ #include "SolutionIterator.hpp" #include "TimingEvents.hpp" +#include "LibraryUpdateReporter.hpp" #include "LogReporter.hpp" #include "MetaResultReporter.hpp" #include "PerformanceReporter.hpp" @@ -124,7 +125,7 @@ namespace Tensile ("init-d", po::value()->default_value(InitMode::Zero), "Initialization for D") ("init-alpha", po::value()->default_value(InitMode::Two), "Initialization for alpha") ("init-beta", po::value()->default_value(InitMode::Two), "Initialization for beta") - ("pristine-on-gpu", po::value()->default_value(false), "Keep a pristine copy of inputs on GPU for performance") + ("pristine-on-gpu", po::value()->default_value(true), "Keep a pristine copy of inputs on GPU for performance") ("c-equal-d", po::value()->default_value(false), "C equals D") ("offset-a", po::value()->default_value(0), "buffer a start offset") ("offset-b", po::value()->default_value(0), "buffer b start offset") @@ -155,9 +156,11 @@ namespace Tensile ("platform-idx", po::value()->default_value(0), "OpenCL Platform Index") ("num-warmups", po::value()->default_value(0), "Number of warmups to run") + ("sync-after-warmups", po::value()->default_value(true), "Synchronize GPU after warmup kernel runs") ("num-benchmarks", po::value()->default_value(1), "Number of benchmarks to run") - ("num-enqueues-per-sync", po::value()->default_value(1), "Enqueues per sync") + ("num-enqueues-per-sync", po::value()->default_value(1), "Enqueues per sync, will affect by min-flops-per-sync") ("num-syncs-per-benchmark", po::value()->default_value(1), "Syncs per benchmark") + ("min-flops-per-sync", po::value()->default_value(0), "Minimum number of flops per sync to increase stability for small problems.") ("use-gpu-timer", po::value()->default_value(true), "Use GPU timer") ("sleep-percent", po::value()->default_value(0), "Sleep percentage") ("hardware-monitor", po::value()->default_value(true), "Use hardware monitor.") @@ -221,7 +224,16 @@ namespace Tensile ("log-file", po::value(), "File name for output log.") ("log-file-append", po::value()->default_value(false), "Append to log file.") ("log-level", po::value()->default_value(LogLevel::Debug), "Log level") - ("exit-on-failure", po::value()->default_value(false), "Exit run early on failed kernels.") + + ("library-update-file", po::value()->default_value(""), "File name for writing indices " + "and speeds suitable for updating " + "an existing library logic file.") + ("library-update-comment", po::value()->default_value(false), "Include solution name as a " + "comment in library update " + "file.") + + + ("exit-on-error", po::value()->default_value(false), "Exit run early on failed kernels or other errors.") ("selection-only", po::value()->default_value(false), "Don't run any solutions, only print kernel selections.") ("max-workspace-size", po::value()->default_value(32*1024*1024), "Max workspace for training") ("granularity-threshold", po::value()->default_value(0.0), "Don't run a solution if total granularity is below") @@ -233,7 +245,16 @@ namespace Tensile std::shared_ptr GetHardware(po::variables_map const& args) { - HIP_CHECK_EXC(hipSetDevice(args["device-idx"].as())); + int deviceCount = 0; + HIP_CHECK_EXC(hipGetDeviceCount(&deviceCount)); + + int deviceIdx = args["device-idx"].as(); + + if(deviceIdx >= deviceCount) + throw std::runtime_error(concatenate( + "Invalid device index ", deviceIdx, " (", deviceCount, " total found.)")); + + HIP_CHECK_EXC(hipSetDevice(deviceIdx)); return hip::GetCurrentDevice(); } @@ -355,7 +376,10 @@ namespace Tensile auto configFiles = args["config-file"].as>(); for(auto filename : configFiles) { + std::cout << "loading config file " << filename << std::endl; std::ifstream file(filename.c_str()); + if(file.bad()) + throw std::runtime_error(concatenate("Could not open ", filename)); po::store(po::parse_config_file(file, options), args); } } @@ -455,6 +479,7 @@ int main(int argc, const char* argv[]) int numSolutions = args["num-solutions"].as(); bool gpuTimer = args["use-gpu-timer"].as(); bool runKernels = !args["selection-only"].as(); + bool exitOnError = args["exit-on-error"].as(); if(firstSolutionIdx < 0) firstSolutionIdx = library->solutions.begin()->first; @@ -499,6 +524,7 @@ int main(int argc, const char* argv[]) // will be missing reporters->addReporter(LogReporter::Default(args)); reporters->addReporter(ResultFileReporter::Default(args)); + reporters->addReporter(LibraryUpdateReporter::Default(args)); if(args.count("log-file")) { @@ -506,7 +532,7 @@ int main(int argc, const char* argv[]) auto logFile = std::make_shared( filename.c_str(), args["log-file-append"].as() ? std::ios::app : std::ios::out); - reporters->addReporter(LogReporter::Default(args, logFile)); + reporters->addReporter(LogReporter::Default(args, logFile, LogLevel::Normal)); } listeners.setReporter(reporters); @@ -573,10 +599,10 @@ int main(int argc, const char* argv[]) size_t syncs = listeners.numSyncs(); size_t enq = listeners.numEnqueuesPerSync(); + listeners.preSyncs(); + for(int i = 0; i < syncs; i++) { - listeners.preSyncs(); - TimingEvents startEvents(enq, eventCount); TimingEvents stopEvents(enq, eventCount); @@ -593,9 +619,9 @@ int main(int argc, const char* argv[]) listeners.postEnqueues(startEvents, stopEvents); listeners.validateEnqueues(inputs, startEvents, stopEvents); - - listeners.postSyncs(); } + + listeners.postSyncs(); } } catch(std::runtime_error const& err) @@ -607,6 +633,9 @@ int main(int argc, const char* argv[]) } listeners.postSolution(); + + if(exitOnError && listeners.error() > 0) + return listeners.error(); } listeners.postProblem(); diff --git a/Tensile/Source/client/source/BenchmarkTimer.cpp b/Tensile/Source/client/source/BenchmarkTimer.cpp index 40dba9bdb..0dff1633c 100644 --- a/Tensile/Source/client/source/BenchmarkTimer.cpp +++ b/Tensile/Source/client/source/BenchmarkTimer.cpp @@ -43,8 +43,10 @@ namespace Tensile BenchmarkTimer::BenchmarkTimer(po::variables_map const& args, Hardware const& hardware) : m_numWarmups(args["num-warmups"].as()) + , m_syncAfterWarmups(args["sync-after-warmups"].as()) , m_numBenchmarks(args["num-benchmarks"].as()) , m_numEnqueuesPerSync(args["num-enqueues-per-sync"].as()) + , m_minFlopsPerSync(args["min-flops-per-sync"].as()) , m_numSyncsPerBenchmark(args["num-syncs-per-benchmark"].as()) , m_hardware(hardware) , m_numEnqueuesPerSolution(m_numEnqueuesPerSync * m_numSyncsPerBenchmark) @@ -147,7 +149,7 @@ namespace Tensile TimingEvents const& startEvents, TimingEvents const& stopEvents) { - if((stopEvents->size() > 0) && (stopEvents->back().size() > 0)) + if(m_syncAfterWarmups && (stopEvents->size() > 0) && (stopEvents->back().size() > 0)) HIP_CHECK_EXC(hipEventSynchronize(stopEvents->back().back())); } @@ -167,7 +169,14 @@ namespace Tensile size_t BenchmarkTimer::numEnqueuesPerSync() { - return m_numEnqueuesPerSync; + size_t enqueuesByFlops = 0; + if(m_minFlopsPerSync > 0) + { + size_t flopsInProblem = m_problem.flopCount(); + enqueuesByFlops = CeilDivide(m_minFlopsPerSync, flopsInProblem); + } + + return std::max(m_numEnqueuesPerSync, enqueuesByFlops); } void BenchmarkTimer::setNumEnqueuesPerSync(size_t count) diff --git a/Tensile/Source/client/source/CSVStackFile.cpp b/Tensile/Source/client/source/CSVStackFile.cpp index eb8eb6385..c9ac270f5 100644 --- a/Tensile/Source/client/source/CSVStackFile.cpp +++ b/Tensile/Source/client/source/CSVStackFile.cpp @@ -33,20 +33,24 @@ namespace Tensile { namespace Client { - CSVStackFile::CSVStackFile(std::string const& filename) + CSVStackFile::CSVStackFile(std::string const& filename, std::string const& separator) : m_stream(new std::ofstream(filename.c_str())) + , m_separator(separator) { } void null_deleter(void* ptr) {} - CSVStackFile::CSVStackFile(std::ostream& stream) + CSVStackFile::CSVStackFile(std::ostream& stream, std::string const& separator) : m_stream(&stream, null_deleter) + , m_separator(separator) { } - CSVStackFile::CSVStackFile(std::shared_ptr stream) + CSVStackFile::CSVStackFile(std::shared_ptr stream, + std::string const& separator) : m_stream(stream) + , m_separator(separator) { } @@ -117,7 +121,7 @@ namespace Tensile for(auto const& key : m_keyOrder) { if(!firstCol) - (*m_stream) << ", "; + (*m_stream) << m_separator; std::string value = ""; diff --git a/Tensile/Source/client/source/LibraryUpdateReporter.cpp b/Tensile/Source/client/source/LibraryUpdateReporter.cpp new file mode 100644 index 000000000..c4456cb19 --- /dev/null +++ b/Tensile/Source/client/source/LibraryUpdateReporter.cpp @@ -0,0 +1,182 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2019-2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include + +namespace Tensile +{ + namespace Client + { + std::shared_ptr + LibraryUpdateReporter::Default(po::variables_map const& args) + { + auto filename = args["library-update-file"].as(); + auto comment = args["library-update-comment"].as(); + if(filename != "") + { + return FromFilename(filename, comment); + } + return std::shared_ptr(); + } + + std::shared_ptr + LibraryUpdateReporter::FromFilename(std::string const& filename, bool addComment) + { + auto file = std::make_shared(filename); + return std::make_shared(file, addComment); + } + + LibraryUpdateReporter::LibraryUpdateReporter(std::ostream& stream, bool addComment) + : m_stream(stream) + , m_addComment(addComment) + { + } + + LibraryUpdateReporter::LibraryUpdateReporter(std::shared_ptr stream, + bool addComment) + : m_stream(*stream.get()) + , m_ownedStream(stream) + , m_addComment(addComment) + { + if(!stream) + throw std::runtime_error("Invalid stream! nullptr is not allowed."); + } + + template + void LibraryUpdateReporter::reportValue(std::string const& key, T const& value) + { + std::string valueStr = boost::lexical_cast(value); + //m_stream << key << " = " << valueStr << std::endl; + + if(key == ResultKey::Validation) + { + m_curSolutionPassed = (valueStr == "PASSED" || valueStr == "NO_CHECK"); + } + else if(key == ResultKey::SolutionLibraryIndex) + { + m_curSolutionIdx = std::stoi(valueStr); + } + else if(key == ResultKey::SolutionName) + { + m_curSolutionName = valueStr; + } + else if(key == ResultKey::SpeedGFlops) + { + try + { + int64_t speed = std::stoll(valueStr); + m_curSolutionSpeed = speed; + //m_stream << "Fastest: " << m_fastestSolutionSpeed << std::endl; + } + catch(std::out_of_range const& exc) + { + } + } + } + + void LibraryUpdateReporter::reportValue_string(std::string const& key, + std::string const& value) + { + reportValue(key, value); + } + + void LibraryUpdateReporter::reportValue_uint(std::string const& key, uint64_t value) + { + reportValue(key, value); + } + + void LibraryUpdateReporter::reportValue_int(std::string const& key, int64_t value) + { + reportValue(key, value); + } + + void LibraryUpdateReporter::reportValue_double(std::string const& key, double value) + { + reportValue(key, value); + } + + void LibraryUpdateReporter::reportValue_sizes(std::string const& key, + std::vector const& value) + { + if(key == ResultKey::ProblemSizes) + { + m_problemSizes = value; + } + } + + void LibraryUpdateReporter::postProblem() + { + if(m_fastestSolutionIdx < 0) + { + m_stream << "# ["; + streamJoin(m_stream, m_problemSizes, ", "); + m_stream << "] no valid solutions." << std::endl; + } + else + { + // - - [1024, 4096, 1, 6336] + // - [289, 4853.07] + m_stream << " - - ["; + streamJoin(m_stream, m_problemSizes, ", "); + m_stream << "]" << std::endl; + m_stream << " - [" << m_fastestSolutionIdx << ", " << m_fastestSolutionSpeed + << "]"; + if(m_addComment) + m_stream << " # " << m_fastestSolutionName; + m_stream << std::endl; + } + + // reset + m_fastestSolutionIdx = -1; + m_fastestSolutionName = ""; + m_fastestSolutionSpeed = -1; + } + + void LibraryUpdateReporter::postSolution() + { + // cascade from BenchmarkTimer, SpeedGFlops second + if(m_curSolutionPassed && m_curSolutionSpeed > m_fastestSolutionSpeed) + { + m_fastestSolutionIdx = m_curSolutionIdx; + m_fastestSolutionName = m_curSolutionName; + m_fastestSolutionSpeed = m_curSolutionSpeed; + } + + m_curSolutionName = ""; + m_curSolutionIdx = -1; + m_curSolutionSpeed = -1; + m_curSolutionPassed = false; + } + + void LibraryUpdateReporter::finalizeReport() + { + // Close file if we're the last owner. + m_ownedStream.reset(); + } + } // namespace Client +} // namespace Tensile diff --git a/Tensile/Source/client/source/Reference.cpp b/Tensile/Source/client/source/Reference.cpp index feb785a9e..df884d02d 100644 --- a/Tensile/Source/client/source/Reference.cpp +++ b/Tensile/Source/client/source/Reference.cpp @@ -160,6 +160,22 @@ namespace Tensile auto boundCount = CoordCount(boundSize.begin() + 1, boundSize.end()); + if(inputs.alpha != static_cast(0)) + { + if(inputs.a == nullptr || inputs.b == nullptr) + { + std::ostringstream msg; + msg << "Unsupported nullptr for"; + if(!inputs.a) + msg << " A"; + if(!inputs.b) + msg << " B"; + msg << " when Alpha !=0"; + + throw std::runtime_error(msg.str()); + } + } + #pragma omp parallel for for(size_t dNum = 0; dNum < d.totalLogicalElements(); dNum += validationStride) { @@ -199,17 +215,6 @@ namespace Tensile // Check short-circuit for alpha = 0 if(inputs.alpha != static_cast(0)) { - if(inputs.a == nullptr || inputs.b == nullptr) - { - std::string matrixID = inputs.a == nullptr ? "A" : "B"; - std::string msg = std::string("Unsupported nullptr for ") + matrixID - + std::string(" when Alpha !=0\n"); - // HACK moving throw temporarily to test targetid update - // throw is currently causing a compiler error in this function - throwException(msg); - // throw std::runtime_error(msg.c_str()); - } - for(size_t boundNum = 0; boundNum < boundCount; boundNum++) { std::vector bound(problem.boundIndices().size()); diff --git a/Tensile/Source/client/source/ReferenceValidator.cpp b/Tensile/Source/client/source/ReferenceValidator.cpp index 49efc7075..860191dd8 100644 --- a/Tensile/Source/client/source/ReferenceValidator.cpp +++ b/Tensile/Source/client/source/ReferenceValidator.cpp @@ -299,6 +299,18 @@ namespace Tensile return rv; } + void ReferenceValidator::allocateResultBuffer(size_t bytes) + { + if(m_cpuResultBufferSize == bytes) + return; + m_cpuResultBuffer.reset(); + + uint8_t* buffer; + HIP_CHECK_EXC(hipHostMalloc(&buffer, bytes, 0)); + m_cpuResultBuffer.reset(buffer, hipFree); + m_cpuResultBufferSize = bytes; + } + template void ReferenceValidator::printTensorsTyped(ManagedInputs const& reference, ManagedInputs const& result) @@ -326,29 +338,29 @@ namespace Tensile requiredBufferSize = std::max(requiredBufferSize, m_problem.d().totalAllocatedBytes()); - if(m_cpuResultBuffer.size() < requiredBufferSize) - m_cpuResultBuffer.resize(requiredBufferSize); + if(m_cpuResultBufferSize < requiredBufferSize) + allocateResultBuffer(requiredBufferSize); if(m_printTensorA) { - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.a, m_problem.a().totalAllocatedBytes(), hipMemcpyDeviceToHost)); auto const* buffer = reinterpret_cast( - m_cpuResultBuffer.data()); + m_cpuResultBuffer.get()); m_reporter->logTensor(LogLevel::Verbose, "A", buffer, m_problem.a(), result.a); } if(m_printTensorB) { - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.b, m_problem.b().totalAllocatedBytes(), hipMemcpyDeviceToHost)); auto const* buffer = reinterpret_cast( - m_cpuResultBuffer.data()); + m_cpuResultBuffer.get()); m_reporter->logTensor(LogLevel::Verbose, "B", buffer, m_problem.b(), result.b); } @@ -356,12 +368,12 @@ namespace Tensile if(result.c == result.d && (m_printTensorC || m_printTensorD)) { // If the pointers are the same, only print the buffer once. - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.c, m_problem.c().totalAllocatedBytes(), hipMemcpyDeviceToHost)); auto const* buffer = reinterpret_cast( - m_cpuResultBuffer.data()); + m_cpuResultBuffer.get()); m_reporter->logTensor(LogLevel::Verbose, "C_D", buffer, m_problem.c(), result.c); } @@ -369,24 +381,24 @@ namespace Tensile { if(m_printTensorC) { - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.c, m_problem.c().totalAllocatedBytes(), hipMemcpyDeviceToHost)); auto const* buffer = reinterpret_cast( - m_cpuResultBuffer.data()); + m_cpuResultBuffer.get()); m_reporter->logTensor(LogLevel::Verbose, "C", buffer, m_problem.c(), result.c); } if(m_printTensorD) { - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.d, m_problem.d().totalAllocatedBytes(), hipMemcpyDeviceToHost)); auto const* buffer = reinterpret_cast( - m_cpuResultBuffer.data()); + m_cpuResultBuffer.get()); m_reporter->logTensor(LogLevel::Verbose, "D", buffer, m_problem.d(), result.d); } @@ -418,15 +430,15 @@ namespace Tensile elementsToCopy = result.dElements; size_t bytesToCopy = elementsToCopy * sizeof(Type); - if(m_cpuResultBuffer.size() < bytesToCopy) - m_cpuResultBuffer.resize(bytesToCopy); + if(m_cpuResultBufferSize < bytesToCopy) + allocateResultBuffer(bytesToCopy); if(boundsCheck == BoundsCheckMode::GuardPageBack) elementsOffsetToCopy = result.dElements - tensor.totalAllocatedElements(); auto copykind = result.gpu ? hipMemcpyDeviceToHost : hipMemcpyHostToHost; - HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.data(), + HIP_CHECK_EXC(hipMemcpy(m_cpuResultBuffer.get(), result.managedD.get() + elementsOffsetToCopy, bytesToCopy, copykind)); @@ -440,7 +452,7 @@ namespace Tensile // If there was extra data allocated before the tensor to do bounds // checking, resultBuffer is the whole allocation, while resultData // points directly to the result. - Type const* resultBuffer = reinterpret_cast(m_cpuResultBuffer.data()); + Type const* resultBuffer = reinterpret_cast(m_cpuResultBuffer.get()); Type const* resultData = resultBuffer + elementsBeforeData; Type const* resultAfterData = resultData + tensor.totalAllocatedElements(); @@ -462,34 +474,6 @@ namespace Tensile compareInvalid.before(resultBuffer[i], i, elementsBeforeData); } - auto compareValues - = [&](Type referenceValue, Type resultValue, size_t elemIndex, size_t elemNumber) { - bool match = AlmostEqual(referenceValue, resultValue); - if(!match) - errors++; - - if(!match || m_printValids) - { - if(doPrint) - { - if(printed == 0) - { - std::cout << "Index: Device | Reference" << std::endl; - } - - std::cout << "[" << (printed) << "] " - << " elem=" << elemNumber << " idx=" << elemIndex << ": " - << resultValue << (match ? "==" : "!=") << referenceValue - << std::endl; - - printed++; - - if(m_printMax >= 0 && printed >= m_printMax) - doPrint = false; - } - } - }; - if(m_validationStride == 1) { std::vector coord(tensor.dimensions()); @@ -497,6 +481,7 @@ namespace Tensile size_t prevBaseIndex = 0; const size_t innerDimSize = tensor.sizes()[0]; + const size_t initialStride = tensor.strides()[0]; for(size_t i = 0; i < outerCount; i++) { @@ -523,7 +508,7 @@ namespace Tensile for(size_t j = 0; j < innerDimSize; j++) { - size_t elemIndex = baseElemIndex + j; + size_t elemIndex = baseElemIndex + (j * initialStride); Type referenceValue = reference.d[elemIndex]; Type resultValue = resultData[elemIndex]; diff --git a/Tensile/Source/client/source/ResultReporter.cpp b/Tensile/Source/client/source/ResultReporter.cpp index 6d56f520b..e793f6376 100644 --- a/Tensile/Source/client/source/ResultReporter.cpp +++ b/Tensile/Source/client/source/ResultReporter.cpp @@ -38,6 +38,8 @@ namespace Tensile return "Error"; case LogLevel::Terse: return "Terse"; + case LogLevel::Normal: + return "Normal"; case LogLevel::Verbose: return "Verbose"; case LogLevel::Debug: diff --git a/Tensile/Source/client/source/SolutionIterator.cpp b/Tensile/Source/client/source/SolutionIterator.cpp index e8107450f..bbc2f570b 100644 --- a/Tensile/Source/client/source/SolutionIterator.cpp +++ b/Tensile/Source/client/source/SolutionIterator.cpp @@ -79,6 +79,7 @@ namespace Tensile { std::ostringstream msg; solution.hardwarePredicate->debugEval(*m_hardware, msg); + msg << std::endl; m_reporter->log(LogLevel::Verbose, msg.str()); } @@ -94,6 +95,7 @@ namespace Tensile { std::ostringstream msg; solution.problemPredicate->debugEval(m_problem, msg); + msg << std::endl; m_reporter->log(LogLevel::Verbose, msg.str()); } @@ -167,6 +169,13 @@ namespace Tensile void AllSolutionsIterator::preSolution(ContractionSolution const& solution) { + { + std::string idx = "-1"; + auto iter = solution.info.find("SolutionIndex"); + if(iter != solution.info.end()) + idx = iter->second; + m_reporter->report(ResultKey::SolutionLibraryIndex, idx); + } m_reporter->report(ResultKey::SolutionIndex, m_currentSolutionIdx); m_reporter->report(ResultKey::SolutionProgress, concatenate(m_currentSolutionIdx, "/", m_lastSolutionIdx)); @@ -225,6 +234,14 @@ namespace Tensile void BestSolutionIterator::preSolution(ContractionSolution const& solution) { + { + std::string idx = "-1"; + auto iter = solution.info.find("SolutionIndex"); + if(iter != solution.info.end()) + idx = iter->second; + m_reporter->report(ResultKey::SolutionLibraryIndex, idx); + } + m_reporter->report(ResultKey::SolutionIndex, 0); m_reporter->report(ResultKey::SolutionProgress, "1/1"); } diff --git a/Tensile/Source/lib/CMakeLists.txt b/Tensile/Source/lib/CMakeLists.txt index f842508bc..f70bb73fc 100644 --- a/Tensile/Source/lib/CMakeLists.txt +++ b/Tensile/Source/lib/CMakeLists.txt @@ -19,7 +19,7 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.13) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14" ) diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp index c16f7fdd6..d43975624 100644 --- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp @@ -60,9 +60,9 @@ namespace Tensile gfx908 = 908, gfx90a = 910, gfx1010 = 1010, - gfx1011 = 1011 - //gfx1011 = 10, - //gfx1012 = 11 + gfx1011 = 1011, + gfx1012 = 1012, + gfx1030 = 1030 }; AMDGPU(); diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp b/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp index b294be867..d952afce2 100644 --- a/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp +++ b/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp @@ -49,7 +49,7 @@ namespace Tensile */ namespace Contraction { - struct FreeSizeAMultiple : public Predicate_CRTP + struct Free0SizeMultiple : public Predicate_CRTP { enum { @@ -59,8 +59,8 @@ namespace Tensile size_t index; size_t value; - FreeSizeAMultiple() = default; - FreeSizeAMultiple(size_t index, size_t value) + Free0SizeMultiple() = default; + Free0SizeMultiple(size_t index, size_t value) : index(index) , value(value) { @@ -68,16 +68,19 @@ namespace Tensile static std::string Type() { - return "FreeSizeAMultiple"; + return "Free0SizeMultiple"; } virtual bool operator()(ContractionProblem const& problem) const override { - return problem.freeSizeA(index) % value == 0; + return (!problem.transposeC01() ? problem.freeSizeA(index) + : problem.freeSizeB(index)) + % value + == 0; } }; - struct FreeSizeBMultiple : public Predicate_CRTP + struct Free1SizeMultiple : public Predicate_CRTP { enum { @@ -87,8 +90,8 @@ namespace Tensile size_t index; size_t value; - FreeSizeBMultiple() = default; - FreeSizeBMultiple(size_t index, size_t value) + Free1SizeMultiple() = default; + Free1SizeMultiple(size_t index, size_t value) : index(index) , value(value) { @@ -96,12 +99,15 @@ namespace Tensile static std::string Type() { - return "FreeSizeBMultiple"; + return "Free1SizeMultiple"; } virtual bool operator()(ContractionProblem const& problem) const override { - return problem.freeSizeB(index) % value == 0; + return (!problem.transposeC01() ? problem.freeSizeB(index) + : problem.freeSizeA(index)) + % value + == 0; } }; diff --git a/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp b/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp index 47256efd4..55efe77f3 100644 --- a/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp +++ b/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp @@ -248,11 +248,11 @@ namespace Tensile size_t staggerStrideShift = 0; int workGroupMapping = 0; - size_t packBatchDims = 0; - int packSummationDims = 0; - int magicDivAlg = 1; - int persistentKernel = 0; - bool persistentKernelAlongBatch; + size_t packBatchDims = 0; + int packSummationDims = 0; + int magicDivAlg = 1; + int persistentKernel = 0; + bool persistentKernelAlongBatch = false; bool sourceKernel = false; diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp b/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp index e49c0ff56..57efe5d6a 100644 --- a/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp +++ b/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp @@ -56,8 +56,8 @@ namespace Tensile static SubclassMap GetSubclasses() { SubclassMap rv({ - Base::template Pair(), - Base::template Pair(), + Base::template Pair(), + Base::template Pair(), Base::template Pair(), Base::template Pair(), Base::template Pair(), @@ -105,14 +105,14 @@ namespace Tensile = ContractionProblemPredicateSMT::GetSubclasses(); template - struct MappingTraits - : public AutoMappingTraits + struct MappingTraits + : public AutoMappingTraits { }; template - struct MappingTraits - : public AutoMappingTraits + struct MappingTraits + : public AutoMappingTraits { }; diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp b/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp index e044b0c1f..78c317d7f 100644 --- a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp +++ b/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp @@ -213,6 +213,8 @@ namespace Tensile iot::enumCase(io, value, "gfx90a", AMDGPU::Processor::gfx90a); iot::enumCase(io, value, "gfx1010", AMDGPU::Processor::gfx1010); iot::enumCase(io, value, "gfx1011", AMDGPU::Processor::gfx1011); + iot::enumCase(io, value, "gfx1012", AMDGPU::Processor::gfx1012); + iot::enumCase(io, value, "gfx1030", AMDGPU::Processor::gfx1030); } }; } // namespace Serialization diff --git a/Tensile/Source/lib/source/AMDGPU.cpp b/Tensile/Source/lib/source/AMDGPU.cpp index 31b644ef0..18d67eeaf 100644 --- a/Tensile/Source/lib/source/AMDGPU.cpp +++ b/Tensile/Source/lib/source/AMDGPU.cpp @@ -78,6 +78,10 @@ namespace Tensile return stream << "gfx1010"; case AMDGPU::Processor::gfx1011: return stream << "gfx1011"; + case AMDGPU::Processor::gfx1012: + return stream << "gfx1012"; + case AMDGPU::Processor::gfx1030: + return stream << "gfx1030"; } return stream; } diff --git a/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp b/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp index ffa06ff54..173373cd8 100644 --- a/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp +++ b/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp @@ -74,6 +74,10 @@ namespace Tensile void ArithmeticUnitTypeInfo::registerAllTypeInfoOnce() { static int call_once = (registerAllTypeInfo(), 0); + + // Use the variable to quiet the compiler. + if(call_once) + return; } void ArithmeticUnitTypeInfo::addInfoObject(ArithmeticUnitTypeInfo const& info) diff --git a/Tensile/Source/lib/source/ContractionProblem.cpp b/Tensile/Source/lib/source/ContractionProblem.cpp index 6563b0f80..832a6852d 100644 --- a/Tensile/Source/lib/source/ContractionProblem.cpp +++ b/Tensile/Source/lib/source/ContractionProblem.cpp @@ -1261,9 +1261,9 @@ namespace Tensile , batchB(_batchB) , batchC(_batchC) , batchD(_batchD) + , ws(_ws) , alpha(_alpha) , beta(_beta) - , ws(_ws) { } diff --git a/Tensile/Source/lib/source/ContractionSolution.cpp b/Tensile/Source/lib/source/ContractionSolution.cpp index cc8d18703..12d65b79c 100644 --- a/Tensile/Source/lib/source/ContractionSolution.cpp +++ b/Tensile/Source/lib/source/ContractionSolution.cpp @@ -293,12 +293,12 @@ namespace Tensile rv.numWorkGroups.z *= problem.batchSize(i); } - rv.numWorkGroups.x = CeilDivide(rv.numWorkGroups.x, sizeMapping.macroTile.x); - rv.numWorkGroups.y = CeilDivide(rv.numWorkGroups.y, sizeMapping.macroTile.y); - if(problem.transposeC01()) std::swap(rv.numWorkGroups.x, rv.numWorkGroups.y); + rv.numWorkGroups.x = CeilDivide(rv.numWorkGroups.x, sizeMapping.macroTile.x); + rv.numWorkGroups.y = CeilDivide(rv.numWorkGroups.y, sizeMapping.macroTile.y); + uint32_t problemNumGroupTiles0 = rv.numWorkGroups.x; uint32_t problemNumGroupTiles1 = rv.numWorkGroups.y; // used only when persistent kernel along batch @@ -995,10 +995,9 @@ namespace Tensile auto cInfo = DataTypeInfo::Get(problemType.cType); auto dInfo = DataTypeInfo::Get(problemType.dType); - double l2ReadBwMultiplier = perf.l2ReadBwMul; - spm.memReadBytesA = (NumBatches * M * N * K) / MT1 * aInfo.elementSize; - spm.memReadBytesB = (NumBatches * M * N * K) / MT0 * bInfo.elementSize; - spm.memReadBytesC = (NumBatches * M * N) * betaReads * cInfo.elementSize; + spm.memReadBytesA = (NumBatches * M * N * K) / MT1 * aInfo.elementSize; + spm.memReadBytesB = (NumBatches * M * N * K) / MT0 * bInfo.elementSize; + spm.memReadBytesC = (NumBatches * M * N) * betaReads * cInfo.elementSize; if(GlobalSplitU == 1) spm.memWriteBytesD = (NumBatches * M * N) * (1 + betaWrites) * dInfo.elementSize; @@ -1017,17 +1016,6 @@ namespace Tensile + spm.memReadBytesC / cInfo.elementSize; spm.memGlobalWrites = spm.memWriteBytesD / dInfo.elementSize; - double readEfficiency = perf.readEff; - double l2ReadHit = perf.l2ReadHitRate; - double l2WriteHit = perf.l2WriteHitRate; - double frequency = perf.clock; - double memFrequency = perf.memClock; - double memBandwidthMBps = perf.memBandwidthMBps; - double l2BandwidthMBps = perf.memBandwidthMBps * perf.l2ReadBwMul; - double peakMFlops = perf.peakGFlops * 1000.0; - - double flops = 2.0 * l2ReadBwMultiplier * NumBatches * M * N * K; - return spm; } diff --git a/Tensile/Source/lib/source/DataTypes.cpp b/Tensile/Source/lib/source/DataTypes.cpp index 82a600fef..c6dd94fa7 100644 --- a/Tensile/Source/lib/source/DataTypes.cpp +++ b/Tensile/Source/lib/source/DataTypes.cpp @@ -127,6 +127,10 @@ namespace Tensile void DataTypeInfo::registerAllTypeInfoOnce() { static int call_once = (registerAllTypeInfo(), 0); + + // Use the variable to quiet the compiler. + if(call_once) + return; } void DataTypeInfo::addInfoObject(DataTypeInfo const& info) diff --git a/Tensile/Source/lib/source/KernelLanguageTypes.cpp b/Tensile/Source/lib/source/KernelLanguageTypes.cpp index 6ff876e55..10e8cc24e 100644 --- a/Tensile/Source/lib/source/KernelLanguageTypes.cpp +++ b/Tensile/Source/lib/source/KernelLanguageTypes.cpp @@ -92,6 +92,10 @@ namespace Tensile void KernelLanguageTypeInfo::registerAllTypeInfoOnce() { static int call_once = (registerAllTypeInfo(), 0); + + // Use the variable to quiet the compiler. + if(call_once) + return; } void KernelLanguageTypeInfo::addInfoObject(KernelLanguageTypeInfo const& info) diff --git a/Tensile/Source/lib/source/TensorOps.cpp b/Tensile/Source/lib/source/TensorOps.cpp index bfff7a42b..390b93ef8 100644 --- a/Tensile/Source/lib/source/TensorOps.cpp +++ b/Tensile/Source/lib/source/TensorOps.cpp @@ -71,6 +71,9 @@ namespace Tensile if(iter == typeNames.end()) throw std::runtime_error(concatenate("Invalid TensorOp type: ", name)); + // Use variable to quiet the compiler. + if(call_once) + return iter->second; return iter->second; } diff --git a/Tensile/Tensile.py b/Tensile/Tensile.py index bd73d1a7c..93802b237 100644 --- a/Tensile/Tensile.py +++ b/Tensile/Tensile.py @@ -1,5 +1,5 @@ ################################################################################ -# Copyright 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,8 +30,8 @@ assignGlobalParameters, restoreDefaultGlobalParameters, HR from . import BenchmarkProblems from . import ClientWriter -from . import LibraryLogic from . import LibraryIO +from . import LibraryLogic from . import __version__ ############################################################################### @@ -215,7 +215,7 @@ def Tensile(userArgs): globalParameters['LibraryFormat'] = args.LibraryFormat # read config - config = LibraryIO.readConfig( configPath ) + config = LibraryIO.readYAML( configPath ) globalParameters["ConfigPath"] = configPath # assign global parameters @@ -281,4 +281,3 @@ def TensileSGEMM5760(): # installed "tensile" command def main(): Tensile(sys.argv[1:]) - diff --git a/Tensile/TensileBenchmarkCluster.py b/Tensile/TensileBenchmarkCluster.py index 7d2f039ad..3ba7a9bf0 100644 --- a/Tensile/TensileBenchmarkCluster.py +++ b/Tensile/TensileBenchmarkCluster.py @@ -1,3 +1,24 @@ +################################################################################ +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + import shlex, subprocess import sys import os diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py index 897e25e07..80806fc03 100644 --- a/Tensile/TensileCreateLibrary.py +++ b/Tensile/TensileCreateLibrary.py @@ -267,18 +267,32 @@ def prepAsmNewClient(kernelWriterAssembly): assemblerFile.write("copy %1.o %1.co\n") else: assemblerFile.write("#!/bin/sh {log}\n".format(log = "-x" if globalParameters["PrintLevel"] >=2 else "")) - assemblerFile.write("# usage: asm-new.sh kernelName(no extension)\n") + assemblerFile.write("# usage: asm-new.sh kernelName(no extension) [--wave32]\n") assemblerFile.write("f=$1\n") assemblerFile.write("shift\n") + assemblerFile.write('if [ ! -z "$1" ] && [ "$1" = "--wave32" ]; then\n') + assemblerFile.write(" wave=32\n") + assemblerFile.write(" shift\n") + assemblerFile.write("else\n") + assemblerFile.write(" wave=64\n") + assemblerFile.write("fi\n") + isa = globalParameters["CurrentISA"] assemblerFile.write("h={gfxName}\n".format(gfxName = Common.gfxName(isa))) - cArgs = kernelWriterAssembly.getCompileArgs("$f.s", "$f.o", useGlobalISA=True) + cArgs32 = kernelWriterAssembly.getCompileArgs("$f.s", "$f.o", isa=isa, wavefrontSize=32) + cArgs64 = kernelWriterAssembly.getCompileArgs("$f.s", "$f.o", isa=isa, wavefrontSize=64) lArgs = kernelWriterAssembly.getLinkCodeObjectArgs(["$f.o"], "$f.co") - assemblerFile.write(" ".join(cArgs) + "\n") + assemblerFile.write("if [ $wave -eq 32 ]; then\n") + assemblerFile.write(" ".join(cArgs32) + "\n") + assemblerFile.write("else\n") + assemblerFile.write(" ".join(cArgs64) + "\n") + assemblerFile.write("fi\n") + + assemblerFile.write(" ".join(lArgs) + "\n") assemblerFile.write("cp $f.co ../../../library/${f}_$h.co\n") @@ -1253,7 +1267,7 @@ def writeBenchmarkClientFiles(libraryWorkingPath, tensileSourcePath, solutions, newLibrary = MasterSolutionLibrary.BenchmarkingLibrary(solutions) newLibrary.applyNaming(kernelMinNaming) - LibraryIO.YAMLWriter().write(newLibraryFile, Utils.state(newLibrary)) + LibraryIO.writeYAML(newLibraryFile, Utils.state(newLibrary)) return (codeObjectFiles, newLibrary) @@ -1273,7 +1287,7 @@ def WriteClientLibraryFromSolutions(solutionList, libraryWorkingPath, tensileSou mataDataFilePath = os.path.join(effectiveWorkingPath, 'metadata.yaml') metaData = {"ProblemType":problemType} - LibraryIO.YAMLWriter().write(mataDataFilePath, metaData) + LibraryIO.writeYAML(mataDataFilePath, metaData) codeObjectFiles, newLibrary = writeBenchmarkClientFiles(libraryWorkingPath, tensileSourcePath, solutionList, cxxCompiler ) @@ -1309,7 +1323,7 @@ def splitExtraParameters(par): argParser.add_argument("--cxx-compiler", dest="CxxCompiler", choices=["hipcc"], action="store", default="hipcc") argParser.add_argument("--cmake-cxx-compiler", dest="CmakeCxxCompiler", action="store") argParser.add_argument("--code-object-version", dest="CodeObjectVersion", choices=["V2", "V3"], action="store", default="V3") - argParser.add_argument("--architecture", dest="Architecture", choices=["all", "gfx000", "gfx803", "gfx900", "gfx906:xnack-", "gfx908:xnack-", "gfx90a:xnack+", "gfx90a:xnack-"], action="store", default="all") + argParser.add_argument("--architecture", dest="Architecture", type=str, action="store", default="all", help="Supported archs: " + " ".join(architectureMap.keys())) argParser.add_argument("--merge-files", dest="MergeFiles", action="store_true") argParser.add_argument("--no-merge-files", dest="MergeFiles", action="store_false") argParser.add_argument("--short-file-names", dest="ShortNames", action="store_true") @@ -1392,17 +1406,21 @@ def splitExtraParameters(par): if not os.path.exists(logicPath): printExit("LogicPath %s doesn't exist" % logicPath) - for key in architectureMap: - if arguments["Architecture"] == key: - arguments["Architecture"] = architectureMap[key] + archs = arguments["Architecture"].split(";") + logicArchs = set() + for arch in archs: + if arch in architectureMap: + logicArchs.add(architectureMap[arch]) + else: + printExit("Architecture %s not supported" % arch) # Recursive directory search logicFiles = [] for root, dirs, files in os.walk(logicPath): logicFiles += [os.path.join(root, f) for f in files if os.path.splitext(f)[1]==".yaml" \ - and arguments["Architecture"] in os.path.splitext(f)[0] \ - or "hip" in os.path.splitext(f)[0] ] + and (any(logicArch in os.path.splitext(f)[0] for logicArch in logicArchs) \ + or "hip" in os.path.splitext(f)[0]) ] print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: @@ -1414,7 +1432,7 @@ def splitExtraParameters(par): solutions = [] logicData = {} # keys are problemTypes, values are schedules - libraries = Common.ParallelMap(LibraryIO.readLibraryLogicForSchedule, logicFiles, "Reading logic files") + libraries = Common.ParallelMap(LibraryIO.parseLibraryLogicFile, logicFiles, "Reading logic files") masterLibraries = {} fullMasterLibrary = None @@ -1422,35 +1440,28 @@ def splitExtraParameters(par): (scheduleName, deviceNames, problemType, solutionsForSchedule, \ indexOrder, exactLogic, rangeLogic, newLibrary, architectureName) = logic - if not globalParameters["PackageLibrary"]: - if fullMasterLibrary is None: - fullMasterLibrary = deepcopy(newLibrary) - fullMasterLibrary.version = args.version - else: - fullMasterLibrary.merge(deepcopy(newLibrary)) - if globalParameters["PackageLibrary"]: if architectureName in masterLibraries: masterLibraries[architectureName].merge(deepcopy(newLibrary)) else: masterLibraries[architectureName] = deepcopy(newLibrary) + masterLibraries[architectureName].version = args.version + else: + if fullMasterLibrary is None: + fullMasterLibrary = deepcopy(newLibrary) + fullMasterLibrary.version = args.version + else: + fullMasterLibrary.merge(deepcopy(newLibrary)) if problemType not in logicData: logicData[problemType] = [] logicData[problemType].append((scheduleName, deviceNames, \ solutionsForSchedule, indexOrder, exactLogic, rangeLogic )) + for solution in solutionsForSchedule: if solution not in solutions: solutions.append(solution) - if globalParameters["PackageLibrary"]: - if architectureName in masterLibraries: - masterLibraries[architectureName].merge(deepcopy(newLibrary)) - else: - masterLibraries[architectureName] = deepcopy(newLibrary) - masterLibraries[architectureName].version = args.version - - kernels, kernelHelperOjbs, _ = generateKernelObjectsFromSolutions(solutions) # if any kernels are assembly, append every ISA supported @@ -1523,28 +1534,28 @@ def splitExtraParameters(par): if globalParameters["AsmCaps"][arch]["SupportedISA"]] newLibraryDir = ensurePath(os.path.join(outputPath, 'library')) - libraryWriter = LibraryIO.configWriter(args.LibraryFormat) - tensileLibraryFilename = "TensileLibrary.yaml" if args.LibraryFormat == "yaml" \ - else "TensileLibrary.dat" if globalParameters["PackageLibrary"]: for archName, newMasterLibrary in masterLibraries.items(): if (archName in archs): archPath = ensurePath(os.path.join(newLibraryDir, archName)) - masterFile = os.path.join(archPath, tensileLibraryFilename) + masterFile = os.path.join(archPath, "TensileLibrary") newMasterLibrary.applyNaming(kernelMinNaming) - libraryWriter.write(masterFile, Utils.state(newMasterLibrary)) + LibraryIO.write(masterFile, Utils.state(newMasterLibrary), args.LibraryFormat) else: - masterFile = os.path.join(newLibraryDir, tensileLibraryFilename) + masterFile = os.path.join(newLibraryDir, "TensileLibrary") fullMasterLibrary.applyNaming(kernelMinNaming) - libraryWriter.write(masterFile, Utils.state(fullMasterLibrary)) + LibraryIO.write(masterFile, Utils.state(fullMasterLibrary), args.LibraryFormat) theMasterLibrary = fullMasterLibrary if globalParameters["PackageLibrary"]: theMasterLibrary = list(masterLibraries.values())[0] + if args.EmbedLibrary is not None: embedFileName = os.path.join(outputPath, "library/{}.cpp".format(args.EmbedLibrary)) with EmbeddedData.EmbeddedDataFile(embedFileName) as embedFile: - embedFile.embed_file(theMasterLibrary.cpp_base_class, masterFile, nullTerminated=True, + + ext = ".yaml" if globalParameters["LibraryFormat"] == "yaml" else ".dat" + embedFile.embed_file(theMasterLibrary.cpp_base_class, masterFile + ext, nullTerminated=True, key=args.EmbedLibraryKey) for co in Utils.tqdm(codeObjectFiles): diff --git a/Tensile/TensileRetuneLibrary.py b/Tensile/TensileRetuneLibrary.py new file mode 100644 index 000000000..d257fa688 --- /dev/null +++ b/Tensile/TensileRetuneLibrary.py @@ -0,0 +1,173 @@ +############################################################################### +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +from . import BenchmarkProblems +from . import ClientExecutable +from . import ClientWriter +from . import LibraryIO +from . import Common +from .Common import globalParameters, print1, printExit, ensurePath, assignGlobalParameters, \ + pushWorkingPath, popWorkingPath, restoreDefaultGlobalParameters, HR +from .Tensile import addCommonArguments +from .SolutionStructs import ProblemSizes +from . import __version__ + +import argparse +import copy +import os +import shutil +import sys + + +def parseCurrentLibrary(libPath): + libYaml = LibraryIO.readYAML(libPath) + # parseLibraryLogicData mutates the original data, so make a copy + fields = LibraryIO.parseLibraryLogicData(copy.deepcopy(libYaml), libPath) + (_, _, problemType, solutions, _, exactLogic, _, _, _) = fields + + # get performance metric + if len(libYaml) > 10: + Common.globalParameters["PerformanceMetric"] = libYaml[10] + + # process exactLogic into ProblemSizes + sizes = [] + for (size, _) in exactLogic: + sizes.append({"Exact": size}) + problemSizes = ProblemSizes(problemType, sizes) + + return (libYaml, solutions, problemSizes) + + +def runBenchmarking(solutions, problemSizes, outPath): + # TODO some copy-pasting from BenchmarkProblems.benchmarkProblemType + # could use a refactor to elimate duplicated code + ClientExecutable.getClientExecutable() + + shortName = "benchmark" + benchmarkDir = os.path.join(outPath, shortName) + sourceDir = os.path.join(benchmarkDir, "source") + ensurePath(sourceDir) + + pushWorkingPath(shortName) + pushWorkingPath("source") + + filesToCopy = [ + "SolutionMapper.h", + "Client.cpp", + "Client.h", + "CMakeLists.txt", + "DeviceStats.h", + "TensorUtils.h", + "MathTemplates.cpp", + "MathTemplates.h", + "TensileTypes.h", + "tensile_bfloat16.h", + "KernelHeader.h", + "ReferenceCPU.h", + "SolutionHelper.cpp", + "SolutionHelper.h", + "Tools.cpp", + "Tools.h", + ] + + for f in filesToCopy: + shutil.copy( + os.path.join(globalParameters["SourcePath"], f), + globalParameters["WorkingPath"] ) + if globalParameters["RuntimeLanguage"] == "OCL": + shutil.copy( + os.path.join(globalParameters["SourcePath"], "FindOpenCL.cmake"), + globalParameters["WorkingPath"] ) + else: + shutil.copy( + os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"), + globalParameters["WorkingPath"] ) + + # make directory for results and set update yaml file + resultsDir = os.path.normpath(os.path.join(globalParameters["WorkingPath"], "../../Data")) + ensurePath(resultsDir) + updateFile = os.path.join(resultsDir, "update.yaml") + Common.globalParameters["LibraryUpdateFile"] = updateFile + + BenchmarkProblems.writeBenchmarkFiles(benchmarkDir, solutions, problemSizes, shortName, filesToCopy, []) + + popWorkingPath() # source + + libraryLogicPath = None + forBenchmark = True + # TODO make this work with TileAware selection + returncode = ClientWriter.runClient(libraryLogicPath, forBenchmark, False) + + if returncode: + printExit("BenchmarkProblems: Benchmark Process exited with code %u" % returncode) + + return updateFile + + +def TensileRetuneLibrary(userArgs): + print1("") + print1(HR) + print1("#") + print1("# Tensile Retune Library v{}".format(__version__)) + + # setup argument parsing + argParser = argparse.ArgumentParser() + argParser.add_argument("library_file", type=os.path.realpath, help="library logic file to retune") + argParser.add_argument("output_path", help="path where to conduct benchmark") + addCommonArguments(argParser) + args = argParser.parse_args(userArgs) + + libPath = args.library_file + + print1("# Library Logic: {}".format(libPath)) + print1("#") + print1(HR) + print1("") + + # setup global parameters + outPath = ensurePath(os.path.abspath(args.output_path)) + restoreDefaultGlobalParameters() + assignGlobalParameters({"LibraryFormat": "msgpack", + "OutputPath": outPath, + "WorkingPath": outPath}) + + # run main steps + (rawYaml, solutions, problemSizes) = parseCurrentLibrary(libPath) + updateFile = runBenchmarking(solutions, problemSizes, outPath) + + # read update yaml from benchmark client and update logic + print1("") + print1(HR) + print1("# Reading update file from Benchmarking Client") + updateLogic = LibraryIO.readYAML(updateFile) + rawYaml[7] = updateLogic + + # write updated library logic (does not overwrite original) + libName = os.path.basename(libPath) + outFile = os.path.join(outPath, libName) + + print1("# Writing updated Library Logic: {}".format(outFile)) + LibraryIO.writeYAML(outFile, rawYaml, explicit_start=False, explicit_end=False) + print(HR) + + +def main(): + TensileRetuneLibrary(sys.argv[1:]) diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py b/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py index a1f09f0b8..1b38b90fd 100644 --- a/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py +++ b/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py @@ -19,7 +19,7 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ -import logging,pytest +import logging from Tensile.SolutionStructs import Convolution log =logging.getLogger("testlog") @@ -27,6 +27,7 @@ def test_nhwc_defaults(tensile_state, run_convolution_level): z={} # problemType definition conv = Convolution(z, 'ConvolutionForward', config={'TensorAFormat': 'NHWC', + 'TensorBFormat': 'KYXC', }) log.debug(conv.printUsage(z)) if not tensile_state.args["no_conv_assertions"]: @@ -37,16 +38,13 @@ def test_nhwc_defaults(tensile_state, run_convolution_level): assert(conv.solutionParms["AssertStrideAEqual"] == {0:1}) assert(conv.solutionParms["AssertStrideBEqual"] == {0:1,2:0}) assert(conv.solutionParms["AssertSizeEqual"] == {}) + run_convolution_level.func(conv, z, run_convolution_level.solution) - solutionName = run_convolution_level.solution.__name__ - if solutionName.startswith("asm"): - pytest.skip("bug with asm NHWC") - #run_convolution_level.func(conv, z, run_convolution_level.solution) - -def test_nhwc_filter2x2(tensile_state, run_convolution_level): +def test_nhwc_filter3x2(tensile_state, run_convolution_level): z={} # problemType definition conv = Convolution(z, 'ConvolutionForward', config={'TensorAFormat': 'NHWC', + 'TensorBFormat': 'KYXC', 'Filter': '3x2', }) log.debug(conv.printUsage(z)) @@ -55,14 +53,73 @@ def test_nhwc_filter2x2(tensile_state, run_convolution_level): cdim = 5 if conv.unrollOnChannel else 3 assert(z['NumIndicesC']==3) assert(z['IndexAssignmentsA']==[cdim] + filterDims + [1, 2]) - assert(z['IndexAssignmentsB']==filterDims + [cdim, 0, 2]) + assert(z['IndexAssignmentsB']==[cdim] + filterDims + [0, 2]) assert(not z['UseInitialStridesAB']) assert(conv.solutionParms["AssertStrideAEqual"] == {0:1}) assert(conv.solutionParms["AssertStrideBEqual"] == {0:1,4:0}) assert(conv.solutionParms["AssertSizeEqual"] == {filterDims[0]:2, filterDims[1]:3}) - #skip since bug in asm output swap required by NHWC, impacts both source and asm - solutionName = run_convolution_level.solution.__name__ - if solutionName.startswith("asm"): - pytest.skip("bug with asm NHWC") - #run_convolution_level.func(conv, z, run_convolution_level.solution) + run_convolution_level.func(conv, z, run_convolution_level.solution) + +def test_nhwc_filter2x2_dilation(tensile_state, run_convolution_level): + z={} # problemType definition + conv = Convolution(z, 'ConvolutionForward', + config={'TensorAFormat': 'NHWC', + 'TensorBFormat': 'KYXC', + 'Filter': '2x2', + 'Dilation': '2x2', + }) + log.debug(conv.printUsage(z)) + if not tensile_state.args["no_conv_assertions"]: + filterDims = [4,3] if conv.unrollOnChannel else [5,4] + cdim = 5 if conv.unrollOnChannel else 3 + assert(z['NumIndicesC']==3) + assert(z['IndexAssignmentsA']==[cdim] + filterDims + [1, 2]) + assert(z['IndexAssignmentsB']==[cdim] + filterDims + [0, 2]) + assert(not z['UseInitialStridesAB']) + assert(conv.solutionParms["AssertStrideAEqual"] == {0:1}) + assert(conv.solutionParms["AssertStrideBEqual"] == {0:1,4:0}) + assert(conv.solutionParms["AssertSizeEqual"] == {filterDims[0]:2, filterDims[1]:2}) + run_convolution_level.func(conv, z, run_convolution_level.solution) + +def test_nhwc_filter2x2_stride(tensile_state, run_convolution_level): + z={} # problemType definition + conv = Convolution(z, 'ConvolutionForward', + config={'TensorAFormat': 'NHWC', + 'TensorBFormat': 'KYXC', + 'Filter': '2x2', + 'Stride': '2x2', + }) + log.debug(conv.printUsage(z)) + if not tensile_state.args["no_conv_assertions"]: + filterDims = [4,3] if conv.unrollOnChannel else [5,4] + cdim = 5 if conv.unrollOnChannel else 3 + assert(z['NumIndicesC']==3) + assert(z['IndexAssignmentsA']==[cdim] + filterDims + [1, 2]) + assert(z['IndexAssignmentsB']==[cdim] + filterDims + [0, 2]) + assert(not z['UseInitialStridesAB']) + assert(conv.solutionParms["AssertStrideAEqual"] == {0:1}) + assert(conv.solutionParms["AssertStrideBEqual"] == {0:1,4:0}) + assert(conv.solutionParms["AssertSizeEqual"] == {filterDims[0]:2, filterDims[1]:2}) + run_convolution_level.func(conv, z, run_convolution_level.solution) +def test_nhwc_filter3x3_pad(tensile_state, run_convolution_level): + z={} # problemType definition + conv = Convolution(z, 'ConvolutionForward', + config={'TensorAFormat': 'NHWC', + 'TensorBFormat': 'KYXC', + 'Filter': '3x3', + 'PadStart': '1x1', + 'PadEnd': '1x1', + }) + log.debug(conv.printUsage(z)) + if not tensile_state.args["no_conv_assertions"]: + filterDims = [4,3] if conv.unrollOnChannel else [5,4] + cdim = 5 if conv.unrollOnChannel else 3 + assert(z['NumIndicesC']==3) + assert(z['IndexAssignmentsA']==[cdim] + filterDims + [1, 2]) + assert(z['IndexAssignmentsB']==[cdim] + filterDims + [0, 2]) + assert(not z['UseInitialStridesAB']) + assert(conv.solutionParms["AssertStrideAEqual"] == {0:1}) + assert(conv.solutionParms["AssertStrideBEqual"] == {0:1,4:0}) + assert(conv.solutionParms["AssertSizeEqual"] == {filterDims[0]:2, filterDims[1]:2}) + run_convolution_level.func(conv, z, run_convolution_level.solution) diff --git a/Tensile/Tests/integration/test_integration.py b/Tensile/Tests/integration/test_integration.py index 6fb16f377..5130b27b8 100644 --- a/Tensile/Tests/integration/test_integration.py +++ b/Tensile/Tests/integration/test_integration.py @@ -83,7 +83,7 @@ def str2bool(mergeFiles, shortNames, legacyComponents): @pytest.mark.parametrize("testYamls", ["quick", "pre_checkin"]) @pytest.mark.parametrize("mergeFiles", ["mergeFiles", "noMergeFiles"]) -@pytest.mark.parametrize("libraryFormat", ["yaml", pytest.param("msgpack", marks=pytest.mark.xfail)]) +@pytest.mark.parametrize("libraryFormat", ["yaml", "msgpack"]) @pytest.mark.parametrize("shortNames", ["shortNames", "noShortName"]) @pytest.mark.parametrize("legacyComponents", ["legacyComponents", "noLegacyComponents"]) def test_integration(useGlobalParameters, builddir, getLogicFileDir, @@ -124,7 +124,7 @@ def test_integration(useGlobalParameters, builddir, getLogicFileDir, clientParametersPaths = [] isaStr = "".join([str(e) for e in Common.globalParameters["CurrentISA"]]) for logicFileName in logicFiles: - (scheduleName, _, problemType, _, _, exactLogic, _, newLibrary, archName) = LibraryIO.readLibraryLogicForSchedule(logicFileName) + (scheduleName, _, problemType, _, _, exactLogic, _, newLibrary, archName) = LibraryIO.parseLibraryLogicFile(logicFileName) problemSizes = ProblemSizesMock(random.sample(exactLogic, min(len(exactLogic), 16))) # sample at most 16 problems if isaStr in archName: clientParametersPaths.append(ClientWriter.writeClientConfig( @@ -143,4 +143,4 @@ def test_integration(useGlobalParameters, builddir, getLogicFileDir, enableTileSelection = False returncode = ClientWriter.runClient(logicFileDir, forBenchmark, enableTileSelection, clientParametersPaths) - assert(returncode == 0) \ No newline at end of file + assert(returncode == 0) diff --git a/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml b/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml index 94d17e4a1..356fb436e 100644 --- a/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml +++ b/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml b/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml index 17d6fb255..9fa303855 100644 --- a/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml +++ b/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumBenchmarks: 3 # we need to run it a few more times to repro the waitcnt bug diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml index 8b27fd10f..630d43530 100644 --- a/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml b/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml index 2338257bd..1b8c54088 100644 --- a/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml +++ b/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml index ee2e35ce2..b4938e5a9 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml index 310118c99..ceefe66de 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml index e60022c83..8fdfd8695 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml index d549d12ed..dd3eef031 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml index 034db02e4..47ac72886 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml index e0938c27c..da8c19e41 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml index e3d43ee0f..34ecf42a7 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 @@ -43,6 +43,7 @@ BenchmarkProblems: - ScheduleIterAlg: [3] - VectorWidth: [4, 8] - 1LDSBuffer: [0, 1] + - StaggerU: [0,32] - PersistentKernel: [0, 1] - PersistentKernelAlongBatch: [False] - PrefetchAcrossPersistent: [0, 1] diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml index 73b5ad942..4bec1d80b 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml b/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml index 6f05233e9..43d637304 100644 --- a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 2048 @@ -331,4 +331,4 @@ BenchmarkProblems: - Exact: [128, 64, 1, 3328] - Exact: [448, 64, 1, 256] - Exact: [6784, 3584, 1, 1280] - - Exact: [1024, 6784, 1, 1280] \ No newline at end of file + - Exact: [1024, 6784, 1, 1280] diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml index 53f022a48..b8f7e2b0e 100644 --- a/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml index 0c03b547d..15f57f5f4 100644 --- a/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml index 5746f6499..f0931ff0b 100644 --- a/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: 65536 @@ -42,6 +42,7 @@ BenchmarkProblems: - ScheduleIterAlg: [3] - VectorWidth: [4, 8] - 1LDSBuffer: [0, 1] + - StaggerU: [0,32] - PersistentKernel: [0, 1] - PersistentKernelAlongBatch: [False] - PrefetchAcrossPersistent: [0, 1] diff --git a/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml b/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml index 68d5824e8..8f8853d10 100644 --- a/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml +++ b/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml @@ -1,5 +1,5 @@ TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011] # not supported by arch + marks: [skip-gfx900, skip-gfx906, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch GlobalParameters: NumElementsToValidate: -1 diff --git a/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml b/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml index 71eb4e72a..391c130e4 100644 --- a/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml +++ b/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml @@ -33,6 +33,7 @@ BenchmarkProblems: - [ 8, 8, 1 ] - [ 32, 32, 1 ] - DepthU: [8] + - StaggerU: [0,32] - PersistentKernel: [1,2] - PersistentKernelAlongBatch: [False,True] - PrefetchAcrossPersistent: [False,True] diff --git a/Tensile/Tests/unit/test_Component.py b/Tensile/Tests/unit/test_Component.py index 435d1ec7a..0ec318b07 100644 --- a/Tensile/Tests/unit/test_Component.py +++ b/Tensile/Tests/unit/test_Component.py @@ -71,7 +71,11 @@ def vega10(): 'v_dot2c_f32_f16': False, 'v_dot2_f32_f16': False, "v_mad_mix_f32": True, - "v_fma_mix_f32": False} + "v_fma_mix_f32": False, + "v_mac_f32": True, + "v_fma_f32": True, + "v_fmac_f32": False, + } } @pytest.fixture @@ -82,7 +86,10 @@ def navi10(): 'v_dot2c_f32_f16': False, 'v_dot2_f32_f16': False, "v_mad_mix_f32": False, - "v_fma_mix_f32": True} + "v_fma_mix_f32": True, + "v_mac_f32": True, + "v_fma_f32": True, + "v_fmac_f32": True} } @pytest.fixture @@ -93,7 +100,10 @@ def navi12(): 'v_dot2c_f32_f16': True, 'v_dot2_f32_f16': True, "v_mad_mix_f32": False, - "v_fma_mix_f32": True} + "v_fma_mix_f32": True, + "v_mac_f32": True, + "v_fma_f32": True, + "v_fmac_f32": True} } @pytest.fixture @@ -145,13 +155,13 @@ def test_find(navi10, f16): writer = MockWriter(**navi10, **f16) found = Component.MAC.find(writer) - assert isinstance(found, Components.MAC_F16.FMA_NonPacked) + assert isinstance(found, Components.MAC_F16.FMA_F16_NonPacked) def test_find2(vega10, f16_hpa): writer = MockWriter(**vega10, **f16_hpa) found = Component.MAC.find(writer) - assert isinstance(found, Components.MAC_F16_HPA.FMA_HPA_MAD_MIX) + assert isinstance(found, Components.MAC_F16_HPA.FMA_F16_HPA_MAD_MIX) def test_MAC_F16_FMA_NonPacked(navi10, f16): writer = MockWriter(**navi10, **f16) @@ -161,7 +171,7 @@ def test_MAC_F16_FMA_NonPacked(navi10, f16): print(kernelText) def test_componentPath(): - assert Components.MAC_F16.FMA_NonPacked.componentPath() == ["Component", "MAC", "FMA_NonPacked"] + assert Components.MAC_F16.FMA_F16_NonPacked.componentPath() == ["Component", "MAC", "FMA_F16_NonPacked"] def test_find_macs(useGlobalParameters, f16, f16_hpa, f16_hpa_ldl): with useGlobalParameters() as globals: diff --git a/Tensile/Tests/unit/test_LibraryIO.py b/Tensile/Tests/unit/test_LibraryIO.py new file mode 100644 index 000000000..519f39709 --- /dev/null +++ b/Tensile/Tests/unit/test_LibraryIO.py @@ -0,0 +1,123 @@ +################################################################################ +# Copyright 2020-2021 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +from copy import deepcopy + +import yaml + +from Tensile.__init__ import __version__ +import Tensile.LibraryIO as LibraryIO + +version = "- {MinimumRequiredVersion: " + __version__ + "}\n" + +vega20Prefix = version + r""" +- vega20 +- gfx906 +- [Device 66a0, Device 66a1, Device 66a7, Device 66af, Vega 20] +""" + +sizes = r""" +- DummyIndexAssignment +- - - [128, 128, 1, 128] + - [0, 80.0] + - - [512, 512, 1, 512] + - [1, 85.0] +""" + +dvEffLogicSuffix = r""" +- null +- null +- DeviceEfficiency +""" + +cuEffLogicSuffix = r""" +- null +- null +- CUEfficiency +""" + +legacyLogicSuffix = r""" +- null +""" + +def createLibraryLogic(suffix): + # paths to test data + scriptDir = os.path.dirname(os.path.realpath(__file__)) + dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) + problemTypePath = os.path.realpath( \ + os.path.join(dataDir, "library_data", "problemType.yaml")) + solutionParametersPath = os.path.realpath( \ + os.path.join(dataDir, "library_data", "initialSolutionParameters.yaml")) + + # read test data + problemType = LibraryIO.readYAML(problemTypePath)["ProblemType"] + solutionParameters = LibraryIO.readYAML(solutionParametersPath) + + # solutions + sol0 = deepcopy(solutionParameters) + sol0["SolutionIndex"] = 0 + sol0["SolutionNameMin"] = "foo" + sol0["ProblemType"] = problemType + + sol1 = deepcopy(solutionParameters) + sol1["SolutionIndex"] = 1 + sol1["SolutionNameMin"] = "bar" + sol0["ProblemType"] = problemType + + # other components + prefixData = yaml.load(vega20Prefix, yaml.SafeLoader) + sizeData = yaml.load(sizes, yaml.SafeLoader) + suffixData = yaml.load(suffix, yaml.SafeLoader) + + # combine all components + return prefixData + [problemType] + [[sol0, sol1]] + sizeData + suffixData + +def test_parseSolutionsData(useGlobalParameters): + with useGlobalParameters(): + # paths to test data + scriptDir = os.path.dirname(os.path.realpath(__file__)) + dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) + solutionsPath = os.path.realpath(os.path.join( \ + dataDir, "solutions", "solutions_nn_3.yaml")) + + solutions = LibraryIO.readYAML(solutionsPath) + + LibraryIO.parseSolutionsData(solutions, "test_parseSolutionsData") + assert True + +def test_parseLibraryLogicData_legacy(useGlobalParameters): + with useGlobalParameters(): + LibraryIO.parseLibraryLogicData(createLibraryLogic(legacyLogicSuffix), \ + "test_parseLibraryLogicData_legacy") + assert True + +def test_parseLibraryLogicData_dvEff(useGlobalParameters): + with useGlobalParameters(): + LibraryIO.parseLibraryLogicData(createLibraryLogic(dvEffLogicSuffix), \ + "test_parseLibraryLogicData_dvEff") + assert True + +def test_parseLibraryLogicData_cuEff(useGlobalParameters): + with useGlobalParameters(): + LibraryIO.parseLibraryLogicData(createLibraryLogic(cuEffLogicSuffix), \ + "test_parseLibraryLogicData_cuEff") + assert True diff --git a/Tensile/Tests/unit/test_ReplacementKernels.py b/Tensile/Tests/unit/test_ReplacementKernels.py index 5f9b88d93..a480dc267 100644 --- a/Tensile/Tests/unit/test_ReplacementKernels.py +++ b/Tensile/Tests/unit/test_ReplacementKernels.py @@ -27,7 +27,7 @@ def test_DefaultInstance(): assert ReplacementKernels.Get("asdf") is None - myReplacement = ReplacementKernels.Get("Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WG16_32_1_WGM8") + myReplacement = ReplacementKernels.Get("Cijk_Alik_Bljk_SB_MT64x128x32_SE_1LDSB0_APM1_AF0EM8_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM8_BL1_DTL0_DVO0_EPS1_FL0_GRVW4_GSU1_ISA908_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_RK1_SIA1_SS0_SU32_SUM0_SUS256_SRVW0_SVW4_SNLL0_TT4_4_TLDS0_USFGRO1_VAW1_VS1_VW4_WSGRA0_WSGRB0_WS64_WG16_32_1_WGM8") assert os.path.isfile(myReplacement) assert os.path.isabs(myReplacement) diff --git a/Tensile/Tests/unit/test_TensileCreateLibrary.py b/Tensile/Tests/unit/test_TensileCreateLibrary.py index 64a0a7f85..0b12c807f 100644 --- a/Tensile/Tests/unit/test_TensileCreateLibrary.py +++ b/Tensile/Tests/unit/test_TensileCreateLibrary.py @@ -1,5 +1,5 @@ ################################################################################ -# Copyright 2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright 2020-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -53,64 +53,64 @@ def test_assignParameters(): assert hardcodedParameters != None assert initialSolutionParameters != None -def test_generateSolutions(): +def test_generateSolutions(useGlobalParameters): + with useGlobalParameters(): + scriptDir = os.path.dirname(os.path.realpath(__file__)) + dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) + problemTypeFilePath = os.path.join(dataDir, "library_data", "problemType.yaml") + hardcodedParametersFilePath = os.path.join(dataDir, "library_data", "hardcodedParameters.yaml") + initialSolutionParametersFilePath = os.path.join(dataDir, "library_data", "initialSolutionParameters.yaml") - scriptDir = os.path.dirname(os.path.realpath(__file__)) - dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) - problemTypeFilePath = os.path.join(dataDir, "library_data", "problemType.yaml") - hardcodedParametersFilePath = os.path.join(dataDir, "library_data", "hardcodedParameters.yaml") - initialSolutionParametersFilePath = os.path.join(dataDir, "library_data", "initialSolutionParameters.yaml") - - problemType = LibraryIO.readConfig(problemTypeFilePath)["ProblemType"] - problemTypeObject = SolutionStructs.ProblemType(problemType) - hardcodedParameters = LibraryIO.readConfig(hardcodedParametersFilePath) - initialSolutionParameters = LibraryIO.readConfig(initialSolutionParametersFilePath) + problemType = LibraryIO.readYAML(problemTypeFilePath)["ProblemType"] + problemTypeObject = SolutionStructs.ProblemType(problemType) + hardcodedParameters = LibraryIO.readYAML(hardcodedParametersFilePath) + initialSolutionParameters = LibraryIO.readYAML(initialSolutionParametersFilePath) - solutionList = BenchmarkProblems.generateForkedSolutions (problemTypeObject, hardcodedParameters, [initialSolutionParameters]) + solutionList = BenchmarkProblems.generateForkedSolutions (problemTypeObject, hardcodedParameters, [initialSolutionParameters]) - assert len(solutionList) == 2 + assert len(solutionList) == 2 -def test_loadSolutions(caplog): - - mylogger.debug("this is a test of debug log") - mylogger.info("this is some info") - scriptDir = os.path.dirname(os.path.realpath(__file__)) - dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) - solutionsFilePath = os.path.join(dataDir, "solutions", "solutions_nn_3.yaml") +def test_loadSolutions(caplog, useGlobalParameters): + with useGlobalParameters(): + mylogger.debug("this is a test of debug log") + mylogger.info("this is some info") + scriptDir = os.path.dirname(os.path.realpath(__file__)) + dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) + solutionsFilePath = os.path.join(dataDir, "solutions", "solutions_nn_3.yaml") - fileSolutions = LibraryIO.readSolutions(solutionsFilePath) - solutions = fileSolutions[1] - kernels, _, _ = TensileCreateLibrary.generateKernelObjectsFromSolutions(solutions) - assert len(solutions) == 3 - assert len(kernels) == 3 + fileSolutions = LibraryIO.parseSolutionsFile(solutionsFilePath) + solutions = fileSolutions[1] + kernels, _, _ = TensileCreateLibrary.generateKernelObjectsFromSolutions(solutions) + assert len(solutions) == 3 + assert len(kernels) == 3 - solutionWriter, _, kernelWriterAssembly, \ - _, _ = TensileCreateLibrary.getSolutionAndKernelWriters(solutions, kernels) + solutionWriter, _, kernelWriterAssembly, \ + _, _ = TensileCreateLibrary.getSolutionAndKernelWriters(solutions, kernels) - expectedSolutionName0 = "Cijk_Ailk_Bljk_SB_MT128x128x2_SE_TT8_8_WG16_16_1" - expectedSolutionName1 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_TT4_4_WG16_16_1" - expectedSolutionName2 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_TT4_8_WG16_8_1" + expectedSolutionName0 = "Cijk_Ailk_Bljk_SB_MT128x128x2_SE_TT8_8_WG16_16_1" + expectedSolutionName1 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_TT4_4_WG16_16_1" + expectedSolutionName2 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_TT4_8_WG16_8_1" - actualSolutionName0 = solutionWriter.getSolutionName(solutions[0]) - actualSolutionName1 = solutionWriter.getSolutionName(solutions[1]) - actualSolutionName2 = solutionWriter.getSolutionName(solutions[2]) + actualSolutionName0 = solutionWriter.getSolutionName(solutions[0]) + actualSolutionName1 = solutionWriter.getSolutionName(solutions[1]) + actualSolutionName2 = solutionWriter.getSolutionName(solutions[2]) - assert expectedSolutionName0 == actualSolutionName0 - assert expectedSolutionName1 == actualSolutionName1 - assert expectedSolutionName2 == actualSolutionName2 + assert expectedSolutionName0 == actualSolutionName0 + assert expectedSolutionName1 == actualSolutionName1 + assert expectedSolutionName2 == actualSolutionName2 - expectedKernelName0 = "Cijk_Ailk_Bljk_SB_MT128x128x2_SE_K1_TT8_8_WG16_16_1" - expectedKernelName1 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_K1_TT4_4_WG16_16_1" - expectedKernelName2 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_K1_TT4_8_WG16_8_1" + expectedKernelName0 = "Cijk_Ailk_Bljk_SB_MT128x128x2_SE_K1_TT8_8_WG16_16_1" + expectedKernelName1 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_K1_TT4_4_WG16_16_1" + expectedKernelName2 = "Cijk_Ailk_Bljk_SB_MT64x64x2_SE_K1_TT4_8_WG16_8_1" - actualKernelName0 = kernelWriterAssembly.getKernelName(kernels[0]) - actualKernelName1 = kernelWriterAssembly.getKernelName(kernels[1]) - actualKernelName2 = kernelWriterAssembly.getKernelName(kernels[2]) + actualKernelName0 = kernelWriterAssembly.getKernelName(kernels[0]) + actualKernelName1 = kernelWriterAssembly.getKernelName(kernels[1]) + actualKernelName2 = kernelWriterAssembly.getKernelName(kernels[2]) - assert expectedKernelName0 == actualKernelName0 - assert expectedKernelName1 == actualKernelName1 - assert expectedKernelName2 == actualKernelName2 + assert expectedKernelName0 == actualKernelName0 + assert expectedKernelName1 == actualKernelName1 + assert expectedKernelName2 == actualKernelName2 @pytest.mark.skip(reason="System issue with find assempler called when assigning defaults") def test_WriteClientLibraryFromSolutions(tmpdir): @@ -128,7 +128,7 @@ def test_WriteClientLibraryFromSolutions(tmpdir): dataDir = os.path.realpath(os.path.join(scriptDir, "..", "test_data", "unit")) solutionsFilePath = os.path.join(dataDir, "solutions", "solutions_nn_3.yaml") - fileSolutions = LibraryIO.readSolutions(solutionsFilePath) + fileSolutions = LibraryIO.parseSolutionsFile(solutionsFilePath) solutions = fileSolutions[1] Common.setWorkingPath(buildWorkingPath) @@ -183,7 +183,7 @@ def test_CreateBenchmarkClientParametersForSizes(tmpdir): metadataFilepath = os.path.join(libraryPath, "metadata.yaml") - metadataFile = LibraryIO.readConfig(metadataFilepath) + metadataFile = LibraryIO.readYAML(metadataFilepath) problemTypeDict = metadataFile["ProblemType"] sizes = [{"Exact": [196, 256, 64, 1024]}] problemSizes = SolutionStructs.ProblemSizes(problemTypeDict, sizes) @@ -193,7 +193,3 @@ def test_CreateBenchmarkClientParametersForSizes(tmpdir): ClientWriter.CreateBenchmarkClientParametersForSizes(testDataPath, problemSizes, dataFilePath, configFile) assert os.path.exists(configFile) == 1 - - - - diff --git a/Tensile/Tests/unit/test_mergeLogic.py b/Tensile/Tests/unit/test_mergeLogic.py index e3edf83d0..d70d56782 100644 --- a/Tensile/Tests/unit/test_mergeLogic.py +++ b/Tensile/Tests/unit/test_mergeLogic.py @@ -104,6 +104,92 @@ - [42, 999.9] """ +mfmaMergeBaseLogic=logicPrefix+r""" +- + - SolutionIndex: 0 + SolutionNameMin: MFMA_base + EnableMatrixInstruction: True + MatrixInstruction: [16, 16, 4, 1] + - SolutionIndex: 1 + SolutionNameMin: VALU_base + EnableMatrixInstruction: False + MatrixInstruction: [] +- DummyIndexAssignment +""" + +mfmaMergeIncLogic=logicPrefix+r""" +- + - SolutionIndex: 0 + SolutionNameMin: MFMA_inc + EnableMatrixInstruction: True + MatrixInstruction: [16, 16, 4, 1] + - SolutionIndex: 1 + SolutionNameMin: VALU_inc + EnableMatrixInstruction: False + MatrixInstruction: [] +- DummyIndexAssignment +""" + +mfmaMergeBaseSizes=r""" +- + - - [128, 128, 1, 128] + - [0, 3.0] + - - [128, 128, 1, 128] + - [1, 6.0] + - - [130, 128, 1, 128] + - [1, 9.0] + - - [131, 128, 1, 128] + - [0, 12.0] +""" + +mfmaMergeIncFasterSizes=r""" +- + - - [128, 128, 1, 128] + - [0, 4.0] + - - [128, 128, 1, 128] + - [1, 7.0] + - - [131, 128, 1, 128] + - [0, 13.0] + - - [130, 128, 1, 128] + - [1, 10.0] +""" + +mfmaMergeIncSlowerSizes=r""" +- + - - [128, 128, 1, 128] + - [0, 2.0] + - - [128, 128, 1, 128] + - [1, 5.0] + - - [131, 128, 1, 128] + - [0, 11.0] + - - [130, 128, 1, 128] + - [1, 8.0] +""" + +mfmaMergeIncNotMatchingMFMA=r""" +- + - - [130, 128, 1, 128] + - [0, 7.0] + - - [131, 128, 1, 128] + - [1, 12.0] +""" + +mfmaMergeResNotMatchingMFMA=r""" +- + - - [128, 128, 1, 128] + - [0, 3.0] + - - [128, 128, 1, 128] + - [1, 6.0] + - - [130, 128, 1, 128] + - [1, 9.0] + - - [131, 128, 1, 128] + - [0, 12.0] + - - [130, 128, 1, 128] + - [2, 7.0] + - - [131, 128, 1, 128] + - [3, 12.0] +""" + def checkUniqueSolution(solutionPool): uniq = set() # note: any([False or None, True or None]) -> True @@ -194,6 +280,41 @@ def test_checkUniqueSolution(input, expected): data = yaml.load(input, yaml.SafeLoader) assert checkUniqueSolution(data[5]) == expected +@pytest.mark.parametrize("baseLogic, incLogic, expectedSizesYaml, expectedSolutions", [ +# test case #1: Slower sizes in incremental logic file + (mfmaMergeBaseLogic+mfmaMergeBaseSizes, mfmaMergeIncLogic+mfmaMergeIncSlowerSizes, + mfmaMergeBaseSizes, ["MFMA_base", "VALU_base"]), +# test case #2: Faster sizes in incremental logic file + (mfmaMergeBaseLogic+mfmaMergeBaseSizes, mfmaMergeIncLogic+mfmaMergeIncFasterSizes, + mfmaMergeIncFasterSizes, ["MFMA_inc", "VALU_inc"]), +# test case #3: Test that VALU size is included alongside MFMA size, and vice versa (regardless of efficiency) + (mfmaMergeBaseLogic+mfmaMergeBaseSizes, mfmaMergeIncLogic+mfmaMergeIncNotMatchingMFMA, + mfmaMergeResNotMatchingMFMA, ["MFMA_base", "VALU_base", "MFMA_inc", "VALU_inc"]) +]) +def test_mfmaMergeLogic(baseLogic, incLogic, expectedSizesYaml, expectedSolutions): + baseData = yaml.load(baseLogic, yaml.SafeLoader) + incData = yaml.load(incLogic, yaml.SafeLoader) + expectedSizes = yaml.load(expectedSizesYaml, yaml.SafeLoader)[0] + + mergedData, _, _, _ = mergeLogic(baseData, incData, False, True, True) + + solutionIndices = {s['SolutionNameMin']: s['SolutionIndex'] for s in mergedData[5]} # size -> solutionName + + #Ensure all correct solutions are present in merged data + for solution in expectedSolutions: + assert solution in solutionIndices.keys() + + assert len(expectedSolutions) == len(mergedData[5]) + + #Convert expected sizes to use mergedData's solution indices + expectedSizes = [ [size, [solutionIndices[expectedSolutions[solIndex]], eff]] for size, [solIndex, eff] in expectedSizes ] + + #Ensure all expected sizes are present in merged data + for item in expectedSizes: + assert item in mergedData[7] + + assert len(expectedSizes) == len(mergedData[7]) + if __name__ == "__main__": # test_mergeLogic(baseLogic, incLogic, [1,2,2], [(1024, 1024, 1, 1024), (256, 256, 1, 256), (128, 128, 1, 128), (64, 64, 1, 64)], [ "InUseForSize256or1024xxx", "InUseForSize256or1024xxx", "InUseForSize128xxx", "InUseForSize128or64"]) # test_checkUniqueSolution(uniqueSolution, True) diff --git a/Tensile/Tests/yaml_only/test_config.py b/Tensile/Tests/yaml_only/test_config.py index 9c174c7a1..da4a6f2f7 100644 --- a/Tensile/Tests/yaml_only/test_config.py +++ b/Tensile/Tests/yaml_only/test_config.py @@ -12,6 +12,7 @@ ################################################################################ def isExe( filePath ): return os.path.isfile(filePath) and os.access(filePath, os.X_OK) + def locateExe( defaultPath, exeName ): # /opt/rocm/bin, hip-clang # look in path first for path in os.environ["PATH"].split(os.pathsep): diff --git a/Tensile/Utilities/merge.py b/Tensile/Utilities/merge.py index bae630d26..e817663d4 100644 --- a/Tensile/Utilities/merge.py +++ b/Tensile/Utilities/merge.py @@ -24,6 +24,7 @@ import sys import argparse from copy import deepcopy +from enum import IntEnum verbosity = 1 @@ -52,7 +53,8 @@ def fixSizeInconsistencies(sizes, fileType): duplicates = list() for i in range(0,len(sizes)): currSize = sizes[i][0] - if len(currSize) == 8: + # >= so size will be trimmed when a SolutionTag is included + if len(currSize) >= 8: currSize = currSize[:-4] if currSize in (item for index in sizes for item in index): duplicates.append(i-len(duplicates)) @@ -147,8 +149,43 @@ def debug(*args, **kwargs): if verbosity < 2: return msg(*args, **kwargs) +# Tags distinguishing solution types +# Can be added to size key to allow solutions of each type to be present +# in logic file for a given size +class SolutionTag(IntEnum): + VALU = 0 + MFMA = 1 + + def __str__(self): + return ["VALU", "MFMA"][self] + def __repr__(self): + return str(self) + +def getSolutionTag(solution): + if solution.get("EnableMatrixInstruction", False) or solution.get("MatrixInstruction", False): + return SolutionTag.MFMA + else: + return SolutionTag.VALU + +def findSolutionWithIndex(solutionData, solIndex): + # Check solution at the index corresponding to solIndex first + if solIndex < len(solutionData) and solutionData[solIndex]["SolutionIndex"] == solIndex: + return solutionData[solIndex] + else: + debug("Searching for index...") + solution = [s for s in solutionData if s["SolutionIndex"]==solIndex] + assert(len(solution) == 1) + return solution[0] + +def addSolutionTagToKeys(solutionMap, solutionPool): + return [[[getSolutionTag(findSolutionWithIndex(solutionPool, idx))] + keys, [idx, eff]] + for [keys, [idx, eff]] in solutionMap] + +def removeSolutionTagFromKeys(solutionMap): + return [[keys[1:], [idx, incEff]] for keys, [idx, incEff] in solutionMap] + # returns merged logic data as list -def mergeLogic(origData, incData, forceMerge, trimSize=True): +def mergeLogic(origData, incData, forceMerge, trimSize=True, addMfmaTag=False): origNumSizes = len(origData[7]) origNumSolutions = len(origData[5]) @@ -158,6 +195,28 @@ def mergeLogic(origData, incData, forceMerge, trimSize=True): verbose(origNumSizes, "sizes and", origNumSolutions, "kernels in base logic file") verbose(incNumSizes, "sizes and", incNumSolutions, "kernels in incremental logic file") + # Add SolutionTag to distinguish mfma and non-mfma solutions + origTaggedSizes = addSolutionTagToKeys(origData[7], origData[5]) + incTaggedSizes = addSolutionTagToKeys(incData[7], incData[5]) + if addMfmaTag: + origData[7] = origTaggedSizes + incData[7] = incTaggedSizes + # Print warning if addMfmaTag=False results in removed sizes + else: + origSet = {tuple(size) for size, [_, _] in origData[7]} + origTaggedSet = {tuple(size) for size, [_, _] in origTaggedSizes} + incSet = {tuple(size) for size, [_, _] in incData[7]} + incTaggedSet = {tuple(size) for size, [_, _] in incTaggedSizes} + + if len(origSet) != len(origTaggedSet): + verbose("Warning:", len(origTaggedSet) - len(origSet), "duplicate sizes are present in base logic", + "that may not be handled correctly unless --add_mfma_tag is used") + if len(incSet) != len(incTaggedSet): + verbose("Warning:", len(incTaggedSet) - len(incSet), "duplicate sizes are present in incremental logic", + "that may not be handled correctly unless --add_mfma_tag is used") + + + if trimSize: # trim 8-tuple gemm size format to 4-tuple [m, n, b, k] # TODO future gemm size could include dictionary format so need robust preprocessing @@ -172,9 +231,7 @@ def mergeLogic(origData, incData, forceMerge, trimSize=True): origDict = {tuple(origSize): [i, origEff] for i, [origSize, [origIndex, origEff]] in enumerate(origData[7])} for incSize, [incIndex, incEff] in incData[7]: - incSolution = [s for s in incData[5] if s["SolutionIndex"]==incIndex] # TODO this is slow - assert len(incSolution)==1 - incSolution = incSolution[0] + incSolution = findSolutionWithIndex(incData[5], incIndex) try: j, origEff = origDict[tuple(incSize)] @@ -187,7 +244,7 @@ def mergeLogic(origData, incData, forceMerge, trimSize=True): solutionPool, index = addKernel(solutionPool, incSolution) solutionMap[j][1] = [index, incEff] else: - verbose("[X]", incSize, " already exists but does not improve in performance.", end="") + verbose("[X]", incSize, "already exists but does not improve in performance.", end="") verbose("Efficiency:", origEff, "->", incEff) except KeyError: verbose("[-]", incSize, "has been added to solution table, Efficiency: N/A ->", incEff) @@ -197,6 +254,10 @@ def mergeLogic(origData, incData, forceMerge, trimSize=True): verbose(numOrigRemoved, "unused kernels removed from base logic file") verbose(numIncRemoved, "unused kernels removed from incremental logic file") + # Remove SolutionTag for yaml output + if addMfmaTag: + solutionMap = removeSolutionTagFromKeys(solutionMap) + mergedData = deepcopy(origData) mergedData[5] = solutionPool mergedData[7] = solutionMap @@ -208,7 +269,7 @@ def mergeLogic(origData, incData, forceMerge, trimSize=True): return [mergedData, numSizesAdded, numSolutionsAdded, numSolutionsRemoved] -def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSize=True): +def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSize=True, addMfmaTag=False): originalFiles = allFiles(originalDir) incrementalFiles = allFiles(incrementalDir) ensurePath(outputPath) @@ -222,7 +283,8 @@ def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSi origFile = os.path.join(originalDir, basename) forceMerge = defaultForceMergePolicy(incFile) if forceMerge is None else forceMerge - msg("Base logic file:", origFile, "| Incremental:", incFile, "| Merge policy: %s"%("Forced" if forceMerge else "Winner"), "| Trim size:", trimSize) + msg("Base logic file:", origFile, "| Incremental:", incFile, "| Merge policy: %s"%("Forced" if forceMerge else "Winner"), "| Trim size:", trimSize, + "| Add MFMA tag:", addMfmaTag) origData = loadData(origFile) incData = loadData(incFile) @@ -231,7 +293,7 @@ def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSi origData = reindexSolutions(origData) incData = reindexSolutions(incData) - mergedData, *stats = mergeLogic(origData, incData, forceMerge, trimSize) + mergedData, *stats = mergeLogic(origData, incData, forceMerge, trimSize, addMfmaTag) msg(stats[0], "size(s) and", stats[1], "kernel(s) added,", stats[2], "kernel(s) removed") with open(os.path.join(outputPath, basename), "w") as outFile: @@ -251,7 +313,7 @@ def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSi # This is useful for when a tuning task is # shared between multiple machines who each # will provide a partial result. -def mergePartialLogics(partialLogicFilePaths, outputDir, forceMerge, trimSize=True): +def mergePartialLogics(partialLogicFilePaths, outputDir, forceMerge, trimSize=True, addMfmaTag=False): logicFiles = deepcopy(partialLogicFilePaths) ensurePath(outputDir) @@ -269,7 +331,7 @@ def mergePartialLogics(partialLogicFilePaths, outputDir, forceMerge, trimSize=Tr baseLogicData = reindexSolutions(baseLogicData) incLogicData = reindexSolutions(incLogicData) - mergedData, *stats = mergeLogic(baseLogicData, incLogicData, forceMerge, trimSize) + mergedData, *stats = mergeLogic(baseLogicData, incLogicData, forceMerge, trimSize, addMfmaTag) msg(stats[0], "size(s) and", stats[1], "kernel(s) added,", stats[2], "kernel(s) removed") # Use the merged data as the base data for the next partial logic file @@ -291,6 +353,7 @@ def mergePartialLogics(partialLogicFilePaths, outputDir, forceMerge, trimSize=Tr argParser.add_argument("-v", "--verbosity", help="0: summary, 1: verbose, 2: debug", default=1, type=int) argParser.add_argument("--force_merge", help="Merge previously known sizes unconditionally. Default behavior if not arcturus", default="none") argParser.add_argument("--notrim", help="Do not trim long size format down to short format (m,n,b,k). Default is --trim", action="store_false") + argParser.add_argument("--add_mfma_tag", help="Add a tag to the size key for usage of MFMA instructions, allowing for both a MFMA and non-MFMA kernel to exist for the same size. Default doesn't add this tag.", action="store_true") args = argParser.parse_args(sys.argv[1:]) originalDir = args.original_dir @@ -299,9 +362,10 @@ def mergePartialLogics(partialLogicFilePaths, outputDir, forceMerge, trimSize=Tr verbosity = args.verbosity forceMerge = args.force_merge.lower() trimSize = args.notrim + addMfmaTag = args.add_mfma_tag if forceMerge in ["none"]: forceMerge=None elif forceMerge in ["true", "1"]: forceMerge=True elif forceMerge in ["false", "0"]: forceMerge=False - avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSize) + avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, trimSize, addMfmaTag) diff --git a/Tensile/__init__.py b/Tensile/__init__.py index 34b7183ea..736ff6b5c 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -23,7 +23,7 @@ from __future__ import print_function # hardcoded tensile version; also in Tensile/Source/TensileConfigVersion.cmake -__version__ = "4.27.0" +__version__ = "4.28.0" def PrintTensileRoot(): import os.path diff --git a/Tensile/bin/TensileBenchmarkCluster b/Tensile/bin/TensileBenchmarkCluster index 7c712754e..e4f563469 100755 --- a/Tensile/bin/TensileBenchmarkCluster +++ b/Tensile/bin/TensileBenchmarkCluster @@ -1,6 +1,6 @@ #!/usr/bin/env python3 ################################################################################ -# Copyright 2016-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/Tensile/bin/TensileRetuneLibrary b/Tensile/bin/TensileRetuneLibrary new file mode 100755 index 000000000..4b71da946 --- /dev/null +++ b/Tensile/bin/TensileRetuneLibrary @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +################################################################################ +# Copyright 2016-2021 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + + +try: + from Tensile import TensileRetuneLibrary +except ImportError: + import os.path + import sys + parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")) + sys.path.append(parentdir) + + from Tensile import TensileRetuneLibrary + +# script run from commandline +if __name__ == "__main__": + TensileRetuneLibrary.main() diff --git a/Tensile/cmake/TensileConfig.cmake b/Tensile/cmake/TensileConfig.cmake index cf2c814bd..cc2f4faf6 100644 --- a/Tensile/cmake/TensileConfig.cmake +++ b/Tensile/cmake/TensileConfig.cmake @@ -104,7 +104,6 @@ function(TensileCreateLibraryFiles # Single value settings set(oneValueArgs - ARCHITECTURE CODE_OBJECT_VERSION COMPILER COMPILER_PATH @@ -115,7 +114,11 @@ function(TensileCreateLibraryFiles VAR_PREFIX ) - set(multiValueArgs "") + # Multi value settings + set(multiValueArgs + ARCHITECTURE + ) + cmake_parse_arguments(Tensile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(Tensile_UNPARSED_ARGUMENTS) @@ -184,7 +187,8 @@ function(TensileCreateLibraryFiles endif() if(Tensile_ARCHITECTURE) - set(Options ${Options} "--architecture=${Tensile_ARCHITECTURE}") + string (REPLACE ";" "\\\\\\\;" archString "${Tensile_ARCHITECTURE}") + set(Options ${Options} "--architecture=${archString}") endif() if(Tensile_LIBRARY_FORMAT) @@ -212,10 +216,12 @@ function(TensileCreateLibraryFiles # Create the manifest file of the output libraries. set(Tensile_CREATE_MANIFEST_COMMAND ${CommandLine} "--generate-manifest-and-exit") set(Tensile_MANIFEST_FILE_PATH "${Tensile_OUTPUT_PATH}/library/TensileManifest.txt") + message(STATUS "Tensile_MANIFEST_FILE_PATH: ${Tensile_MANIFEST_FILE_PATH}") execute_process( COMMAND ${Tensile_CREATE_MANIFEST_COMMAND} - RESULT_VARIABLE Tensile_CREATE_MANIFEST_RESULT) + RESULT_VARIABLE Tensile_CREATE_MANIFEST_RESULT + COMMAND_ECHO STDOUT) if(Tensile_CREATE_MANIFEST_RESULT OR (NOT EXISTS ${Tensile_MANIFEST_FILE_PATH})) message(FATAL_ERROR "Error creating Tensile library: ${Tensile_CREATE_MANIFEST_RESULT}") diff --git a/Tensile/cmake/TensileConfigVersion.cmake b/Tensile/cmake/TensileConfigVersion.cmake index 28ef1f20b..abb003d9e 100644 --- a/Tensile/cmake/TensileConfigVersion.cmake +++ b/Tensile/cmake/TensileConfigVersion.cmake @@ -21,7 +21,7 @@ # hardcoded tensile version; also in Tensile/__init__.py set(TENSILE_VERSION_MAJOR 4) -set(TENSILE_VERSION_MINOR 27) +set(TENSILE_VERSION_MINOR 28) set(TENSILE_VERSION_PATCH 0) # export version diff --git a/bump-version.sh b/bump-version.sh index 92c65ba97..40810a6ae 100755 --- a/bump-version.sh +++ b/bump-version.sh @@ -3,8 +3,8 @@ # This script needs to be edited to bump version for new release. # Version will be bumped in Tensile/__init__.py and in .yaml files -OLD_VERSION="4.26.0" -NEW_VERSION="4.27.0" +OLD_VERSION="4.27.0" +NEW_VERSION="4.28.0" OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.7.2" NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.8.0" diff --git a/pytest.ini b/pytest.ini index b678042ef..9607cd3d2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,21 +4,23 @@ junit_logging = all junit_log_passing_tests = False xfail_strict = True markers = + extended: Longer tests, formerly nightly. + pre_checkin: All pre-checkin tests. + disabled: Disabled tests. + regression: directory: tests for some bug hotspots + + bufferload_offset: directory bugs: directory dot: directory - extended: Longer tests, formerly nightly. nightly: directory - pre_checkin: All pre-checkin tests. + no_load_loop: Directory, tests for the OptNoLoadLoop option. special: directory unit: Unit tests - no_load_loop: Directory, tests for the OptNoLoadLoop option. use_initial_strides: directory use_initial_strides_cd: directory vega_20: directory weekly: directory yaml_only: directory, all tests which simply run a YAML file. - disabled: Disabled tests. - regression: directory: tests for some bug hotspots integration: emulation: Quick set of essential tests to run on emulation. emulation1: Emulation tests part 1 @@ -27,6 +29,7 @@ markers = emulation4: Emulation tests part 4 emulation5: Emulation tests part 5 + validate: All tests which validate the results. validateAll: All tests which validate all data points. GEMM: All tests which run standard GEMMs @@ -83,6 +86,8 @@ markers = xfail-gfx90a: architecture xfail-gfx1010: architecture xfail-gfx1011: architecture + xfail-gfx1012: architecture + xfail-gfx1030: architecture skip-gfx000: architecture skip-gfx900: architecture skip-gfx906: architecture @@ -90,3 +95,5 @@ markers = skip-gfx90a: architecture skip-gfx1010: architecture skip-gfx1011: architecture + skip-gfx1012: architecture + skip-gfx1030: architecture diff --git a/setup.py b/setup.py index 4442c7b48..b6818d75b 100644 --- a/setup.py +++ b/setup.py @@ -50,5 +50,7 @@ def readVersionFromInit(): "tensile_sgemm = Tensile.Tensile:TensileSGEMM5760", # Run tensile benchmark from cluster "TensileBenchmarkCluster = Tensile.TensileBenchmarkCluster:main", + # Retune library logic file + "TensileRetuneLibrary = Tensile.TensileRetuneLibrary:main" ]} ) diff --git a/tox.ini b/tox.ini index 211a2c7e5..eb965b16e 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ envlist = py35,py36,py27,lint deps = -r{toxinidir}/requirements.txt pytest>=5.4.1 - pytest-xdist==1.32.0 + pytest-xdist>=1.32.0 filelock commands = python3 ./Tensile/bin/Tensile Tensile/Configs/build_client.yaml {envdir}/client