From 006a5d653ce0d82fecb05d5e215d053749b57c04 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Sat, 29 Oct 2022 11:39:08 -0600 Subject: [PATCH] hotfix: Assembly syntax and coverage for DirectToLds has changed (#1598) (#1610) --- .jenkins/common.groovy | 4 +- Tensile/Common.py | 177 +++++++++--------- Tensile/KernelWriterAssembly.py | 81 +++++--- Tensile/SolutionStructs.py | 8 + .../direct_to_lds/dtl_dgemm.yaml | 0 .../direct_to_lds/dtl_dgemm_lite.yaml | 0 .../direct_to_lds/dtl_hgemm.yaml | 0 tox.ini | 3 +- 8 files changed, 155 insertions(+), 118 deletions(-) rename Tensile/Tests/{extended => disabled}/direct_to_lds/dtl_dgemm.yaml (100%) rename Tensile/Tests/{pre_checkin => disabled}/direct_to_lds/dtl_dgemm_lite.yaml (100%) rename Tensile/Tests/{extended => disabled}/direct_to_lds/dtl_hgemm.yaml (100%) diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index 016289ca2..c8416a4e5 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -30,7 +30,7 @@ def runCompileCommand(platform, project, jobName, boolean debug=false) project.paths.construct_build_prefix() String compiler = 'hipcc' - String pythonVersion = 'py36' + String pythonVersion = 'py3' String cov = "V3" String buildType = debug ? 'Debug' : 'RelWithDebInfo' String parallelJobs = "export HIPCC_COMPILE_FLAGS_APPEND=-parallel-jobs=2" @@ -110,7 +110,7 @@ def runTestCommand (platform, project, jobName, test_marks, boolean skipHostTest def test_dir = "Tensile/Tests" String compiler = 'hipcc' - String pythonVersion = 'py36' + String pythonVersion = 'py3' String markSkipHostTest = skipHostTest ? "#" : "" String markSkipExtendedTest = !test_marks.contains("extended") ? "--gtest_filter=-\"*Extended*\"" : "" diff --git a/Tensile/Common.py b/Tensile/Common.py index 13c39a3cd..be8bbbab8 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -83,7 +83,7 @@ globalParameters["ForceRedoLibraryLogic"] = True # if False and library logic already analyzed, then library logic will be skipped when tensile is re-run globalParameters["ForceRedoLibraryClient"] = True # if False and library client already built, then building library client will be skipped when tensile is re-run -# Compare CPU reference convolution model vs golden tensor contracton model +# Compare CPU reference convolution model vs golden tensor contraction model # Useful to test if conversion from tensor contraction is working as expected # In this mode, the filter,stride,dilation are specified in the problem type. # If the problem type uses constant Filter,Stride,Dilation,Pad* (ie these are not 'N'), then the @@ -92,7 +92,7 @@ globalParameters["ConvolutionVsContraction"] = False globalParameters["ShowProgressBar"] = True # if False and library client already built, then building library client will be skipped when tensile is re-run -globalParameters["SolutionSelectionAlg"] = 1 # algorithm to detetermine which solutions to keep. 0=removeLeastImportantSolutions, 1=keepWinnerSolutions (faster) +globalParameters["SolutionSelectionAlg"] = 1 # algorithm to determine which solutions to keep. 0=removeLeastImportantSolutions, 1=keepWinnerSolutions (faster) globalParameters["ExpandRanges"] = True # expand ranges into exact configs before writing logic file. False ignores ranges. globalParameters["ExitAfterKernelGen"] = False # Exit after generating kernels globalParameters["GenerateSourcesAndExit"] = False # Exit after kernel source generation. @@ -162,7 +162,7 @@ # build parameters globalParameters["CMakeCXXFlags"] = "" # pass flags to cmake globalParameters["CMakeCFlags"] = "" # pass flags to cmake -globalParameters["DebugKernel"] = False # assembly only, kernel gets buffer for debug "printing"; kernel writes data to memory, gets coppied to host and printed +globalParameters["DebugKernel"] = False # assembly only, kernel gets buffer for debug "printing"; kernel writes data to memory, gets copied to host and printed globalParameters["LibraryPrintDebug"] = False # solutions will print enqueue info when enqueueing a kernel # debug for assembly @@ -185,7 +185,7 @@ globalParameters["PrintCodeCommands"] = False # print the commands used to generate the code objects (asm,link,hip-clang, etc) globalParameters["DumpTensors"] = False # If True, dump tensors to binary files instead of printing them. -# If PrintMax* is greater than the dimension, the middle elements will be repaced with "..." +# If PrintMax* is greater than the dimension, the middle elements will be replaced with "..." # device selection @@ -256,7 +256,7 @@ # directory where custom kernels are located globalParameters["CustomKernelDirectory"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "CustomKernels") -globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile trainning verification or not +globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile training verification or not globalParameters["SeparateArchitectures"] = False # write Tensile library metadata to separate files for each architecture @@ -340,8 +340,8 @@ def getArchitectureName(gfxName): # The supported typed GEMM, each entry is (Ti, To, Tc). # DataType (Ti) = The data-type of the input matrices: A/B # DestDataType (To) = The data-type of the output matrices: C/D -# ComputeDataType (Tc) = The data-type of computaiton: alpha/beta: -# Cinternal: basically should == ComputeDataType +# ComputeDataType (Tc) = The data-type of computation: alpha/beta: +# CInternal: basically should == ComputeDataType # Align the supported GEMM type with rocBLAS: [A/B/ C/D/ alpha/beta] # (rocblas/library/include/internal/rocblas_functions.h) @@ -465,7 +465,7 @@ def getArchitectureName(gfxName): "GlobalSplitU": list(range(1, 1024+1)), # choose how to do GlobalSplitU - # 1: use atomic operation to accumlate on one buffer + # 1: use atomic operation to accumulate on one buffer # 2: each GSU group write to each own buffer and accumulate by another kernel "GlobalSplitUAlgorithm": ["SingleBuffer", "MultipleBuffer"], @@ -477,14 +477,14 @@ def getArchitectureName(gfxName): # GSUSARR=True means the 4 workgroups round robin split up the chunks of the summation: k=0 -> DU-1, 4DU -> 5DU-1, ...; k=1DU -> 2DU-1, 5DU -> 6DU-1...; ... "GlobalSplitUSummationAssignmentRoundRobin": [ False, True ], - # in opencl for some compilers, performance improved by putting a memfence after each subiteration; it prevented the loads of one subiteration from being moved + # in opencl for some compilers, performance improved by putting a memfence after each sub-iteration; it prevented the loads of one sub-iteration from being moved # into a prior iteration, which would help latency but it consumed more vgprs which was a net loss "UnrollMemFence": [ False, True ], # not used yet; will refer to combining multiple reads into single instruction # such as ds_read_b32 -> ds_read2_b32 # the pro is that it cuts in half the number of instructions - # the con is that bits per offset is half, so arithmatic might be required to increment and reset offset vgprs + # the con is that bits per offset is half, so arithmetic might be required to increment and reset offset vgprs "GlobalRead2A": [ False, True ], "GlobalRead2B": [ False, True ], "LocalWrite2A": [ False, True ], @@ -501,7 +501,7 @@ def getArchitectureName(gfxName): # This eliminates 4 vector XOR instructions used for pointer swap "ExpandPointerSwap": [False, True], - # Schedule global reads and global read incrementsinto LocalRead iterations + # Schedule global reads and global read increments into LocalRead iterations # Can reduce pressure on local read instruction dispatch queue # 0=perform global reads at start of instruction loop # 1=schedule into the local read instruction iterations @@ -535,7 +535,7 @@ def getArchitectureName(gfxName): # the purpose of this parameter is to control density of local write instruction scheduling # In PGR1, we want to schedule local write more denser, so we can have more # latency to hide global read - # In PGR2, since LW is followed by GR, every LW has same whole loop latecy + # In PGR2, since LW is followed by GR, every LW has same whole loop latency # to hide global read. We want to schedule LW less denser, can # avoid full of vmem FIFO. # Range from 0.01 to 32 @@ -568,7 +568,7 @@ def getArchitectureName(gfxName): # the next tile in the sequence. "PrefetchAcrossPersistent": [0, 1], - # Changes the behaviour of prefetch across persistent. + # Changes the behavior of prefetch across persistent. # Mode 0 is default, works for all sizes # Mode 1 disables static tile setup for prefetch and merges prefetch with ord. noLoadLoop, "PrefetchAcrossPersistentMode": [0, 1], @@ -610,7 +610,7 @@ def getArchitectureName(gfxName): # + Uses hardware buffer limit for edge detection # - Limited range - the bot-right corner of macro-tile (plus padding=GRVW # for shift-pointer, if ShiftPtr is required) must be within 2^32. - # ShiftPtrPad = MayShift ? GRWV*BPE : 0 + # ShiftPtrPad = MayShift ? GRVW*BPE : 0 # For TLU=1: Unroll*StrideA1 + ShiftPtrPad <= 2^32 # For TLU=0: MT*StrideA1 + ShiftPtrPad <= 2^32 # These conditions should be checked using Assert - TODO @@ -620,7 +620,7 @@ def getArchitectureName(gfxName): # + Each SGPR stores an offset from base GlobalReadOffset+0. # - Requirements for UseSgprForGRO=1: # - BufferLoad=1 - # - Use appropriate Assert*ElementMultiple or GRVW=1 to eliminate need for ShifPtr + # - Use appropriate Assert*ElementMultiple or GRVW=1 to eliminate need for ShiftPtr # (UseSgprForGRO does not support ShiftPtr since ShiftPtr needs to potentially shift GRO) # = KernelWriterAssembly also supports 64-bit 2D buffer size (see use64bPbcLimit) # - Requires 4 instructions to move scalar limit and a couple SGPR @@ -633,18 +633,18 @@ def getArchitectureName(gfxName): # - DirectToLds=1 # converting m0 update from LocalWriteAddrSGpr using is usually win - # -1 attempt to use a hueristic to determine when the tile size will use too many SGPR and fall back to VGPR + # -1 attempt to use a heuristic to determine when the tile size will use too many SGPR and fall back to VGPR "UseInstOffsetForGRO": [ -1, 0, 1], # Converting VGPR GRO into SGPR GRO is usually a win # However, the mode may exhaust all available SGPR, in particular for large unroll - # -1 attempt to use a hueristic to determine when the tile size will use too many SGPR and fall back to VGPR + # -1 attempt to use a heuristic to determine when the tile size will use too many SGPR and fall back to VGPR "UseSgprForGRO": [ -1, 0, 1], # Some work-items in the group may not participate in the final buffer load. # Allows more flexibility in choosing DepthU. - # 1= allocate extra addressing vpgr for edge cases + # 1= allocate extra addressing vgpr for edge cases # 2= use temp vgpr inside unroll loop, may save 1 VPR if both A and B have a fractional edge but costs v_alu "FractionalLoad": [ 0, 1, 2] , @@ -775,11 +775,11 @@ def getArchitectureName(gfxName): # - Higher values will spread traffic to more channels but provide less L2 re-use. # - StaggerU and WorkGroupMapping interact and should be tuned together - # The WGM controls how tiles are assigned in C matrix, while StaggerU controls where those - # tiles start reading their summation dim parms. + # tiles start reading their summation dim params. # - StaggerU requires BufferLoad==1 and is silently ignored if BufferLoad==0 "StaggerU": [0,2,4,8,16,32,64], - # Stride in bytes for each staggeru 'click'. + # Stride in bytes for each staggerU 'click'. # 256 is recommended since this is the width of memory channel (on gfx803,gfx900,gf906) - so # each click will start in a new memory channel and spread traffic among the 16 available channels. # For example StaggerUStride=256 and StaggerU=8 will use 8 unique starting points @@ -796,7 +796,7 @@ def getArchitectureName(gfxName): # 1: Use wg1 # 2: Use wg2 # 3: Use wgSerial, wgSerial = wg0 + wg1 * nwg0 + wg2 * (nwg0 * nwg1) - # 4: Debug mode, offset each tile max allowed StaggerU. This just moves hotspot + # 4: Debug mode, offset each tile max allowed StaggerU. This just moves hot-spot # to a different bank since all workgroups still start at same point. "StaggerUMapping": [0,1,2,3,4], @@ -813,7 +813,7 @@ def getArchitectureName(gfxName): # Tensor C always mapped with first free coord as fastest moving # (Elements in this dimension are sequential in memory. # - # For 2D nonbatched Matrix this means index order is I, then J + # For 2D non-batched Matrix this means index order is I, then J # For 2D batched Matrix this means index order is I, then J, then K. # # Then for 2D case: @@ -827,7 +827,7 @@ def getArchitectureName(gfxName): # Examples for 2D matrix: # WGM=8: on CU64 machine this is a square box # WGM=1: Short/Fat - this will cover maximum width in I dimension of C. This matches hardware assigned mapping. - # WGM=64: Tall/Skinny - this will cover maximum width in J dimention of C. + # WGM=64: Tall/Skinny - this will cover maximum width in J dimension of C. # # Formula for wgSerial: # wgSerial = wg0 + (wg1 % WorkGroupMapping) * nwg0 @@ -872,18 +872,18 @@ def getArchitectureName(gfxName): # StoreRemap: Optimize MatrixInstruction store patterns to enhance performance. # MI output data between each threads are along N dims. - # But global memory is along M dim continous. - # That mean global write between each threads are not continous. + # But global memory is along M dim continuous. + # That mean global write between each threads are not continuous. # Therefore, store performance for MI instruction is poor. # How StoreRemap works in final store stage: # 1. Put all thread output data into LDS. # 2. All thread read data from LDS along M dims. - # (match global Memory continous direction) + # (match global Memory continuous direction) # 3. All thread write out data into global memory. # 0: Disable StoreRemap (default) # 1~8: Enable StoreRemap and set the global write vector width - # Suggest optimum value: fp32 = [2,4], fp16 or bf16 = [4,8] (dwordx2 and dowrdx4) - # -1: Use dwordx2 if support SRVW, or set SRVW to 0 + # Suggest optimum value: fp32 = [2,4], fp16 or bf16 = [4,8] (dwordX2 and dwordX4) + # -1: Use dwordX2 if support SRVW, or set SRVW to 0 "StoreRemapVectorWidth": [-1,0,1,2,4,8], # SourceSwap: Optimizes MatrixInstruction store pattern by swapping mfma input order. @@ -957,7 +957,7 @@ def getArchitectureName(gfxName): # Disable overlapping AB-tile vgpr and read/write addr vgprs with C-tile vgprs # Valid only for MatrixInstruction enabled kernels, which by default overlaps - # C-tile w/ AB-tile until it's due for v_accvgpr_read before the writeback. Illustrated below: + # C-tile w/ AB-tile until it's due for v_accvgpr_read before the write-back. Illustrated below: # |<----------------------- valuC ----------------------->| # |<--- valuA/B --->|<-- R/W pointers -->|xxx|<- Spares ->| # ^ ^ @@ -979,7 +979,7 @@ def getArchitectureName(gfxName): # 7= +NoPreLoop+ NoGlobalReadInc # 9= NullKernel # For example set DisableKernelPieces: [0,1,2,3,4,5,6,7,9] - # this will create a set of kernels with progessively more pieces of the kernel disabled + # this will create a set of kernels with progressively more pieces of the kernel disabled "DisableKernelPieces": list(range(-9,10)), # disable pieces of the kernel, for performance isolation # assume atomics always work correctly. @@ -998,7 +998,7 @@ def getArchitectureName(gfxName): # - Host code will not launch more groups than tiles in the C space # -1 : Automatically choose a "heuristic" value that can possibly get a better gain: (TilesPerWorkgroup = 1~2) # Not based on any theory, but on some experiment observation, can be used to reduce the kernels - # Recommand [-1,0,1] for basic tuning + # Recommend [-1,0,1] for basic tuning # Assertions/Requirements: NumWorkGroups0 * NumWorkGroups1 < 2^32 "PersistentKernel": range(-1,512+1) , # Use persistent kernel. @@ -1048,9 +1048,9 @@ def getArchitectureName(gfxName): # Controls desired width (#elements) for loads from global memory -> LDS. # and eliminates the pointer unshift logic # -1 : Set GlobalReadVectorWidth = VectorWidth - # NOTE: for input bpe=32, max GRVW is 4 (to fit dwordx4) (FP32), min GRVW is 1 (dword) - # bpe=16, max GRVW is 8 (to fit dwordx4) (FP16), min GRVW is 2 (dword) - # bpe=8, max GRVW is 16 (to fit dwordx4) (INT8), min GRVW is 4 (dword) + # NOTE: for input bpe=32, max GRVW is 4 (to fit dwordX4) (FP32), min GRVW is 1 (dword) + # bpe=16, max GRVW is 8 (to fit dwordX4) (FP16), min GRVW is 2 (dword) + # bpe=8, max GRVW is 16 (to fit dwordX4) (INT8), min GRVW is 4 (dword) "GlobalReadVectorWidth": [ -1, 1, 2, 3, 4, 6, 8, 16 ], # Controls desired width (#elements) for loads from LDS -> VGPR. @@ -1066,12 +1066,12 @@ def getArchitectureName(gfxName): "LocalReadVectorWidth": [ -1, 1, 2, 4, 8, 16 ], # threads should read/write/operate on this many contiguous elements from the C matrix. - # If VW=4 then thread0 will process 4 consec C elements, then thread1 next 4, etc. + # If VW=4 then thread0 will process 4 consecutive C elements, then thread1 next 4, etc. # If the ThreadTile is > VectorWidth then thread0 will next operate on the 4 elements in C at (4*NumThreads) # Typically the load vector width and store vector width are directly related to the VW. # The global load width is closely related to the width of local stores so # GlobalReadVectorWidth also controls local write width. - # Local read width also matches since VectorWidth consec elements must be read + # Local read width also matches since VectorWidth consecutive elements must be read # Typically matching 16 bytes is good choice since the stores will be optimally coalesced with 16 bytes/WI. # -1 means use the largest vector width up to 128 bits. # Using a VW too large which results in >16bytes/thread isn't supported @@ -1085,7 +1085,7 @@ def getArchitectureName(gfxName): "VectorStore": [-1, 0, 1], # Controls desired width (#elements) for stores from reg to global memory. - # When MatrixInstruciton == None, derived parameter gwvw takes precedence. + # When MatrixInstruction == None, derived parameter gwvw takes precedence. # -1 : Set StoreVectorWidth = VectorWidth "StoreVectorWidth": [ -1, 1, 2, 3, 4, 6, 8 ], @@ -1097,20 +1097,20 @@ def getArchitectureName(gfxName): # when loading all the data from global into lds requires multiple load instructions, these parameters govern which # loads will pull which rectangle of data from global into lds # NLC=1 means one load along the coalesced dimension, which results in the most coalescing possible - # NLC=-1 looks for the largest number of reads along the coalesced dimension which results in the least ammount of coalescing; + # NLC=-1 looks for the largest number of reads along the coalesced dimension which results in the least amount of coalescing; # however in this case the stride between one load and another is a static value, therefore buffer loads only need one set of registers # whereas the =1 case has a stride which is a multiple of a kernel argument and therefore needs one address per load in the perpendicular dimension "NumLoadsCoalescedA": list(range(-1, 64+1)), "NumLoadsCoalescedB": list(range(-1, 64+1)), # DepthU, LocalSplitU (which is the 3rd number in WorkGroup), and LoopUnroll are closely related - # LoopUnroll=4 means there are 4 subiterations within the loop, 4 actual iterations written in the code. + # LoopUnroll=4 means there are 4 sub-iterations within the loop, 4 actual iterations written in the code. # LocalSplit=2 means the workgroup is split up into 2 subgroups, and each subgroup is doing different parts of the summation. # subgroup0 does k=0-3, 8-11... and subgroup1 does k=4-7, 12-15... - # So, each iteration through the summation loop, which has 4 actual subiterations, does 8 summation iterations, because each subgroup did 4; + # So, each iteration through the summation loop, which has 4 actual sub-iterations, does 8 summation iterations, because each subgroup did 4; # and when data is read from global memory the threads read 8 elements along the summation dimension. # DepthU = LoopUnroll * LocalSplitU = 4*2 in this case - # it made more sense for the user to directly control LocalSplitU and DepthU, then derrive afterwards LoopUnroll=DepthU/LocalSplitU + # it made more sense for the user to directly control LocalSplitU and DepthU, then derive afterwards LoopUnroll=DepthU/LocalSplitU # -1 : Only allow GLVW=1 # -2 : Only allow max(GLVWA,GLVWB) < VW ? # -3 : Only allow min(GLVWA,GLVWB) < VW ? @@ -1125,7 +1125,7 @@ def getArchitectureName(gfxName): # was previously a problem for TN since it implies DepthU is large, and that leads to oversubscription of LDS. # # Preconditions: - # ScheduleIterAlg=3, TransposeLDS=1, PGR=0/1 exlcuding 2, DirectToLds=0 (DirectToLds=0 because part of the data loaded *need* to reside in registers), + # ScheduleIterAlg=3, TransposeLDS=1, PGR=0/1 excluding 2, DirectToLds=0 (DirectToLds=0 because part of the data loaded *need* to reside in registers), # nRegs per load >= DepthULdsDivisor (since we artificially require at least 1 register per LDS write) # # Example: DepthULdsDivisor=2 @@ -1135,10 +1135,10 @@ def getArchitectureName(gfxName): # 2nd subloop writes v2,v3 to LDS "DepthULdsDivisor": [1, 2, 4], - # integer ammount of padding to put into LDS, in 2016 this didn't seem to help performance, profilers were showing that channel conflicts weren't really hurting + # integer amount of padding to put into LDS, in 2016 this didn't seem to help performance, profilers were showing that channel conflicts weren't really hurting # performance so this has been deprecated and probably doesn't work # -1 means use same padding as the VectorWidth if TLU=0 else 0. (Padding only helps when transpose is required) - # With MatrixInstruciton: -1 means max(GRVW,MIInput) if TLU=0 + # With MatrixInstruction: -1 means max(GRVW,MIInput) if TLU=0 "LdsPadA": [ -1, 0, 1, 2, 3, 4, 8, 16, 32], "LdsPadB": [ -1, 0, 1, 2, 3, 4, 8, 16, 32], @@ -1149,7 +1149,7 @@ def getArchitectureName(gfxName): # -1 means round up to nearest power of 2 begin with 128 "LdsBlockSizePerPad": [-1, 0, 64, 128, 256, 512, 1024], - # Transpose LDS format. Local store in Coalsced dimension , same as optimized global fetch dimension . applicable only in TLU=0 case for miSIMD(s) + # Transpose LDS format. Local store in Coalesced dimension , same as optimized global fetch dimension . applicable only in TLU=0 case for miSIMD(s) # TODO: No code for -1 ? "TransposeLDS": [-1, 1, 0], @@ -1158,7 +1158,7 @@ def getArchitectureName(gfxName): "PerformanceWaitLocation": list(range(-1, 16*16+1)), "PerformanceWaitCount": list(range(-1, 16)), - # add gls or slc after global memory read/writes to change cacheing, not cacheing the writes is promising and improved performance a tiny bit + # add gls or slc after global memory read/writes to change caching, not caching the writes is promising and improved performance a tiny bit # 1: glc, 2: slc, 3: glc+slc "NonTemporalD": list(range(0,4)), "NonTemporalC": list(range(0,4)), @@ -1179,7 +1179,7 @@ def getArchitectureName(gfxName): # For example, InnerUnroll=2 will fetch LDS for two unroll iterations "InnerUnroll": [1,2,4,8,16,32,64], - # Arrange elements in LDS so N elements consec in U-dim are adjacent in LDS + # Arrange elements in LDS so N elements consecutive in U-dim are adjacent in LDS # 1 is default and results in no interleaving. # Implementation only supports LocalDotLayout that is a power-of-two "LocalDotLayout": [1,2,4,8], @@ -1381,7 +1381,7 @@ def getArchitectureName(gfxName): "TensorBFormat", # see validTensorBFormats "TensorDFormat", # see validTensorDFormats - # Each of the parms below specifies dimensions separated by 'x". + # Each of the params below specifies dimensions separated by 'x". # - The notation follows 'convolution' convention so fastest-moving dimensions are last, # and should mirror the order of the spatial dimension in the activation format. # For example, in NCHW format Filter=3x1 is 3 in the H dimension and 1 in the W dimension. @@ -1422,7 +1422,7 @@ def getArchitectureName(gfxName): # iteration count for the unroll loop. # If 0: # - Unroll index is filter index (Forward,BackwardData) or spatial index (BackwardWeights) - # - provides better cache locality for most formats, but tigher looping. + # - provides better cache locality for most formats, but tighter looping. # - Likely a good idea with PackSummationDims=1 since there is only one unroll loop. "UnrollOnChannel", @@ -1430,7 +1430,7 @@ def getArchitectureName(gfxName): # Optional parameter for debug and testing. This does not impact kernel generation. # If set,then each problem dimension size/stride will be checked to ensure they are # correctly specified. (TBD) - # Also used by testbenches to compute consistent strides and sizes for auto-generated + # Also used by test benches to compute consistent strides and sizes for auto-generated # problem sizes and strides. 'Spatial', # examples 56x56, 7x7. @@ -1440,7 +1440,7 @@ def getArchitectureName(gfxName): # Default Problem Type ################################################################################ defaultProblemType = { - # =GEMM uses TransposeA,B paramters and makes the problem type more readeable for users + # =GEMM uses TransposeA,B parameters and makes the problem type more readable for users # =TensorContraction requires specifying "OperationType": "GEMM", # GEMM, TensorContraction, ConvolutionForward, ConvolutionBackwardData, ConvolutionBackwardWeights @@ -1543,7 +1543,7 @@ def getArchitectureName(gfxName): # - Typical use case is to set summationStride < freeSize, with padStart+padEnd+1 == summationStride. # - Caveats: # - ZeroPad requires that the ElementEdge <= 2^32: - # This is SizeFree+SizeSum + Pad_Leading + PadTrailingPad + padding=GRWW for shift-pointer) bytes < 2^32 + # This is SizeFree+SizeSum + Pad_Leading + PadTrailingPad + padding=GRVW for shift-pointer) bytes < 2^32 # Likely this is less than the standard buffer load limits (bottom-right corner of macro-tile) # EX: ZeroPadA: [ [0,1, 2,3]] # TensorA free index 0 with sum index 1 has leading pad=2 and trailing pad=3 @@ -1680,47 +1680,52 @@ def locateExe( defaultPath, exeName ): # /opt/rocm/bin, hip-clang def GetAsmCaps(isaVersion): """ Determine assembler capabilities by testing short instructions sequences """ rv = {} - rv["SupportedISA"] = tryAssembler(isaVersion, "") - rv["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,1") - rv["HasExplicitNC"] = tryAssembler(isaVersion, "v_add_nc_u32 v0,v0,1") + rv["SupportedISA"] = tryAssembler(isaVersion, "") + rv["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,1") + rv["HasExplicitNC"] = tryAssembler(isaVersion, "v_add_nc_u32 v0,v0,1") - rv["HasDirectToLds"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds") \ - or tryAssembler(isaVersion, "buffer_load_b32 v40, v36, s[24:27], s28 offen offset:0 lds") - rv["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2") - rv["HasLshlOr"] = tryAssembler(isaVersion, "v_lshl_or_b32 v47, v36, 0x2, v34") - rv["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34") - rv["HasCodeObjectV3"] = tryAssembler(isaVersion, "", False, "-mcode-object-version=2") + # Syntax of DirectToLds loads has changed: destination vgpr should be omitted + # Old syntax should be removed in a future update as it is no longer supported + rv["HasDirectToLdsDest"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds") \ + or tryAssembler(isaVersion, "buffer_load_b32 v40, v36, s[24:27], s28 offen offset:0 lds") + rv["HasDirectToLdsNoDest"] = tryAssembler(isaVersion, "buffer_load_dword v36, s[24:27], s28 offen offset:0 lds") \ + or tryAssembler(isaVersion, "buffer_load_b32 v36, s[24:27], s28 offen offset:0 lds") - rv["HasMFMA"] = tryAssembler(isaVersion, "v_mfma_f32_32x32x2bf16 a[0:31], v32, v33, a[0:31]") - rv["HasMFMA_f64"] = tryAssembler(isaVersion, "v_mfma_f64_16x16x4f64 v[0:7], v[32:33], v[36:37], v[0:7]") - rv["HasMFMA_bf16_1k"] = tryAssembler(isaVersion, "v_mfma_f32_32x32x4bf16_1k a[0:31], v[32:33], v[36:37], a[0:31]") + rv["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2") + rv["HasLshlOr"] = tryAssembler(isaVersion, "v_lshl_or_b32 v47, v36, 0x2, v34") + rv["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34") + rv["HasCodeObjectV3"] = tryAssembler(isaVersion, "", False, "-mcode-object-version=2") - rv["v_mac_f16"] = tryAssembler(isaVersion, "v_mac_f16 v47, v36, v34") + rv["HasMFMA"] = tryAssembler(isaVersion, "v_mfma_f32_32x32x2bf16 a[0:31], v32, v33, a[0:31]") + rv["HasMFMA_f64"] = tryAssembler(isaVersion, "v_mfma_f64_16x16x4f64 v[0:7], v[32:33], v[36:37], v[0:7]") + rv["HasMFMA_bf16_1k"] = tryAssembler(isaVersion, "v_mfma_f32_32x32x4bf16_1k a[0:31], v[32:33], v[36:37], a[0:31]") - rv["v_fma_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0,0]") - rv["v_fmac_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34") + rv["v_mac_f16"] = tryAssembler(isaVersion, "v_mac_f16 v47, v36, v34") - rv["v_pk_fma_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0]") - rv["v_pk_fmac_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34") + rv["v_fma_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0,0]") + rv["v_fmac_f16"] = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34") - rv["v_mad_mix_f32"] = tryAssembler(isaVersion, "v_mad_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") - rv["v_fma_mix_f32"] = tryAssembler(isaVersion, "v_fma_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") + rv["v_pk_fma_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0]") + rv["v_pk_fmac_f16"] = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34") - rv["v_dot2_f32_f16"] = tryAssembler(isaVersion, "v_dot2_f32_f16 v20, v36, v34, v20") - rv["v_dot2c_f32_f16"] = tryAssembler(isaVersion, "v_dot2c_f32_f16 v47, v36, v34") \ - or tryAssembler(isaVersion, "v_dot2acc_f32_f16 v47, v36, v34") + rv["v_mad_mix_f32"] = tryAssembler(isaVersion, "v_mad_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") + rv["v_fma_mix_f32"] = tryAssembler(isaVersion, "v_fma_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]") - rv["v_dot4_i32_i8"] = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34") - rv["v_dot4c_i32_i8"] = tryAssembler(isaVersion, "v_dot4c_i32_i8 v47, v36, v34") - rv["VOP3v_dot4_i32_i8"] = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34, v47") + rv["v_dot2_f32_f16"] = tryAssembler(isaVersion, "v_dot2_f32_f16 v20, v36, v34, v20") + rv["v_dot2c_f32_f16"] = tryAssembler(isaVersion, "v_dot2c_f32_f16 v47, v36, v34") \ + or tryAssembler(isaVersion, "v_dot2acc_f32_f16 v47, v36, v34") - rv["v_mac_f32"] = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22") - rv["v_fma_f32"] = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23") - rv["v_fmac_f32"] = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22") + rv["v_dot4_i32_i8"] = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34") + rv["v_dot4c_i32_i8"] = tryAssembler(isaVersion, "v_dot4c_i32_i8 v47, v36, v34") + rv["VOP3v_dot4_i32_i8"] = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34, v47") - rv["v_fma_f64"] = tryAssembler(isaVersion, "v_fma_f64 v[20:21], v[22:23], v[24:25], v[20:21]") + rv["v_mac_f32"] = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22") + rv["v_fma_f32"] = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23") + rv["v_fmac_f32"] = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22") - rv["HasAtomicAdd"] = tryAssembler(isaVersion, "buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:0") + rv["v_fma_f64"] = tryAssembler(isaVersion, "v_fma_f64 v[20:21], v[22:23], v[24:25], v[20:21]") + + rv["HasAtomicAdd"] = tryAssembler(isaVersion, "buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:0") if tryAssembler(isaVersion, "s_waitcnt vmcnt(63)"): @@ -1819,7 +1824,7 @@ def detectGlobalCurrentISA(): for line_in in process.stdout.decode().splitlines(): if 'gcnArchName' in line_in: line += line_in.split()[1] - break # detemine if hipinfo will support multiple arch + break # determine if hipinfo will support multiple arch arch = gfxArch(line.strip()) if arch is not None: if arch in globalParameters["SupportedISA"]: @@ -1897,7 +1902,7 @@ def assignGlobalParameters( config ): """ Assign Global Parameters Each global parameter has a default parameter, and the user - can override them, those overridings happen here + can override them, overriding happens here """ global globalParameters @@ -1917,7 +1922,7 @@ def assignGlobalParameters( config ): if configValue == defaultValue: print2(" %24s: %8s (same)" % (key, configValue)) else: - print2(" %24s: %8s (overriden)" % (key, configValue)) + print2(" %24s: %8s (overridden)" % (key, configValue)) else: print2(" %24s: %8s (unspecified)" % (key, defaultValue)) @@ -2025,7 +2030,7 @@ def assignGlobalParameters( config ): for key in config: value = config[key] if key not in globalParameters: - printWarning("Global parameter %s = %s unrecognised." % ( key, value )) + printWarning("Global parameter %s = %s unrecognized." % ( key, value )) globalParameters[key] = value def setupRestoreClocks(): diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index a3b25872d..7cd72e8c5 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -743,7 +743,10 @@ def initKernel(self, kernel, tPA, tPB ): self.AsmBugs["ExplicitCO"] = globalParameters["AsmCaps"][self.version]["HasExplicitCO"] self.AsmBugs["ExplicitNC"] = globalParameters["AsmCaps"][self.version]["HasExplicitNC"] - if not globalParameters["AsmCaps"][self.version]["HasDirectToLds"]: + hasDtl = globalParameters["AsmCaps"][self.version]["HasDirectToLdsDest"] or globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"] + if not hasDtl: + if kernel["DirectToLds"]: + printExit("DirectToLds requested, but not available on this architecture ( {} )".format(self.version)) kernel["DirectToLdsA"] = False kernel["DirectToLdsB"] = False kernel["LocalWriteUseSgprA"] = False # Requires DirectToLdsA @@ -2223,6 +2226,16 @@ def defineBufferMemoryMacros(self): replace = f'{type_list[t]}' if (self.version[0] < 11) else f'{t}' kStr += self.generalMacro('buffer_load_', origin, replace, 'dst', 'voffset', 'base', 'soffset', 'offen', 'ioffset', 'md0', 'md1', 'md2') + self.endLine + # Extra macro for DirectToLds loads with no destination register + type_list = { + 'b32' : 'dword', + 'u16' : 'ushort' + } + for t in type_list: + origin = f'{t}' + replace = f'{type_list[t]}' if (self.version[0] < 11) else f'{t}' + kStr += self.generalMacro('buffer_load_', origin + '_dtl', replace, 'voffset', 'base', 'soffset', 'offen', 'ioffset', 'md0', 'md1', 'md2') + self.endLine + type_list = { 'b32' : 'dword', 'b64' : 'dwordx2', @@ -7294,8 +7307,11 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx): extraFields += " glc" if tP["NonTemporal"]//2==1: extraFields += " slc" + dtlNoDestVgpr = False if kernel["DirectToLds%s"%tc]: extraFields += " lds" + dtlNoDestVgpr = globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"] + directToLdsLoads = 0 prevLdsOffset = 0 @@ -7503,6 +7519,7 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx): addr0=vgpr(offsetVgpr), addr1=sgpr("Srd%s"%tc, 4), \ soffset=soffset, offset=offset, \ extraFields=extraFields, \ + dtlNoDestVgpr=dtlNoDestVgpr, \ hi16=hi16, \ comment=comment).toStr() @@ -7529,6 +7546,7 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx): addr0=vgpr("GlobalReadAddr%s+%u"%(tc,graIdx),2), addr1="", \ soffset=0, offset=0, \ extraFields=extraFields, \ + dtlNoDestVgpr=dtlNoDestVgpr, \ hi16=hi16, \ comment="load one flat value").toStr() @@ -7793,8 +7811,10 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0): extraFields += " glc" if tP["NonTemporal"]//2==1: extraFields += " slc" + dtlNoDestVgpr = False if kernel["DirectToLds%s"%tc]: extraFields += " lds" + dtlNoDestVgpr = globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"] directToLdsLoads = 0 instOffset = 0 @@ -7900,6 +7920,7 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0): addr0=vgpr(offsetVgpr), addr1=sgpr("Srd%s"%tc, 4), \ soffset=soffset, offset=instOffset, \ extraFields=extraFields, \ + dtlNoDestVgpr=dtlNoDestVgpr, \ hi16=(kernel["ProblemType"]["DataType"].isHalf() or kernel["ProblemType"]["DataType"].isBFloat16()) and loopCnt%2==1, \ comment="G -> Reg %u_%u_%u_%u"%(para, sPara, perp, sPerp))) @@ -7925,6 +7946,7 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0): addr0=vgpr("GlobalReadAddr%s+%u"%(tc,graIdx),2), addr1="", \ soffset=0, offset=0, \ extraFields=extraFields, \ + dtlNoDestVgpr=dtlNoDestVgpr, \ hi16=(kernel["ProblemType"]["DataType"].isHalf() or kernel["ProblemType"]["DataType"].isBFloat16()) and loopCnt%2==1, \ comment="G -> Reg %u_%u_%u_%u"%(para, sPara, perp, sPerp ))) @@ -10982,7 +11004,7 @@ def globalWriteElements(self, kernel, vectorWidths, elements, # bpl = bytes per load op ############################################################################## def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \ - addr0, addr1, soffset, offset, extraFields, hi16=0, comment="load C"): + addr0, addr1, soffset, offset, extraFields, dtlNoDestVgpr, hi16=0, comment="load C"): # rpv = regs per vector rpv = bpl/4.0 @@ -11000,34 +11022,25 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \ assert 0, "offset too large and soffset set" if extraFields != "": tailFields += ", %s"% extraFields + globalReadInst = None if bpl==1 and hi16: - rv.addCode(Code.GlobalReadInst("_buffer_load_d16_hi_u8", vgpr(destVgpr, rpv*4), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_d16_hi_u8" + rpv *= 4 elif bpl==1 and not hi16: - rv.addCode(Code.GlobalReadInst("_buffer_load_d16_u8", vgpr(destVgpr, rpv*4), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_d16_u8" + rpv *= 4 elif bpl==2 and hi16: - rv.addCode(Code.GlobalReadInst("_buffer_load_d16_hi_b16", vgpr(destVgpr, rpv*2), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_d16_hi_b16" + rpv *= 2 elif bpl==2 and not hi16: - rv.addCode(Code.GlobalReadInst("_buffer_load_d16_b16", vgpr(destVgpr, rpv*2), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_d16_b16" + rpv *= 2 elif bpl==4: - rv.addCode(Code.GlobalReadInst("_buffer_load_b32", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_b32" elif bpl==8: - rv.addCode(Code.GlobalReadInst("_buffer_load_b64", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_b64" elif bpl==16: - rv.addCode(Code.GlobalReadInst("_buffer_load_b128", vgpr(destVgpr, rpv), addr0, \ - addr1, soffset, tailFields, comment)) - return rv + globalReadInst = "_buffer_load_b128" elif bpl==32: # split into two dwordx4 loads. Second load offset is +0.5 bpl tailFields1 = "offen offset:%u"%(offset + bpl/2) @@ -11039,9 +11052,17 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \ addr1, soffset, tailFields, comment)) rv.addCode(Code.GlobalReadInst("_buffer_load_b128", vgpr(int(destVgpr + rpv/2), rpv/2), addr0, \ addr1, soffset, tailFields1, comment)) + return rv else: assert 0, "chooseGlobalRead: bad bpl" + if dtlNoDestVgpr: + globalReadInst += "_dtl" + args = [globalReadInst] + if not dtlNoDestVgpr: + args.append(vgpr(destVgpr, rpv)) + args.extend([addr0, addr1, soffset, tailFields, comment]) + rv.addCode(Code.GlobalReadInst(*args)) return rv else: @@ -11310,7 +11331,7 @@ def readCInput(self, kernel, ss, addrCalc, vc0, data, gwvw, addr, tmpS01): if kernel["ProblemType"]["DestDataType"].isHalf(): kStr += self.chooseGlobalRead(useBuffer, bps, data, \ addr0, addr1, soffset=0, offset=addrCalc.globalOffset, \ - extraFields=extraStr, hi16=vc0 % 2, + extraFields=extraStr, dtlNoDestVgpr=False, hi16=vc0 % 2, comment="load C for beta calc").toStr() elif kernel["ProblemType"]["DestDataType"].isBFloat16() or \ kernel["ProblemType"]["DestDataType"].isInt32() or \ @@ -11321,6 +11342,7 @@ def readCInput(self, kernel, ss, addrCalc, vc0, data, gwvw, addr, tmpS01): kStr += self.chooseGlobalRead(useBuffer, bps, data, \ addr0, addr1, soffset=0, offset=addrCalc.globalOffset, \ extraFields=extraStr, \ + dtlNoDestVgpr=False, \ comment="load C for beta calc").toStr() return kStr @@ -11501,6 +11523,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, vgprIdx = 1*(bpm//4) kStr += self.chooseGlobalRead(useBuffer, bpm, dataV+vgprIdx, \ addr0, addr1, soffset=0, offset=addrCalc.globalOffset, extraFields="", + dtlNoDestVgpr=False, \ comment="load D (atomic) bpm=%u vaw=%u"%(bpm,atomicW)).toStr() if kernel["InterleaveAlpha"] and applyAlpha: @@ -12172,19 +12195,19 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic, if kernel["ProblemType"]["DestDataType"].isHalf() or kernel["ProblemType"]["DestDataType"].isBFloat16(): if not kernel["ProblemType"]["HighPrecisionAccumulate"]: kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx//2, \ - addr0, addr1, soffset=0, offset=0, extraFields="", hi16=sumIdx%2).toStr() + addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False, hi16=sumIdx%2).toStr() else: kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx, \ - addr0, addr1, soffset=0, offset=0, extraFields="", hi16=0).toStr() + addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False, hi16=0).toStr() elif kernel["ProblemType"]["DestDataType"].isInt32() or kernel["ProblemType"]["DestDataType"].isSingle(): kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx, \ - addr0, addr1, soffset=0, offset=0, extraFields="").toStr() + addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr() elif kernel["ProblemType"]["DestDataType"].isDouble() or kernel["ProblemType"]["DestDataType"].isSingleComplex() : kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx*2, \ - addr0, addr1, soffset=0, offset=0, extraFields="").toStr() + addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr() elif kernel["ProblemType"]["DestDataType"].isDoubleComplex(): kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx*4, \ - addr0, addr1, soffset=0, offset=0, extraFields="").toStr() + addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr() kStr += inst("s_waitcnt", "vmcnt(0)", "CheckStoreC, wait for stores to complete" ) if self.archCaps["SeparateVscnt"]: kStr += inst("s_waitcnt_vscnt", "null", "0", "writes") diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 15ab45103..87e63f6d7 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -2435,6 +2435,11 @@ def isDirectToLdsDoable(state, tc): #TN # use for all precisions with TransposeLDS=1 + numRegisters = state["ProblemType"]["DataType"].numRegisters() + if numRegisters * state["GlobalLoadVectorWidth%c"%tc] != 1: + reject(state, "DirectToLds can only be used with buffer loads requiring 1 register") + return False + if state["ProblemType"]["DataType"].isHalf(): if state["AssertSummationElementMultiple"] % (2 * state["GlobalLoadVectorWidth%c"%tc]) != 0: reject(state, "can't use DirectToLds for FP16 with AssertSummationElementMultiple %u" % state["AssertSummationElementMultiple"]) @@ -3437,6 +3442,9 @@ def assignDerivedParameters(state): state["DirectToLdsB"] = True state["LocalWriteUseSgprB"] = True #print("DirectToLdsB", state["DirectToLdsB"]) + + if state["Valid"] and state["DirectToLds"] and not (state["DirectToLdsA"] or state["DirectToLdsB"]): + printWarning("DirectToLds requested, but not enabled for A or B, check kernel configuration!") # Update parent variable so kernel display is accurate state["DirectToLds"] = state["DirectToLdsA"] or state["DirectToLdsB"] diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml rename to Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml diff --git a/Tensile/Tests/pre_checkin/direct_to_lds/dtl_dgemm_lite.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/direct_to_lds/dtl_dgemm_lite.yaml rename to Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_hgemm.yaml similarity index 100% rename from Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml rename to Tensile/Tests/disabled/direct_to_lds/dtl_hgemm.yaml diff --git a/tox.ini b/tox.ini index eb965b16e..cb38e81fb 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,6 @@ [tox] -envlist = py35,py36,py27,lint +envlist = py35,py36,py38,py27,lint + [testenv] # Some versions of Pytest versions have a bug: