From 006a5d653ce0d82fecb05d5e215d053749b57c04 Mon Sep 17 00:00:00 2001
From: Alex Brown <alex.brown@amd.com>
Date: Sat, 29 Oct 2022 11:39:08 -0600
Subject: [PATCH] hotfix: Assembly syntax and coverage for DirectToLds has
 changed (#1598) (#1610)

---
 .jenkins/common.groovy                        |   4 +-
 Tensile/Common.py                             | 177 +++++++++---------
 Tensile/KernelWriterAssembly.py               |  81 +++++---
 Tensile/SolutionStructs.py                    |   8 +
 .../direct_to_lds/dtl_dgemm.yaml              |   0
 .../direct_to_lds/dtl_dgemm_lite.yaml         |   0
 .../direct_to_lds/dtl_hgemm.yaml              |   0
 tox.ini                                       |   3 +-
 8 files changed, 155 insertions(+), 118 deletions(-)
 rename Tensile/Tests/{extended => disabled}/direct_to_lds/dtl_dgemm.yaml (100%)
 rename Tensile/Tests/{pre_checkin => disabled}/direct_to_lds/dtl_dgemm_lite.yaml (100%)
 rename Tensile/Tests/{extended => disabled}/direct_to_lds/dtl_hgemm.yaml (100%)

diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 016289ca2..c8416a4e5 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -30,7 +30,7 @@ def runCompileCommand(platform, project, jobName, boolean debug=false)
     project.paths.construct_build_prefix()
 
     String compiler = 'hipcc'
-    String pythonVersion = 'py36'
+    String pythonVersion = 'py3'
     String cov = "V3"
     String buildType = debug ? 'Debug' : 'RelWithDebInfo'
     String parallelJobs = "export HIPCC_COMPILE_FLAGS_APPEND=-parallel-jobs=2"
@@ -110,7 +110,7 @@ def runTestCommand (platform, project, jobName, test_marks, boolean skipHostTest
     def test_dir =  "Tensile/Tests"
 
     String compiler = 'hipcc'
-    String pythonVersion = 'py36'
+    String pythonVersion = 'py3'
     String markSkipHostTest = skipHostTest ? "#" : ""
     String markSkipExtendedTest = !test_marks.contains("extended") ? "--gtest_filter=-\"*Extended*\"" : ""
 
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 13c39a3cd..be8bbbab8 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -83,7 +83,7 @@
 globalParameters["ForceRedoLibraryLogic"] = True      # if False and library logic already analyzed, then library logic will be skipped when tensile is re-run
 globalParameters["ForceRedoLibraryClient"] = True     # if False and library client already built, then building library client will be skipped when tensile is re-run
 
-# Compare CPU reference convolution model vs golden tensor contracton model
+# Compare CPU reference convolution model vs golden tensor contraction model
 # Useful to test if conversion from tensor contraction is working as expected
 # In this mode, the filter,stride,dilation are specified in the problem type.
 # If the problem type uses constant Filter,Stride,Dilation,Pad* (ie these are not 'N'), then the
@@ -92,7 +92,7 @@
 globalParameters["ConvolutionVsContraction"] = False
 
 globalParameters["ShowProgressBar"] = True     # if False and library client already built, then building library client will be skipped when tensile is re-run
-globalParameters["SolutionSelectionAlg"] = 1          # algorithm to detetermine which solutions to keep. 0=removeLeastImportantSolutions, 1=keepWinnerSolutions (faster)
+globalParameters["SolutionSelectionAlg"] = 1          # algorithm to determine which solutions to keep. 0=removeLeastImportantSolutions, 1=keepWinnerSolutions (faster)
 globalParameters["ExpandRanges"] = True          # expand ranges into exact configs before writing logic file.  False ignores ranges.
 globalParameters["ExitAfterKernelGen"] = False     # Exit after generating kernels
 globalParameters["GenerateSourcesAndExit"] = False # Exit after kernel source generation.
@@ -162,7 +162,7 @@
 # build parameters
 globalParameters["CMakeCXXFlags"] = ""            # pass flags to cmake
 globalParameters["CMakeCFlags"] = ""              # pass flags to cmake
-globalParameters["DebugKernel"] = False           # assembly only, kernel gets buffer for debug "printing"; kernel writes data to memory, gets coppied to host and printed
+globalParameters["DebugKernel"] = False           # assembly only, kernel gets buffer for debug "printing"; kernel writes data to memory, gets copied to host and printed
 globalParameters["LibraryPrintDebug"] = False     # solutions will print enqueue info when enqueueing a kernel
 
 # debug for assembly
@@ -185,7 +185,7 @@
 globalParameters["PrintCodeCommands"] = False  # print the commands used to generate the code objects (asm,link,hip-clang, etc)
 globalParameters["DumpTensors"] = False        # If True, dump tensors to binary files instead of printing them.
 
-# If PrintMax* is greater than the dimension, the middle elements will be repaced with "..."
+# If PrintMax* is greater than the dimension, the middle elements will be replaced with "..."
 
 
 # device selection
@@ -256,7 +256,7 @@
 # directory where custom kernels are located
 globalParameters["CustomKernelDirectory"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "CustomKernels")
 
-globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile trainning verification or not
+globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile training verification or not
 
 globalParameters["SeparateArchitectures"] = False # write Tensile library metadata to separate files for each architecture
 
@@ -340,8 +340,8 @@ def getArchitectureName(gfxName):
 # The supported typed GEMM, each entry is (Ti, To, Tc).
 # DataType (Ti)        = The data-type of the input matrices: A/B
 # DestDataType (To)    = The data-type of the output matrices: C/D
-# ComputeDataType (Tc) = The data-type of computaiton: alpha/beta:
-# Cinternal: basically should == ComputeDataType
+# ComputeDataType (Tc) = The data-type of computation: alpha/beta:
+# CInternal: basically should == ComputeDataType
 
 # Align the supported GEMM type with rocBLAS: [A/B/ C/D/ alpha/beta]
 #   (rocblas/library/include/internal/rocblas_functions.h)
@@ -465,7 +465,7 @@ def getArchitectureName(gfxName):
     "GlobalSplitU":               list(range(1, 1024+1)),
 
     # choose how to do GlobalSplitU
-    # 1: use atomic operation to accumlate on one buffer
+    # 1: use atomic operation to accumulate on one buffer
     # 2: each GSU group write to each own buffer and accumulate by another kernel
     "GlobalSplitUAlgorithm":      ["SingleBuffer", "MultipleBuffer"],
 
@@ -477,14 +477,14 @@ def getArchitectureName(gfxName):
     # GSUSARR=True means the 4 workgroups round robin split up the chunks of the summation: k=0 -> DU-1, 4DU -> 5DU-1, ...; k=1DU -> 2DU-1, 5DU -> 6DU-1...; ...
     "GlobalSplitUSummationAssignmentRoundRobin":  [ False, True ],
 
-    # in opencl for some compilers, performance improved by putting a memfence after each subiteration; it prevented the loads of one subiteration from being moved
+    # in opencl for some compilers, performance improved by putting a memfence after each sub-iteration; it prevented the loads of one sub-iteration from being moved
     # into a prior iteration, which would help latency but it consumed more vgprs which was a net loss
     "UnrollMemFence":             [ False, True ],
 
     # not used yet; will refer to combining multiple reads into single instruction
     # such as ds_read_b32 -> ds_read2_b32
     # the pro is that it cuts in half the number of instructions
-    # the con is that bits per offset is half, so arithmatic might be required to increment and reset offset vgprs
+    # the con is that bits per offset is half, so arithmetic might be required to increment and reset offset vgprs
     "GlobalRead2A":               [ False, True ],
     "GlobalRead2B":               [ False, True ],
     "LocalWrite2A":               [ False, True ],
@@ -501,7 +501,7 @@ def getArchitectureName(gfxName):
     # This eliminates 4 vector XOR instructions used for pointer swap
     "ExpandPointerSwap":          [False, True],
 
-    # Schedule global reads and global read incrementsinto LocalRead iterations
+    # Schedule global reads and global read increments into LocalRead iterations
     # Can reduce pressure on local read instruction dispatch queue
     # 0=perform global reads at start of instruction loop
     # 1=schedule into the local read instruction iterations
@@ -535,7 +535,7 @@ def getArchitectureName(gfxName):
     # the purpose of this parameter is to control density of local write instruction scheduling
     # In PGR1, we want to schedule local write more denser, so we can have more
     #          latency to hide global read
-    # In PGR2, since LW is followed by GR, every LW has same whole loop latecy
+    # In PGR2, since LW is followed by GR, every LW has same whole loop latency
     #          to hide global read. We want to schedule LW less denser, can
     #          avoid full of vmem FIFO.
     # Range from 0.01 to 32
@@ -568,7 +568,7 @@ def getArchitectureName(gfxName):
     # the next tile in the sequence.
     "PrefetchAcrossPersistent":    [0, 1],
 
-    # Changes the behaviour of prefetch across persistent.
+    # Changes the behavior of prefetch across persistent.
     # Mode 0 is default, works for all sizes
     # Mode 1 disables static tile setup for prefetch and merges prefetch with ord. noLoadLoop,
     "PrefetchAcrossPersistentMode": [0, 1],
@@ -610,7 +610,7 @@ def getArchitectureName(gfxName):
     #    + Uses hardware buffer limit for edge detection
     #    - Limited range - the bot-right corner of macro-tile (plus padding=GRVW
     #        for shift-pointer, if ShiftPtr is required) must be within 2^32.
-    #      ShiftPtrPad = MayShift ? GRWV*BPE : 0
+    #      ShiftPtrPad = MayShift ? GRVW*BPE : 0
     #      For TLU=1: Unroll*StrideA1 + ShiftPtrPad <= 2^32
     #      For TLU=0: MT*StrideA1 + ShiftPtrPad <= 2^32
     #      These conditions should be checked using Assert - TODO
@@ -620,7 +620,7 @@ def getArchitectureName(gfxName):
     #    + Each SGPR stores an offset from base GlobalReadOffset+0.
     #    - Requirements for UseSgprForGRO=1:
     #      - BufferLoad=1
-    #      - Use appropriate Assert*ElementMultiple or GRVW=1 to eliminate need for ShifPtr
+    #      - Use appropriate Assert*ElementMultiple or GRVW=1 to eliminate need for ShiftPtr
     #        (UseSgprForGRO does not support ShiftPtr since ShiftPtr needs to potentially shift GRO)
     #  = KernelWriterAssembly also supports 64-bit 2D buffer size (see use64bPbcLimit)
     #    - Requires 4 instructions to move scalar limit and a couple SGPR
@@ -633,18 +633,18 @@ def getArchitectureName(gfxName):
     #      - DirectToLds=1
 
     #  converting m0 update from LocalWriteAddrSGpr using  is usually win
-    # -1 attempt to use a hueristic to determine when the tile size will use too many SGPR and fall back to VGPR
+    # -1 attempt to use a heuristic to determine when the tile size will use too many SGPR and fall back to VGPR
     "UseInstOffsetForGRO":              [ -1, 0, 1],
 
 
     # Converting VGPR GRO into SGPR GRO is usually a win
     # However, the mode may exhaust all available SGPR, in particular for large unroll
-    # -1 attempt to use a hueristic to determine when the tile size will use too many SGPR and fall back to VGPR
+    # -1 attempt to use a heuristic to determine when the tile size will use too many SGPR and fall back to VGPR
     "UseSgprForGRO":              [ -1, 0, 1],
 
     # Some work-items in the group may not participate in the final buffer load.
     # Allows more flexibility in choosing DepthU.
-    # 1= allocate extra addressing vpgr for edge cases
+    # 1= allocate extra addressing vgpr for edge cases
     # 2= use temp vgpr inside unroll loop, may save 1 VPR if both A and B have a fractional edge but costs v_alu
     "FractionalLoad":             [ 0, 1, 2] ,
 
@@ -775,11 +775,11 @@ def getArchitectureName(gfxName):
     #  - Higher values will spread traffic to more channels but provide less L2 re-use.
     #  - StaggerU and WorkGroupMapping interact and should be tuned together -
     #    The WGM controls how tiles are assigned in C matrix, while StaggerU controls where those
-    #    tiles start reading their summation dim parms.
+    #    tiles start reading their summation dim params.
     #  - StaggerU requires BufferLoad==1 and is silently ignored if BufferLoad==0
     "StaggerU":              [0,2,4,8,16,32,64],
 
-    # Stride in bytes for each staggeru 'click'.
+    # Stride in bytes for each staggerU 'click'.
     # 256 is recommended since this is the width of memory channel (on gfx803,gfx900,gf906) - so
     # each click will start in a new memory channel and spread traffic among the 16 available channels.
     # For example StaggerUStride=256 and StaggerU=8 will use 8 unique starting points
@@ -796,7 +796,7 @@ def getArchitectureName(gfxName):
     # 1: Use wg1
     # 2: Use wg2
     # 3: Use wgSerial, wgSerial = wg0 + wg1 * nwg0 + wg2 * (nwg0 * nwg1)
-    # 4: Debug mode, offset each tile max allowed StaggerU.  This just moves hotspot
+    # 4: Debug mode, offset each tile max allowed StaggerU.  This just moves hot-spot
     #    to a different bank since all workgroups still start at same point.
     "StaggerUMapping":       [0,1,2,3,4],
 
@@ -813,7 +813,7 @@ def getArchitectureName(gfxName):
     # Tensor C always mapped with first free coord as fastest moving
     # (Elements in this dimension are sequential in memory.
     #
-    # For 2D nonbatched Matrix this means index order is I, then J
+    # For 2D non-batched Matrix this means index order is I, then J
     # For 2D batched Matrix this means index order is I, then J, then K.
     #
     # Then for 2D case:
@@ -827,7 +827,7 @@ def getArchitectureName(gfxName):
     # Examples for 2D matrix:
     # WGM=8:  on CU64 machine this is a square box
     # WGM=1:  Short/Fat - this will cover maximum width in I dimension of C.  This matches hardware assigned mapping.
-    # WGM=64: Tall/Skinny - this will cover maximum width in J dimention of C.
+    # WGM=64: Tall/Skinny - this will cover maximum width in J dimension of C.
     #
     # Formula for wgSerial:
     # wgSerial = wg0 + (wg1 % WorkGroupMapping) * nwg0
@@ -872,18 +872,18 @@ def getArchitectureName(gfxName):
 
     # StoreRemap: Optimize MatrixInstruction store patterns to enhance performance.
     #             MI output data between each threads are along N dims.
-    #             But global memory is along M dim continous.
-    #             That mean global write between each threads are not continous.
+    #             But global memory is along M dim continuous.
+    #             That mean global write between each threads are not continuous.
     #             Therefore, store performance for MI instruction is poor.
     # How StoreRemap works in final store stage:
     #             1. Put all thread output data into LDS.
     #             2. All thread read data from LDS along M dims.
-    #                (match global Memory continous direction)
+    #                (match global Memory continuous direction)
     #             3. All thread write out data into global memory.
     # 0:   Disable StoreRemap (default)
     # 1~8: Enable StoreRemap and set the global write vector width
-    # Suggest optimum value: fp32 = [2,4], fp16 or bf16 = [4,8] (dwordx2 and dowrdx4)
-    # -1:  Use dwordx2 if support SRVW, or set SRVW to 0
+    # Suggest optimum value: fp32 = [2,4], fp16 or bf16 = [4,8] (dwordX2 and dwordX4)
+    # -1:  Use dwordX2 if support SRVW, or set SRVW to 0
     "StoreRemapVectorWidth":      [-1,0,1,2,4,8],
 
     # SourceSwap: Optimizes MatrixInstruction store pattern by swapping mfma input order.
@@ -957,7 +957,7 @@ def getArchitectureName(gfxName):
 
     # Disable overlapping AB-tile vgpr and read/write addr vgprs with C-tile vgprs
     # Valid only for MatrixInstruction enabled kernels, which by default overlaps
-    # C-tile w/ AB-tile until it's due for v_accvgpr_read before the writeback. Illustrated below:
+    # C-tile w/ AB-tile until it's due for v_accvgpr_read before the write-back. Illustrated below:
     # |<----------------------- valuC ----------------------->|
     # |<--- valuA/B --->|<-- R/W pointers -->|xxx|<- Spares ->|
     #                                          ^        ^
@@ -979,7 +979,7 @@ def getArchitectureName(gfxName):
     # 7= +NoPreLoop+ NoGlobalReadInc
     # 9= NullKernel
     # For example set DisableKernelPieces: [0,1,2,3,4,5,6,7,9]
-    #   this will create a set of kernels with progessively more pieces of the kernel disabled
+    #   this will create a set of kernels with progressively more pieces of the kernel disabled
     "DisableKernelPieces":        list(range(-9,10)),         # disable pieces of the kernel, for performance isolation
 
     # assume atomics always work correctly.
@@ -998,7 +998,7 @@ def getArchitectureName(gfxName):
     #       - Host code will not launch more groups than tiles in the C space
     # -1 : Automatically choose a "heuristic" value that can possibly get a better gain: (TilesPerWorkgroup = 1~2)
     #      Not based on any theory, but on some experiment observation, can be used to reduce the kernels
-    #      Recommand [-1,0,1] for basic tuning
+    #      Recommend [-1,0,1] for basic tuning
     # Assertions/Requirements: NumWorkGroups0 * NumWorkGroups1 < 2^32
     "PersistentKernel":           range(-1,512+1) ,       # Use persistent kernel.
 
@@ -1048,9 +1048,9 @@ def getArchitectureName(gfxName):
     # Controls desired width (#elements) for loads from global memory -> LDS.
     # and eliminates the pointer unshift logic
     # -1 : Set GlobalReadVectorWidth =  VectorWidth
-    # NOTE: for input bpe=32, max GRVW is 4  (to fit dwordx4) (FP32), min GRVW is 1 (dword)
-    #                 bpe=16, max GRVW is 8  (to fit dwordx4) (FP16), min GRVW is 2 (dword)
-    #                 bpe=8,  max GRVW is 16 (to fit dwordx4) (INT8), min GRVW is 4 (dword)
+    # NOTE: for input bpe=32, max GRVW is 4  (to fit dwordX4) (FP32), min GRVW is 1 (dword)
+    #                 bpe=16, max GRVW is 8  (to fit dwordX4) (FP16), min GRVW is 2 (dword)
+    #                 bpe=8,  max GRVW is 16 (to fit dwordX4) (INT8), min GRVW is 4 (dword)
     "GlobalReadVectorWidth":      [ -1, 1, 2, 3, 4, 6, 8, 16 ],
 
     # Controls desired width (#elements) for loads from LDS -> VGPR.
@@ -1066,12 +1066,12 @@ def getArchitectureName(gfxName):
     "LocalReadVectorWidth":      [ -1, 1, 2, 4, 8, 16 ],
 
     # threads should read/write/operate on this many contiguous elements from the C matrix.
-    # If VW=4 then thread0 will process 4 consec C elements, then thread1 next 4, etc.
+    # If VW=4 then thread0 will process 4 consecutive C elements, then thread1 next 4, etc.
     # If the ThreadTile is > VectorWidth then thread0 will next operate on the 4 elements in C at (4*NumThreads)
     # Typically the load vector width and store vector width are directly related to the VW.
     # The global load width is closely related to the width of local stores so
     # GlobalReadVectorWidth also controls local write width.
-    # Local read width also matches since VectorWidth consec elements must be read
+    # Local read width also matches since VectorWidth consecutive elements must be read
     # Typically matching 16 bytes is good choice since the stores will be optimally coalesced with 16 bytes/WI.
     # -1 means use the largest vector width up to 128 bits.
     # Using a VW too large which results in >16bytes/thread isn't supported
@@ -1085,7 +1085,7 @@ def getArchitectureName(gfxName):
     "VectorStore":                    [-1, 0, 1],
 
     # Controls desired width (#elements) for stores from reg to global memory.
-    # When MatrixInstruciton == None, derived parameter gwvw takes precedence.
+    # When MatrixInstruction == None, derived parameter gwvw takes precedence.
     # -1 : Set StoreVectorWidth = VectorWidth
     "StoreVectorWidth":           [ -1, 1, 2, 3, 4, 6, 8 ],
 
@@ -1097,20 +1097,20 @@ def getArchitectureName(gfxName):
     # when loading all the data from global into lds requires multiple load instructions, these parameters govern which
     # loads will pull which rectangle of data from global into lds
     # NLC=1 means one load along the coalesced dimension, which results in the most coalescing possible
-    # NLC=-1 looks for the largest number of reads along the coalesced dimension which results in the least ammount of coalescing;
+    # NLC=-1 looks for the largest number of reads along the coalesced dimension which results in the least amount of coalescing;
     # however in this case the stride between one load and another is a static value, therefore buffer loads only need one set of registers
     # whereas the =1 case has a stride which is a multiple of a kernel argument and therefore needs one address per load in the perpendicular dimension
     "NumLoadsCoalescedA":         list(range(-1, 64+1)),
     "NumLoadsCoalescedB":         list(range(-1, 64+1)),
 
     # DepthU, LocalSplitU (which is the 3rd number in WorkGroup), and LoopUnroll are closely related
-    # LoopUnroll=4 means there are 4 subiterations within the loop, 4 actual iterations written in the code.
+    # LoopUnroll=4 means there are 4 sub-iterations within the loop, 4 actual iterations written in the code.
     # LocalSplit=2 means the workgroup is split up into 2 subgroups, and each subgroup is doing different parts of the summation.
     # subgroup0 does k=0-3, 8-11... and subgroup1 does k=4-7, 12-15...
-    # So, each iteration through the summation loop, which has 4 actual subiterations, does 8 summation iterations, because each subgroup did 4;
+    # So, each iteration through the summation loop, which has 4 actual sub-iterations, does 8 summation iterations, because each subgroup did 4;
     # and when data is read from global memory the threads read 8 elements along the summation dimension.
     # DepthU = LoopUnroll * LocalSplitU = 4*2 in this case
-    # it made more sense for the user to directly control LocalSplitU and DepthU, then derrive afterwards LoopUnroll=DepthU/LocalSplitU
+    # it made more sense for the user to directly control LocalSplitU and DepthU, then derive afterwards LoopUnroll=DepthU/LocalSplitU
     # -1 : Only allow GLVW=1
     # -2 : Only allow max(GLVWA,GLVWB) < VW ?
     # -3 : Only allow min(GLVWA,GLVWB) < VW ?
@@ -1125,7 +1125,7 @@ def getArchitectureName(gfxName):
     # was previously a problem for TN since it implies DepthU is large, and that leads to oversubscription of LDS.
     #
     # Preconditions:
-    # ScheduleIterAlg=3, TransposeLDS=1, PGR=0/1 exlcuding 2, DirectToLds=0 (DirectToLds=0 because part of the data loaded *need* to reside in registers),
+    # ScheduleIterAlg=3, TransposeLDS=1, PGR=0/1 excluding 2, DirectToLds=0 (DirectToLds=0 because part of the data loaded *need* to reside in registers),
     # nRegs per load >= DepthULdsDivisor (since we artificially require at least 1 register per LDS write)
     #
     # Example: DepthULdsDivisor=2
@@ -1135,10 +1135,10 @@ def getArchitectureName(gfxName):
     # 2nd subloop writes v2,v3 to LDS
     "DepthULdsDivisor":           [1, 2, 4],
 
-    # integer ammount of padding to put into LDS, in 2016 this didn't seem to help performance, profilers were showing that channel conflicts weren't really hurting
+    # integer amount of padding to put into LDS, in 2016 this didn't seem to help performance, profilers were showing that channel conflicts weren't really hurting
     # performance so this has been deprecated and probably doesn't work
     # -1 means use same padding as the VectorWidth if TLU=0 else 0.  (Padding only helps when transpose is required)
-    # With MatrixInstruciton: -1 means max(GRVW,MIInput) if TLU=0
+    # With MatrixInstruction: -1 means max(GRVW,MIInput) if TLU=0
     "LdsPadA":                     [ -1, 0, 1, 2, 3, 4, 8, 16, 32],
     "LdsPadB":                     [ -1, 0, 1, 2, 3, 4, 8, 16, 32],
 
@@ -1149,7 +1149,7 @@ def getArchitectureName(gfxName):
     # -1 means round up to nearest power of 2 begin with 128
     "LdsBlockSizePerPad":          [-1, 0, 64, 128, 256, 512, 1024],
 
-    # Transpose LDS format. Local store in Coalsced dimension , same as optimized global fetch dimension . applicable only in TLU=0 case for miSIMD(s)
+    # Transpose LDS format. Local store in Coalesced dimension , same as optimized global fetch dimension . applicable only in TLU=0 case for miSIMD(s)
     # TODO: No code for -1 ?
     "TransposeLDS":                [-1, 1, 0],
 
@@ -1158,7 +1158,7 @@ def getArchitectureName(gfxName):
     "PerformanceWaitLocation":    list(range(-1, 16*16+1)),
     "PerformanceWaitCount":       list(range(-1, 16)),
 
-    # add gls or slc after global memory read/writes to change cacheing, not cacheing the writes is promising and improved performance a tiny bit
+    # add gls or slc after global memory read/writes to change caching, not caching the writes is promising and improved performance a tiny bit
     # 1: glc, 2: slc, 3: glc+slc
     "NonTemporalD":               list(range(0,4)),
     "NonTemporalC":               list(range(0,4)),
@@ -1179,7 +1179,7 @@ def getArchitectureName(gfxName):
     # For example, InnerUnroll=2 will fetch LDS for two unroll iterations
     "InnerUnroll":                [1,2,4,8,16,32,64],
 
-    # Arrange elements in LDS so N elements consec in U-dim are adjacent in LDS
+    # Arrange elements in LDS so N elements consecutive in U-dim are adjacent in LDS
     # 1 is default and results in no interleaving.
     # Implementation only supports LocalDotLayout that is a power-of-two
     "LocalDotLayout":             [1,2,4,8],
@@ -1381,7 +1381,7 @@ def getArchitectureName(gfxName):
     "TensorBFormat",           # see validTensorBFormats
     "TensorDFormat",           # see validTensorDFormats
 
-    # Each of the parms below specifies dimensions separated by 'x".
+    # Each of the params below specifies dimensions separated by 'x".
     # -  The notation follows 'convolution' convention so fastest-moving dimensions are last,
     #    and should mirror the order of the spatial dimension in the activation format.
     #    For example, in NCHW format Filter=3x1 is 3 in the H dimension and 1 in the W dimension.
@@ -1422,7 +1422,7 @@ def getArchitectureName(gfxName):
     #    iteration count for the unroll loop.
     # If 0:
     #   - Unroll index is filter index (Forward,BackwardData) or spatial index (BackwardWeights)
-    #   - provides better cache locality for most formats, but tigher looping.
+    #   - provides better cache locality for most formats, but tighter looping.
     #   - Likely a good idea with PackSummationDims=1 since there is only one unroll loop.
     "UnrollOnChannel",
 
@@ -1430,7 +1430,7 @@ def getArchitectureName(gfxName):
     # Optional parameter for debug and testing.  This does not impact kernel generation.
     # If set,then each problem dimension size/stride will be checked to ensure they are
     # correctly specified. (TBD)
-    # Also used by testbenches to compute consistent strides and sizes for auto-generated
+    # Also used by test benches to compute consistent strides and sizes for auto-generated
     # problem sizes and strides.
     'Spatial',              # examples 56x56, 7x7.
 
@@ -1440,7 +1440,7 @@ def getArchitectureName(gfxName):
 # Default Problem Type
 ################################################################################
 defaultProblemType = {
-    # =GEMM uses TransposeA,B paramters and makes the problem type more readeable for users
+    # =GEMM uses TransposeA,B parameters and makes the problem type more readable for users
     # =TensorContraction  requires specifying
     "OperationType":            "GEMM",           # GEMM, TensorContraction, ConvolutionForward, ConvolutionBackwardData, ConvolutionBackwardWeights
 
@@ -1543,7 +1543,7 @@ def getArchitectureName(gfxName):
     #   - Typical use case is to set summationStride < freeSize, with padStart+padEnd+1 == summationStride.
     # - Caveats:
     #  - ZeroPad requires that the ElementEdge <= 2^32:
-    #    This is SizeFree+SizeSum + Pad_Leading + PadTrailingPad + padding=GRWW for shift-pointer) bytes < 2^32
+    #    This is SizeFree+SizeSum + Pad_Leading + PadTrailingPad + padding=GRVW for shift-pointer) bytes < 2^32
     #    Likely this is less than the standard buffer load limits (bottom-right corner of macro-tile)
 
     #  EX: ZeroPadA: [ [0,1,  2,3]] # TensorA free index 0 with sum index 1 has leading pad=2 and trailing pad=3
@@ -1680,47 +1680,52 @@ def locateExe( defaultPath, exeName ): # /opt/rocm/bin, hip-clang
 def GetAsmCaps(isaVersion):
   """ Determine assembler capabilities by testing short instructions sequences """
   rv = {}
-  rv["SupportedISA"]      = tryAssembler(isaVersion, "")
-  rv["HasExplicitCO"]     = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,1")
-  rv["HasExplicitNC"]     = tryAssembler(isaVersion, "v_add_nc_u32 v0,v0,1")
+  rv["SupportedISA"]          = tryAssembler(isaVersion, "")
+  rv["HasExplicitCO"]         = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,1")
+  rv["HasExplicitNC"]         = tryAssembler(isaVersion, "v_add_nc_u32 v0,v0,1")
 
-  rv["HasDirectToLds"]    = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds") \
-                            or tryAssembler(isaVersion, "buffer_load_b32 v40, v36, s[24:27], s28 offen offset:0 lds")
-  rv["HasAddLshl"]        = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2")
-  rv["HasLshlOr"]         = tryAssembler(isaVersion, "v_lshl_or_b32 v47, v36, 0x2, v34")
-  rv["HasSMulHi"]         = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34")
-  rv["HasCodeObjectV3"]   = tryAssembler(isaVersion, "", False, "-mcode-object-version=2")
+  # Syntax of DirectToLds loads has changed: destination vgpr should be omitted
+  # Old syntax should be removed in a future update as it is no longer supported
+  rv["HasDirectToLdsDest"]    = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds") \
+                                or tryAssembler(isaVersion, "buffer_load_b32 v40, v36, s[24:27], s28 offen offset:0 lds")
+  rv["HasDirectToLdsNoDest"]  = tryAssembler(isaVersion, "buffer_load_dword v36, s[24:27], s28 offen offset:0 lds") \
+                                or tryAssembler(isaVersion, "buffer_load_b32 v36, s[24:27], s28 offen offset:0 lds")
 
-  rv["HasMFMA"]           = tryAssembler(isaVersion, "v_mfma_f32_32x32x2bf16 a[0:31], v32, v33, a[0:31]")
-  rv["HasMFMA_f64"]       = tryAssembler(isaVersion, "v_mfma_f64_16x16x4f64 v[0:7], v[32:33], v[36:37], v[0:7]")
-  rv["HasMFMA_bf16_1k"]   = tryAssembler(isaVersion, "v_mfma_f32_32x32x4bf16_1k a[0:31], v[32:33], v[36:37], a[0:31]")
+  rv["HasAddLshl"]            = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2")
+  rv["HasLshlOr"]             = tryAssembler(isaVersion, "v_lshl_or_b32 v47, v36, 0x2, v34")
+  rv["HasSMulHi"]             = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34")
+  rv["HasCodeObjectV3"]       = tryAssembler(isaVersion, "", False, "-mcode-object-version=2")
 
-  rv["v_mac_f16"]         = tryAssembler(isaVersion, "v_mac_f16 v47, v36, v34")
+  rv["HasMFMA"]               = tryAssembler(isaVersion, "v_mfma_f32_32x32x2bf16 a[0:31], v32, v33, a[0:31]")
+  rv["HasMFMA_f64"]           = tryAssembler(isaVersion, "v_mfma_f64_16x16x4f64 v[0:7], v[32:33], v[36:37], v[0:7]")
+  rv["HasMFMA_bf16_1k"]       = tryAssembler(isaVersion, "v_mfma_f32_32x32x4bf16_1k a[0:31], v[32:33], v[36:37], a[0:31]")
 
-  rv["v_fma_f16"]         = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0,0]")
-  rv["v_fmac_f16"]        = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34")
+  rv["v_mac_f16"]             = tryAssembler(isaVersion, "v_mac_f16 v47, v36, v34")
 
-  rv["v_pk_fma_f16"]      = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0]")
-  rv["v_pk_fmac_f16"]     = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34")
+  rv["v_fma_f16"]             = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0,0]")
+  rv["v_fmac_f16"]            = tryAssembler(isaVersion, "v_fma_f16 v47, v36, v34")
 
-  rv["v_mad_mix_f32"]     = tryAssembler(isaVersion, "v_mad_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]")
-  rv["v_fma_mix_f32"]     = tryAssembler(isaVersion, "v_fma_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]")
+  rv["v_pk_fma_f16"]          = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34, v47, op_sel:[0,0,0]")
+  rv["v_pk_fmac_f16"]         = tryAssembler(isaVersion, "v_pk_fma_f16 v47, v36, v34")
 
-  rv["v_dot2_f32_f16"]    = tryAssembler(isaVersion, "v_dot2_f32_f16 v20, v36, v34, v20")
-  rv["v_dot2c_f32_f16"]   = tryAssembler(isaVersion, "v_dot2c_f32_f16 v47, v36, v34") \
-                            or tryAssembler(isaVersion, "v_dot2acc_f32_f16 v47, v36, v34")
+  rv["v_mad_mix_f32"]         = tryAssembler(isaVersion, "v_mad_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]")
+  rv["v_fma_mix_f32"]         = tryAssembler(isaVersion, "v_fma_mix_f32 v47, v36, v34, v47, op_sel:[0,0,0] op_sel_hi:[1,1,0]")
 
-  rv["v_dot4_i32_i8"]     = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34")
-  rv["v_dot4c_i32_i8"]    = tryAssembler(isaVersion, "v_dot4c_i32_i8 v47, v36, v34")
-  rv["VOP3v_dot4_i32_i8"] = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34, v47")
+  rv["v_dot2_f32_f16"]        = tryAssembler(isaVersion, "v_dot2_f32_f16 v20, v36, v34, v20")
+  rv["v_dot2c_f32_f16"]       = tryAssembler(isaVersion, "v_dot2c_f32_f16 v47, v36, v34") \
+                                or tryAssembler(isaVersion, "v_dot2acc_f32_f16 v47, v36, v34")
 
-  rv["v_mac_f32"]         = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22")
-  rv["v_fma_f32"]         = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23")
-  rv["v_fmac_f32"]        = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22")
+  rv["v_dot4_i32_i8"]         = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34")
+  rv["v_dot4c_i32_i8"]        = tryAssembler(isaVersion, "v_dot4c_i32_i8 v47, v36, v34")
+  rv["VOP3v_dot4_i32_i8"]     = tryAssembler(isaVersion, "v_dot4_i32_i8 v47, v36, v34, v47")
 
-  rv["v_fma_f64"]         = tryAssembler(isaVersion, "v_fma_f64 v[20:21], v[22:23], v[24:25], v[20:21]")
+  rv["v_mac_f32"]             = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22")
+  rv["v_fma_f32"]             = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23")
+  rv["v_fmac_f32"]            = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22")
 
-  rv["HasAtomicAdd"]      = tryAssembler(isaVersion, "buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:0")
+  rv["v_fma_f64"]             = tryAssembler(isaVersion, "v_fma_f64 v[20:21], v[22:23], v[24:25], v[20:21]")
+
+  rv["HasAtomicAdd"]          = tryAssembler(isaVersion, "buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:0")
 
 
   if tryAssembler(isaVersion, "s_waitcnt vmcnt(63)"):
@@ -1819,7 +1824,7 @@ def detectGlobalCurrentISA():
       for line_in in process.stdout.decode().splitlines():
         if 'gcnArchName' in line_in:
           line += line_in.split()[1]
-          break # detemine if hipinfo will support multiple arch
+          break # determine if hipinfo will support multiple arch
       arch = gfxArch(line.strip())
       if arch is not None:
         if arch in globalParameters["SupportedISA"]:
@@ -1897,7 +1902,7 @@ def assignGlobalParameters( config ):
   """
   Assign Global Parameters
   Each global parameter has a default parameter, and the user
-  can override them, those overridings happen here
+  can override them, overriding happens here
   """
 
   global globalParameters
@@ -1917,7 +1922,7 @@ def assignGlobalParameters( config ):
       if configValue == defaultValue:
         print2(" %24s: %8s (same)" % (key, configValue))
       else:
-        print2(" %24s: %8s (overriden)" % (key, configValue))
+        print2(" %24s: %8s (overridden)" % (key, configValue))
     else:
       print2(" %24s: %8s (unspecified)" % (key, defaultValue))
 
@@ -2025,7 +2030,7 @@ def assignGlobalParameters( config ):
   for key in config:
     value = config[key]
     if key not in globalParameters:
-      printWarning("Global parameter %s = %s unrecognised." % ( key, value ))
+      printWarning("Global parameter %s = %s unrecognized." % ( key, value ))
     globalParameters[key] = value
 
 def setupRestoreClocks():
diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py
index a3b25872d..7cd72e8c5 100644
--- a/Tensile/KernelWriterAssembly.py
+++ b/Tensile/KernelWriterAssembly.py
@@ -743,7 +743,10 @@ def initKernel(self, kernel, tPA, tPB ):
     self.AsmBugs["ExplicitCO"] = globalParameters["AsmCaps"][self.version]["HasExplicitCO"]
     self.AsmBugs["ExplicitNC"] = globalParameters["AsmCaps"][self.version]["HasExplicitNC"]
 
-    if not globalParameters["AsmCaps"][self.version]["HasDirectToLds"]:
+    hasDtl = globalParameters["AsmCaps"][self.version]["HasDirectToLdsDest"] or globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"]
+    if not hasDtl:
+      if kernel["DirectToLds"]:
+        printExit("DirectToLds requested, but not available on this architecture ( {} )".format(self.version))
       kernel["DirectToLdsA"] = False
       kernel["DirectToLdsB"] = False
       kernel["LocalWriteUseSgprA"] = False # Requires DirectToLdsA
@@ -2223,6 +2226,16 @@ def defineBufferMemoryMacros(self):
       replace = f'{type_list[t]}' if (self.version[0] < 11) else f'{t}'
       kStr += self.generalMacro('buffer_load_', origin, replace, 'dst', 'voffset', 'base', 'soffset', 'offen', 'ioffset', 'md0', 'md1', 'md2') + self.endLine
 
+    # Extra macro for DirectToLds loads with no destination register
+    type_list = {
+      'b32'       : 'dword',
+      'u16'       : 'ushort'
+    }
+    for t in type_list:
+      origin  = f'{t}'
+      replace = f'{type_list[t]}' if (self.version[0] < 11) else f'{t}'
+      kStr += self.generalMacro('buffer_load_', origin + '_dtl', replace, 'voffset', 'base', 'soffset', 'offen', 'ioffset', 'md0', 'md1', 'md2') + self.endLine
+
     type_list = {
       'b32'       : 'dword',
       'b64'       : 'dwordx2',
@@ -7294,8 +7307,11 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx):
       extraFields += " glc"
     if tP["NonTemporal"]//2==1:
       extraFields += " slc"
+    dtlNoDestVgpr = False
     if kernel["DirectToLds%s"%tc]:
       extraFields += " lds"
+      dtlNoDestVgpr = globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"]
+    
 
     directToLdsLoads = 0
     prevLdsOffset    = 0
@@ -7503,6 +7519,7 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx):
                           addr0=vgpr(offsetVgpr), addr1=sgpr("Srd%s"%tc, 4), \
                           soffset=soffset, offset=offset, \
                           extraFields=extraFields, \
+                          dtlNoDestVgpr=dtlNoDestVgpr, \
                           hi16=hi16, \
                           comment=comment).toStr()
 
@@ -7529,6 +7546,7 @@ def globalReadGuardK(self, kernel, tP, vregSetIdx):
                           addr0=vgpr("GlobalReadAddr%s+%u"%(tc,graIdx),2), addr1="", \
                           soffset=0, offset=0, \
                           extraFields=extraFields, \
+                          dtlNoDestVgpr=dtlNoDestVgpr, \
                           hi16=hi16, \
                           comment="load one flat value").toStr()
 
@@ -7793,8 +7811,10 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0):
       extraFields += " glc"
     if tP["NonTemporal"]//2==1:
       extraFields += " slc"
+    dtlNoDestVgpr = False
     if kernel["DirectToLds%s"%tc]:
       extraFields += " lds"
+      dtlNoDestVgpr = globalParameters["AsmCaps"][self.version]["HasDirectToLdsNoDest"]
 
     directToLdsLoads = 0
     instOffset       = 0
@@ -7900,6 +7920,7 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0):
                         addr0=vgpr(offsetVgpr), addr1=sgpr("Srd%s"%tc, 4), \
                         soffset=soffset, offset=instOffset, \
                         extraFields=extraFields, \
+                        dtlNoDestVgpr=dtlNoDestVgpr, \
                         hi16=(kernel["ProblemType"]["DataType"].isHalf() or kernel["ProblemType"]["DataType"].isBFloat16()) and loopCnt%2==1, \
                         comment="G -> Reg %u_%u_%u_%u"%(para, sPara, perp, sPerp)))
 
@@ -7925,6 +7946,7 @@ def globalReadDo(self, kernel, mode, tP, vregSetIdx=0):
                         addr0=vgpr("GlobalReadAddr%s+%u"%(tc,graIdx),2), addr1="", \
                         soffset=0, offset=0, \
                         extraFields=extraFields, \
+                        dtlNoDestVgpr=dtlNoDestVgpr, \
                         hi16=(kernel["ProblemType"]["DataType"].isHalf() or kernel["ProblemType"]["DataType"].isBFloat16()) and loopCnt%2==1, \
                         comment="G -> Reg %u_%u_%u_%u"%(para, sPara, perp, sPerp )))
 
@@ -10982,7 +11004,7 @@ def globalWriteElements(self, kernel, vectorWidths, elements,
   # bpl = bytes per load op
   ##############################################################################
   def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \
-                       addr0, addr1, soffset, offset, extraFields, hi16=0, comment="load C"):
+                       addr0, addr1, soffset, offset, extraFields, dtlNoDestVgpr, hi16=0, comment="load C"):
   # rpv = regs per vector
     rpv = bpl/4.0
 
@@ -11000,34 +11022,25 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \
           assert 0, "offset too large and soffset set"
       if extraFields != "":
         tailFields += ", %s"% extraFields
+      globalReadInst = None
       if bpl==1 and hi16:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_d16_hi_u8", vgpr(destVgpr, rpv*4), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_d16_hi_u8"
+        rpv *= 4
       elif bpl==1 and not hi16:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_d16_u8", vgpr(destVgpr, rpv*4), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_d16_u8"
+        rpv *= 4
       elif bpl==2 and hi16:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_d16_hi_b16", vgpr(destVgpr, rpv*2), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_d16_hi_b16"
+        rpv *= 2
       elif bpl==2 and not hi16:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_d16_b16", vgpr(destVgpr, rpv*2), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_d16_b16"
+        rpv *= 2
       elif bpl==4:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_b32", vgpr(destVgpr, rpv), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_b32"
       elif bpl==8:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_b64", vgpr(destVgpr, rpv), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_b64"
       elif bpl==16:
-        rv.addCode(Code.GlobalReadInst("_buffer_load_b128", vgpr(destVgpr, rpv), addr0, \
-                  addr1, soffset, tailFields, comment))
-        return rv
+        globalReadInst = "_buffer_load_b128"
       elif bpl==32:
         # split into two dwordx4 loads. Second load offset is +0.5 bpl
         tailFields1 = "offen offset:%u"%(offset + bpl/2)
@@ -11039,9 +11052,17 @@ def chooseGlobalRead(self, useBuffer, bpl, destVgpr, \
                   addr1, soffset, tailFields, comment))
         rv.addCode(Code.GlobalReadInst("_buffer_load_b128", vgpr(int(destVgpr + rpv/2), rpv/2), addr0, \
                   addr1, soffset, tailFields1, comment))
+        return rv
       else:
         assert 0, "chooseGlobalRead: bad bpl"
 
+      if dtlNoDestVgpr:
+        globalReadInst += "_dtl"
+      args = [globalReadInst]
+      if not dtlNoDestVgpr:
+        args.append(vgpr(destVgpr, rpv))
+      args.extend([addr0, addr1, soffset, tailFields, comment])
+      rv.addCode(Code.GlobalReadInst(*args))
       return rv
 
     else:
@@ -11310,7 +11331,7 @@ def readCInput(self, kernel, ss, addrCalc, vc0, data, gwvw, addr, tmpS01):
     if kernel["ProblemType"]["DestDataType"].isHalf():
       kStr += self.chooseGlobalRead(useBuffer, bps, data, \
                 addr0, addr1, soffset=0, offset=addrCalc.globalOffset, \
-                extraFields=extraStr, hi16=vc0 % 2,
+                extraFields=extraStr, dtlNoDestVgpr=False, hi16=vc0 % 2,
                 comment="load C for beta calc").toStr()
     elif kernel["ProblemType"]["DestDataType"].isBFloat16() or \
          kernel["ProblemType"]["DestDataType"].isInt32() or \
@@ -11321,6 +11342,7 @@ def readCInput(self, kernel, ss, addrCalc, vc0, data, gwvw, addr, tmpS01):
       kStr += self.chooseGlobalRead(useBuffer, bps, data, \
                 addr0, addr1, soffset=0, offset=addrCalc.globalOffset, \
                 extraFields=extraStr, \
+                dtlNoDestVgpr=False, \
                 comment="load C for beta calc").toStr()
 
     return kStr
@@ -11501,6 +11523,7 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic,
           vgprIdx = 1*(bpm//4)
           kStr += self.chooseGlobalRead(useBuffer, bpm, dataV+vgprIdx, \
                     addr0, addr1, soffset=0, offset=addrCalc.globalOffset, extraFields="",
+                    dtlNoDestVgpr=False, \
                     comment="load D (atomic) bpm=%u vaw=%u"%(bpm,atomicW)).toStr()
 
       if kernel["InterleaveAlpha"] and applyAlpha:
@@ -12172,19 +12195,19 @@ def globalWriteBatch(self, kernel, ss, batchIdx, applyAlpha, beta, edge, atomic,
           if kernel["ProblemType"]["DestDataType"].isHalf() or kernel["ProblemType"]["DestDataType"].isBFloat16():
             if not kernel["ProblemType"]["HighPrecisionAccumulate"]:
               kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx//2, \
-                        addr0, addr1, soffset=0, offset=0, extraFields="", hi16=sumIdx%2).toStr()
+                        addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False, hi16=sumIdx%2).toStr()
             else:
               kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx, \
-                        addr0, addr1, soffset=0, offset=0, extraFields="", hi16=0).toStr()
+                        addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False, hi16=0).toStr()
           elif kernel["ProblemType"]["DestDataType"].isInt32() or kernel["ProblemType"]["DestDataType"].isSingle():
             kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx, \
-                      addr0, addr1, soffset=0, offset=0, extraFields="").toStr()
+                      addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr()
           elif kernel["ProblemType"]["DestDataType"].isDouble() or kernel["ProblemType"]["DestDataType"].isSingleComplex() :
             kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx*2, \
-                      addr0, addr1, soffset=0, offset=0, extraFields="").toStr()
+                      addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr()
           elif kernel["ProblemType"]["DestDataType"].isDoubleComplex():
             kStr += self.chooseGlobalRead(useBuffer, bps, sumIdx*4, \
-                      addr0, addr1, soffset=0, offset=0, extraFields="").toStr()
+                      addr0, addr1, soffset=0, offset=0, extraFields="", dtlNoDestVgpr=False).toStr()
         kStr += inst("s_waitcnt", "vmcnt(0)", "CheckStoreC, wait for stores to complete" )
         if self.archCaps["SeparateVscnt"]:
           kStr += inst("s_waitcnt_vscnt", "null", "0", "writes")
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 15ab45103..87e63f6d7 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -2435,6 +2435,11 @@ def isDirectToLdsDoable(state, tc):
     #TN
     # use for all precisions with TransposeLDS=1
 
+    numRegisters = state["ProblemType"]["DataType"].numRegisters()
+    if numRegisters * state["GlobalLoadVectorWidth%c"%tc] != 1:
+      reject(state, "DirectToLds can only be used with buffer loads requiring 1 register")
+      return False
+
     if state["ProblemType"]["DataType"].isHalf():
       if state["AssertSummationElementMultiple"] % (2 * state["GlobalLoadVectorWidth%c"%tc])  != 0:
         reject(state, "can't use DirectToLds for FP16 with AssertSummationElementMultiple %u" % state["AssertSummationElementMultiple"])
@@ -3437,6 +3442,9 @@ def assignDerivedParameters(state):
         state["DirectToLdsB"] = True
         state["LocalWriteUseSgprB"] = True
         #print("DirectToLdsB", state["DirectToLdsB"])
+      
+      if state["Valid"] and state["DirectToLds"] and not (state["DirectToLdsA"] or state["DirectToLdsB"]):
+        printWarning("DirectToLds requested, but not enabled for A or B, check kernel configuration!")
 
       # Update parent variable so kernel display is accurate
       state["DirectToLds"] = state["DirectToLdsA"] or state["DirectToLdsB"]
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml
rename to Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml
diff --git a/Tensile/Tests/pre_checkin/direct_to_lds/dtl_dgemm_lite.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/direct_to_lds/dtl_dgemm_lite.yaml
rename to Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml b/Tensile/Tests/disabled/direct_to_lds/dtl_hgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml
rename to Tensile/Tests/disabled/direct_to_lds/dtl_hgemm.yaml
diff --git a/tox.ini b/tox.ini
index eb965b16e..cb38e81fb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,6 @@
 [tox]
-envlist = py35,py36,py27,lint
+envlist = py35,py36,py38,py27,lint
+
 
 [testenv]
 # Some versions of Pytest versions have a bug: